From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001
From: "Alan D. Brunelle" <Alan.Brunelle@hp.com>
Date: Mon, 4 May 2009 16:35:08 -0400
Subject: blktrace: from-sector redundant in trace_block_remap

Remove redundant from-sector parameter: it's /always/ the bio's sector
passed in.

[ Impact: cleanup ]

Signed-off-by: Alan D. Brunelle <alan.brunelle@hp.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49FF517C.7000503@hp.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/md/dm.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8a994be035ba..b01514afb6b5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 		/* the bio has been remapped so dispatch it */
 
 		trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-				    tio->io->bio->bi_bdev->bd_dev,
-				    clone->bi_sector, sector);
+				    tio->io->bio->bi_bdev->bd_dev, sector);
 
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
-- 
cgit v1.2.3


From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:49 -0400
Subject: block: Do away with the notion of hardsect_size

Until now we have had a 1:1 mapping between storage device physical
block size and the logical block sized used when addressing the device.
With SATA 4KB drives coming out that will no longer be the case.  The
sector size will be 4KB but the logical block size will remain
512-bytes.  Hence we need to distinguish between the physical block size
and the logical ditto.

This patch renames hardsect_size to logical_block_size.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 arch/powerpc/sysdev/axonram.c       |  2 +-
 block/blk-integrity.c               |  2 +-
 block/blk-settings.c                | 21 ++++++++++-----------
 block/blk-sysfs.c                   | 12 +++++++++---
 block/compat_ioctl.c                |  2 +-
 block/ioctl.c                       |  2 +-
 drivers/block/cciss.c               |  6 +++---
 drivers/block/cpqarray.c            |  4 ++--
 drivers/block/hd.c                  |  2 +-
 drivers/block/mg_disk.c             |  2 +-
 drivers/block/pktcdvd.c             |  2 +-
 drivers/block/ps3disk.c             |  2 +-
 drivers/block/ub.c                  |  6 +++---
 drivers/block/virtio_blk.c          |  2 +-
 drivers/block/xen-blkfront.c        |  2 +-
 drivers/block/xsysace.c             |  2 +-
 drivers/cdrom/gdrom.c               |  2 +-
 drivers/cdrom/viocd.c               |  4 ++--
 drivers/char/raw.c                  |  2 +-
 drivers/ide/ide-cd.c                | 12 ++++++------
 drivers/md/bitmap.c                 |  4 ++--
 drivers/md/dm-exception-store.c     |  2 +-
 drivers/md/dm-log.c                 |  3 ++-
 drivers/md/dm-snap-persistent.c     |  2 +-
 drivers/md/dm-table.c               | 12 +++++++-----
 drivers/md/md.c                     |  2 +-
 drivers/memstick/core/mspro_block.c |  2 +-
 drivers/message/i2o/i2o_block.c     |  5 +++--
 drivers/mmc/card/block.c            |  2 +-
 drivers/mtd/mtd_blkdevs.c           |  2 +-
 drivers/s390/block/dasd.c           |  2 +-
 drivers/s390/block/dcssblk.c        |  2 +-
 drivers/s390/block/xpram.c          |  2 +-
 drivers/s390/char/tape_block.c      |  2 +-
 drivers/scsi/sd.c                   |  2 +-
 drivers/scsi/sr.c                   |  2 +-
 fs/bio.c                            |  3 ++-
 fs/block_dev.c                      |  6 +++---
 fs/buffer.c                         |  6 +++---
 fs/direct-io.c                      |  2 +-
 fs/ext3/super.c                     |  4 ++--
 fs/ext4/super.c                     |  2 +-
 fs/gfs2/ops_fstype.c                |  4 ++--
 fs/gfs2/rgrp.c                      |  2 +-
 fs/nilfs2/the_nilfs.c               |  2 +-
 fs/ntfs/super.c                     |  6 +++---
 fs/ocfs2/cluster/heartbeat.c        |  2 +-
 fs/ocfs2/super.c                    |  2 +-
 fs/partitions/ibm.c                 |  2 +-
 fs/partitions/msdos.c               |  4 ++--
 fs/udf/super.c                      |  2 +-
 fs/xfs/linux-2.6/xfs_buf.c          |  2 +-
 include/linux/blkdev.h              | 14 +++++++-------
 include/linux/device-mapper.h       |  2 +-
 54 files changed, 108 insertions(+), 98 deletions(-)

(limited to 'drivers/md')

diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c
index 9e105cbc5e5f..a4779912a5ca 100644
--- a/arch/powerpc/sysdev/axonram.c
+++ b/arch/powerpc/sysdev/axonram.c
@@ -250,7 +250,7 @@ axon_ram_probe(struct of_device *device, const struct of_device_id *device_id)
 
 	set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT);
 	blk_queue_make_request(bank->disk->queue, axon_ram_make_request);
-	blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
+	blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE);
 	add_disk(bank->disk);
 
 	bank->irq_id = irq_of_parse_and_map(device->node, 0);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 91fa8e06b6a5..73e28d355688 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -340,7 +340,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
 		kobject_uevent(&bi->kobj, KOBJ_ADD);
 
 		bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE;
-		bi->sector_size = disk->queue->hardsect_size;
+		bi->sector_size = queue_logical_block_size(disk->queue);
 		disk->integrity = bi;
 	} else
 		bi = disk->integrity;
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 57af728d94bb..15c3164537b8 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -134,7 +134,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
-	blk_queue_hardsect_size(q, 512);
+	blk_queue_logical_block_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
@@ -288,21 +288,20 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 
 /**
- * blk_queue_hardsect_size - set hardware sector size for the queue
+ * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
- * @size:  the hardware sector size, in bytes
+ * @size:  the logical block size, in bytes
  *
  * Description:
- *   This should typically be set to the lowest possible sector size
- *   that the hardware can operate on (possible without reverting to
- *   even internal read-modify-write operations). Usually the default
- *   of 512 covers most hardware.
+ *   This should be set to the lowest possible block size that the
+ *   storage device can address.  The default of 512 covers most
+ *   hardware.
  **/
-void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
+void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
-	q->hardsect_size = size;
+	q->logical_block_size = size;
 }
-EXPORT_SYMBOL(blk_queue_hardsect_size);
+EXPORT_SYMBOL(blk_queue_logical_block_size);
 
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
@@ -324,7 +323,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 	t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments);
 	t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments);
 	t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size);
-	t->hardsect_size = max(t->hardsect_size, b->hardsect_size);
+	t->logical_block_size = max(t->logical_block_size, b->logical_block_size);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 3ff9bba3379a..13d38b7e4d0f 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -100,9 +100,9 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 	return queue_var_show(max_sectors_kb, (page));
 }
 
-static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page)
+static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
 {
-	return queue_var_show(q->hardsect_size, page);
+	return queue_var_show(queue_logical_block_size(q), page);
 }
 
 static ssize_t
@@ -249,7 +249,12 @@ static struct queue_sysfs_entry queue_iosched_entry = {
 
 static struct queue_sysfs_entry queue_hw_sector_size_entry = {
 	.attr = {.name = "hw_sector_size", .mode = S_IRUGO },
-	.show = queue_hw_sector_size_show,
+	.show = queue_logical_block_size_show,
+};
+
+static struct queue_sysfs_entry queue_logical_block_size_entry = {
+	.attr = {.name = "logical_block_size", .mode = S_IRUGO },
+	.show = queue_logical_block_size_show,
 };
 
 static struct queue_sysfs_entry queue_nonrot_entry = {
@@ -283,6 +288,7 @@ static struct attribute *default_attrs[] = {
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	&queue_hw_sector_size_entry.attr,
+	&queue_logical_block_size_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index f87615dea46b..9eaa1940273a 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -763,7 +763,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 	case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
 		return compat_put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return compat_put_int(arg, bdev_hardsect_size(bdev));
+		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 bdev_get_queue(bdev)->max_sectors);
diff --git a/block/ioctl.c b/block/ioctl.c
index ad474d4bbcce..7aa97f65da82 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -311,7 +311,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */
 		return put_int(arg, block_size(bdev));
 	case BLKSSZGET: /* get block device hardware sector size */
-		return put_int(arg, bdev_hardsect_size(bdev));
+		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
 	case BLKRASET:
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index e714e7cce6f2..94474f5f8bce 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -1389,8 +1389,8 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk,
 
 	disk->queue->queuedata = h;
 
-	blk_queue_hardsect_size(disk->queue,
-				h->drv[drv_index].block_size);
+	blk_queue_logical_block_size(disk->queue,
+				     h->drv[drv_index].block_size);
 
 	/* Make sure all queue data is written out before */
 	/* setting h->drv[drv_index].queue, as setting this */
@@ -2298,7 +2298,7 @@ static int cciss_revalidate(struct gendisk *disk)
 	cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size,
 			       inq_buff, drv);
 
-	blk_queue_hardsect_size(drv->queue, drv->block_size);
+	blk_queue_logical_block_size(drv->queue, drv->block_size);
 	set_capacity(disk, drv->nr_blocks);
 
 	kfree(inq_buff);
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index a02dcfc00f13..44fa2018f6b0 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -474,7 +474,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev)
 		disk->fops = &ida_fops;
 		if (j && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(hba[i]->queue, drv->blk_size);
+		blk_queue_logical_block_size(hba[i]->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = hba[i]->queue;
 		disk->private_data = drv;
@@ -1546,7 +1546,7 @@ static int revalidate_allvol(ctlr_info_t *host)
 		drv_info_t *drv = &host->drv[i];
 		if (i && !drv->nr_blks)
 			continue;
-		blk_queue_hardsect_size(host->queue, drv->blk_size);
+		blk_queue_logical_block_size(host->queue, drv->blk_size);
 		set_capacity(disk, drv->nr_blks);
 		disk->queue = host->queue;
 		disk->private_data = drv;
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index 961de56d00a9..f65b3f369eb0 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -724,7 +724,7 @@ static int __init hd_init(void)
 	blk_queue_max_sectors(hd_queue, 255);
 	init_timer(&device_timer);
 	device_timer.function = hd_times_out;
-	blk_queue_hardsect_size(hd_queue, 512);
+	blk_queue_logical_block_size(hd_queue, 512);
 
 	if (!NR_HD) {
 		/*
diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c
index c0cd0a03f698..60de5a01e71e 100644
--- a/drivers/block/mg_disk.c
+++ b/drivers/block/mg_disk.c
@@ -996,7 +996,7 @@ static int mg_probe(struct platform_device *plat_dev)
 		goto probe_err_6;
 	}
 	blk_queue_max_sectors(host->breq, MG_MAX_SECTS);
-	blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE);
+	blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE);
 
 	init_timer(&host->timer);
 	host->timer.function = mg_times_out;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index dc7a8c352da2..293f5858921d 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2657,7 +2657,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd)
 	struct request_queue *q = pd->disk->queue;
 
 	blk_queue_make_request(q, pkt_make_request);
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 	blk_queue_max_sectors(q, PACKET_MAX_SECTORS);
 	blk_queue_merge_bvec(q, pkt_merge_bvec);
 	q->queuedata = pd;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 338cee4cc0ba..aaeeb544228a 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -477,7 +477,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
 	blk_queue_max_sectors(queue, dev->bounce_size >> 9);
 	blk_queue_segment_boundary(queue, -1UL);
 	blk_queue_dma_alignment(queue, dev->blk_size-1);
-	blk_queue_hardsect_size(queue, dev->blk_size);
+	blk_queue_logical_block_size(queue, dev->blk_size);
 
 	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH,
 			  ps3disk_prepare_flush);
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index e67bbae9547d..cc54473b8e77 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -722,7 +722,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun,
 	/*
 	 * build the command
 	 *
-	 * The call to blk_queue_hardsect_size() guarantees that request
+	 * The call to blk_queue_logical_block_size() guarantees that request
 	 * is aligned, but it is given in terms of 512 byte units, always.
 	 */
 	block = blk_rq_pos(rq) >> lun->capacity.bshift;
@@ -1749,7 +1749,7 @@ static int ub_bd_revalidate(struct gendisk *disk)
 	ub_revalidate(lun->udev, lun);
 
 	/* XXX Support sector size switching like in sr.c */
-	blk_queue_hardsect_size(disk->queue, lun->capacity.bsize);
+	blk_queue_logical_block_size(disk->queue, lun->capacity.bsize);
 	set_capacity(disk, lun->capacity.nsec);
 	// set_disk_ro(sdkp->disk, lun->readonly);
 
@@ -2324,7 +2324,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum)
 	blk_queue_max_phys_segments(q, UB_MAX_REQ_SG);
 	blk_queue_segment_boundary(q, 0xffffffff);	/* Dubious. */
 	blk_queue_max_sectors(q, UB_MAX_SECTORS);
-	blk_queue_hardsect_size(q, lun->capacity.bsize);
+	blk_queue_logical_block_size(q, lun->capacity.bsize);
 
 	lun->disk = disk;
 	q->queuedata = lun;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 511d4ae2d176..c4845b169464 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -347,7 +347,7 @@ static int virtblk_probe(struct virtio_device *vdev)
 				offsetof(struct virtio_blk_config, blk_size),
 				&blk_size);
 	if (!err)
-		blk_queue_hardsect_size(vblk->disk->queue, blk_size);
+		blk_queue_logical_block_size(vblk->disk->queue, blk_size);
 
 	add_disk(vblk->disk);
 	return 0;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 132120ae4bde..c1996829d5ec 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -344,7 +344,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
-	blk_queue_hardsect_size(rq, sector_size);
+	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_sectors(rq, 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 3a4397edab71..f08491a3a813 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -984,7 +984,7 @@ static int __devinit ace_setup(struct ace_device *ace)
 	ace->queue = blk_init_queue(ace_request, &ace->lock);
 	if (ace->queue == NULL)
 		goto err_blk_initq;
-	blk_queue_hardsect_size(ace->queue, 512);
+	blk_queue_logical_block_size(ace->queue, 512);
 
 	/*
 	 * Allocate and initialize GD structure
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index 1e366ad8f680..b5621f27c4be 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -739,7 +739,7 @@ static void __devinit probe_gdrom_setupdisk(void)
 
 static int __devinit probe_gdrom_setupqueue(void)
 {
-	blk_queue_hardsect_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
+	blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
 	/* using DMA so memory will need to be contiguous */
 	blk_queue_max_hw_segments(gd.gdrom_rq, 1);
 	/* set a large max size to get most from DMA */
diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c
index f177c2d4017f..0fff646cc2f0 100644
--- a/drivers/cdrom/viocd.c
+++ b/drivers/cdrom/viocd.c
@@ -469,8 +469,8 @@ static void vio_handle_cd_event(struct HvLpEvent *event)
 	case viocdopen:
 		if (event->xRc == 0) {
 			di = &viocd_diskinfo[bevent->disk];
-			blk_queue_hardsect_size(di->viocd_disk->queue,
-					bevent->block_size);
+			blk_queue_logical_block_size(di->viocd_disk->queue,
+						     bevent->block_size);
 			set_capacity(di->viocd_disk,
 					bevent->media_size *
 					bevent->block_size / 512);
diff --git a/drivers/char/raw.c b/drivers/char/raw.c
index 20d90e6a6e50..db32f0e4c7dd 100644
--- a/drivers/char/raw.c
+++ b/drivers/char/raw.c
@@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp)
 	err = bd_claim(bdev, raw_open);
 	if (err)
 		goto out1;
-	err = set_blocksize(bdev, bdev_hardsect_size(bdev));
+	err = set_blocksize(bdev, bdev_logical_block_size(bdev));
 	if (err)
 		goto out2;
 	filp->f_flags |= O_DIRECT;
diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c
index 1799328decfb..424140c6c400 100644
--- a/drivers/ide/ide-cd.c
+++ b/drivers/ide/ide-cd.c
@@ -182,7 +182,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
 				 (sense->information[2] <<  8) |
 				 (sense->information[3]);
 
-			if (drive->queue->hardsect_size == 2048)
+			if (queue_logical_block_size(drive->queue) == 2048)
 				/* device sector size is 2K */
 				sector <<= 2;
 
@@ -737,7 +737,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq)
 	struct request_queue *q = drive->queue;
 	int write = rq_data_dir(rq) == WRITE;
 	unsigned short sectors_per_frame =
-		queue_hardsect_size(q) >> SECTOR_BITS;
+		queue_logical_block_size(q) >> SECTOR_BITS;
 
 	ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, "
 				  "secs_per_frame: %u",
@@ -1021,8 +1021,8 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense)
 	/* save a private copy of the TOC capacity for error handling */
 	drive->probed_capacity = toc->capacity * sectors_per_frame;
 
-	blk_queue_hardsect_size(drive->queue,
-				sectors_per_frame << SECTOR_BITS);
+	blk_queue_logical_block_size(drive->queue,
+				     sectors_per_frame << SECTOR_BITS);
 
 	/* first read just the header, so we know how long the TOC is */
 	stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr,
@@ -1338,7 +1338,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive)
 /* standard prep_rq_fn that builds 10 byte cmds */
 static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq)
 {
-	int hard_sect = queue_hardsect_size(q);
+	int hard_sect = queue_logical_block_size(q);
 	long block = (long)blk_rq_pos(rq) / (hard_sect >> 9);
 	unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9);
 
@@ -1543,7 +1543,7 @@ static int ide_cdrom_setup(ide_drive_t *drive)
 
 	nslots = ide_cdrom_probe_capabilities(drive);
 
-	blk_queue_hardsect_size(q, CD_FRAMESIZE);
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
 
 	if (ide_cdrom_register(drive, nslots)) {
 		printk(KERN_ERR PFX "%s: %s failed to register device with the"
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index 47c68bc75a17..06b0ded1ce23 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset,
 		target = rdev->sb_start + offset + index * (PAGE_SIZE/512);
 
 		if (sync_page_io(rdev->bdev, target,
-				 roundup(size, bdev_hardsect_size(rdev->bdev)),
+				 roundup(size, bdev_logical_block_size(rdev->bdev)),
 				 page, READ)) {
 			page->index = index;
 			attach_page_buffers(page, NULL); /* so that free_buffer will
@@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
 			int size = PAGE_SIZE;
 			if (page->index == bitmap->file_pages-1)
 				size = roundup(bitmap->last_page_size,
-					       bdev_hardsect_size(rdev->bdev));
+					       bdev_logical_block_size(rdev->bdev));
 			/* Just make sure we aren't corrupting data or
 			 * metadata
 			 */
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index a2e26c242141..75d8081a9041 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store,
 	}
 
 	/* Validate the chunk size against the device block size */
-	if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) {
+	if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) {
 		*error = "Chunk size is not a multiple of device blocksize";
 		return -EINVAL;
 	}
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index be233bc4d917..6fa8ccf91c70 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		 * Buffer holds both header and bitset.
 		 */
 		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
-				       bitset_size, ti->limits.hardsect_size);
+				       bitset_size,
+				       ti->limits.logical_block_size);
 
 		if (buf_size > dev->bdev->bd_inode->i_size) {
 			DMWARN("log device %s too small: need %llu bytes",
diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index e75c6dd76a9a..2662a41337e7 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot)
 	 */
 	if (!ps->store->chunk_size) {
 		ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS,
-		    bdev_hardsect_size(ps->store->cow->bdev) >> 9);
+		    bdev_logical_block_size(ps->store->cow->bdev) >> 9);
 		ps->store->chunk_mask = ps->store->chunk_size - 1;
 		ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1;
 		chunk_size_supplied = 0;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 429b50b975d5..65e2d9759857 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs,
 	lhs->max_hw_segments =
 		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
 
-	lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size);
+	lhs->logical_block_size = max(lhs->logical_block_size,
+				      rhs->logical_block_size);
 
 	lhs->max_segment_size =
 		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
@@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	rs->max_hw_segments =
 		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
 
-	rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size);
+	rs->logical_block_size = max(rs->logical_block_size,
+				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
 		min_not_zero(rs->max_segment_size, q->max_segment_size);
@@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
 	if (!rs->max_hw_segments)
 		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->hardsect_size)
-		rs->hardsect_size = 1 << SECTOR_SHIFT;
+	if (!rs->logical_block_size)
+		rs->logical_block_size = 1 << SECTOR_SHIFT;
 	if (!rs->max_segment_size)
 		rs->max_segment_size = MAX_SEGMENT_SIZE;
 	if (!rs->seg_boundary_mask)
@@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_sectors(q, t->limits.max_sectors);
 	q->max_phys_segments = t->limits.max_phys_segments;
 	q->max_hw_segments = t->limits.max_hw_segments;
-	q->hardsect_size = t->limits.hardsect_size;
+	q->logical_block_size = t->limits.logical_block_size;
 	q->max_segment_size = t->limits.max_segment_size;
 	q->max_hw_sectors = t->limits.max_hw_sectors;
 	q->seg_boundary_mask = t->limits.seg_boundary_mask;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index fccc8343a250..4cbc19f5c304 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
 
 	rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
-	bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1;
+	bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
 	if (rdev->sb_size & bmask)
 		rdev->sb_size = (rdev->sb_size | bmask) + 1;
 
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index c0bebc6a2f2c..7847bbc1440d 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1242,7 +1242,7 @@ static int mspro_block_init_disk(struct memstick_dev *card)
 
 	sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
 
-	blk_queue_hardsect_size(msb->queue, msb->page_size);
+	blk_queue_logical_block_size(msb->queue, msb->page_size);
 
 	capacity = be16_to_cpu(sys_info->user_block_count);
 	capacity *= be16_to_cpu(sys_info->block_size);
diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c
index 6573ef4408f1..335d4c78a775 100644
--- a/drivers/message/i2o/i2o_block.c
+++ b/drivers/message/i2o/i2o_block.c
@@ -794,8 +794,9 @@ static int i2o_block_transfer(struct request *req)
 	if (c->adaptec) {
 		u8 cmd[10];
 		u32 scsi_flags;
-		u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT;
+		u16 hwsec;
 
+		hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT;
 		memset(cmd, 0, 10);
 
 		sgl_offset = SGL_OFFSET_12;
@@ -1078,7 +1079,7 @@ static int i2o_block_probe(struct device *dev)
 	 */
 	if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) ||
 	    !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) {
-		blk_queue_hardsect_size(queue, le32_to_cpu(blocksize));
+		blk_queue_logical_block_size(queue, le32_to_cpu(blocksize));
 	} else
 		osm_warn("unable to get blocksize of %s\n", gd->disk_name);
 
diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c
index c5df86546458..98ffc41eaf2c 100644
--- a/drivers/mmc/card/block.c
+++ b/drivers/mmc/card/block.c
@@ -521,7 +521,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card)
 
 	sprintf(md->disk->disk_name, "mmcblk%d", devidx);
 
-	blk_queue_hardsect_size(md->queue.queue, 512);
+	blk_queue_logical_block_size(md->queue.queue, 512);
 
 	if (!mmc_card_sd(card) && mmc_card_blockaddr(card)) {
 		/*
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index 502622f628bc..aaac3b6800b7 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -378,7 +378,7 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr)
 	}
 
 	tr->blkcore_priv->rq->queuedata = tr;
-	blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize);
+	blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize);
 	if (tr->discard)
 		blk_queue_set_discard(tr->blkcore_priv->rq,
 				      blktrans_discard_request);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index e64f62d5e0fc..27a1be0cd4d4 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -1990,7 +1990,7 @@ static void dasd_setup_queue(struct dasd_block *block)
 {
 	int max;
 
-	blk_queue_hardsect_size(block->request_queue, block->bp_block);
+	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_sectors(block->request_queue, max);
 	blk_queue_max_phys_segments(block->request_queue, -1L);
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index cfdcf1aed33c..a4c7ffcd9987 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -602,7 +602,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
 	dev_info->gd->private_data = dev_info;
 	dev_info->gd->driverfs_dev = &dev_info->dev;
 	blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request);
-	blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096);
+	blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096);
 
 	seg_byte_size = (dev_info->end - dev_info->start + 1);
 	set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors
diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c
index 76814f3e898a..0ae0c83ef879 100644
--- a/drivers/s390/block/xpram.c
+++ b/drivers/s390/block/xpram.c
@@ -343,7 +343,7 @@ static int __init xpram_setup_blkdev(void)
 			goto out;
 		}
 		blk_queue_make_request(xpram_queues[i], xpram_make_request);
-		blk_queue_hardsect_size(xpram_queues[i], 4096);
+		blk_queue_logical_block_size(xpram_queues[i], 4096);
 	}
 
 	/*
diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c
index 1e7967675980..47ff695255ea 100644
--- a/drivers/s390/char/tape_block.c
+++ b/drivers/s390/char/tape_block.c
@@ -222,7 +222,7 @@ tapeblock_setup_device(struct tape_device * device)
 	if (rc)
 		goto cleanup_queue;
 
-	blk_queue_hardsect_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
+	blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE);
 	blk_queue_max_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC);
 	blk_queue_max_phys_segments(blkdat->request_queue, -1L);
 	blk_queue_max_hw_segments(blkdat->request_queue, -1L);
diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c
index 40d2860f235a..bcf3bd40bbd5 100644
--- a/drivers/scsi/sd.c
+++ b/drivers/scsi/sd.c
@@ -1510,7 +1510,7 @@ got_data:
 		 */
 		sector_size = 512;
 	}
-	blk_queue_hardsect_size(sdp->request_queue, sector_size);
+	blk_queue_logical_block_size(sdp->request_queue, sector_size);
 
 	{
 		char cap_str_2[10], cap_str_10[10];
diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c
index fddba53c7fe5..cd350dfc1216 100644
--- a/drivers/scsi/sr.c
+++ b/drivers/scsi/sr.c
@@ -727,7 +727,7 @@ static void get_sectorsize(struct scsi_cd *cd)
 	}
 
 	queue = cd->device->request_queue;
-	blk_queue_hardsect_size(queue, sector_size);
+	blk_queue_logical_block_size(queue, sector_size);
 
 	return;
 }
diff --git a/fs/bio.c b/fs/bio.c
index 81dc93e72535..4445c3821730 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors)
 sector_t bio_sector_offset(struct bio *bio, unsigned short index,
 			   unsigned int offset)
 {
-	unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue);
+	unsigned int sector_sz;
 	struct bio_vec *bv;
 	sector_t sectors;
 	int i;
 
+	sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue);
 	sectors = 0;
 
 	if (index >= bio->bi_idx)
diff --git a/fs/block_dev.c b/fs/block_dev.c
index a85fe310fc6f..a29b4dcc1bca 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size)
 		return -EINVAL;
 
 	/* Size cannot be smaller than the size supported by the device */
-	if (size < bdev_hardsect_size(bdev))
+	if (size < bdev_logical_block_size(bdev))
 		return -EINVAL;
 
 	/* Don't change the size if it is same as current */
@@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize);
 
 int sb_min_blocksize(struct super_block *sb, int size)
 {
-	int minsize = bdev_hardsect_size(sb->s_bdev);
+	int minsize = bdev_logical_block_size(sb->s_bdev);
 	if (size < minsize)
 		size = minsize;
 	return sb_set_blocksize(sb, size);
@@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change);
 
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
-	unsigned bsize = bdev_hardsect_size(bdev);
+	unsigned bsize = bdev_logical_block_size(bdev);
 
 	bdev->bd_inode->i_size = size;
 	while (bsize < PAGE_CACHE_SIZE) {
diff --git a/fs/buffer.c b/fs/buffer.c
index aed297739eb0..36e2bbc60ec7 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1085,12 +1085,12 @@ static struct buffer_head *
 __getblk_slow(struct block_device *bdev, sector_t block, int size)
 {
 	/* Size must be multiple of hard sectorsize */
-	if (unlikely(size & (bdev_hardsect_size(bdev)-1) ||
+	if (unlikely(size & (bdev_logical_block_size(bdev)-1) ||
 			(size < 512 || size > PAGE_SIZE))) {
 		printk(KERN_ERR "getblk(): invalid block size %d requested\n",
 					size);
-		printk(KERN_ERR "hardsect size: %d\n",
-					bdev_hardsect_size(bdev));
+		printk(KERN_ERR "logical block size: %d\n",
+					bdev_logical_block_size(bdev));
 
 		dump_stack();
 		return NULL;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 05763bbc2050..8b10b87dc01a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
 		rw = WRITE_ODIRECT;
 
 	if (bdev)
-		bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
+		bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev));
 
 	if (offset & blocksize_mask) {
 		if (bdev)
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 599dbfe504c3..acbb94fdf903 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		goto failed_mount;
 	}
 
-	hblock = bdev_hardsect_size(sb->s_bdev);
+	hblock = bdev_logical_block_size(sb->s_bdev);
 	if (sb->s_blocksize != blocksize) {
 		/*
 		 * Make sure the blocksize for the filesystem is larger
@@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT3-fs: blocksize too small for journal device.\n");
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 2958f4e6f222..a30549f7a305 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 	}
 
 	blocksize = sb->s_blocksize;
-	hblock = bdev_hardsect_size(bdev);
+	hblock = bdev_logical_block_size(bdev);
 	if (blocksize < hblock) {
 		printk(KERN_ERR
 			"EXT4-fs: blocksize too small for journal device.\n");
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 1ff9473ea753..a3b2ac989fc3 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent)
 	}
 
 	/* Set up the buffer cache and SB for real */
-	if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) {
+	if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) {
 		ret = -EINVAL;
 		fs_err(sdp, "FS block size (%u) is too small for device "
 		       "block size (%u)\n",
-		       sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev));
+		       sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev));
 		goto out;
 	}
 	if (sdp->sd_sb.sb_bsize > PAGE_SIZE) {
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 565038243fa2..a971d24e10ce 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 	struct super_block *sb = sdp->sd_vfs;
 	struct block_device *bdev = sb->s_bdev;
 	const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize /
-					   bdev_hardsect_size(sb->s_bdev);
+					   bdev_logical_block_size(sb->s_bdev);
 	u64 blk;
 	sector_t start = 0;
 	sector_t nr_sects = 0;
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index 7f65b3be4aa9..a91f15b8673c 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
 
 	blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
 	if (sb->s_blocksize != blocksize) {
-		int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
+		int hw_blocksize = bdev_logical_block_size(sb->s_bdev);
 
 		if (blocksize < hw_blocksize) {
 			printk(KERN_ERR
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index f76951dcd4a6..6aa7c4713536 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -25,7 +25,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
-#include <linux/blkdev.h>	/* For bdev_hardsect_size(). */
+#include <linux/blkdev.h>	/* For bdev_logical_block_size(). */
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
 #include <linux/vfs.h>
@@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent)
 		goto err_out_now;
 
 	/* We support sector sizes up to the PAGE_CACHE_SIZE. */
-	if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
+	if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) {
 		if (!silent)
 			ntfs_error(sb, "Device has unsupported sector size "
 					"(%i).  The maximum supported sector "
 					"size on this architecture is %lu "
 					"bytes.",
-					bdev_hardsect_size(sb->s_bdev),
+					bdev_logical_block_size(sb->s_bdev),
 					PAGE_CACHE_SIZE);
 		goto err_out_now;
 	}
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 4f85eceab376..09cc25d04611 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
 
 	bdevname(reg->hr_bdev, reg->hr_dev_name);
 
-	sectsize = bdev_hardsect_size(reg->hr_bdev);
+	sectsize = bdev_logical_block_size(reg->hr_bdev);
 	if (sectsize != reg->hr_block_bytes) {
 		mlog(ML_ERROR,
 		     "blocksize %u incorrect for device, expected %d",
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 79ff8d9d37e0..5c6163f55039 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb,
 	*bh = NULL;
 
 	/* may be > 512 */
-	*sector_size = bdev_hardsect_size(sb->s_bdev);
+	*sector_size = bdev_logical_block_size(sb->s_bdev);
 	if (*sector_size > OCFS2_MAX_BLOCKSIZE) {
 		mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n",
 		     *sector_size, OCFS2_MAX_BLOCKSIZE);
diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c
index 46297683cd34..fc71aab08460 100644
--- a/fs/partitions/ibm.c
+++ b/fs/partitions/ibm.c
@@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev)
 	Sector sect;
 
 	res = 0;
-	blocksize = bdev_hardsect_size(bdev);
+	blocksize = bdev_logical_block_size(bdev);
 	if (blocksize <= 0)
 		goto out_exit;
 	i_size = i_size_read(bdev->bd_inode);
diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c
index 796511886f28..0028d2ef0662 100644
--- a/fs/partitions/msdos.c
+++ b/fs/partitions/msdos.c
@@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev,
 	Sector sect;
 	unsigned char *data;
 	u32 this_sector, this_size;
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	int loopct = 0;		/* number of links followed
 				   without finding a data partition */
 	int i;
@@ -415,7 +415,7 @@ static struct {
  
 int msdos_partition(struct parsed_partitions *state, struct block_device *bdev)
 {
-	int sector_size = bdev_hardsect_size(bdev) / 512;
+	int sector_size = bdev_logical_block_size(bdev) / 512;
 	Sector sect;
 	unsigned char *data;
 	struct partition *p;
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 72348cc855a4..0ba44107d8f1 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) {
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 	} else {
-		uopt.blocksize = bdev_hardsect_size(sb->s_bdev);
+		uopt.blocksize = bdev_logical_block_size(sb->s_bdev);
 		ret = udf_load_vrs(sb, &uopt, silent, &fileset);
 		if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) {
 			if (!silent)
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index e28800a9f2b5..1418b916fc27 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early(
 	struct block_device	*bdev)
 {
 	return xfs_setsize_buftarg_flags(btp,
-			PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0);
+			PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0);
 }
 
 int
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 56ce53fce72e..872b78b7a101 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -391,7 +391,7 @@ struct request_queue
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
-	unsigned short		hardsect_size;
+	unsigned short		logical_block_size;
 	unsigned int		max_segment_size;
 
 	unsigned long		seg_boundary_mask;
@@ -901,7 +901,7 @@ extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
-extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
+extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
@@ -988,19 +988,19 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
-static inline int queue_hardsect_size(struct request_queue *q)
+static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 
-	if (q && q->hardsect_size)
-		retval = q->hardsect_size;
+	if (q && q->logical_block_size)
+		retval = q->logical_block_size;
 
 	return retval;
 }
 
-static inline int bdev_hardsect_size(struct block_device *bdev)
+static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
-	return queue_hardsect_size(bdev_get_queue(bdev));
+	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 
 static inline int queue_dma_alignment(struct request_queue *q)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index ded2d7c42668..49c2362977fd 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -149,7 +149,7 @@ struct io_restrictions {
 	unsigned max_hw_sectors;
 	unsigned max_sectors;
 	unsigned max_segment_size;
-	unsigned short hardsect_size;
+	unsigned short logical_block_size;
 	unsigned short max_hw_segments;
 	unsigned short max_phys_segments;
 	unsigned char no_cluster; /* inverted so that 0 is default */
-- 
cgit v1.2.3


From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Fri, 22 May 2009 17:17:50 -0400
Subject: block: Use accessor functions for queue limits

Convert all external users of queue limits to using wrapper functions
instead of poking the request queue variables directly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-barrier.c            |  8 ++++----
 block/blk-core.c               | 16 ++++++++--------
 block/blk-map.c                |  4 ++--
 block/blk-merge.c              | 27 ++++++++++++++-------------
 block/blk-settings.c           | 15 ++++++++++++---
 block/blk-sysfs.c              |  8 ++++----
 block/compat_ioctl.c           |  2 +-
 block/ioctl.c                  | 10 +++++-----
 block/scsi_ioctl.c             |  8 ++++----
 drivers/block/pktcdvd.c        |  6 ++++--
 drivers/cdrom/cdrom.c          |  4 ++--
 drivers/md/dm-table.c          | 28 ++++++++++++++--------------
 drivers/md/linear.c            |  2 +-
 drivers/md/multipath.c         |  4 ++--
 drivers/md/raid0.c             |  2 +-
 drivers/md/raid1.c             |  4 ++--
 drivers/md/raid10.c            |  8 ++++----
 drivers/md/raid5.c             |  4 ++--
 drivers/scsi/sg.c              | 15 ++++++++-------
 drivers/scsi/st.c              |  4 ++--
 drivers/usb/storage/scsiglue.c |  4 ++--
 fs/bio.c                       | 19 ++++++++++---------
 include/linux/bio.h            |  2 +-
 include/linux/blkdev.h         | 36 ++++++++++++++++++++++++++++++++++++
 mm/bounce.c                    |  4 ++--
 25 files changed, 147 insertions(+), 97 deletions(-)

(limited to 'drivers/md')

diff --git a/block/blk-barrier.c b/block/blk-barrier.c
index 0d98054cdbd7..30022b4e2f63 100644
--- a/block/blk-barrier.c
+++ b/block/blk-barrier.c
@@ -388,10 +388,10 @@ int blkdev_issue_discard(struct block_device *bdev,
 
 		bio->bi_sector = sector;
 
-		if (nr_sects > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			nr_sects -= q->max_hw_sectors;
-			sector += q->max_hw_sectors;
+		if (nr_sects > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			nr_sects -= queue_max_hw_sectors(q);
+			sector += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = nr_sects << 9;
 			nr_sects = 0;
diff --git a/block/blk-core.c b/block/blk-core.c
index 59c4af523112..7a4c40184a64 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1437,11 +1437,11 @@ static inline void __generic_make_request(struct bio *bio)
 			goto end_io;
 		}
 
-		if (unlikely(nr_sectors > q->max_hw_sectors)) {
+		if (unlikely(nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
-				bdevname(bio->bi_bdev, b),
-				bio_sectors(bio),
-				q->max_hw_sectors);
+			       bdevname(bio->bi_bdev, b),
+			       bio_sectors(bio),
+			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 
@@ -1608,8 +1608,8 @@ EXPORT_SYMBOL(submit_bio);
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
-	if (blk_rq_sectors(rq) > q->max_sectors ||
-	    blk_rq_bytes(rq) > q->max_hw_sectors << 9) {
+	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
+	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
@@ -1621,8 +1621,8 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
-	if (rq->nr_phys_segments > q->max_phys_segments ||
-	    rq->nr_phys_segments > q->max_hw_segments) {
+	if (rq->nr_phys_segments > queue_max_phys_segments(q) ||
+	    rq->nr_phys_segments > queue_max_hw_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
diff --git a/block/blk-map.c b/block/blk-map.c
index ef2492adca7e..9083cf0180cc 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -115,7 +115,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
 	struct bio *bio = NULL;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len)
 		return -EINVAL;
@@ -292,7 +292,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 	struct bio *bio;
 	int ret;
 
-	if (len > (q->max_hw_sectors << 9))
+	if (len > (queue_max_hw_sectors(q) << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 4974dd5767e5..39ce64432ba6 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -32,11 +32,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 			 * never considered part of another segment, since that
 			 * might change with the bounce page.
 			 */
-			high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
+			high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q);
 			if (high || highprv)
 				goto new_segment;
 			if (cluster) {
-				if (seg_size + bv->bv_len > q->max_segment_size)
+				if (seg_size + bv->bv_len
+				    > queue_max_segment_size(q))
 					goto new_segment;
 				if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
 					goto new_segment;
@@ -91,7 +92,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 		return 0;
 
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
-	    q->max_segment_size)
+	    queue_max_segment_size(q))
 		return 0;
 
 	if (!bio_has_data(bio))
@@ -134,7 +135,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		int nbytes = bvec->bv_len;
 
 		if (bvprv && cluster) {
-			if (sg->length + nbytes > q->max_segment_size)
+			if (sg->length + nbytes > queue_max_segment_size(q))
 				goto new_segment;
 
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
@@ -205,8 +206,8 @@ static inline int ll_new_hw_segment(struct request_queue *q,
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 
-	if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments
-	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
+	if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) ||
+	    req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
@@ -227,9 +228,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
@@ -251,9 +252,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
 	unsigned short max_sectors;
 
 	if (unlikely(blk_pc_request(req)))
-		max_sectors = q->max_hw_sectors;
+		max_sectors = queue_max_hw_sectors(q);
 	else
-		max_sectors = q->max_sectors;
+		max_sectors = queue_max_sectors(q);
 
 
 	if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) {
@@ -287,7 +288,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 	/*
 	 * Will it become too large?
 	 */
-	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors)
+	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q))
 		return 0;
 
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
@@ -299,10 +300,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 		total_phys_segments--;
 	}
 
-	if (total_phys_segments > q->max_phys_segments)
+	if (total_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
-	if (total_phys_segments > q->max_hw_segments)
+	if (total_phys_segments > queue_max_hw_segments(q))
 		return 0;
 
 	/* Merge is OK... */
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 15c3164537b8..0b32f984eed2 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -219,6 +219,15 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 
+void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors)
+{
+	if (BLK_DEF_MAX_SECTORS > max_sectors)
+		q->max_hw_sectors = BLK_DEF_MAX_SECTORS;
+	else
+		q->max_hw_sectors = max_sectors;
+}
+EXPORT_SYMBOL(blk_queue_max_hw_sectors);
+
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
@@ -395,11 +404,11 @@ int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
-	if (q->max_hw_segments < 2 || q->max_phys_segments < 2)
+	if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
-	--q->max_hw_segments;
-	--q->max_phys_segments;
+	blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1);
+	blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 13d38b7e4d0f..142a4acddd43 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -95,7 +95,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
-	int max_sectors_kb = q->max_sectors >> 1;
+	int max_sectors_kb = queue_max_sectors(q) >> 1;
 
 	return queue_var_show(max_sectors_kb, (page));
 }
@@ -109,7 +109,7 @@ static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
-			max_hw_sectors_kb = q->max_hw_sectors >> 1,
+		max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 
@@ -117,7 +117,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 		return -EINVAL;
 
 	spin_lock_irq(q->queue_lock);
-	q->max_sectors = max_sectors_kb << 1;
+	blk_queue_max_sectors(q, max_sectors_kb << 1);
 	spin_unlock_irq(q->queue_lock);
 
 	return ret;
@@ -125,7 +125,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 
 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 {
-	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
+	int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
 
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 9eaa1940273a..df18a156d011 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -766,7 +766,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 		return compat_put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
-					 bdev_get_queue(bdev)->max_sectors);
+					 queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
diff --git a/block/ioctl.c b/block/ioctl.c
index 7aa97f65da82..500e4c73cc52 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -152,10 +152,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start,
 		bio->bi_private = &wait;
 		bio->bi_sector = start;
 
-		if (len > q->max_hw_sectors) {
-			bio->bi_size = q->max_hw_sectors << 9;
-			len -= q->max_hw_sectors;
-			start += q->max_hw_sectors;
+		if (len > queue_max_hw_sectors(q)) {
+			bio->bi_size = queue_max_hw_sectors(q) << 9;
+			len -= queue_max_hw_sectors(q);
+			start += queue_max_hw_sectors(q);
 		} else {
 			bio->bi_size = len << 9;
 			len = 0;
@@ -313,7 +313,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 	case BLKSSZGET: /* get block device hardware sector size */
 		return put_int(arg, bdev_logical_block_size(bdev));
 	case BLKSECTGET:
-		return put_ushort(arg, bdev_get_queue(bdev)->max_sectors);
+		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
index a9670dd4b5de..5f8e798ede4e 100644
--- a/block/scsi_ioctl.c
+++ b/block/scsi_ioctl.c
@@ -75,7 +75,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p)
 
 static int sg_get_reserved_size(struct request_queue *q, int __user *p)
 {
-	unsigned val = min(q->sg_reserved_size, q->max_sectors << 9);
+	unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9);
 
 	return put_user(val, p);
 }
@@ -89,8 +89,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p)
 
 	if (size < 0)
 		return -EINVAL;
-	if (size > (q->max_sectors << 9))
-		size = q->max_sectors << 9;
+	if (size > (queue_max_sectors(q) << 9))
+		size = queue_max_sectors(q) << 9;
 
 	q->sg_reserved_size = size;
 	return 0;
@@ -264,7 +264,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
 	if (hdr->cmd_len > BLK_MAX_CDB)
 		return -EINVAL;
 
-	if (hdr->dxfer_len > (q->max_hw_sectors << 9))
+	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
 		return -EIO;
 
 	if (hdr->dxfer_len)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 293f5858921d..d57f11759480 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -991,13 +991,15 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
  */
 static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
 {
-	if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) {
+	if ((pd->settings.size << 9) / CD_FRAMESIZE
+	    <= queue_max_phys_segments(q)) {
 		/*
 		 * The cdrom device can handle one segment/frame
 		 */
 		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
 		return 0;
-	} else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) {
+	} else if ((pd->settings.size << 9) / PAGE_SIZE
+		   <= queue_max_phys_segments(q)) {
 		/*
 		 * We can handle this case at the expense of some extra memory
 		 * copies during write operations
diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c
index cceace61ef28..71d1b9bab70b 100644
--- a/drivers/cdrom/cdrom.c
+++ b/drivers/cdrom/cdrom.c
@@ -2101,8 +2101,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf,
 		nr = nframes;
 		if (cdi->cdda_method == CDDA_BPC_SINGLE)
 			nr = 1;
-		if (nr * CD_FRAMESIZE_RAW > (q->max_sectors << 9))
-			nr = (q->max_sectors << 9) / CD_FRAMESIZE_RAW;
+		if (nr * CD_FRAMESIZE_RAW > (queue_max_sectors(q) << 9))
+			nr = (queue_max_sectors(q) << 9) / CD_FRAMESIZE_RAW;
 
 		len = nr * CD_FRAMESIZE_RAW;
 
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 65e2d9759857..e9a73bb242b0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 *        combine_restrictions_low()
 	 */
 	rs->max_sectors =
-		min_not_zero(rs->max_sectors, q->max_sectors);
+		min_not_zero(rs->max_sectors, queue_max_sectors(q));
 
 	/*
 	 * Check if merge fn is supported.
@@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 
 	rs->max_phys_segments =
 		min_not_zero(rs->max_phys_segments,
-			     q->max_phys_segments);
+			     queue_max_phys_segments(q));
 
 	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, q->max_hw_segments);
+		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
 
 	rs->logical_block_size = max(rs->logical_block_size,
 				     queue_logical_block_size(q));
 
 	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, q->max_segment_size);
+		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
 
 	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, q->max_hw_sectors);
+		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
 
 	rs->seg_boundary_mask =
 		min_not_zero(rs->seg_boundary_mask,
-			     q->seg_boundary_mask);
+			     queue_segment_boundary(q));
 
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn);
+	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
 
 	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
@@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	 * restrictions.
 	 */
 	blk_queue_max_sectors(q, t->limits.max_sectors);
-	q->max_phys_segments = t->limits.max_phys_segments;
-	q->max_hw_segments = t->limits.max_hw_segments;
-	q->logical_block_size = t->limits.logical_block_size;
-	q->max_segment_size = t->limits.max_segment_size;
-	q->max_hw_sectors = t->limits.max_hw_sectors;
-	q->seg_boundary_mask = t->limits.seg_boundary_mask;
-	q->bounce_pfn = t->limits.bounce_pfn;
+	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
+	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
+	blk_queue_logical_block_size(q, t->limits.logical_block_size);
+	blk_queue_max_segment_size(q, t->limits.max_segment_size);
+	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
+	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 7a36e38393a1..64f1f3e046e0 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->num_sectors = rdev->sectors;
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 41ced0cbe823..4ee31aa13c40 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 		 * merge_bvec_fn will be involved in multipath.)
 		 */
 			if (q->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(q) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			conf->working_disks++;
@@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev)
 		 * violating it, not that we ever expect a device with
 		 * a merge_bvec_fn to be involved in multipath */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!test_bit(Faulty, &rdev->flags))
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c08d7559be55..925507e7d673 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev)
 		 */
 
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		if (!smallest || (rdev1->sectors < smallest->sectors))
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 36df9109cde1..e23758b4a34e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
@@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 499620afb44b..750550c1166f 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-				mddev->queue->max_sectors = (PAGE_SIZE>>9);
+			    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
@@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev)
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
-		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
-			mddev->queue->max_sectors = (PAGE_SIZE>>9);
+		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
+			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
 		disk->head_position = 0;
 	}
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4616bc3a6e71..7970dc8c522e 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi)
 {
 	struct request_queue *q = bdev_get_queue(bi->bi_bdev);
 
-	if ((bi->bi_size>>9) > q->max_sectors)
+	if ((bi->bi_size>>9) > queue_max_sectors(q))
 		return 0;
 	blk_recount_segments(q, bi);
-	if (bi->bi_phys_segments > q->max_phys_segments)
+	if (bi->bi_phys_segments > queue_max_phys_segments(q))
 		return 0;
 
 	if (q->merge_bvec_fn)
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0fc2c0ae7691..9bd407fa98e4 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -289,8 +289,8 @@ sg_open(struct inode *inode, struct file *filp)
 	if (list_empty(&sdp->sfds)) {	/* no existing opens on this device */
 		sdp->sgdebug = 0;
 		q = sdp->device->request_queue;
-		sdp->sg_tablesize = min(q->max_hw_segments,
-					q->max_phys_segments);
+		sdp->sg_tablesize = min(queue_max_hw_segments(q),
+					queue_max_phys_segments(q));
 	}
 	if ((sfp = sg_add_sfp(sdp, dev)))
 		filp->private_data = sfp;
@@ -909,7 +909,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                 if (val < 0)
                         return -EINVAL;
 		val = min_t(int, val,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		if (val != sfp->reserve.bufflen) {
 			if (sg_res_in_use(sfp) || sfp->mmap_called)
 				return -EBUSY;
@@ -919,7 +919,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 		return 0;
 	case SG_GET_RESERVED_SIZE:
 		val = min_t(int, sfp->reserve.bufflen,
-				sdp->device->request_queue->max_sectors * 512);
+			    queue_max_sectors(sdp->device->request_queue) * 512);
 		return put_user(val, ip);
 	case SG_SET_COMMAND_Q:
 		result = get_user(val, ip);
@@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
 			return -ENODEV;
 		return scsi_ioctl(sdp->device, cmd_in, p);
 	case BLKSECTGET:
-		return put_user(sdp->device->request_queue->max_sectors * 512,
+		return put_user(queue_max_sectors(sdp->device->request_queue) * 512,
 				ip);
 	case BLKTRACESETUP:
 		return blk_trace_setup(sdp->device->request_queue,
@@ -1377,7 +1377,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp)
 	sdp->device = scsidp;
 	INIT_LIST_HEAD(&sdp->sfds);
 	init_waitqueue_head(&sdp->o_excl_wait);
-	sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments);
+	sdp->sg_tablesize = min(queue_max_hw_segments(q),
+				queue_max_phys_segments(q));
 	sdp->index = k;
 	kref_init(&sdp->d_ref);
 
@@ -2055,7 +2056,7 @@ sg_add_sfp(Sg_device * sdp, int dev)
 		sg_big_buff = def_reserved_size;
 
 	bufflen = min_t(int, sg_big_buff,
-			sdp->device->request_queue->max_sectors * 512);
+			queue_max_sectors(sdp->device->request_queue) * 512);
 	sg_build_reserve(sfp, bufflen);
 	SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp:   bufflen=%d, k_use_sg=%d\n",
 			   sfp->reserve.bufflen, sfp->reserve.k_use_sg));
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c
index 8681b708344f..89bd438e1fe3 100644
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -3983,8 +3983,8 @@ static int st_probe(struct device *dev)
 		return -ENODEV;
 	}
 
-	i = min(SDp->request_queue->max_hw_segments,
-		SDp->request_queue->max_phys_segments);
+	i = min(queue_max_hw_segments(SDp->request_queue),
+		queue_max_phys_segments(SDp->request_queue));
 	if (st_max_sg_segs < i)
 		i = st_max_sg_segs;
 	buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i);
diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c
index 4ca3b5860643..cfa26d56ce60 100644
--- a/drivers/usb/storage/scsiglue.c
+++ b/drivers/usb/storage/scsiglue.c
@@ -132,7 +132,7 @@ static int slave_configure(struct scsi_device *sdev)
 
 		if (us->fflags & US_FL_MAX_SECTORS_MIN)
 			max_sectors = PAGE_CACHE_SIZE >> 9;
-		if (sdev->request_queue->max_sectors > max_sectors)
+		if (queue_max_sectors(sdev->request_queue) > max_sectors)
 			blk_queue_max_sectors(sdev->request_queue,
 					      max_sectors);
 	} else if (sdev->type == TYPE_TAPE) {
@@ -483,7 +483,7 @@ static ssize_t show_max_sectors(struct device *dev, struct device_attribute *att
 {
 	struct scsi_device *sdev = to_scsi_device(dev);
 
-	return sprintf(buf, "%u\n", sdev->request_queue->max_sectors);
+	return sprintf(buf, "%u\n", queue_max_sectors(sdev->request_queue));
 }
 
 /* Input routine for the sysfs max_sectors file */
diff --git a/fs/bio.c b/fs/bio.c
index 4445c3821730..ab423a1024ab 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev)
 	struct request_queue *q = bdev_get_queue(bdev);
 	int nr_pages;
 
-	nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (nr_pages > q->max_phys_segments)
-		nr_pages = q->max_phys_segments;
-	if (nr_pages > q->max_hw_segments)
-		nr_pages = q->max_hw_segments;
+	nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (nr_pages > queue_max_phys_segments(q))
+		nr_pages = queue_max_phys_segments(q);
+	if (nr_pages > queue_max_hw_segments(q))
+		nr_pages = queue_max_hw_segments(q);
 
 	return nr_pages;
 }
@@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 	 * make this too complex.
 	 */
 
-	while (bio->bi_phys_segments >= q->max_phys_segments
-	       || bio->bi_phys_segments >= q->max_hw_segments) {
+	while (bio->bi_phys_segments >= queue_max_phys_segments(q)
+	       || bio->bi_phys_segments >= queue_max_hw_segments(q)) {
 
 		if (retried_segments)
 			return 0;
@@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
 int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
 		    unsigned int len, unsigned int offset)
 {
-	return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors);
+	return __bio_add_page(q, bio, page, len, offset,
+			      queue_max_hw_sectors(q));
 }
 
 /**
@@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
 		 unsigned int offset)
 {
 	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, q->max_sectors);
+	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
 }
 
 struct bio_map_data {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index d30ec6f30dd7..12737be58601 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -279,7 +279,7 @@ static inline int bio_has_allocated_vec(struct bio *bio)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
-	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
+	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 872b78b7a101..29b48f7b4ba8 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -898,6 +898,7 @@ extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
+extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
@@ -988,6 +989,41 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter);
 
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 
+static inline unsigned long queue_bounce_pfn(struct request_queue *q)
+{
+	return q->bounce_pfn;
+}
+
+static inline unsigned long queue_segment_boundary(struct request_queue *q)
+{
+	return q->seg_boundary_mask;
+}
+
+static inline unsigned int queue_max_sectors(struct request_queue *q)
+{
+	return q->max_sectors;
+}
+
+static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
+{
+	return q->max_hw_sectors;
+}
+
+static inline unsigned short queue_max_hw_segments(struct request_queue *q)
+{
+	return q->max_hw_segments;
+}
+
+static inline unsigned short queue_max_phys_segments(struct request_queue *q)
+{
+	return q->max_phys_segments;
+}
+
+static inline unsigned int queue_max_segment_size(struct request_queue *q)
+{
+	return q->max_segment_size;
+}
+
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8..8dcd4315e01c 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -192,7 +192,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 		/*
 		 * is destination page below bounce pfn?
 		 */
-		if (page_to_pfn(page) <= q->bounce_pfn)
+		if (page_to_pfn(page) <= queue_bounce_pfn(q))
 			continue;
 
 		/*
@@ -284,7 +284,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
-		if (q->bounce_pfn >= blk_max_pfn)
+		if (queue_bounce_pfn(q) >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
-- 
cgit v1.2.3


From a05c0205ba031c01bba33a21bf0a35920eb64833 Mon Sep 17 00:00:00 2001
From: "Martin K. Petersen" <martin.petersen@oracle.com>
Date: Wed, 3 Jun 2009 09:33:18 +0200
Subject: block: Fix bounce limit setting in DM

blk_queue_bounce_limit() is more than a wrapper about the request queue
limits.bounce_pfn variable.  Introduce blk_queue_bounce_pfn() which can
be called by stacking drivers that wish to set the bounce limit
explicitly.

Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 +++++++++++++++++
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 +
 3 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8d3393492891..9acd0b7e802a 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,6 +193,23 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
+/**
+ * blk_queue_bounce_pfn - set the bounce buffer limit for queue
+ * @q: the request queue for the device
+ * @pfn: max address
+ *
+ * Description:
+ *    This function is similar to blk_queue_bounce_limit except it
+ *    neither changes allocation flags, nor does it set up the ISA DMA
+ *    pool. This function should only be used by stacking drivers.
+ *    Hardware drivers should use blk_queue_bounce_limit instead.
+ */
+void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
+{
+	q->limits.bounce_pfn = pfn;
+}
+EXPORT_SYMBOL(blk_queue_bounce_pfn);
+
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b0..3ca1604ddd5c 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
+	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 5e740a135e73..989aa1790f48 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,6 +910,7 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
+extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v1.2.3


From 9df1bb9b516daeece159ab7fb262d01a0359247c Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 9 Jun 2009 06:22:57 +0200
Subject: Revert "block: Fix bounce limit setting in DM"

This reverts commit a05c0205ba031c01bba33a21bf0a35920eb64833.

DM doesn't need to access the bounce_pfn directly.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-settings.c   | 17 -----------------
 drivers/md/dm-table.c  |  2 +-
 include/linux/blkdev.h |  1 -
 3 files changed, 1 insertion(+), 19 deletions(-)

(limited to 'drivers/md')

diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9acd0b7e802a..8d3393492891 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -193,23 +193,6 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 
-/**
- * blk_queue_bounce_pfn - set the bounce buffer limit for queue
- * @q: the request queue for the device
- * @pfn: max address
- *
- * Description:
- *    This function is similar to blk_queue_bounce_limit except it
- *    neither changes allocation flags, nor does it set up the ISA DMA
- *    pool. This function should only be used by stacking drivers.
- *    Hardware drivers should use blk_queue_bounce_limit instead.
- */
-void blk_queue_bounce_pfn(struct request_queue *q, u64 pfn)
-{
-	q->limits.bounce_pfn = pfn;
-}
-EXPORT_SYMBOL(blk_queue_bounce_pfn);
-
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 3ca1604ddd5c..e9a73bb242b0 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -920,7 +920,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 	blk_queue_max_segment_size(q, t->limits.max_segment_size);
 	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
 	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_pfn(q, t->limits.bounce_pfn);
+	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 989aa1790f48..5e740a135e73 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -910,7 +910,6 @@ extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
-extern void blk_queue_bounce_pfn(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
-- 
cgit v1.2.3


From 55782138e47d9baf2f7d3a7af9e7cf42adf72c56 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 9 Jun 2009 13:43:05 +0800
Subject: tracing/events: convert block trace points to TRACE_EVENT()

TRACE_EVENT is a more generic way to define tracepoints. Doing so adds
these new capabilities to this tracepoint:

  - zero-copy and per-cpu splice() tracing
  - binary tracing without printf overhead
  - structured logging records exposed under /debug/tracing/events
  - trace events embedded in function tracer output and other plugins
  - user-defined, per tracepoint filter expressions
  ...

Cons:

  - no dev_t info for the output of plug, unplug_timer and unplug_io events.
    no dev_t info for getrq and sleeprq events if bio == NULL.
    no dev_t info for rq_abort,...,rq_requeue events if rq->rq_disk == NULL.

    This is mainly because we can't get the deivce from a request queue.
    But this may change in the future.

  - A packet command is converted to a string in TP_assign, not TP_print.
    While blktrace do the convertion just before output.

    Since pc requests should be rather rare, this is not a big issue.

  - In blktrace, an event can have 2 different print formats, but a TRACE_EVENT
    has a unique format, which means we have some unused data in a trace entry.

    The overhead is minimized by using __dynamic_array() instead of __array().

I've benchmarked the ioctl blktrace vs the splice based TRACE_EVENT tracing:

      dd                   dd + ioctl blktrace       dd + TRACE_EVENT (splice)
1     7.36s, 42.7 MB/s     7.50s, 42.0 MB/s          7.41s, 42.5 MB/s
2     7.43s, 42.3 MB/s     7.48s, 42.1 MB/s          7.43s, 42.4 MB/s
3     7.38s, 42.6 MB/s     7.45s, 42.2 MB/s          7.41s, 42.5 MB/s

So the overhead of tracing is very small, and no regression when using
those trace events vs blktrace.

And the binary output of TRACE_EVENT is much smaller than blktrace:

 # ls -l -h
 -rw-r--r-- 1 root root 8.8M 06-09 13:24 sda.blktrace.0
 -rw-r--r-- 1 root root 195K 06-09 13:24 sda.blktrace.1
 -rw-r--r-- 1 root root 2.7M 06-09 13:25 trace_splice.out

Following are some comparisons between TRACE_EVENT and blktrace:

plug:
  kjournald-480   [000]   303.084981: block_plug: [kjournald]
  kjournald-480   [000]   303.084981:   8,0    P   N [kjournald]

unplug_io:
  kblockd/0-118   [000]   300.052973: block_unplug_io: [kblockd/0] 1
  kblockd/0-118   [000]   300.052974:   8,0    U   N [kblockd/0] 1

remap:
  kjournald-480   [000]   303.085042: block_remap: 8,0 W 102736992 + 8 <- (8,8) 33384
  kjournald-480   [000]   303.085043:   8,0    A   W 102736992 + 8 <- (8,8) 33384

bio_backmerge:
  kjournald-480   [000]   303.085086: block_bio_backmerge: 8,0 W 102737032 + 8 [kjournald]
  kjournald-480   [000]   303.085086:   8,0    M   W 102737032 + 8 [kjournald]

getrq:
  kjournald-480   [000]   303.084974: block_getrq: 8,0 W 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084975:   8,0    G   W 102736984 + 8 [kjournald]

  bash-2066  [001]  1072.953770:   8,0    G   N [bash]
  bash-2066  [001]  1072.953773: block_getrq: 0,0 N 0 + 0 [bash]

rq_complete:
  konsole-2065  [001]   300.053184: block_rq_complete: 8,0 W () 103669040 + 16 [0]
  konsole-2065  [001]   300.053191:   8,0    C   W 103669040 + 16 [0]

  ksoftirqd/1-7   [001]  1072.953811:   8,0    C   N (5a 00 08 00 00 00 00 00 24 00) [0]
  ksoftirqd/1-7   [001]  1072.953813: block_rq_complete: 0,0 N (5a 00 08 00 00 00 00 00 24 00) 0 + 0 [0]

rq_insert:
  kjournald-480   [000]   303.084985: block_rq_insert: 8,0 W 0 () 102736984 + 8 [kjournald]
  kjournald-480   [000]   303.084986:   8,0    I   W 102736984 + 8 [kjournald]

Changelog from v2 -> v3:

- use the newly introduced __dynamic_array().

Changelog from v1 -> v2:

- use __string() instead of __array() to minimize the memory required
  to store hex dump of rq->cmd().

- support large pc requests.

- add missing blk_fill_rwbs_rq() in block_rq_requeue TRACE_EVENT.

- some cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4A2DF669.5070905@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 block/blk-core.c             |  16 +-
 block/elevator.c             |   8 +-
 drivers/md/dm.c              |   5 +-
 fs/bio.c                     |   3 +-
 include/linux/blktrace_api.h |  13 ++
 include/trace/block.h        |  76 -------
 include/trace/events/block.h | 483 +++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile        |   5 +-
 kernel/trace/blktrace.c      |  78 ++++++-
 mm/bounce.c                  |   5 +-
 10 files changed, 588 insertions(+), 104 deletions(-)
 delete mode 100644 include/trace/block.h
 create mode 100644 include/trace/events/block.h

(limited to 'drivers/md')

diff --git a/block/blk-core.c b/block/blk-core.c
index 1306de9cce04..9475bf99b891 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,22 +28,14 @@
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 #include "blk.h"
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 
diff --git a/block/elevator.c b/block/elevator.c
index 7073a9072577..e220f0c543e3 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -33,17 +33,16 @@
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
  * Merge hash stuff.
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e2ee4a79ea2c..3fd8b1e65483 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -53,8 +54,6 @@ struct dm_target_io {
 	union map_info info;
 };
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
  * For request-based dm.
  * One of these is allocated per request.
diff --git a/fs/bio.c b/fs/bio.c
index 98711647ece4..740699c4f90c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -26,10 +26,9 @@
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 82b4636030e9..c7ec31dd04c9 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -218,5 +218,18 @@ static inline int blk_trace_init_sysfs(struct device *dev)
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
+#ifdef CONFIG_EVENT_TRACING
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+	return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING */
+
 #endif /* __KERNEL__ */
 #endif
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644
index 5b12efa096b6..000000000000
--- a/include/trace/block.h
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-	TP_PROTO(struct request_queue *q, struct request *rq),
-	      TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-	TP_PROTO(struct request_queue *q, struct bio *bio),
-	      TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-	      TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-	TP_PROTO(struct request_queue *q),
-	      TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-	      TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		 sector_t from),
-	      TP_ARGS(q, bio, dev, from));
-
-#endif
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644
index 000000000000..a99d1e565bb0
--- /dev/null
+++ b/include/trace/events/block.h
@@ -0,0 +1,483 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,         comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  unsigned int,	bytes			)
+		__array(  char,		rwbs,	6		)
+		__array(  char,		comm,   TASK_COMM_LEN   )
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __entry->bytes, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors	   = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+	TP_PROTO(struct request_queue *q, struct request *rq),
+
+	TP_ARGS(q, rq),
+
+	TP_STRUCT__entry(
+		__field(  dev_t,	dev			)
+		__field(  sector_t,	sector			)
+		__field(  unsigned int,	nr_sector		)
+		__field(  int,		errors			)
+		__array(  char,		rwbs,	6		)
+		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
+	),
+
+	TP_fast_assign(
+		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+		__entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+		__entry->nr_sector = blk_pc_request(rq) ?
+						0 : rq->hard_nr_sectors;
+		__entry->errors    = rq->errors;
+
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+		blk_dump_cmd(__get_str(cmd), rq);
+	),
+
+	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->rwbs, __get_str(cmd),
+		  __entry->sector, __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned,	nr_sector	)
+		__field( int,		error		)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%d]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+
+	TP_ARGS(q, bio),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+        ),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			      bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+	TP_ARGS(q, bio, rw),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev			)
+		__field( sector_t,	sector			)
+		__field( unsigned int,	nr_sector		)
+		__array( char,		rwbs,	6		)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
+		__entry->sector		= bio ? bio->bi_sector : 0;
+		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
+		blk_fill_rwbs(__entry->rwbs,
+			    bio ? bio->bi_rw : 0, __entry->nr_sector);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu + %u [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+	TP_PROTO(struct request_queue *q),
+
+	TP_ARGS(q),
+
+	TP_STRUCT__entry(
+		__field( int,		nr_rq			)
+		__array( char,		comm,	TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->nr_rq	= q->rq.count[READ] + q->rq.count[WRITE];
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio,
+		 unsigned int new_sector),
+
+	TP_ARGS(q, bio, new_sector),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev				)
+		__field( sector_t,	sector				)
+		__field( sector_t,	new_sector			)
+		__array( char,		rwbs,		6		)
+		__array( char,		comm,		TASK_COMM_LEN	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->new_sector	= new_sector;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+	),
+
+	TP_printk("%d,%d %s %llu / %llu [%s]",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->new_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, bio, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= bio->bi_bdev->bd_dev;
+		__entry->sector		= bio->bi_sector;
+		__entry->nr_sector	= bio->bi_size >> 9;
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  __entry->sector, __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  __entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 06b85850fab4..844164dca90a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,7 +45,10 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
 obj-$(CONFIG_EVENT_TRACING) += trace_events.o
 obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e3abf55bc8e5..7bd6a9893c24 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,10 +23,14 @@
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -1658,3 +1662,75 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+	int i, end;
+	int len = rq->cmd_len;
+	unsigned char *cmd = rq->cmd;
+
+	if (!blk_pc_request(rq)) {
+		buf[0] = '\0';
+		return;
+	}
+
+	for (end = len - 1; end >= 0; end--)
+		if (cmd[end])
+			break;
+	end++;
+
+	for (i = 0; i < len; i++) {
+		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+		if (i == end && end != len - 1) {
+			sprintf(buf, " ..");
+			break;
+		}
+	}
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+	int i = 0;
+
+	if (rw & WRITE)
+		rwbs[i++] = 'W';
+	else if (rw & 1 << BIO_RW_DISCARD)
+		rwbs[i++] = 'D';
+	else if (bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (rw & 1 << BIO_RW_AHEAD)
+		rwbs[i++] = 'A';
+	if (rw & 1 << BIO_RW_BARRIER)
+		rwbs[i++] = 'B';
+	if (rw & 1 << BIO_RW_SYNCIO)
+		rwbs[i++] = 'S';
+	if (rw & 1 << BIO_RW_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+	int rw = rq->cmd_flags & 0x03;
+	int bytes;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq))
+		bytes = rq->data_len;
+	else
+		bytes = rq->hard_nr_sectors << 9;
+
+	blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/mm/bounce.c b/mm/bounce.c
index e590272fe7a8..65f5e17e411a 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -14,16 +14,15 @@
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/block.h>
+
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 
 static mempool_t *page_pool, *isa_page_pool;
 
-DEFINE_TRACE(block_bio_bounce);
-
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
-- 
cgit v1.2.3


From d405640539555b601e52f7d18f1f0b1345d18bf5 Mon Sep 17 00:00:00 2001
From: Kay Sievers <kay.sievers@vrfy.org>
Date: Thu, 30 Apr 2009 15:23:42 +0200
Subject: Driver Core: misc: add nodename support for misc devices.

This adds support for misc devices to report their requested nodename to
userspace.  It also updates a number of misc drivers to provide the
needed subdirectory and device name to be used for them.

Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Jan Blunck <jblunck@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 arch/x86/kernel/microcode_core.c |  1 +
 drivers/char/hw_random/core.c    |  1 +
 drivers/char/misc.c              | 15 ++++++++++++---
 drivers/md/dm-ioctl.c            |  1 +
 drivers/net/tun.c                |  1 +
 include/linux/miscdevice.h       |  1 +
 6 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 9c4461501fcb..9371448290ac 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -236,6 +236,7 @@ static const struct file_operations microcode_fops = {
 static struct miscdevice microcode_dev = {
 	.minor			= MICROCODE_MINOR,
 	.name			= "microcode",
+	.devnode		= "cpu/microcode",
 	.fops			= &microcode_fops,
 };
 
diff --git a/drivers/char/hw_random/core.c b/drivers/char/hw_random/core.c
index e5d583c84e4f..fc93e2fc7c71 100644
--- a/drivers/char/hw_random/core.c
+++ b/drivers/char/hw_random/core.c
@@ -153,6 +153,7 @@ static const struct file_operations rng_chrdev_ops = {
 static struct miscdevice rng_miscdev = {
 	.minor		= RNG_MISCDEV_MINOR,
 	.name		= RNG_MODULE_NAME,
+	.devnode	= "hwrng",
 	.fops		= &rng_chrdev_ops,
 };
 
diff --git a/drivers/char/misc.c b/drivers/char/misc.c
index a5e0db9d7662..62c99fa59e2b 100644
--- a/drivers/char/misc.c
+++ b/drivers/char/misc.c
@@ -168,7 +168,6 @@ static const struct file_operations misc_fops = {
 	.open		= misc_open,
 };
 
-
 /**
  *	misc_register	-	register a miscellaneous device
  *	@misc: device structure
@@ -217,8 +216,8 @@ int misc_register(struct miscdevice * misc)
 		misc_minors[misc->minor >> 3] |= 1 << (misc->minor & 7);
 	dev = MKDEV(MISC_MAJOR, misc->minor);
 
-	misc->this_device = device_create(misc_class, misc->parent, dev, NULL,
-					  "%s", misc->name);
+	misc->this_device = device_create(misc_class, misc->parent, dev,
+					  misc, "%s", misc->name);
 	if (IS_ERR(misc->this_device)) {
 		err = PTR_ERR(misc->this_device);
 		goto out;
@@ -264,6 +263,15 @@ int misc_deregister(struct miscdevice *misc)
 EXPORT_SYMBOL(misc_register);
 EXPORT_SYMBOL(misc_deregister);
 
+static char *misc_nodename(struct device *dev)
+{
+	struct miscdevice *c = dev_get_drvdata(dev);
+
+	if (c->devnode)
+		return kstrdup(c->devnode, GFP_KERNEL);
+	return NULL;
+}
+
 static int __init misc_init(void)
 {
 	int err;
@@ -279,6 +287,7 @@ static int __init misc_init(void)
 	err = -EIO;
 	if (register_chrdev(MISC_MAJOR,"misc",&misc_fops))
 		goto fail_printk;
+	misc_class->nodename = misc_nodename;
 	return 0;
 
 fail_printk:
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 823ceba6efa8..1128d3fba797 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1513,6 +1513,7 @@ static const struct file_operations _ctl_fops = {
 static struct miscdevice _dm_misc = {
 	.minor 		= MISC_DYNAMIC_MINOR,
 	.name  		= DM_NAME,
+	.devnode	= "mapper/control",
 	.fops  		= &_ctl_fops
 };
 
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 811d3517fce0..11a0ba47b677 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1366,6 +1366,7 @@ static const struct file_operations tun_fops = {
 static struct miscdevice tun_miscdev = {
 	.minor = TUN_MINOR,
 	.name = "tun",
+	.devnode = "net/tun",
 	.fops = &tun_fops,
 };
 
diff --git a/include/linux/miscdevice.h b/include/linux/miscdevice.h
index beb6ec99cfef..052117744629 100644
--- a/include/linux/miscdevice.h
+++ b/include/linux/miscdevice.h
@@ -41,6 +41,7 @@ struct miscdevice  {
 	struct list_head list;
 	struct device *parent;
 	struct device *this_device;
+	const char *devnode;
 };
 
 extern int misc_register(struct miscdevice * misc);
-- 
cgit v1.2.3


From dc58266385e51420298275c90a616c34f1473a73 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:18:43 +1000
Subject: md: raid0: Replace hash table lookup by looping over all strip_zones.

The number of strip_zones of a raid0 array is bounded by the number of
drives in the array and is in fact much smaller for typical setups. For
example, any raid0 array containing identical disks will have only
a single strip_zone.

Therefore, the hash tables which are used for quickly finding the
strip_zone that holds a particular sector are of questionable value
and add quite a bit of unnecessary complexity.

This patch replaces the hash table lookup by equivalent code which
simply loops over all strip zones to find the zone that holds the
given sector.

In order to make this loop as fast as possible, the zone->start field
of struct strip_zone has been renamed to zone_end, and it now stores
the beginning of the next zone in sectors. This allows to save one
addition in the loop.

Subsequent cleanup patches will remove the hash table structure.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 40 ++++++++++++++++++++--------------------
 drivers/md/raid0.h |  2 +-
 2 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 925507e7d673..bb245a6d16c8 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -52,7 +52,6 @@ static int raid0_congested(void *data, int bits)
 	return ret;
 }
 
-
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
@@ -158,7 +157,7 @@ static int create_strip_zones (mddev_t *mddev)
 	}
 	zone->nb_dev = cnt;
 	zone->sectors = smallest->sectors * cnt;
-	zone->zone_start = 0;
+	zone->zone_end = zone->sectors;
 
 	current_start = smallest->sectors;
 	curr_zone_start = zone->sectors;
@@ -198,14 +197,13 @@ static int create_strip_zones (mddev_t *mddev)
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->sectors);
 
-		zone->zone_start = curr_zone_start;
+		zone->zone_end = curr_zone_start + zone->sectors;
 		curr_zone_start += zone->sectors;
 
 		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
 			(unsigned long long)current_start);
 	}
-
 	/* Now find appropriate hash spacing.
 	 * We want a number which causes most hash entries to cover
 	 * at most two strips, but the hash table must be at most
@@ -398,6 +396,19 @@ static int raid0_stop (mddev_t *mddev)
 	return 0;
 }
 
+/* Find the zone which holds a particular offset */
+static struct strip_zone *find_zone(struct raid0_private_data *conf,
+		sector_t sector)
+{
+	int i;
+	struct strip_zone *z = conf->strip_zone;
+
+	for (i = 0; i < conf->nr_strip_zones; i++)
+		if (sector < z[i].zone_end)
+			return z + i;
+	BUG();
+}
+
 static int raid0_make_request (struct request_queue *q, struct bio *bio)
 {
 	mddev_t *mddev = q->queuedata;
@@ -443,22 +454,11 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_pair_release(bp);
 		return 0;
 	}
- 
-
-	{
-		sector_t x = sector >> conf->sector_shift;
-		sector_div(x, (u32)conf->spacing);
-		zone = conf->hash_table[x];
-	}
-
-	while (sector >= zone->zone_start + zone->sectors)
-		zone++;
-
+	zone = find_zone(conf, sector);
 	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
-
-
 	{
-		sector_t x = (sector - zone->zone_start) >> chunksect_bits;
+		sector_t x = (zone->sectors + sector - zone->zone_end)
+				>> chunksect_bits;
 
 		sector_div(x, zone->nb_dev);
 		chunk = x;
@@ -503,8 +503,8 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 			seq_printf(seq, "%s/", bdevname(
 				conf->strip_zone[j].dev[k]->bdev,b));
 
-		seq_printf(seq, "] zs=%d ds=%d s=%d\n",
-				conf->strip_zone[j].zone_start,
+		seq_printf(seq, "] ze=%d ds=%d s=%d\n",
+				conf->strip_zone[j].zone_end,
 				conf->strip_zone[j].dev_start,
 				conf->strip_zone[j].sectors);
 	}
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 824b12eb1d4f..556666fec3a5 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -3,7 +3,7 @@
 
 struct strip_zone
 {
-	sector_t zone_start;	/* Zone offset in md_dev (in sectors) */
+	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	sector_t sectors;	/* Zone size in sectors */
 	int nb_dev;		/* # of devices attached to the zone */
-- 
cgit v1.2.3


From d27a43abd7be0ab4b2337e4587feca8c7340e5f9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jun 2009 16:46:46 +1000
Subject: md/raid0: two cleanups in create_stripe_zones.

1/ remove current_start.  The same value is available in
     zone->dev_start and storing it separately doesn't gain anything.
2/ rename curr_zone_start to curr_zone_end as we are now more
     focused on the 'end' of each zone.  We end up storing the
     same number though - the old name was a little confusing
     (and what does 'current' mean in this context anyway).

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 22 ++++++++++------------
 1 file changed, 10 insertions(+), 12 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index bb245a6d16c8..1afdfd120bba 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -55,7 +55,7 @@ static int raid0_congested(void *data, int bits)
 static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
-	sector_t current_start, curr_zone_start;
+	sector_t curr_zone_end;
 	sector_t min_spacing;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
@@ -159,8 +159,7 @@ static int create_strip_zones (mddev_t *mddev)
 	zone->sectors = smallest->sectors * cnt;
 	zone->zone_end = zone->sectors;
 
-	current_start = smallest->sectors;
-	curr_zone_start = zone->sectors;
+	curr_zone_end = zone->sectors;
 
 	/* now do the other zones */
 	for (i = 1; i < conf->nr_strip_zones; i++)
@@ -169,7 +168,7 @@ static int create_strip_zones (mddev_t *mddev)
 		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
 
 		printk(KERN_INFO "raid0: zone %d\n", i);
-		zone->dev_start = current_start;
+		zone->dev_start = smallest->sectors;
 		smallest = NULL;
 		c = 0;
 
@@ -178,7 +177,7 @@ static int create_strip_zones (mddev_t *mddev)
 			rdev = conf->strip_zone[0].dev[j];
 			printk(KERN_INFO "raid0: checking %s ...",
 				bdevname(rdev->bdev, b));
-			if (rdev->sectors <= current_start) {
+			if (rdev->sectors <= zone->dev_start) {
 				printk(KERN_INFO " nope.\n");
 				continue;
 			}
@@ -193,16 +192,15 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 
 		zone->nb_dev = c;
-		zone->sectors = (smallest->sectors - current_start) * c;
+		zone->sectors = (smallest->sectors - zone->dev_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
 			zone->nb_dev, (unsigned long long)zone->sectors);
 
-		zone->zone_end = curr_zone_start + zone->sectors;
-		curr_zone_start += zone->sectors;
+		curr_zone_end += zone->sectors;
+		zone->zone_end = curr_zone_end;
 
-		current_start = smallest->sectors;
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
-			(unsigned long long)current_start);
+			(unsigned long long)smallest->sectors);
 	}
 	/* Now find appropriate hash spacing.
 	 * We want a number which causes most hash entries to cover
@@ -212,8 +210,8 @@ static int create_strip_zones (mddev_t *mddev)
 	 * strip though as it's size has no bearing on the efficacy of the hash
 	 * table.
 	 */
-	conf->spacing = curr_zone_start;
-	min_spacing = curr_zone_start;
+	conf->spacing = curr_zone_end;
+	min_spacing = curr_zone_end;
 	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
 	for (i=0; i < conf->nr_strip_zones-1; i++) {
 		sector_t s = 0;
-- 
cgit v1.2.3


From 09770e0b6ee649313611a2d6a9b44f456072dbd6 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:46:48 +1000
Subject: md: raid0: Remove hash table.

The raid0 hash table has become unused due to the changes in the
previous patch. This patch removes the hash table allocation and
setup code and kills the hash_table field of struct raid0_private_data.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 12 ------------
 drivers/md/raid0.h |  1 -
 2 files changed, 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 1afdfd120bba..d4c9c5d5d7f5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -326,22 +326,14 @@ static int raid0_run (mddev_t *mddev)
 		nb_zone = s + round;
 	}
 	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
-
-	printk(KERN_INFO "raid0 : Allocating %zu bytes for hash.\n",
-				nb_zone*sizeof(struct strip_zone*));
-	conf->hash_table = kmalloc (sizeof (struct strip_zone *)*nb_zone, GFP_KERNEL);
-	if (!conf->hash_table)
-		goto out_free_conf;
 	sectors = conf->strip_zone[cur].sectors;
 
-	conf->hash_table[0] = conf->strip_zone + cur;
 	for (i=1; i< nb_zone; i++) {
 		while (sectors <= conf->spacing) {
 			cur++;
 			sectors += conf->strip_zone[cur].sectors;
 		}
 		sectors -= conf->spacing;
-		conf->hash_table[i] = conf->strip_zone + cur;
 	}
 	if (conf->sector_shift) {
 		conf->spacing >>= conf->sector_shift;
@@ -384,8 +376,6 @@ static int raid0_stop (mddev_t *mddev)
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	kfree(conf->hash_table);
-	conf->hash_table = NULL;
 	kfree(conf->strip_zone);
 	conf->strip_zone = NULL;
 	kfree(conf);
@@ -494,8 +484,6 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	h = 0;
 	for (j = 0; j < conf->nr_strip_zones; j++) {
 		seq_printf(seq, "      z%d", j);
-		if (conf->hash_table[h] == conf->strip_zone+j)
-			seq_printf(seq, "(h%d)", h++);
 		seq_printf(seq, "=[");
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
 			seq_printf(seq, "%s/", bdevname(
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 556666fec3a5..a14630a25aa4 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -12,7 +12,6 @@ struct strip_zone
 
 struct raid0_private_data
 {
-	struct strip_zone **hash_table; /* Table of indexes into strip_zone */
 	struct strip_zone *strip_zone;
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-- 
cgit v1.2.3


From 8f79cfcdb65472f1504ade2f53e5f2bfdaeb95da Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:47:10 +1000
Subject: md: raid0: Remove hash spacing and sector shift.

The "sector_shift" and "spacing" fields of struct raid0_private_data
were only used for the hash table lookups. So the removal of the
hash table allows get rid of these fields as well which simplifies
create_strip_zones() and raid0_run() quite a bit.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 63 +-----------------------------------------------------
 drivers/md/raid0.h |  3 ---
 2 files changed, 1 insertion(+), 65 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d4c9c5d5d7f5..edffc4940b49 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -56,7 +56,6 @@ static int create_strip_zones (mddev_t *mddev)
 {
 	int i, c, j;
 	sector_t curr_zone_end;
-	sector_t min_spacing;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
 	struct strip_zone *zone;
@@ -202,28 +201,7 @@ static int create_strip_zones (mddev_t *mddev)
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
 			(unsigned long long)smallest->sectors);
 	}
-	/* Now find appropriate hash spacing.
-	 * We want a number which causes most hash entries to cover
-	 * at most two strips, but the hash table must be at most
-	 * 1 PAGE.  We choose the smallest strip, or contiguous collection
-	 * of strips, that has big enough size.  We never consider the last
-	 * strip though as it's size has no bearing on the efficacy of the hash
-	 * table.
-	 */
-	conf->spacing = curr_zone_end;
-	min_spacing = curr_zone_end;
-	sector_div(min_spacing, PAGE_SIZE/sizeof(struct strip_zone*));
-	for (i=0; i < conf->nr_strip_zones-1; i++) {
-		sector_t s = 0;
-		for (j = i; j < conf->nr_strip_zones - 1 &&
-				s < min_spacing; j++)
-			s += conf->strip_zone[j].sectors;
-		if (s >= min_spacing && s < conf->spacing)
-			conf->spacing = s;
-	}
-
 	mddev->queue->unplug_fn = raid0_unplug;
-
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
@@ -273,10 +251,8 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	return array_sectors;
 }
 
-static int raid0_run (mddev_t *mddev)
+static int raid0_run(mddev_t *mddev)
 {
-	unsigned  cur=0, i=0, nb_zone;
-	s64 sectors;
 	raid0_conf_t *conf;
 
 	if (mddev->chunk_size == 0) {
@@ -306,43 +282,6 @@ static int raid0_run (mddev_t *mddev)
 
 	printk(KERN_INFO "raid0 : md_size is %llu sectors.\n",
 		(unsigned long long)mddev->array_sectors);
-	printk(KERN_INFO "raid0 : conf->spacing is %llu sectors.\n",
-		(unsigned long long)conf->spacing);
-	{
-		sector_t s = raid0_size(mddev, 0, 0);
-		sector_t space = conf->spacing;
-		int round;
-		conf->sector_shift = 0;
-		if (sizeof(sector_t) > sizeof(u32)) {
-			/*shift down space and s so that sector_div will work */
-			while (space > (sector_t) (~(u32)0)) {
-				s >>= 1;
-				space >>= 1;
-				s += 1; /* force round-up */
-				conf->sector_shift++;
-			}
-		}
-		round = sector_div(s, (u32)space) ? 1 : 0;
-		nb_zone = s + round;
-	}
-	printk(KERN_INFO "raid0 : nb_zone is %d.\n", nb_zone);
-	sectors = conf->strip_zone[cur].sectors;
-
-	for (i=1; i< nb_zone; i++) {
-		while (sectors <= conf->spacing) {
-			cur++;
-			sectors += conf->strip_zone[cur].sectors;
-		}
-		sectors -= conf->spacing;
-	}
-	if (conf->sector_shift) {
-		conf->spacing >>= conf->sector_shift;
-		/* round spacing up so when we divide by it, we
-		 * err on the side of too-low, which is safest
-		 */
-		conf->spacing++;
-	}
-
 	/* calculate the max read-ahead size.
 	 * For read-ahead of large files to be effective, we need to
 	 * readahead at least twice a whole stripe. i.e. number of devices
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index a14630a25aa4..dbcf1da916b7 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -15,9 +15,6 @@ struct raid0_private_data
 	struct strip_zone *strip_zone;
 	mdk_rdev_t **devlist; /* lists of rdevs, pointed to by strip_zone->dev */
 	int nr_strip_zones;
-
-	sector_t spacing;
-	int sector_shift; /* shift this before divide by spacing */
 };
 
 typedef struct raid0_private_data raid0_conf_t;
-- 
cgit v1.2.3


From 5568a6035d9fca2cd8f1ef7005e215eae4e65fab Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:47:21 +1000
Subject: md: raid0: Make raid0_run() return a proper error code.

Currently raid0_run() always returns -ENOMEM on errors. This is
incorrect as running the array might fail for other reasons, for
example because not all component devices were available.

This patch changes create_strip_zones() so that it returns a proper
error code (either -ENOMEM or -EINVAL) rather than 1 on errors and
makes raid0_run(), its single caller, return that value instead
of -ENOMEM.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index edffc4940b49..e5648b660e75 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -105,12 +105,12 @@ static int create_strip_zones (mddev_t *mddev)
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
 	if (!conf->strip_zone)
-		return 1;
+		return -ENOMEM;
 	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
 				conf->nr_strip_zones*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->devlist)
-		return 1;
+		return -ENOMEM;
 
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
@@ -207,8 +207,8 @@ static int create_strip_zones (mddev_t *mddev)
 
 	printk(KERN_INFO "raid0: done.\n");
 	return 0;
- abort:
-	return 1;
+abort:
+	return -EINVAL;
 }
 
 /**
@@ -254,6 +254,7 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 static int raid0_run(mddev_t *mddev)
 {
 	raid0_conf_t *conf;
+	int ret;
 
 	if (mddev->chunk_size == 0) {
 		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
@@ -269,12 +270,13 @@ static int raid0_run(mddev_t *mddev)
 
 	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
 	if (!conf)
-		goto out;
+		return -ENOMEM;
 	mddev->private = (void *)conf;
  
 	conf->strip_zone = NULL;
 	conf->devlist = NULL;
-	if (create_strip_zones (mddev)) 
+	ret = create_strip_zones(mddev);
+	if (ret < 0)
 		goto out_free_conf;
 
 	/* calculate array device size */
@@ -306,8 +308,7 @@ out_free_conf:
 	kfree(conf->devlist);
 	kfree(conf);
 	mddev->private = NULL;
-out:
-	return -ENOMEM;
+	return ret;
 }
 
 static int raid0_stop (mddev_t *mddev)
-- 
cgit v1.2.3


From ed7b00380d957ec770b5e90380d012c6062c13cc Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:47:36 +1000
Subject: md: raid0: Allocate all buffers for the raid0 configuration in one
 function.

Currently the raid0 configuration is allocated in raid0_run() while
the buffers for the strip_zone and the dev_list arrays are allocated
in create_strip_zones(). On errors, all three buffers are freed
in raid0_run().

It's easier and more readable to do the allocation and cleanup within
a single function. So move that code into create_strip_zones().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 47 +++++++++++++++++------------------------------
 1 file changed, 17 insertions(+), 30 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e5648b660e75..99cee51734e5 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -52,21 +52,18 @@ static int raid0_congested(void *data, int bits)
 	return ret;
 }
 
-static int create_strip_zones (mddev_t *mddev)
+static int create_strip_zones(mddev_t *mddev)
 {
-	int i, c, j;
+	int i, c, j, err;
 	sector_t curr_zone_end;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
 	struct strip_zone *zone;
 	int cnt;
 	char b[BDEVNAME_SIZE];
- 
-	/*
-	 * The number of 'same size groups'
-	 */
-	conf->nr_strip_zones = 0;
- 
+	raid0_conf_t *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
+
+	if (!conf)
+		return -ENOMEM;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
@@ -101,16 +98,16 @@ static int create_strip_zones (mddev_t *mddev)
 		}
 	}
 	printk(KERN_INFO "raid0: FINAL %d zones\n", conf->nr_strip_zones);
-
+	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
 	if (!conf->strip_zone)
-		return -ENOMEM;
+		goto abort;
 	conf->devlist = kzalloc(sizeof(mdk_rdev_t*)*
 				conf->nr_strip_zones*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->devlist)
-		return -ENOMEM;
+		goto abort;
 
 	/* The first zone must contain all devices, so here we check that
 	 * there is a proper alignment of slots to devices and find them all
@@ -119,6 +116,7 @@ static int create_strip_zones (mddev_t *mddev)
 	cnt = 0;
 	smallest = NULL;
 	zone->dev = conf->devlist;
+	err = -EINVAL;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
 
@@ -206,9 +204,14 @@ static int create_strip_zones (mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
 	printk(KERN_INFO "raid0: done.\n");
+	mddev->private = conf;
 	return 0;
 abort:
-	return -EINVAL;
+	kfree(conf->strip_zone);
+	kfree(conf->devlist);
+	kfree(conf);
+	mddev->private = NULL;
+	return err;
 }
 
 /**
@@ -253,7 +256,6 @@ static sector_t raid0_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 
 static int raid0_run(mddev_t *mddev)
 {
-	raid0_conf_t *conf;
 	int ret;
 
 	if (mddev->chunk_size == 0) {
@@ -268,16 +270,9 @@ static int raid0_run(mddev_t *mddev)
 	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
-	conf = kmalloc(sizeof (raid0_conf_t), GFP_KERNEL);
-	if (!conf)
-		return -ENOMEM;
-	mddev->private = (void *)conf;
- 
-	conf->strip_zone = NULL;
-	conf->devlist = NULL;
 	ret = create_strip_zones(mddev);
 	if (ret < 0)
-		goto out_free_conf;
+		return ret;
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));
@@ -299,16 +294,8 @@ static int raid0_run(mddev_t *mddev)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
 
-
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
 	return 0;
-
-out_free_conf:
-	kfree(conf->strip_zone);
-	kfree(conf->devlist);
-	kfree(conf);
-	mddev->private = NULL;
-	return ret;
 }
 
 static int raid0_stop (mddev_t *mddev)
-- 
cgit v1.2.3


From fb5ab4b5d6e16fd5006c9f800d0116f3547cb760 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Tue, 16 Jun 2009 16:48:19 +1000
Subject: md: raid0: Fix a memory leak when stopping a raid0 array.

raid0_stop() removes all references to the raid0 configuration but
misses to free the ->devlist buffer.

This patch closes this leak, removes a pointless initialization and
fixes a coding style issue in raid0_stop().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 99cee51734e5..0d62ad6df212 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -298,16 +298,15 @@ static int raid0_run(mddev_t *mddev)
 	return 0;
 }
 
-static int raid0_stop (mddev_t *mddev)
+static int raid0_stop(mddev_t *mddev)
 {
 	raid0_conf_t *conf = mddev_to_conf(mddev);
 
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->strip_zone);
-	conf->strip_zone = NULL;
+	kfree(conf->devlist);
 	kfree(conf);
 	mddev->private = NULL;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 49f357a22b3fa3eeac042dfa0a6cae920c174e48 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jun 2009 16:50:35 +1000
Subject: md: raid0: remove ->sectors from the strip_zone structure.

storing ->sectors is redundant as is can be computed from the
difference  z->zone_end - (z-1)->zone_end

The one place where it is used, it is just as efficient to use
a zone_end value instead.

And removing it makes strip_zone smaller, so they array of these that
is searched on every request has a better chance to say in cache.

So discard the field and get the value from elsewhere.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 33 +++++++++++++++++++--------------
 drivers/md/raid0.h |  1 -
 2 files changed, 19 insertions(+), 15 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 0d62ad6df212..07ef936afc71 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -55,7 +55,7 @@ static int raid0_congested(void *data, int bits)
 static int create_strip_zones(mddev_t *mddev)
 {
 	int i, c, j, err;
-	sector_t curr_zone_end;
+	sector_t curr_zone_end, sectors;
 	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
 	struct strip_zone *zone;
 	int cnt;
@@ -153,10 +153,9 @@ static int create_strip_zones(mddev_t *mddev)
 		goto abort;
 	}
 	zone->nb_dev = cnt;
-	zone->sectors = smallest->sectors * cnt;
-	zone->zone_end = zone->sectors;
+	zone->zone_end = smallest->sectors * cnt;
 
-	curr_zone_end = zone->sectors;
+	curr_zone_end = zone->zone_end;
 
 	/* now do the other zones */
 	for (i = 1; i < conf->nr_strip_zones; i++)
@@ -189,11 +188,11 @@ static int create_strip_zones(mddev_t *mddev)
 		}
 
 		zone->nb_dev = c;
-		zone->sectors = (smallest->sectors - zone->dev_start) * c;
+		sectors = (smallest->sectors - zone->dev_start) * c;
 		printk(KERN_INFO "raid0: zone->nb_dev: %d, sectors: %llu\n",
-			zone->nb_dev, (unsigned long long)zone->sectors);
+			zone->nb_dev, (unsigned long long)sectors);
 
-		curr_zone_end += zone->sectors;
+		curr_zone_end += sectors;
 		zone->zone_end = curr_zone_end;
 
 		printk(KERN_INFO "raid0: current zone start: %llu\n",
@@ -310,16 +309,22 @@ static int raid0_stop(mddev_t *mddev)
 	return 0;
 }
 
-/* Find the zone which holds a particular offset */
+/* Find the zone which holds a particular offset
+ * Update *sectorp to be an offset in that zone
+ */
 static struct strip_zone *find_zone(struct raid0_private_data *conf,
-		sector_t sector)
+				    sector_t *sectorp)
 {
 	int i;
 	struct strip_zone *z = conf->strip_zone;
+	sector_t sector = *sectorp;
 
 	for (i = 0; i < conf->nr_strip_zones; i++)
-		if (sector < z[i].zone_end)
+		if (sector < z[i].zone_end) {
+			if (i)
+				*sectorp = sector - z[i-1].zone_end;
 			return z + i;
+		}
 	BUG();
 }
 
@@ -331,7 +336,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
 	sector_t chunk;
-	sector_t sector, rsect;
+	sector_t sector, rsect, sector_offset;
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
@@ -368,11 +373,11 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_pair_release(bp);
 		return 0;
 	}
-	zone = find_zone(conf, sector);
+	sector_offset = sector;
+	zone = find_zone(conf, &sector_offset);
 	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
 	{
-		sector_t x = (zone->sectors + sector - zone->zone_end)
-				>> chunksect_bits;
+		sector_t x = sector_offset >> chunksect_bits;
 
 		sector_div(x, zone->nb_dev);
 		chunk = x;
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index dbcf1da916b7..124ba34c8eed 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -5,7 +5,6 @@ struct strip_zone
 {
 	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
-	sector_t sectors;	/* Zone size in sectors */
 	int nb_dev;		/* # of devices attached to the zone */
 	mdk_rdev_t **dev;	/* Devices attached to the zone */
 };
-- 
cgit v1.2.3


From b414579f4573b6dc8583e31b01dcffd13f49fd62 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jun 2009 16:50:52 +1000
Subject: md: raid0: remove ->dev pointer from strip_zone structure

If we treat conf->devlist more like a 2 dimensional array,
we can get the devlist for a particular zone simply by indexing
that array, so we don't need to store the pointers to subarrays
in strip_zone.  This makes strip_zone smaller and so (hopefully)
searches faster.

Signed-of-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 21 +++++++++++----------
 drivers/md/raid0.h |  1 -
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 07ef936afc71..af0df78223b1 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -27,7 +27,7 @@ static void raid0_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	mdk_rdev_t **devlist = conf->devlist;
 	int i;
 
 	for (i=0; i<mddev->raid_disks; i++) {
@@ -41,7 +41,7 @@ static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
 	raid0_conf_t *conf = mddev_to_conf(mddev);
-	mdk_rdev_t **devlist = conf->strip_zone[0].dev;
+	mdk_rdev_t **devlist = conf->devlist;
 	int i, ret = 0;
 
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
@@ -56,7 +56,7 @@ static int create_strip_zones(mddev_t *mddev)
 {
 	int i, c, j, err;
 	sector_t curr_zone_end, sectors;
-	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev;
+	mdk_rdev_t *smallest, *rdev1, *rdev2, *rdev, **dev;
 	struct strip_zone *zone;
 	int cnt;
 	char b[BDEVNAME_SIZE];
@@ -115,7 +115,7 @@ static int create_strip_zones(mddev_t *mddev)
 	zone = &conf->strip_zone[0];
 	cnt = 0;
 	smallest = NULL;
-	zone->dev = conf->devlist;
+	dev = conf->devlist;
 	err = -EINVAL;
 	list_for_each_entry(rdev1, &mddev->disks, same_set) {
 		int j = rdev1->raid_disk;
@@ -125,12 +125,12 @@ static int create_strip_zones(mddev_t *mddev)
 				"aborting!\n", j);
 			goto abort;
 		}
-		if (zone->dev[j]) {
+		if (dev[j]) {
 			printk(KERN_ERR "raid0: multiple devices for %d - "
 				"aborting!\n", j);
 			goto abort;
 		}
-		zone->dev[j] = rdev1;
+		dev[j] = rdev1;
 
 		blk_queue_stack_limits(mddev->queue,
 				       rdev1->bdev->bd_disk->queue);
@@ -161,7 +161,7 @@ static int create_strip_zones(mddev_t *mddev)
 	for (i = 1; i < conf->nr_strip_zones; i++)
 	{
 		zone = conf->strip_zone + i;
-		zone->dev = conf->strip_zone[i-1].dev + mddev->raid_disks;
+		dev = conf->devlist + i * mddev->raid_disks;
 
 		printk(KERN_INFO "raid0: zone %d\n", i);
 		zone->dev_start = smallest->sectors;
@@ -170,7 +170,7 @@ static int create_strip_zones(mddev_t *mddev)
 
 		for (j=0; j<cnt; j++) {
 			char b[BDEVNAME_SIZE];
-			rdev = conf->strip_zone[0].dev[j];
+			rdev = conf->devlist[j];
 			printk(KERN_INFO "raid0: checking %s ...",
 				bdevname(rdev->bdev, b));
 			if (rdev->sectors <= zone->dev_start) {
@@ -178,7 +178,7 @@ static int create_strip_zones(mddev_t *mddev)
 				continue;
 			}
 			printk(KERN_INFO " contained as device %d\n", c);
-			zone->dev[c] = rdev;
+			dev[c] = rdev;
 			c++;
 			if (!smallest || rdev->sectors < smallest->sectors) {
 				smallest = rdev;
@@ -383,7 +383,8 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		chunk = x;
 
 		x = sector >> chunksect_bits;
-		tmp_dev = zone->dev[sector_div(x, zone->nb_dev)];
+		tmp_dev = conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+					+ sector_div(x, zone->nb_dev)];
 	}
 	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
  
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 124ba34c8eed..7b3605e570c0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -6,7 +6,6 @@ struct strip_zone
 	sector_t zone_end;	/* Start of the next zone (in sectors) */
 	sector_t dev_start;	/* Zone offset in real dev (in sectors) */
 	int nb_dev;		/* # of devices attached to the zone */
-	mdk_rdev_t **dev;	/* Devices attached to the zone */
 };
 
 struct raid0_private_data
-- 
cgit v1.2.3


From a6b3deafe0c50e3e873e8ed5cc8abfcb25c05eff Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jun 2009 16:54:07 +1000
Subject: md: raid0:  remove setting of segment boundary.

This setting doesn't seem to make sense (half the chunk size??) and
shouldn't be needed.
The segment boundary exported by raid0 should simply be the minimum
of the segment boundary of all component devices.  And we already
get that right.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index af0df78223b1..e2e9c1833336 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -261,12 +261,7 @@ static int raid0_run(mddev_t *mddev)
 		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
 		return -EINVAL;
 	}
-	printk(KERN_INFO "%s: setting max_sectors to %d, segment boundary to %d\n",
-	       mdname(mddev),
-	       mddev->chunk_size >> 9,
-	       (mddev->chunk_size>>1)-1);
 	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-	blk_queue_segment_boundary(mddev->queue, (mddev->chunk_size>>1) - 1);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
 	ret = create_strip_zones(mddev);
-- 
cgit v1.2.3


From 070ec55d07157a3041f92654135c3c6e2eaaf901 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 16 Jun 2009 16:54:21 +1000
Subject: md: remove mddev_to_conf "helper" macro

Having a macro just to cast a void* isn't really helpful.
I would must rather see that we are simply de-referencing ->private,
than have to know what the macro does.

So open code the macro everywhere and remove the pointless cast.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c    | 12 ++++++------
 drivers/md/linear.h    |  2 --
 drivers/md/multipath.c | 20 ++++++++++----------
 drivers/md/multipath.h |  6 ------
 drivers/md/raid0.c     | 10 +++++-----
 drivers/md/raid0.h     |  2 --
 drivers/md/raid1.c     | 38 +++++++++++++++++++-------------------
 drivers/md/raid1.h     |  6 ------
 drivers/md/raid10.c    | 42 +++++++++++++++++++++---------------------
 drivers/md/raid10.h    |  6 ------
 drivers/md/raid5.c     | 36 ++++++++++++++++++------------------
 drivers/md/raid5.h     |  2 --
 12 files changed, 79 insertions(+), 103 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 64f1f3e046e0..31f8ec7131bd 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -28,7 +28,7 @@
 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
 	dev_info_t *hash;
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf = mddev->private;
 	sector_t idx = sector >> conf->sector_shift;
 
 	/*
@@ -79,7 +79,7 @@ static int linear_mergeable_bvec(struct request_queue *q,
 static void linear_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf = mddev->private;
 	int i;
 
 	for (i=0; i < mddev->raid_disks; i++) {
@@ -91,7 +91,7 @@ static void linear_unplug(struct request_queue *q)
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
@@ -103,7 +103,7 @@ static int linear_congested(void *data, int bits)
 
 static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf = mddev->private;
 
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
@@ -294,7 +294,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!newconf)
 		return -ENOMEM;
 
-	newconf->prev = mddev_to_conf(mddev);
+	newconf->prev = mddev->private;
 	mddev->private = newconf;
 	mddev->raid_disks++;
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
@@ -304,7 +304,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 
 static int linear_stop (mddev_t *mddev)
 {
-	linear_conf_t *conf = mddev_to_conf(mddev);
+	linear_conf_t *conf = mddev->private;
   
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	do {
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index bf8179587f95..76078f1cded0 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -24,6 +24,4 @@ struct linear_private_data
 
 typedef struct linear_private_data linear_conf_t;
 
-#define mddev_to_conf(mddev) ((linear_conf_t *) mddev->private)
-
 #endif
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index 4ee31aa13c40..c1ca63f278a9 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -58,7 +58,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 {
 	unsigned long flags;
 	mddev_t *mddev = mp_bh->mddev;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&mp_bh->retry_list, &conf->retry_list);
@@ -75,7 +75,7 @@ static void multipath_reschedule_retry (struct multipath_bh *mp_bh)
 static void multipath_end_bh_io (struct multipath_bh *mp_bh, int err)
 {
 	struct bio *bio = mp_bh->master_bio;
-	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+	multipath_conf_t *conf = mp_bh->mddev->private;
 
 	bio_endio(bio, err);
 	mempool_free(mp_bh, conf->pool);
@@ -85,7 +85,7 @@ static void multipath_end_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct multipath_bh * mp_bh = (struct multipath_bh *)(bio->bi_private);
-	multipath_conf_t *conf = mddev_to_conf(mp_bh->mddev);
+	multipath_conf_t *conf = mp_bh->mddev->private;
 	mdk_rdev_t *rdev = conf->multipaths[mp_bh->path].rdev;
 
 	if (uptodate)
@@ -107,7 +107,7 @@ static void multipath_end_request(struct bio *bio, int error)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -138,7 +138,7 @@ static void multipath_unplug(struct request_queue *q)
 static int multipath_make_request (struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	struct multipath_bh * mp_bh;
 	struct multipath_info *multipath;
 	const int rw = bio_data_dir(bio);
@@ -180,7 +180,7 @@ static int multipath_make_request (struct request_queue *q, struct bio * bio)
 
 static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i;
 	
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks,
@@ -195,7 +195,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev)
 static int multipath_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -220,7 +220,7 @@ static int multipath_congested(void *data, int bits)
  */
 static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	if (conf->working_disks <= 1) {
 		/*
@@ -367,7 +367,7 @@ static void multipathd (mddev_t *mddev)
 	struct multipath_bh *mp_bh;
 	struct bio *bio;
 	unsigned long flags;
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 
 	md_check_recovery(mddev);
@@ -531,7 +531,7 @@ out:
 
 static int multipath_stop (mddev_t *mddev)
 {
-	multipath_conf_t *conf = mddev_to_conf(mddev);
+	multipath_conf_t *conf = mddev->private;
 
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h
index 6fa70b400cda..d1c2a8d78395 100644
--- a/drivers/md/multipath.h
+++ b/drivers/md/multipath.h
@@ -18,12 +18,6 @@ struct multipath_private_data {
 
 typedef struct multipath_private_data multipath_conf_t;
 
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((multipath_conf_t *) mddev->private)
-
 /*
  * this is our 'private' 'collective' MULTIPATH buffer head.
  * it contains information about what kind of IO operations were started
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index e2e9c1833336..77764dad1bcb 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -26,7 +26,7 @@
 static void raid0_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 	mdk_rdev_t **devlist = conf->devlist;
 	int i;
 
@@ -40,7 +40,7 @@ static void raid0_unplug(struct request_queue *q)
 static int raid0_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 	mdk_rdev_t **devlist = conf->devlist;
 	int i, ret = 0;
 
@@ -294,7 +294,7 @@ static int raid0_run(mddev_t *mddev)
 
 static int raid0_stop(mddev_t *mddev)
 {
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	kfree(conf->strip_zone);
@@ -327,7 +327,7 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 {
 	mddev_t *mddev = q->queuedata;
 	unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
 	sector_t chunk;
@@ -406,7 +406,7 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 #ifdef MD_DEBUG
 	int j, k, h;
 	char b[BDEVNAME_SIZE];
-	raid0_conf_t *conf = mddev_to_conf(mddev);
+	raid0_conf_t *conf = mddev->private;
 
 	h = 0;
 	for (j = 0; j < conf->nr_strip_zones; j++) {
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 7b3605e570c0..91f8e876ee64 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -17,6 +17,4 @@ struct raid0_private_data
 
 typedef struct raid0_private_data raid0_conf_t;
 
-#define mddev_to_conf(mddev) ((raid0_conf_t *) mddev->private)
-
 #endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index e23758b4a34e..5ea5bca53a5e 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -182,7 +182,7 @@ static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 
 static void free_r1bio(r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -196,7 +196,7 @@ static void free_r1bio(r1bio_t *r1_bio)
 
 static void put_buf(r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 	int i;
 
 	for (i=0; i<conf->raid_disks; i++) {
@@ -214,7 +214,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
 {
 	unsigned long flags;
 	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
@@ -253,7 +253,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
  */
 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	conf->mirrors[disk].head_position =
 		r1_bio->sector + (r1_bio->sectors);
@@ -264,7 +264,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror;
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 
 	mirror = r1_bio->read_disk;
 	/*
@@ -309,7 +309,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
-	conf_t *conf = mddev_to_conf(r1_bio->mddev);
+	conf_t *conf = r1_bio->mddev->private;
 	struct bio *to_put = NULL;
 
 
@@ -541,7 +541,7 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -573,7 +573,7 @@ static void raid1_unplug(struct request_queue *q)
 static int raid1_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -772,7 +772,7 @@ do_sync_io:
 static int make_request(struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
 	struct bio *read_bio;
@@ -991,7 +991,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
@@ -1010,7 +1010,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1214,7 +1214,7 @@ static void end_sync_write(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	mddev_t *mddev = r1_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 	int mirror=0;
 
@@ -1248,7 +1248,7 @@ static void end_sync_write(struct bio *bio, int error)
 
 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 	int disks = conf->raid_disks;
 	struct bio *bio, *wbio;
@@ -1562,7 +1562,7 @@ static void raid1d(mddev_t *mddev)
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	unsigned long flags;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	int unplug=0;
 	mdk_rdev_t *rdev;
@@ -1585,7 +1585,7 @@ static void raid1d(mddev_t *mddev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r1_bio->mddev;
-		conf = mddev_to_conf(mddev);
+		conf = mddev->private;
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
@@ -1706,7 +1706,7 @@ static int init_resync(conf_t *conf)
 
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
@@ -2087,7 +2087,7 @@ out:
 
 static int stop(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct bitmap *bitmap = mddev->bitmap;
 	int behind_wait = 0;
 
@@ -2155,7 +2155,7 @@ static int raid1_reshape(mddev_t *mddev)
 	mempool_t *newpool, *oldpool;
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int cnt, raid_disks;
 	unsigned long flags;
 	int d, d2, err;
@@ -2252,7 +2252,7 @@ static int raid1_reshape(mddev_t *mddev)
 
 static void raid1_quiesce(mddev_t *mddev, int state)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 1:
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 1620eea3d57c..e87b84deff68 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -63,12 +63,6 @@ struct r1_private_data_s {
 
 typedef struct r1_private_data_s conf_t;
 
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
 /*
  * this is our 'private' RAID1 bio.
  *
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 750550c1166f..9a5beb4fd954 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -188,7 +188,7 @@ static void put_all_bios(conf_t *conf, r10bio_t *r10_bio)
 
 static void free_r10bio(r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	/*
 	 * Wake up any possible resync thread that waits for the device
@@ -202,7 +202,7 @@ static void free_r10bio(r10bio_t *r10_bio)
 
 static void put_buf(r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	mempool_free(r10_bio, conf->r10buf_pool);
 
@@ -213,7 +213,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
 {
 	unsigned long flags;
 	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
@@ -245,7 +245,7 @@ static void raid_end_bio_io(r10bio_t *r10_bio)
  */
 static inline void update_head_pos(int slot, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
@@ -256,7 +256,7 @@ static void raid10_end_read_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	int slot, dev;
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 
 	slot = r10_bio->read_slot;
@@ -297,7 +297,7 @@ static void raid10_end_write_request(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	int slot, dev;
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 
 	for (slot = 0; slot < conf->copies; slot++)
 		if (r10_bio->devs[slot].bio == bio)
@@ -596,7 +596,7 @@ rb_out:
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -628,7 +628,7 @@ static void raid10_unplug(struct request_queue *q)
 static int raid10_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, ret = 0;
 
 	rcu_read_lock();
@@ -788,7 +788,7 @@ static void unfreeze_array(conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	mirror_info_t *mirror;
 	r10bio_t *r10_bio;
 	struct bio *read_bio;
@@ -981,7 +981,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
 
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i;
 
 	if (conf->near_copies < conf->raid_disks)
@@ -1006,7 +1006,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	/*
 	 * If it is not operational, then we have already marked it as dead
@@ -1215,7 +1215,7 @@ abort:
 static void end_sync_read(struct bio *bio, int error)
 {
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
-	conf_t *conf = mddev_to_conf(r10_bio->mddev);
+	conf_t *conf = r10_bio->mddev->private;
 	int i,d;
 
 	for (i=0; i<conf->copies; i++)
@@ -1253,7 +1253,7 @@ static void end_sync_write(struct bio *bio, int error)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r10bio_t * r10_bio = (r10bio_t *)(bio->bi_private);
 	mddev_t *mddev = r10_bio->mddev;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i,d;
 
 	for (i = 0; i < conf->copies; i++)
@@ -1300,7 +1300,7 @@ static void end_sync_write(struct bio *bio, int error)
  */
 static void sync_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, first;
 	struct bio *tbio, *fbio;
 
@@ -1400,7 +1400,7 @@ done:
 
 static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	int i, d;
 	struct bio *bio, *wbio;
 
@@ -1549,7 +1549,7 @@ static void raid10d(mddev_t *mddev)
 	r10bio_t *r10_bio;
 	struct bio *bio;
 	unsigned long flags;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	int unplug=0;
 	mdk_rdev_t *rdev;
@@ -1572,7 +1572,7 @@ static void raid10d(mddev_t *mddev)
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 
 		mddev = r10_bio->mddev;
-		conf = mddev_to_conf(mddev);
+		conf = mddev->private;
 		if (test_bit(R10BIO_IsSync, &r10_bio->state)) {
 			sync_request_write(mddev, r10_bio);
 			unplug = 1;
@@ -1680,7 +1680,7 @@ static int init_resync(conf_t *conf)
 
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 	r10bio_t *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
@@ -2026,7 +2026,7 @@ static sector_t
 raid10_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t size;
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	if (!raid_disks)
 		raid_disks = mddev->raid_disks;
@@ -2227,7 +2227,7 @@ out:
 
 static int stop(mddev_t *mddev)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
@@ -2245,7 +2245,7 @@ static int stop(mddev_t *mddev)
 
 static void raid10_quiesce(mddev_t *mddev, int state)
 {
-	conf_t *conf = mddev_to_conf(mddev);
+	conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 1:
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 244dbe507a54..59cd1efb8d30 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -61,12 +61,6 @@ struct r10_private_data_s {
 
 typedef struct r10_private_data_s conf_t;
 
-/*
- * this is the only point in the RAID code where we violate
- * C type safety. mddev->private is an 'opaque' pointer.
- */
-#define mddev_to_conf(mddev) ((conf_t *) mddev->private)
-
 /*
  * this is our 'private' RAID10 bio.
  *
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bef876698232..7fb97c65ad37 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3284,7 +3284,7 @@ static void activate_bit_delay(raid5_conf_t *conf)
 
 static void unplug_slaves(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int i;
 
 	rcu_read_lock();
@@ -3308,7 +3308,7 @@ static void unplug_slaves(mddev_t *mddev)
 static void raid5_unplug_device(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long flags;
 
 	spin_lock_irqsave(&conf->device_lock, flags);
@@ -3327,7 +3327,7 @@ static void raid5_unplug_device(struct request_queue *q)
 static int raid5_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	/* No difference between reads and writes.  Just check
 	 * how busy the stripe_cache is
@@ -3440,7 +3440,7 @@ static void raid5_align_endio(struct bio *bi, int error)
 	bio_put(bi);
 
 	mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
-	conf = mddev_to_conf(mddev);
+	conf = mddev->private;
 	rdev = (void*)raid_bi->bi_next;
 	raid_bi->bi_next = NULL;
 
@@ -3482,7 +3482,7 @@ static int bio_fits_rdev(struct bio *bi)
 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned int dd_idx;
 	struct bio* align_bi;
 	mdk_rdev_t *rdev;
@@ -3599,7 +3599,7 @@ static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
 static int make_request(struct request_queue *q, struct bio * bi)
 {
 	mddev_t *mddev = q->queuedata;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int dd_idx;
 	sector_t new_sector;
 	sector_t logical_sector, last_sector;
@@ -4129,7 +4129,7 @@ static int  retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
 static void raid5d(mddev_t *mddev)
 {
 	struct stripe_head *sh;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	int handled;
 
 	pr_debug("+++ raid5d active\n");
@@ -4185,7 +4185,7 @@ static void raid5d(mddev_t *mddev)
 static ssize_t
 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->max_nr_stripes);
 	else
@@ -4195,7 +4195,7 @@ raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long new;
 	int err;
 
@@ -4233,7 +4233,7 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 static ssize_t
 raid5_show_preread_threshold(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", conf->bypass_threshold);
 	else
@@ -4243,7 +4243,7 @@ raid5_show_preread_threshold(mddev_t *mddev, char *page)
 static ssize_t
 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	unsigned long new;
 	if (len >= PAGE_SIZE)
 		return -EINVAL;
@@ -4267,7 +4267,7 @@ raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
 static ssize_t
 stripe_cache_active_show(mddev_t *mddev, char *page)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	if (conf)
 		return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
 	else
@@ -4291,7 +4291,7 @@ static struct attribute_group raid5_attrs_group = {
 static sector_t
 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (!sectors)
 		sectors = mddev->dev_sectors;
@@ -4845,7 +4845,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 
 static int raid5_check_reshape(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
@@ -4890,7 +4890,7 @@ static int raid5_check_reshape(mddev_t *mddev)
 
 static int raid5_start_reshape(mddev_t *mddev)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 	mdk_rdev_t *rdev;
 	int spares = 0;
 	int added_devices = 0;
@@ -5022,7 +5022,7 @@ static void end_reshape(raid5_conf_t *conf)
 static void raid5_finish_reshape(mddev_t *mddev)
 {
 	struct block_device *bdev;
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
 
@@ -5061,7 +5061,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 
 static void raid5_quiesce(mddev_t *mddev, int state)
 {
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	switch(state) {
 	case 2: /* resume for a suspend */
@@ -5157,7 +5157,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	 * For larger arrays we record the new value - after validation
 	 * to be used by a reshape pass.
 	 */
-	raid5_conf_t *conf = mddev_to_conf(mddev);
+	raid5_conf_t *conf = mddev->private;
 
 	if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
 		return -EINVAL;
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 52ba99954dec..1a25c9e252b4 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -408,8 +408,6 @@ struct raid5_private_data {
 
 typedef struct raid5_private_data raid5_conf_t;
 
-#define mddev_to_conf(mddev) ((raid5_conf_t *) mddev->private)
-
 /*
  * Our supported algorithms
  */
-- 
cgit v1.2.3


From 45d4582f219619e368ea91ea1189085e1c5f1969 Mon Sep 17 00:00:00 2001
From: Sandeep K Sinha <sandeepksinha@gmail.com>
Date: Tue, 16 Jun 2009 16:55:26 +1000
Subject: md: Removal of hash table in linear raid

Get rid of sector_div and hash table for linear raid and replace
with a linear search in which_dev.
The hash table adds a lot of complexity for little if any gain.
Ultimately a binary search will be used which will have smaller
cache foot print, a similar number of memory access, and no
divisions.

Signed-off-by: Sandeep K Sinha <sandeepksinha@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 93 ++---------------------------------------------------
 drivers/md/linear.h |  5 ---
 2 files changed, 3 insertions(+), 95 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 31f8ec7131bd..92bcd3dd52cc 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -29,13 +29,8 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
 	dev_info_t *hash;
 	linear_conf_t *conf = mddev->private;
-	sector_t idx = sector >> conf->sector_shift;
 
-	/*
-	 * sector_div(a,b) returns the remainer and sets a to a/b
-	 */
-	(void)sector_div(idx, conf->spacing);
-	hash = conf->hash_table[idx];
+	hash = conf->disks;
 
 	while (sector >= hash->num_sectors + hash->start_sector)
 		hash++;
@@ -114,11 +109,8 @@ static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 {
 	linear_conf_t *conf;
-	dev_info_t **table;
 	mdk_rdev_t *rdev;
-	int i, nb_zone, cnt;
-	sector_t min_sectors;
-	sector_t curr_sector;
+	int i, cnt;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(dev_info_t),
 			GFP_KERNEL);
@@ -159,63 +151,8 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		goto out;
 	}
 
-	min_sectors = conf->array_sectors;
-	sector_div(min_sectors, PAGE_SIZE/sizeof(struct dev_info *));
-	if (min_sectors == 0)
-		min_sectors = 1;
-
-	/* min_sectors is the minimum spacing that will fit the hash
-	 * table in one PAGE.  This may be much smaller than needed.
-	 * We find the smallest non-terminal set of consecutive devices
-	 * that is larger than min_sectors and use the size of that as
-	 * the actual spacing
-	 */
-	conf->spacing = conf->array_sectors;
-	for (i=0; i < cnt-1 ; i++) {
-		sector_t tmp = 0;
-		int j;
-		for (j = i; j < cnt - 1 && tmp < min_sectors; j++)
-			tmp += conf->disks[j].num_sectors;
-		if (tmp >= min_sectors && tmp < conf->spacing)
-			conf->spacing = tmp;
-	}
-
-	/* spacing may be too large for sector_div to work with,
-	 * so we might need to pre-shift
-	 */
-	conf->sector_shift = 0;
-	if (sizeof(sector_t) > sizeof(u32)) {
-		sector_t space = conf->spacing;
-		while (space > (sector_t)(~(u32)0)) {
-			space >>= 1;
-			conf->sector_shift++;
-		}
-	}
 	/*
-	 * This code was restructured to work around a gcc-2.95.3 internal
-	 * compiler error.  Alter it with care.
-	 */
-	{
-		sector_t sz;
-		unsigned round;
-		unsigned long base;
-
-		sz = conf->array_sectors >> conf->sector_shift;
-		sz += 1; /* force round-up */
-		base = conf->spacing >> conf->sector_shift;
-		round = sector_div(sz, base);
-		nb_zone = sz + (round ? 1 : 0);
-	}
-	BUG_ON(nb_zone > PAGE_SIZE / sizeof(struct dev_info *));
-
-	conf->hash_table = kmalloc (sizeof (struct dev_info *) * nb_zone,
-					GFP_KERNEL);
-	if (!conf->hash_table)
-		goto out;
-
-	/*
-	 * Here we generate the linear hash table
-	 * First calculate the device offsets.
+	 * Here we calculate the device offsets.
 	 */
 	conf->disks[0].start_sector = 0;
 	for (i = 1; i < raid_disks; i++)
@@ -223,29 +160,6 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 			conf->disks[i-1].start_sector +
 			conf->disks[i-1].num_sectors;
 
-	table = conf->hash_table;
-	i = 0;
-	for (curr_sector = 0;
-	     curr_sector < conf->array_sectors;
-	     curr_sector += conf->spacing) {
-
-		while (i < raid_disks-1 &&
-		       curr_sector >= conf->disks[i+1].start_sector)
-			i++;
-
-		*table ++ = conf->disks + i;
-	}
-
-	if (conf->sector_shift) {
-		conf->spacing >>= conf->sector_shift;
-		/* round spacing up so that when we divide by it,
-		 * we err on the side of "too-low", which is safest.
-		 */
-		conf->spacing++;
-	}
-
-	BUG_ON(table - conf->hash_table > nb_zone);
-
 	return conf;
 
 out:
@@ -309,7 +223,6 @@ static int linear_stop (mddev_t *mddev)
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	do {
 		linear_conf_t *t = conf->prev;
-		kfree(conf->hash_table);
 		kfree(conf);
 		conf = t;
 	} while (conf);
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 76078f1cded0..721a878403d1 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -12,12 +12,7 @@ typedef struct dev_info dev_info_t;
 struct linear_private_data
 {
 	struct linear_private_data *prev;	/* earlier version */
-	dev_info_t		**hash_table;
-	sector_t		spacing;
 	sector_t		array_sectors;
-	int			sector_shift;	/* shift before dividing
-						 * by spacing
-						 */
 	dev_info_t		disks[0];
 };
 
-- 
cgit v1.2.3


From 4db7cdc859f56ecf0a186e0cfb238b5bb3af2efb Mon Sep 17 00:00:00 2001
From: Sandeep K Sinha <sandeepksinha@gmail.com>
Date: Tue, 16 Jun 2009 16:56:13 +1000
Subject: md: Removing num_sector and replacing start_sector with end_sector

Remove num_sectors from dev_info and replace start_sector with
end_sector.  This makes a lot of comparisons much simpler.

Signed-off-by: Sandeep K Sinha <sandeepksinha@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 37 ++++++++++++++++++-------------------
 drivers/md/linear.h |  3 +--
 2 files changed, 19 insertions(+), 21 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 92bcd3dd52cc..529a3d37e3fe 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -32,7 +32,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 
 	hash = conf->disks;
 
-	while (sector >= hash->num_sectors + hash->start_sector)
+	while (sector >= hash->end_sector)
 		hash++;
 	return hash;
 }
@@ -55,7 +55,7 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
 	dev0 = which_dev(mddev, sector);
-	maxsectors = dev0->num_sectors - (sector - dev0->start_sector);
+	maxsectors = dev0->end_sector - sector;
 
 	if (maxsectors < bio_sectors)
 		maxsectors = 0;
@@ -141,10 +141,9 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		    queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 
-		disk->num_sectors = rdev->sectors;
 		conf->array_sectors += rdev->sectors;
-
 		cnt++;
+
 	}
 	if (cnt != raid_disks) {
 		printk("linear: not enough drives present. Aborting!\n");
@@ -154,11 +153,12 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	/*
 	 * Here we calculate the device offsets.
 	 */
-	conf->disks[0].start_sector = 0;
+	conf->disks[0].end_sector = conf->disks[0].rdev->sectors;
+
 	for (i = 1; i < raid_disks; i++)
-		conf->disks[i].start_sector =
-			conf->disks[i-1].start_sector +
-			conf->disks[i-1].num_sectors;
+		conf->disks[i].end_sector =
+			conf->disks[i-1].end_sector +
+			conf->disks[i].rdev->sectors;
 
 	return conf;
 
@@ -235,6 +235,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	const int rw = bio_data_dir(bio);
 	mddev_t *mddev = q->queuedata;
 	dev_info_t *tmp_dev;
+	sector_t start_sector;
 	int cpu;
 
 	if (unlikely(bio_barrier(bio))) {
@@ -249,32 +250,30 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	tmp_dev = which_dev(mddev, bio->bi_sector);
-    
-	if (unlikely(bio->bi_sector >= (tmp_dev->num_sectors +
-					tmp_dev->start_sector)
-		     || (bio->bi_sector <
-			 tmp_dev->start_sector))) {
+	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
+
+	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
+		     || (bio->bi_sector < start_sector))) {
 		char b[BDEVNAME_SIZE];
 
 		printk("linear_make_request: Sector %llu out of bounds on "
 			"dev %s: %llu sectors, offset %llu\n",
 			(unsigned long long)bio->bi_sector,
 			bdevname(tmp_dev->rdev->bdev, b),
-			(unsigned long long)tmp_dev->num_sectors,
-			(unsigned long long)tmp_dev->start_sector);
+			(unsigned long long)tmp_dev->rdev->sectors,
+			(unsigned long long)start_sector);
 		bio_io_error(bio);
 		return 0;
 	}
 	if (unlikely(bio->bi_sector + (bio->bi_size >> 9) >
-		     tmp_dev->start_sector + tmp_dev->num_sectors)) {
+		     tmp_dev->end_sector)) {
 		/* This bio crosses a device boundary, so we have to
 		 * split it.
 		 */
 		struct bio_pair *bp;
 
 		bp = bio_split(bio,
-			       tmp_dev->start_sector + tmp_dev->num_sectors
-			       - bio->bi_sector);
+			       tmp_dev->end_sector - bio->bi_sector);
 
 		if (linear_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
@@ -285,7 +284,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	}
 		    
 	bio->bi_bdev = tmp_dev->rdev->bdev;
-	bio->bi_sector = bio->bi_sector - tmp_dev->start_sector
+	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
 
 	return 1;
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 721a878403d1..599e5c1bbb01 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -3,8 +3,7 @@
 
 struct dev_info {
 	mdk_rdev_t	*rdev;
-	sector_t	num_sectors;
-	sector_t	start_sector;
+	sector_t	end_sector;
 };
 
 typedef struct dev_info dev_info_t;
-- 
cgit v1.2.3


From aece3d1f40879759f641dfbfdbb9e2593adeb43c Mon Sep 17 00:00:00 2001
From: Sandeep K Sinha <sandeepksinha@gmail.com>
Date: Tue, 16 Jun 2009 16:57:08 +1000
Subject: md: Binary search in linear raid

Replace the linear search with binary search in which_dev.

Signed-off-by: Sandeep K Sinha <sandeepksinha@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 529a3d37e3fe..9b02a73fbc6b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -27,14 +27,26 @@
  */
 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
-	dev_info_t *hash;
+	int lo, mid, hi;
 	linear_conf_t *conf = mddev->private;
 
-	hash = conf->disks;
+	lo = 0;
+	hi = mddev->raid_disks - 1;
 
-	while (sector >= hash->end_sector)
-		hash++;
-	return hash;
+	/*
+	 * Binary Search
+	 */
+
+	while (hi > lo) {
+
+		mid = (hi + lo) / 2;
+		if (sector < conf->disks[mid].end_sector)
+			hi = mid;
+		else
+			lo = mid + 1;
+	}
+
+	return conf->disks + lo;
 }
 
 /**
-- 
cgit v1.2.3


From 1b9614291eb319fad96de45392eb4452ad39f0ee Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 16:57:40 +1000
Subject: md: have raid0 compile with MD_DEBUG on

Because of the removal of the device list from
the strips raid0 did not compile with MD_DEBUG flag on

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 77764dad1bcb..d8692fc17963 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -400,7 +400,7 @@ bad_map:
 	return 0;
 }
 
-static void raid0_status (struct seq_file *seq, mddev_t *mddev)
+static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 {
 #undef MD_DEBUG
 #ifdef MD_DEBUG
@@ -408,18 +408,24 @@ static void raid0_status (struct seq_file *seq, mddev_t *mddev)
 	char b[BDEVNAME_SIZE];
 	raid0_conf_t *conf = mddev->private;
 
+	sector_t zone_size;
+	sector_t zone_start = 0;
 	h = 0;
+
 	for (j = 0; j < conf->nr_strip_zones; j++) {
 		seq_printf(seq, "      z%d", j);
 		seq_printf(seq, "=[");
 		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
 			seq_printf(seq, "%s/", bdevname(
-				conf->strip_zone[j].dev[k]->bdev,b));
-
-		seq_printf(seq, "] ze=%d ds=%d s=%d\n",
-				conf->strip_zone[j].zone_end,
-				conf->strip_zone[j].dev_start,
-				conf->strip_zone[j].sectors);
+				conf->devlist[j*mddev->raid_disks + k]
+						->bdev, b));
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		seq_printf(seq, "] ze=%lld ds=%lld s=%lld\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
 	}
 #endif
 	seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
-- 
cgit v1.2.3


From 46994191ae8fdf1cbcc1f29282576b269a638c69 Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:00:54 +1000
Subject: md: have raid0 report its formation

Report to the user what are the raid zones

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 33 +++++++++++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index d8692fc17963..62fde23bf281 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -52,6 +52,38 @@ static int raid0_congested(void *data, int bits)
 	return ret;
 }
 
+/*
+ * inform the user of the raid configuration
+*/
+static void dump_zones(mddev_t *mddev)
+{
+	int j, k, h;
+	sector_t zone_size = 0;
+	sector_t zone_start = 0;
+	char b[BDEVNAME_SIZE];
+	raid0_conf_t *conf = mddev->private;
+	printk(KERN_INFO "******* %s configuration *********\n",
+		mdname(mddev));
+	h = 0;
+	for (j = 0; j < conf->nr_strip_zones; j++) {
+		printk(KERN_INFO "zone%d=[", j);
+		for (k = 0; k < conf->strip_zone[j].nb_dev; k++)
+			printk("%s/",
+			bdevname(conf->devlist[j*mddev->raid_disks
+						+ k]->bdev, b));
+		printk("]\n");
+
+		zone_size  = conf->strip_zone[j].zone_end - zone_start;
+		printk(KERN_INFO "        zone offset=%llukb "
+				"device offset=%llukb size=%llukb\n",
+			(unsigned long long)zone_start>>1,
+			(unsigned long long)conf->strip_zone[j].dev_start>>1,
+			(unsigned long long)zone_size>>1);
+		zone_start = conf->strip_zone[j].zone_end;
+	}
+	printk(KERN_INFO "**********************************\n\n");
+}
+
 static int create_strip_zones(mddev_t *mddev)
 {
 	int i, c, j, err;
@@ -289,6 +321,7 @@ static int raid0_run(mddev_t *mddev)
 	}
 
 	blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec);
+	dump_zones(mddev);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 92e59b6ba21845fadd2cce725010a9351740b76e Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:00:57 +1000
Subject: md: raid0: chunk size check in raid0_run

have raid0 check chunk size in run method instead of in md.
This is part of a series moving the checks from common code to
the personalities where they belong.

hardsect is short and chunksize is an int, so it is safe to use %.

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 62fde23bf281..39936a217f95 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -234,6 +234,16 @@ static int create_strip_zones(mddev_t *mddev)
 	mddev->queue->backing_dev_info.congested_fn = raid0_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 
+	/*
+	 * now since we have the hard sector sizes, we can make sure
+	 * chunk size is a multiple of that sector size
+	 */
+	if (mddev->chunk_size % queue_logical_block_size(mddev->queue)) {
+		printk(KERN_ERR "%s chunk_size of %d not valid\n",
+		       mdname(mddev),
+		       mddev->chunk_size);
+		goto abort;
+	}
 	printk(KERN_INFO "raid0: done.\n");
 	mddev->private = conf;
 	return 0;
@@ -289,8 +299,9 @@ static int raid0_run(mddev_t *mddev)
 {
 	int ret;
 
-	if (mddev->chunk_size == 0) {
-		printk(KERN_ERR "md/raid0: non-zero chunk size required.\n");
+	if (mddev->chunk_size == 0 ||
+	    !is_power_of_2(mddev->chunk_size)) {
+		printk(KERN_ERR "md/raid0: chunk size must be a power of 2.\n");
 		return -EINVAL;
 	}
 	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
-- 
cgit v1.2.3


From 964e7913b0d25b988e27a7cd9378bc55cc572bb4 Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:01:22 +1000
Subject: md: raid10: chunk size check in run

have raid10 check chunk size in run method instead of in md

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid10.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 9a5beb4fd954..06bef686f91b 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2050,9 +2050,10 @@ static int run(mddev_t *mddev)
 	int nc, fc, fo;
 	sector_t stride, size;
 
-	if (mddev->chunk_size < PAGE_SIZE) {
+	if (mddev->chunk_size < PAGE_SIZE ||
+	    !is_power_of_2(mddev->chunk_size)) {
 		printk(KERN_ERR "md/raid10: chunk size must be "
-		       "at least PAGE_SIZE(%ld).\n", PAGE_SIZE);
+		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 740da44918680a0c72411ae4ccdd1861069afcc4 Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:01:36 +1000
Subject: md: raid5: chunk size check in setup_conf

have raid5 check chunk size in run/reshape method instead of in md

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7fb97c65ad37..be4e62f611bc 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4336,7 +4336,8 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE) {
+	if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE ||
+	    !is_power_of_2(mddev->new_chunk)) {
 		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
 			mddev->new_chunk, mdname(mddev));
 		return ERR_PTR(-EINVAL);
-- 
cgit v1.2.3


From 2ac06c3332898103210b478c5a17c20e28929287 Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:01:42 +1000
Subject: md: prepare for non-power-of-two chunk sizes

Remove chunk size check from md as this is now performed in the run
function in each personality.

Replace chunk size power 2 code calculations by a regular division.

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 20f6ac338349..a02bde70874b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -444,8 +444,11 @@ static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
 {
 	sector_t num_sectors = rdev->sb_start;
 
-	if (chunk_size)
-		num_sectors &= ~((sector_t)chunk_size/512 - 1);
+	if (chunk_size) {
+		unsigned chunk_sects = chunk_size>>9;
+		sector_div(num_sectors, chunk_sects);
+		num_sectors *= chunk_sects;
+	}
 	return num_sectors;
 }
 
@@ -1248,8 +1251,12 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (rdev->sectors < le64_to_cpu(sb->data_size))
 		return -EINVAL;
 	rdev->sectors = le64_to_cpu(sb->data_size);
-	if (le32_to_cpu(sb->chunksize))
-		rdev->sectors &= ~((sector_t)le32_to_cpu(sb->chunksize) - 1);
+	if (le32_to_cpu(sb->chunksize)) {
+		int chunk_sects = le32_to_cpu(sb->chunksize);
+		sector_t chunks = rdev->sectors;
+		sector_div(chunks, chunk_sects);
+		rdev->sectors = chunks * chunk_sects;
+	}
 
 	if (le64_to_cpu(sb->size) > rdev->sectors)
 		return -EINVAL;
@@ -3528,7 +3535,8 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len)
 
 	/* Must be a multiple of chunk_size */
 	if (mddev->chunk_size) {
-		if (min & (sector_t)((mddev->chunk_size>>9)-1))
+		sector_t temp = min;
+		if (sector_div(temp, (mddev->chunk_size>>9)))
 			return -EINVAL;
 	}
 	mddev->resync_min = min;
@@ -3565,7 +3573,8 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 
 		/* Must be a multiple of chunk_size */
 		if (mddev->chunk_size) {
-			if (max & (sector_t)((mddev->chunk_size>>9)-1))
+			sector_t temp = max;
+			if (sector_div(temp, (mddev->chunk_size>>9)))
 				return -EINVAL;
 		}
 		mddev->resync_max = max;
@@ -4006,14 +4015,6 @@ static int do_md_run(mddev_t * mddev)
 				chunk_size, MAX_CHUNK_SIZE);
 			return -EINVAL;
 		}
-		/*
-		 * chunk-size has to be a power of 2
-		 */
-		if ( (1 << ffz(~chunk_size)) != chunk_size) {
-			printk(KERN_ERR "chunk_size of %d not valid\n", chunk_size);
-			return -EINVAL;
-		}
-
 		/* devices must have minimum size of one chunk */
 		list_for_each_entry(rdev, &mddev->disks, same_set) {
 			if (test_bit(Faulty, &rdev->flags))
-- 
cgit v1.2.3


From fbb704efb784e2c8418e34dc3013af76bdd58101 Mon Sep 17 00:00:00 2001
From: raz ben yehuda <raziebe@gmail.com>
Date: Tue, 16 Jun 2009 17:02:05 +1000
Subject: md: raid0 :Enables chunk size other than powers of 2.

Maintain two flows, one for pow2 chunk sizes (which uses masks and
shift), and a flow for the general case (which uses sector_div).
This is for the sake of performance.

 - introduce map_sector and is_io_in_chunk_boundary to encapsulate
   those two flows better for raid0_make_request
 - fix blk_mergeable to support the two flows.

Signed-off-by: raziebe@gmail.com
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 107 ++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 77 insertions(+), 30 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 39936a217f95..7cd2671cc794 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -273,7 +273,12 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 	unsigned int chunk_sectors = mddev->chunk_size >> 9;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
+	if (is_power_of_2(mddev->chunk_size))
+		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
+						+ bio_sectors)) << 9;
+	else
+		max =  (chunk_sectors - (sector_div(sector, chunk_sectors)
+						+ bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 	if (max <= biovec->bv_len && bio_sectors == 0)
 		return biovec->bv_len;
@@ -299,9 +304,8 @@ static int raid0_run(mddev_t *mddev)
 {
 	int ret;
 
-	if (mddev->chunk_size == 0 ||
-	    !is_power_of_2(mddev->chunk_size)) {
-		printk(KERN_ERR "md/raid0: chunk size must be a power of 2.\n");
+	if (mddev->chunk_size == 0) {
+		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
 		return -EINVAL;
 	}
 	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
@@ -367,15 +371,65 @@ static struct strip_zone *find_zone(struct raid0_private_data *conf,
 	BUG();
 }
 
-static int raid0_make_request (struct request_queue *q, struct bio *bio)
+/*
+ * remaps the bio to the target device. we separate two flows.
+ * power 2 flow and a general flow for the sake of perfromance
+*/
+static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
+				sector_t sector, sector_t *sector_offset)
 {
-	mddev_t *mddev = q->queuedata;
-	unsigned int sect_in_chunk, chunksect_bits, chunk_sects;
+	unsigned int sect_in_chunk;
+	sector_t chunk;
 	raid0_conf_t *conf = mddev->private;
+	unsigned int chunk_sects = mddev->chunk_size >> 9;
+
+	if (is_power_of_2(mddev->chunk_size)) {
+		int chunksect_bits = ffz(~chunk_sects);
+		/* find the sector offset inside the chunk */
+		sect_in_chunk  = sector & (chunk_sects - 1);
+		sector >>= chunksect_bits;
+		/* chunk in zone */
+		chunk = *sector_offset;
+		/* quotient is the chunk in real device*/
+		sector_div(chunk, zone->nb_dev << chunksect_bits);
+	} else{
+		sect_in_chunk = sector_div(sector, chunk_sects);
+		chunk = *sector_offset;
+		sector_div(chunk, chunk_sects * zone->nb_dev);
+	}
+	/*
+	*  position the bio over the real device
+	*  real sector = chunk in device + starting of zone
+	*	+ the position in the chunk
+	*/
+	*sector_offset = (chunk * chunk_sects) + sect_in_chunk;
+	return conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
+			     + sector_div(sector, zone->nb_dev)];
+}
+
+/*
+ * Is io distribute over 1 or more chunks ?
+*/
+static inline int is_io_in_chunk_boundary(mddev_t *mddev,
+			unsigned int chunk_sects, struct bio *bio)
+{
+	if (likely(is_power_of_2(mddev->chunk_size))) {
+		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
+					+ (bio->bi_size >> 9));
+	} else{
+		sector_t sector = bio->bi_sector;
+		return chunk_sects >= (sector_div(sector, chunk_sects)
+						+ (bio->bi_size >> 9));
+	}
+}
+
+static int raid0_make_request(struct request_queue *q, struct bio *bio)
+{
+	mddev_t *mddev = q->queuedata;
+	unsigned int chunk_sects;
+	sector_t sector_offset;
 	struct strip_zone *zone;
 	mdk_rdev_t *tmp_dev;
-	sector_t chunk;
-	sector_t sector, rsect, sector_offset;
 	const int rw = bio_data_dir(bio);
 	int cpu;
 
@@ -391,10 +445,8 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 	part_stat_unlock();
 
 	chunk_sects = mddev->chunk_size >> 9;
-	chunksect_bits = ffz(~chunk_sects);
-	sector = bio->bi_sector;
-
-	if (unlikely(chunk_sects < (bio->bi_sector & (chunk_sects - 1)) + (bio->bi_size >> 9))) {
+	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
+		sector_t sector = bio->bi_sector;
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio->bi_vcnt != 1 ||
@@ -403,7 +455,12 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		bp = bio_split(bio, chunk_sects - (bio->bi_sector & (chunk_sects - 1)));
+		if (likely(is_power_of_2(mddev->chunk_size)))
+			bp = bio_split(bio, chunk_sects - (sector &
+							   (chunk_sects-1)));
+		else
+			bp = bio_split(bio, chunk_sects -
+				       sector_div(sector, chunk_sects));
 		if (raid0_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
 		if (raid0_make_request(q, &bp->bio2))
@@ -412,24 +469,14 @@ static int raid0_make_request (struct request_queue *q, struct bio *bio)
 		bio_pair_release(bp);
 		return 0;
 	}
-	sector_offset = sector;
-	zone = find_zone(conf, &sector_offset);
-	sect_in_chunk = bio->bi_sector & (chunk_sects - 1);
-	{
-		sector_t x = sector_offset >> chunksect_bits;
-
-		sector_div(x, zone->nb_dev);
-		chunk = x;
 
-		x = sector >> chunksect_bits;
-		tmp_dev = conf->devlist[(zone - conf->strip_zone)*mddev->raid_disks
-					+ sector_div(x, zone->nb_dev)];
-	}
-	rsect = (chunk << chunksect_bits) + zone->dev_start + sect_in_chunk;
- 
+	sector_offset = bio->bi_sector;
+	zone =  find_zone(mddev->private, &sector_offset);
+	tmp_dev = map_sector(mddev, zone, bio->bi_sector,
+			     &sector_offset);
 	bio->bi_bdev = tmp_dev->bdev;
-	bio->bi_sector = rsect + tmp_dev->data_offset;
-
+	bio->bi_sector = sector_offset + zone->dev_start +
+		tmp_dev->data_offset;
 	/*
 	 * Let the main block layer submit the IO and resolve recursion:
 	 */
-- 
cgit v1.2.3


From e212d6f25084e8e9b02a04ba514d7bb1e4a4924a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 16 Jun 2009 11:19:36 +0200
Subject: block: remove some includings of blktrace_api.h

When porting blktrace to tracepoints, we changed to trace/block.h
for trace prober declarations.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c      | 1 -
 drivers/block/cciss.c | 1 -
 drivers/md/dm.c       | 1 -
 fs/bio.c              | 1 -
 mm/bounce.c           | 1 -
 mm/highmem.c          | 1 -
 6 files changed, 6 deletions(-)

(limited to 'drivers/md')

diff --git a/block/blk-core.c b/block/blk-core.c
index 94d88fabc4bd..b06cf5c2a829 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -26,7 +26,6 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
-#include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 
 #define CREATE_TRACE_POINTS
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index b22cec97ea19..c7a527c08a09 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -38,7 +38,6 @@
 #include <linux/hdreg.h>
 #include <linux/spinlock.h>
 #include <linux/compat.h>
-#include <linux/blktrace_api.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 3fd8b1e65483..48db308fae67 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -19,7 +19,6 @@
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
-#include <linux/blktrace_api.h>
 
 #include <trace/events/block.h>
 
diff --git a/fs/bio.c b/fs/bio.c
index 5f80848c320c..24c914043532 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -25,7 +25,6 @@
 #include <linux/module.h>
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
-#include <linux/blktrace_api.h>
 #include <scsi/sg.h>		/* for struct sg_iovec */
 
 #include <trace/events/block.h>
diff --git a/mm/bounce.c b/mm/bounce.c
index 4ebe3ea83795..a2b76a588e34 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -13,7 +13,6 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 #include <trace/events/block.h>
diff --git a/mm/highmem.c b/mm/highmem.c
index 68eb1d9b63fa..25878cc49daa 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -26,7 +26,6 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
-#include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 
 /*
-- 
cgit v1.2.3


From 9d8f0363623b3da12c43007cf77f5e1a4e8a5964 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:45:01 +1000
Subject: md: Make mddev->chunk_size sector-based.

This patch renames the chunk_size field to chunk_sectors with the
implied change of semantics.  Since

	is_power_of_2(chunk_size) = is_power_of_2(chunk_sectors << 9)
				  = is_power_of_2(chunk_sectors)

these bits don't need an adjustment for the shift.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c |  2 +-
 drivers/md/md.c     | 51 ++++++++++++++++++++++++++-------------------------
 drivers/md/md.h     |  2 +-
 drivers/md/raid0.c  | 27 ++++++++++++++-------------
 drivers/md/raid1.c  |  4 ++--
 drivers/md/raid10.c | 15 ++++++++-------
 drivers/md/raid5.c  | 41 ++++++++++++++++++++++-------------------
 7 files changed, 74 insertions(+), 68 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 9b02a73fbc6b..9f7cec42dd8e 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -305,7 +305,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 static void linear_status (struct seq_file *seq, mddev_t *mddev)
 {
 
-	seq_printf(seq, " %dk rounding", mddev->chunk_size/1024);
+	seq_printf(seq, " %dk rounding", mddev->chunk_sectors / 2);
 }
 
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a02bde70874b..abcc0fef30e3 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -869,7 +869,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->minor_version = sb->minor_version;
 		mddev->patch_version = sb->patch_version;
 		mddev->external = 0;
-		mddev->chunk_size = sb->chunk_size;
+		mddev->chunk_sectors = sb->chunk_size >> 9;
 		mddev->ctime = sb->ctime;
 		mddev->utime = sb->utime;
 		mddev->level = sb->level;
@@ -892,7 +892,7 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_size;
+			mddev->new_chunk = mddev->chunk_sectors << 9;
 		}
 
 		if (sb->state & (1<<MD_SB_CLEAN))
@@ -1021,7 +1021,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->recovery_cp = 0;
 
 	sb->layout = mddev->layout;
-	sb->chunk_size = mddev->chunk_size;
+	sb->chunk_size = mddev->chunk_sectors << 9;
 
 	if (mddev->bitmap && mddev->bitmap_file == NULL)
 		sb->state |= (1<<MD_SB_BITMAP_PRESENT);
@@ -1278,7 +1278,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 		mddev->major_version = 1;
 		mddev->patch_version = 0;
 		mddev->external = 0;
-		mddev->chunk_size = le32_to_cpu(sb->chunksize) << 9;
+		mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
 		mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
 		mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
 		mddev->level = le32_to_cpu(sb->level);
@@ -1310,7 +1310,7 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_size;
+			mddev->new_chunk = mddev->chunk_sectors << 9;
 		}
 
 	} else if (mddev->pers == NULL) {
@@ -1382,7 +1382,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 
 	sb->raid_disks = cpu_to_le32(mddev->raid_disks);
 	sb->size = cpu_to_le64(mddev->dev_sectors);
-	sb->chunksize = cpu_to_le32(mddev->chunk_size >> 9);
+	sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
 	sb->level = cpu_to_le32(mddev->level);
 	sb->layout = cpu_to_le32(mddev->layout);
 
@@ -2753,7 +2753,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	if (IS_ERR(priv)) {
 		mddev->new_level = mddev->level;
 		mddev->new_layout = mddev->layout;
-		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_chunk = mddev->chunk_sectors << 9;
 		mddev->raid_disks -= mddev->delta_disks;
 		mddev->delta_disks = 0;
 		module_put(pers->owner);
@@ -2771,7 +2771,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 	mddev->level = mddev->new_level;
 	mddev->layout = mddev->new_layout;
-	mddev->chunk_size = mddev->new_chunk;
+	mddev->chunk_sectors = mddev->new_chunk >> 9;
 	mddev->delta_disks = 0;
 	pers->run(mddev);
 	mddev_resume(mddev);
@@ -2864,10 +2864,10 @@ static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
 {
 	if (mddev->reshape_position != MaxSector &&
-	    mddev->chunk_size != mddev->new_chunk)
+	    mddev->chunk_sectors << 9 != mddev->new_chunk)
 		return sprintf(page, "%d (%d)\n", mddev->new_chunk,
-			       mddev->chunk_size);
-	return sprintf(page, "%d\n", mddev->chunk_size);
+			       mddev->chunk_sectors << 9);
+	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
 }
 
 static ssize_t
@@ -2889,7 +2889,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 	} else {
 		mddev->new_chunk = n;
 		if (mddev->reshape_position == MaxSector)
-			mddev->chunk_size = n;
+			mddev->chunk_sectors = n >> 9;
 	}
 	return len;
 }
@@ -3534,9 +3534,9 @@ min_sync_store(mddev_t *mddev, const char *buf, size_t len)
 		return -EBUSY;
 
 	/* Must be a multiple of chunk_size */
-	if (mddev->chunk_size) {
+	if (mddev->chunk_sectors) {
 		sector_t temp = min;
-		if (sector_div(temp, (mddev->chunk_size>>9)))
+		if (sector_div(temp, mddev->chunk_sectors))
 			return -EINVAL;
 	}
 	mddev->resync_min = min;
@@ -3572,9 +3572,9 @@ max_sync_store(mddev_t *mddev, const char *buf, size_t len)
 			return -EBUSY;
 
 		/* Must be a multiple of chunk_size */
-		if (mddev->chunk_size) {
+		if (mddev->chunk_sectors) {
 			sector_t temp = max;
-			if (sector_div(temp, (mddev->chunk_size>>9)))
+			if (sector_div(temp, mddev->chunk_sectors))
 				return -EINVAL;
 		}
 		mddev->resync_max = max;
@@ -3665,7 +3665,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->delta_disks = 0;
 	mddev->new_level = mddev->level;
 	mddev->new_layout = mddev->layout;
-	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_chunk = mddev->chunk_sectors << 9;
 	return len;
 }
 
@@ -4007,7 +4007,7 @@ static int do_md_run(mddev_t * mddev)
 		analyze_sbs(mddev);
 	}
 
-	chunk_size = mddev->chunk_size;
+	chunk_size = mddev->chunk_sectors << 9;
 
 	if (chunk_size) {
 		if (chunk_size > MAX_CHUNK_SIZE) {
@@ -4406,7 +4406,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->flags = 0;
 		mddev->ro = 0;
 		mddev->metadata_type[0] = 0;
-		mddev->chunk_size = 0;
+		mddev->chunk_sectors = 0;
 		mddev->ctime = mddev->utime = 0;
 		mddev->layout = 0;
 		mddev->max_disks = 0;
@@ -4619,7 +4619,7 @@ static int get_array_info(mddev_t * mddev, void __user * arg)
 	info.spare_disks   = spare;
 
 	info.layout        = mddev->layout;
-	info.chunk_size    = mddev->chunk_size;
+	info.chunk_size    = mddev->chunk_sectors << 9;
 
 	if (copy_to_user(arg, &info, sizeof(info)))
 		return -EFAULT;
@@ -4844,7 +4844,8 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
 			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-		rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+		rdev->sectors = calc_num_sectors(rdev,
+						 mddev->chunk_sectors << 9);
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -4914,7 +4915,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	else
 		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
-	rdev->sectors = calc_num_sectors(rdev, mddev->chunk_size);
+	rdev->sectors = calc_num_sectors(rdev, mddev->chunk_sectors << 9);
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
@@ -5063,7 +5064,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	mddev->external	     = 0;
 
 	mddev->layout        = info->layout;
-	mddev->chunk_size    = info->chunk_size;
+	mddev->chunk_sectors = info->chunk_size >> 9;
 
 	mddev->max_disks     = MD_SB_DISKS;
 
@@ -5082,7 +5083,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	get_random_bytes(mddev->uuid, 16);
 
 	mddev->new_level = mddev->level;
-	mddev->new_chunk = mddev->chunk_size;
+	mddev->new_chunk = mddev->chunk_sectors << 9;
 	mddev->new_layout = mddev->layout;
 	mddev->delta_disks = 0;
 
@@ -5192,7 +5193,7 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 	    mddev->level         != info->level         ||
 /*	    mddev->layout        != info->layout        || */
 	    !mddev->persistent	 != info->not_persistent||
-	    mddev->chunk_size    != info->chunk_size    ||
+	    mddev->chunk_sectors != info->chunk_size >> 9 ||
 	    /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
 	    ((state^info->state) & 0xfffffe00)
 		)
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 8227ab909d44..5d78830043d0 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -145,7 +145,7 @@ struct mddev_s
 	int 				external;	/* metadata is
 							 * managed externally */
 	char				metadata_type[17]; /* externally set*/
-	int				chunk_size;
+	int				chunk_sectors;
 	time_t				ctime, utime;
 	int				level, layout;
 	char				clevel[16];
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7cd2671cc794..f20b18ff7969 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -238,10 +238,10 @@ static int create_strip_zones(mddev_t *mddev)
 	 * now since we have the hard sector sizes, we can make sure
 	 * chunk size is a multiple of that sector size
 	 */
-	if (mddev->chunk_size % queue_logical_block_size(mddev->queue)) {
+	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
 		printk(KERN_ERR "%s chunk_size of %d not valid\n",
 		       mdname(mddev),
-		       mddev->chunk_size);
+		       mddev->chunk_sectors << 9);
 		goto abort;
 	}
 	printk(KERN_INFO "raid0: done.\n");
@@ -270,10 +270,10 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	if (is_power_of_2(mddev->chunk_size))
+	if (is_power_of_2(mddev->chunk_sectors))
 		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
 						+ bio_sectors)) << 9;
 	else
@@ -304,11 +304,11 @@ static int raid0_run(mddev_t *mddev)
 {
 	int ret;
 
-	if (mddev->chunk_size == 0) {
+	if (mddev->chunk_sectors == 0) {
 		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
 		return -EINVAL;
 	}
-	blk_queue_max_sectors(mddev->queue, mddev->chunk_size >> 9);
+	blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
 	ret = create_strip_zones(mddev);
@@ -330,7 +330,8 @@ static int raid0_run(mddev_t *mddev)
 	 * chunksize should be used in that case.
 	 */
 	{
-		int stripe = mddev->raid_disks * mddev->chunk_size / PAGE_SIZE;
+		int stripe = mddev->raid_disks *
+			(mddev->chunk_sectors << 9) / PAGE_SIZE;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
@@ -381,9 +382,9 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 	unsigned int sect_in_chunk;
 	sector_t chunk;
 	raid0_conf_t *conf = mddev->private;
-	unsigned int chunk_sects = mddev->chunk_size >> 9;
+	unsigned int chunk_sects = mddev->chunk_sectors;
 
-	if (is_power_of_2(mddev->chunk_size)) {
+	if (is_power_of_2(mddev->chunk_sectors)) {
 		int chunksect_bits = ffz(~chunk_sects);
 		/* find the sector offset inside the chunk */
 		sect_in_chunk  = sector & (chunk_sects - 1);
@@ -413,7 +414,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 static inline int is_io_in_chunk_boundary(mddev_t *mddev,
 			unsigned int chunk_sects, struct bio *bio)
 {
-	if (likely(is_power_of_2(mddev->chunk_size))) {
+	if (likely(is_power_of_2(mddev->chunk_sectors))) {
 		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
 					+ (bio->bi_size >> 9));
 	} else{
@@ -444,7 +445,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 		      bio_sectors(bio));
 	part_stat_unlock();
 
-	chunk_sects = mddev->chunk_size >> 9;
+	chunk_sects = mddev->chunk_sectors;
 	if (unlikely(!is_io_in_chunk_boundary(mddev, chunk_sects, bio))) {
 		sector_t sector = bio->bi_sector;
 		struct bio_pair *bp;
@@ -455,7 +456,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		if (likely(is_power_of_2(mddev->chunk_size)))
+		if (likely(is_power_of_2(mddev->chunk_sectors)))
 			bp = bio_split(bio, chunk_sects - (sector &
 							   (chunk_sects-1)));
 		else
@@ -519,7 +520,7 @@ static void raid0_status(struct seq_file *seq, mddev_t *mddev)
 		zone_start = conf->strip_zone[j].zone_end;
 	}
 #endif
-	seq_printf(seq, " %dk chunks", mddev->chunk_size/1024);
+	seq_printf(seq, " %dk chunks", mddev->chunk_sectors / 2);
 	return;
 }
 
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 5ea5bca53a5e..388635735ae5 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2161,10 +2161,10 @@ static int raid1_reshape(mddev_t *mddev)
 	int d, d2, err;
 
 	/* Cannot change chunk_size, layout, or level */
-	if (mddev->chunk_size != mddev->new_chunk ||
+	if (mddev->chunk_sectors << 9 != mddev->new_chunk ||
 	    mddev->layout != mddev->new_layout ||
 	    mddev->level != mddev->new_level) {
-		mddev->new_chunk = mddev->chunk_size;
+		mddev->new_chunk = mddev->chunk_sectors << 9;
 		mddev->new_layout = mddev->layout;
 		mddev->new_level = mddev->level;
 		return -EINVAL;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 06bef686f91b..30029a312cf5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -461,7 +461,7 @@ static int raid10_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
@@ -985,7 +985,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 	int i;
 
 	if (conf->near_copies < conf->raid_disks)
-		seq_printf(seq, " %dK chunks", mddev->chunk_size/1024);
+		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
 	if (conf->near_copies > 1)
 		seq_printf(seq, " %d near-copies", conf->near_copies);
 	if (conf->far_copies > 1) {
@@ -2050,8 +2050,8 @@ static int run(mddev_t *mddev)
 	int nc, fc, fo;
 	sector_t stride, size;
 
-	if (mddev->chunk_size < PAGE_SIZE ||
-	    !is_power_of_2(mddev->chunk_size)) {
+	if (mddev->chunk_sectors < (PAGE_SIZE >> 9) ||
+	    !is_power_of_2(mddev->chunk_sectors)) {
 		printk(KERN_ERR "md/raid10: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n", PAGE_SIZE);
 		return -EINVAL;
@@ -2096,8 +2096,8 @@ static int run(mddev_t *mddev)
 	conf->far_copies = fc;
 	conf->copies = nc*fc;
 	conf->far_offset = fo;
-	conf->chunk_mask = (sector_t)(mddev->chunk_size>>9)-1;
-	conf->chunk_shift = ffz(~mddev->chunk_size) - 9;
+	conf->chunk_mask = mddev->chunk_sectors - 1;
+	conf->chunk_shift = ffz(~mddev->chunk_sectors);
 	size = mddev->dev_sectors >> conf->chunk_shift;
 	sector_div(size, fc);
 	size = size * conf->raid_disks;
@@ -2205,7 +2205,8 @@ static int run(mddev_t *mddev)
 	 * maybe...
 	 */
 	{
-		int stripe = conf->raid_disks * (mddev->chunk_size / PAGE_SIZE);
+		int stripe = conf->raid_disks *
+			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index be4e62f611bc..1e4fd5e8bfdd 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3352,13 +3352,13 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 	mddev_t *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
-	if (mddev->new_chunk < mddev->chunk_size)
+	if (mddev->new_chunk < mddev->chunk_sectors << 9)
 		chunk_sectors = mddev->new_chunk >> 9;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0;
@@ -3372,10 +3372,10 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 {
 	sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
-	unsigned int chunk_sectors = mddev->chunk_size >> 9;
+	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bio->bi_size >> 9;
 
-	if (mddev->new_chunk < mddev->chunk_size)
+	if (mddev->new_chunk < mddev->chunk_sectors << 9)
 		chunk_sectors = mddev->new_chunk >> 9;
 	return  chunk_sectors >=
 		((sector & (chunk_sectors - 1)) + bio_sectors);
@@ -3791,10 +3791,10 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * If old and new chunk sizes differ, we need to process the
 	 * largest of these
 	 */
-	if (mddev->new_chunk > mddev->chunk_size)
+	if (mddev->new_chunk > mddev->chunk_sectors << 9)
 		reshape_sectors = mddev->new_chunk / 512;
 	else
-		reshape_sectors = mddev->chunk_size / 512;
+		reshape_sectors = mddev->chunk_sectors;
 
 	/* we update the metadata when there is more than 3Meg
 	 * in the block range (that is rather arbitrary, should
@@ -4303,7 +4303,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 			raid_disks = conf->previous_raid_disks;
 	}
 
-	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
 	return sectors * (raid_disks - conf->max_degraded);
 }
@@ -4412,7 +4412,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	conf->max_nr_stripes = NR_STRIPES;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
-		conf->prev_chunk = mddev->chunk_size;
+		conf->prev_chunk = mddev->chunk_sectors << 9;
 		conf->prev_algo = mddev->layout;
 	}
 
@@ -4484,7 +4484,7 @@ static int run(mddev_t *mddev)
 		}
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
-		sector_div(here_old, (mddev->chunk_size>>9)*
+		sector_div(here_old, mddev->chunk_sectors *
 			   (old_disks-max_degraded));
 		/* here_old is the first stripe that we might need to read
 		 * from */
@@ -4499,7 +4499,7 @@ static int run(mddev_t *mddev)
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
 		BUG_ON(mddev->layout != mddev->new_layout);
-		BUG_ON(mddev->chunk_size != mddev->new_chunk);
+		BUG_ON(mddev->chunk_sectors << 9 != mddev->new_chunk);
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
@@ -4533,7 +4533,7 @@ static int run(mddev_t *mddev)
 	}
 
 	/* device size must be a multiple of chunk size */
-	mddev->dev_sectors &= ~(mddev->chunk_size / 512 - 1);
+	mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
 	mddev->resync_max_sectors = mddev->dev_sectors;
 
 	if (mddev->degraded > 0 &&
@@ -4582,7 +4582,7 @@ static int run(mddev_t *mddev)
 	{
 		int data_disks = conf->previous_raid_disks - conf->max_degraded;
 		int stripe = data_disks *
-			(mddev->chunk_size / PAGE_SIZE);
+			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
@@ -4679,7 +4679,8 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 	raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
 	int i;
 
-	seq_printf (seq, " level %d, %dk chunk, algorithm %d", mddev->level, mddev->chunk_size >> 10, mddev->layout);
+	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
+		mddev->chunk_sectors / 2, mddev->layout);
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
@@ -4827,7 +4828,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
-	sectors &= ~((sector_t)mddev->chunk_size/512 - 1);
+	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
 	md_set_array_sectors(mddev, raid5_size(mddev, sectors,
 					       mddev->raid_disks));
 	if (mddev->array_sectors >
@@ -4850,7 +4851,7 @@ static int raid5_check_reshape(mddev_t *mddev)
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
-	    mddev->new_chunk == mddev->chunk_size)
+	    mddev->new_chunk == mddev->chunk_sectors << 9)
 		return -EINVAL; /* nothing to do */
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
@@ -4878,10 +4879,11 @@ static int raid5_check_reshape(mddev_t *mddev)
 	 * If the chunk size is greater, user-space should request more
 	 * stripe_heads first.
 	 */
-	if ((mddev->chunk_size / STRIPE_SIZE) * 4 > conf->max_nr_stripes ||
+	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+		> conf->max_nr_stripes ||
 	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-		       (max(mddev->chunk_size, mddev->new_chunk)
+		       (max(mddev->chunk_sectors << 9, mddev->new_chunk)
 			/ STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
@@ -5054,7 +5056,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 				raid5_remove_disk(mddev, d);
 		}
 		mddev->layout = conf->algorithm;
-		mddev->chunk_size = conf->chunk_size;
+		mddev->chunk_sectors = conf->chunk_size >> 9;
 		mddev->reshape_position = MaxSector;
 		mddev->delta_disks = 0;
 	}
@@ -5183,7 +5185,8 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 		}
 		if (new_chunk > 0) {
 			conf->chunk_size = new_chunk;
-			mddev->chunk_size = mddev->new_chunk = new_chunk;
+			mddev->new_chunk = new_chunk;
+			mddev->chunk_sectors = new_chunk >> 9;
 		}
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
-- 
cgit v1.2.3


From 664e7c413f1e90eceb0b2596dd73a0832faec058 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:45:27 +1000
Subject: md: Convert mddev->new_chunk to sectors.

A straight-forward conversion which gets rid of some
multiplications/divisions/shifts. The patch also introduces a couple
of new ones, most of which are due to conf->chunk_size still being
represented in bytes. This will be cleaned up in subsequent patches.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c    | 29 +++++++++++++++--------------
 drivers/md/md.h    |  3 ++-
 drivers/md/raid1.c |  4 ++--
 drivers/md/raid5.c | 45 ++++++++++++++++++++++++---------------------
 4 files changed, 43 insertions(+), 38 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index abcc0fef30e3..f996d8342a85 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -886,13 +886,13 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = sb->delta_disks;
 			mddev->new_level = sb->new_level;
 			mddev->new_layout = sb->new_layout;
-			mddev->new_chunk = sb->new_chunk;
+			mddev->new_chunk_sectors = sb->new_chunk >> 9;
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_sectors << 9;
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
 		if (sb->state & (1<<MD_SB_CLEAN))
@@ -1007,7 +1007,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->new_level = mddev->new_level;
 		sb->delta_disks = mddev->delta_disks;
 		sb->new_layout = mddev->new_layout;
-		sb->new_chunk = mddev->new_chunk;
+		sb->new_chunk = mddev->new_chunk_sectors << 9;
 	}
 	mddev->minor_version = sb->minor_version;
 	if (mddev->in_sync)
@@ -1304,13 +1304,13 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
 			mddev->delta_disks = le32_to_cpu(sb->delta_disks);
 			mddev->new_level = le32_to_cpu(sb->new_level);
 			mddev->new_layout = le32_to_cpu(sb->new_layout);
-			mddev->new_chunk = le32_to_cpu(sb->new_chunk)<<9;
+			mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
 		} else {
 			mddev->reshape_position = MaxSector;
 			mddev->delta_disks = 0;
 			mddev->new_level = mddev->level;
 			mddev->new_layout = mddev->layout;
-			mddev->new_chunk = mddev->chunk_sectors << 9;
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 		}
 
 	} else if (mddev->pers == NULL) {
@@ -1409,7 +1409,7 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
 		sb->new_layout = cpu_to_le32(mddev->new_layout);
 		sb->delta_disks = cpu_to_le32(mddev->delta_disks);
 		sb->new_level = cpu_to_le32(mddev->new_level);
-		sb->new_chunk = cpu_to_le32(mddev->new_chunk>>9);
+		sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
 	}
 
 	max_dev = 0;
@@ -2753,7 +2753,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	if (IS_ERR(priv)) {
 		mddev->new_level = mddev->level;
 		mddev->new_layout = mddev->layout;
-		mddev->new_chunk = mddev->chunk_sectors << 9;
+		mddev->new_chunk_sectors = mddev->chunk_sectors;
 		mddev->raid_disks -= mddev->delta_disks;
 		mddev->delta_disks = 0;
 		module_put(pers->owner);
@@ -2771,7 +2771,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
 	strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
 	mddev->level = mddev->new_level;
 	mddev->layout = mddev->new_layout;
-	mddev->chunk_sectors = mddev->new_chunk >> 9;
+	mddev->chunk_sectors = mddev->new_chunk_sectors;
 	mddev->delta_disks = 0;
 	pers->run(mddev);
 	mddev_resume(mddev);
@@ -2864,8 +2864,9 @@ static ssize_t
 chunk_size_show(mddev_t *mddev, char *page)
 {
 	if (mddev->reshape_position != MaxSector &&
-	    mddev->chunk_sectors << 9 != mddev->new_chunk)
-		return sprintf(page, "%d (%d)\n", mddev->new_chunk,
+	    mddev->chunk_sectors != mddev->new_chunk_sectors)
+		return sprintf(page, "%d (%d)\n",
+			       mddev->new_chunk_sectors << 9,
 			       mddev->chunk_sectors << 9);
 	return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
 }
@@ -2887,7 +2888,7 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 		if (err)
 			return err;
 	} else {
-		mddev->new_chunk = n;
+		mddev->new_chunk_sectors = n >> 9;
 		if (mddev->reshape_position == MaxSector)
 			mddev->chunk_sectors = n >> 9;
 	}
@@ -3665,7 +3666,7 @@ reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
 	mddev->delta_disks = 0;
 	mddev->new_level = mddev->level;
 	mddev->new_layout = mddev->layout;
-	mddev->new_chunk = mddev->chunk_sectors << 9;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	return len;
 }
 
@@ -4414,7 +4415,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
 		mddev->delta_disks = 0;
 		mddev->new_level = LEVEL_NONE;
 		mddev->new_layout = 0;
-		mddev->new_chunk = 0;
+		mddev->new_chunk_sectors = 0;
 		mddev->curr_resync = 0;
 		mddev->resync_mismatches = 0;
 		mddev->suspend_lo = mddev->suspend_hi = 0;
@@ -5083,7 +5084,7 @@ static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
 	get_random_bytes(mddev->uuid, 16);
 
 	mddev->new_level = mddev->level;
-	mddev->new_chunk = mddev->chunk_sectors << 9;
+	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->new_layout = mddev->layout;
 	mddev->delta_disks = 0;
 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 5d78830043d0..e0a2b8e3985d 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -166,7 +166,8 @@ struct mddev_s
 	 * If reshape_position is MaxSector, then no reshape is happening (yet).
 	 */
 	sector_t			reshape_position;
-	int				delta_disks, new_level, new_layout, new_chunk;
+	int				delta_disks, new_level, new_layout;
+	int				new_chunk_sectors;
 
 	struct mdk_thread_s		*thread;	/* management thread */
 	struct mdk_thread_s		*sync_thread;	/* doing resync or reconstruct */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 388635735ae5..12f8f34f17ae 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2161,10 +2161,10 @@ static int raid1_reshape(mddev_t *mddev)
 	int d, d2, err;
 
 	/* Cannot change chunk_size, layout, or level */
-	if (mddev->chunk_sectors << 9 != mddev->new_chunk ||
+	if (mddev->chunk_sectors != mddev->new_chunk_sectors ||
 	    mddev->layout != mddev->new_layout ||
 	    mddev->level != mddev->new_level) {
-		mddev->new_chunk = mddev->chunk_sectors << 9;
+		mddev->new_chunk_sectors = mddev->chunk_sectors;
 		mddev->new_layout = mddev->layout;
 		mddev->new_level = mddev->level;
 		return -EINVAL;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1e4fd5e8bfdd..bc3564cfbba0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3358,8 +3358,8 @@ static int raid5_mergeable_bvec(struct request_queue *q,
 	if ((bvm->bi_rw & 1) == WRITE)
 		return biovec->bv_len; /* always allow writes to be mergeable */
 
-	if (mddev->new_chunk < mddev->chunk_sectors << 9)
-		chunk_sectors = mddev->new_chunk >> 9;
+	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+		chunk_sectors = mddev->new_chunk_sectors;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0;
 	if (max <= biovec->bv_len && bio_sectors == 0)
@@ -3375,8 +3375,8 @@ static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bio->bi_size >> 9;
 
-	if (mddev->new_chunk < mddev->chunk_sectors << 9)
-		chunk_sectors = mddev->new_chunk >> 9;
+	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
+		chunk_sectors = mddev->new_chunk_sectors;
 	return  chunk_sectors >=
 		((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -3791,8 +3791,8 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 	 * If old and new chunk sizes differ, we need to process the
 	 * largest of these
 	 */
-	if (mddev->new_chunk > mddev->chunk_sectors << 9)
-		reshape_sectors = mddev->new_chunk / 512;
+	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
+		reshape_sectors = mddev->new_chunk_sectors;
 	else
 		reshape_sectors = mddev->chunk_sectors;
 
@@ -4304,7 +4304,7 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 	}
 
 	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
-	sectors &= ~((sector_t)mddev->new_chunk/512 - 1);
+	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
@@ -4336,10 +4336,11 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!mddev->new_chunk || mddev->new_chunk % PAGE_SIZE ||
-	    !is_power_of_2(mddev->new_chunk)) {
+	if (!mddev->new_chunk_sectors ||
+	    (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
+	    !is_power_of_2(mddev->new_chunk_sectors)) {
 		printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
-			mddev->new_chunk, mdname(mddev));
+		       mddev->new_chunk_sectors << 9, mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 
@@ -4402,7 +4403,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 			conf->fullsync = 1;
 	}
 
-	conf->chunk_size = mddev->new_chunk;
+	conf->chunk_size = mddev->new_chunk_sectors << 9;
 	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
@@ -4476,7 +4477,7 @@ static int run(mddev_t *mddev)
 		 * geometry.
 		 */
 		here_new = mddev->reshape_position;
-		if (sector_div(here_new, (mddev->new_chunk>>9)*
+		if (sector_div(here_new, mddev->new_chunk_sectors *
 			       (mddev->raid_disks - max_degraded))) {
 			printk(KERN_ERR "raid5: reshape_position not "
 			       "on a stripe boundary\n");
@@ -4499,7 +4500,7 @@ static int run(mddev_t *mddev)
 	} else {
 		BUG_ON(mddev->level != mddev->new_level);
 		BUG_ON(mddev->layout != mddev->new_layout);
-		BUG_ON(mddev->chunk_sectors << 9 != mddev->new_chunk);
+		BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
 		BUG_ON(mddev->delta_disks != 0);
 	}
 
@@ -4851,7 +4852,7 @@ static int raid5_check_reshape(mddev_t *mddev)
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
-	    mddev->new_chunk == mddev->chunk_sectors << 9)
+	    mddev->new_chunk_sectors == mddev->chunk_sectors)
 		return -EINVAL; /* nothing to do */
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
@@ -4881,9 +4882,11 @@ static int raid5_check_reshape(mddev_t *mddev)
 	 */
 	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
 		> conf->max_nr_stripes ||
-	    (mddev->new_chunk / STRIPE_SIZE) * 4 > conf->max_nr_stripes) {
+	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+		> conf->max_nr_stripes) {
 		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-		       (max(mddev->chunk_sectors << 9, mddev->new_chunk)
+		       (max(mddev->chunk_sectors << 9,
+			mddev->new_chunk_sectors << 9)
 			/ STRIPE_SIZE)*4);
 		return -ENOSPC;
 	}
@@ -4929,7 +4932,7 @@ static int raid5_start_reshape(mddev_t *mddev)
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
 	conf->prev_chunk = conf->chunk_size;
-	conf->chunk_size = mddev->new_chunk;
+	conf->chunk_size = mddev->new_chunk_sectors << 9;
 	conf->prev_algo = conf->algorithm;
 	conf->algorithm = mddev->new_layout;
 	if (mddev->delta_disks < 0)
@@ -5114,7 +5117,7 @@ static void *raid5_takeover_raid1(mddev_t *mddev)
 
 	mddev->new_level = 5;
 	mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
-	mddev->new_chunk = chunksect << 9;
+	mddev->new_chunk_sectors = chunksect;
 
 	return setup_conf(mddev);
 }
@@ -5185,7 +5188,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 		}
 		if (new_chunk > 0) {
 			conf->chunk_size = new_chunk;
-			mddev->new_chunk = new_chunk;
+			mddev->new_chunk_sectors = new_chunk >> 9;
 			mddev->chunk_sectors = new_chunk >> 9;
 		}
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
@@ -5194,7 +5197,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 		if (new_layout >= 0)
 			mddev->new_layout = new_layout;
 		if (new_chunk > 0)
-			mddev->new_chunk = new_chunk;
+			mddev->new_chunk_sectors = new_chunk >> 9;
 	}
 	return 0;
 }
@@ -5219,7 +5222,7 @@ static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	if (new_layout >= 0)
 		mddev->new_layout = new_layout;
 	if (new_chunk > 0)
-		mddev->new_chunk = new_chunk;
+		mddev->new_chunk_sectors = new_chunk >> 9;
 
 	return 0;
 }
-- 
cgit v1.2.3


From 09c9e5fa1b93ad5b81c9dcf8ce3a5b9ae2ac31e4 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:45:55 +1000
Subject: md: convert conf->chunk_size and conf->prev_chunk to sectors.

This kills some more shifts.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 27 +++++++++++++--------------
 drivers/md/raid5.h |  6 ++++--
 2 files changed, 17 insertions(+), 16 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index bc3564cfbba0..eaa2d3ee2b5d 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -1274,8 +1274,8 @@ static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
 	sector_t new_sector;
 	int algorithm = previous ? conf->prev_algo
 				 : conf->algorithm;
-	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
-					 : (conf->chunk_size >> 9);
+	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+					 : conf->chunk_sectors;
 	int raid_disks = previous ? conf->previous_raid_disks
 				  : conf->raid_disks;
 	int data_disks = raid_disks - conf->max_degraded;
@@ -1480,8 +1480,8 @@ static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
 	int raid_disks = sh->disks;
 	int data_disks = raid_disks - conf->max_degraded;
 	sector_t new_sector = sh->sector, check;
-	int sectors_per_chunk = previous ? (conf->prev_chunk >> 9)
-					 : (conf->chunk_size >> 9);
+	int sectors_per_chunk = previous ? conf->prev_chunk_sectors
+					 : conf->chunk_sectors;
 	int algorithm = previous ? conf->prev_algo
 				 : conf->algorithm;
 	sector_t stripe;
@@ -1997,8 +1997,7 @@ static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
 			    struct stripe_head *sh)
 {
 	int sectors_per_chunk =
-		previous ? (conf->prev_chunk >> 9)
-			 : (conf->chunk_size >> 9);
+		previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
 	int dd_idx;
 	int chunk_offset = sector_div(stripe, sectors_per_chunk);
 	int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
@@ -3917,7 +3916,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 				     1, &dd_idx, NULL);
 	last_sector =
 		raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
-					    *(new_data_disks) - 1),
+					    * new_data_disks - 1),
 				     1, &dd_idx, NULL);
 	if (last_sector >= mddev->dev_sectors)
 		last_sector = mddev->dev_sectors - 1;
@@ -4403,7 +4402,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 			conf->fullsync = 1;
 	}
 
-	conf->chunk_size = mddev->new_chunk_sectors << 9;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->level = mddev->new_level;
 	if (conf->level == 6)
 		conf->max_degraded = 2;
@@ -4413,7 +4412,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev)
 	conf->max_nr_stripes = NR_STRIPES;
 	conf->reshape_progress = mddev->reshape_position;
 	if (conf->reshape_progress != MaxSector) {
-		conf->prev_chunk = mddev->chunk_sectors << 9;
+		conf->prev_chunk_sectors = mddev->chunk_sectors;
 		conf->prev_algo = mddev->layout;
 	}
 
@@ -4931,8 +4930,8 @@ static int raid5_start_reshape(mddev_t *mddev)
 	spin_lock_irq(&conf->device_lock);
 	conf->previous_raid_disks = conf->raid_disks;
 	conf->raid_disks += mddev->delta_disks;
-	conf->prev_chunk = conf->chunk_size;
-	conf->chunk_size = mddev->new_chunk_sectors << 9;
+	conf->prev_chunk_sectors = conf->chunk_sectors;
+	conf->chunk_sectors = mddev->new_chunk_sectors;
 	conf->prev_algo = conf->algorithm;
 	conf->algorithm = mddev->new_layout;
 	if (mddev->delta_disks < 0)
@@ -5014,7 +5013,7 @@ static void end_reshape(raid5_conf_t *conf)
 		 */
 		{
 			int data_disks = conf->raid_disks - conf->max_degraded;
-			int stripe = data_disks * (conf->chunk_size
+			int stripe = data_disks * ((conf->chunk_sectors << 9)
 						   / PAGE_SIZE);
 			if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 				conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
@@ -5059,7 +5058,7 @@ static void raid5_finish_reshape(mddev_t *mddev)
 				raid5_remove_disk(mddev, d);
 		}
 		mddev->layout = conf->algorithm;
-		mddev->chunk_sectors = conf->chunk_size >> 9;
+		mddev->chunk_sectors = conf->chunk_sectors;
 		mddev->reshape_position = MaxSector;
 		mddev->delta_disks = 0;
 	}
@@ -5187,7 +5186,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 			mddev->layout = mddev->new_layout = new_layout;
 		}
 		if (new_chunk > 0) {
-			conf->chunk_size = new_chunk;
+			conf->chunk_sectors = new_chunk >> 9;
 			mddev->new_chunk_sectors = new_chunk >> 9;
 			mddev->chunk_sectors = new_chunk >> 9;
 		}
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1a25c9e252b4..9459689c4ea0 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -334,7 +334,8 @@ struct raid5_private_data {
 	struct hlist_head	*stripe_hashtbl;
 	mddev_t			*mddev;
 	struct disk_info	*spare;
-	int			chunk_size, level, algorithm;
+	int			chunk_sectors;
+	int			level, algorithm;
 	int			max_degraded;
 	int			raid_disks;
 	int			max_nr_stripes;
@@ -350,7 +351,8 @@ struct raid5_private_data {
 	 */
 	sector_t		reshape_safe;
 	int			previous_raid_disks;
-	int			prev_chunk, prev_algo;
+	int			prev_chunk_sectors;
+	int			prev_algo;
 	short			generation; /* increments with every reshape */
 	unsigned long		reshape_checkpoint; /* Time we last updated
 						     * metadata */
-- 
cgit v1.2.3


From 0ba459d26260d4d13346c76642f461b2bf607eef Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:46:10 +1000
Subject: md/raid5: Use is_power_of_2() in raid5_reconfig()/raid6_reconfig().

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index eaa2d3ee2b5d..72e8a324dcfb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5167,8 +5167,7 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
-		if (new_chunk & (new_chunk-1))
-			/* not a power of 2 */
+		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
 		if (new_chunk < PAGE_SIZE)
 			return -EINVAL;
@@ -5206,8 +5205,7 @@ static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
-		if (new_chunk & (new_chunk-1))
-			/* not a power of 2 */
+		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
 		if (new_chunk < PAGE_SIZE)
 			return -EINVAL;
-- 
cgit v1.2.3


From cdc2ae6d6a30df8fd92c5e300d0e3005e13eb6b0 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:46:47 +1000
Subject: md: fix some comments.

1/ Raid5 has learned to take over also raid4 and raid6 arrays.
2/ new_chunk in mdp_superblock_1 is in sectors, not bytes.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c        | 2 --
 include/linux/raid/md_p.h | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 72e8a324dcfb..009344156751 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5231,8 +5231,6 @@ static void *raid5_takeover(mddev_t *mddev)
 	 *  raid1 - if there are two drives.  We need to know the chunk size
 	 *  raid4 - trivial - just use a raid4 layout.
 	 *  raid6 - Providing it is a *_6 layout
-	 *
-	 * For now, just do raid1
 	 */
 
 	if (mddev->level == 1)
diff --git a/include/linux/raid/md_p.h b/include/linux/raid/md_p.h
index 6ba830fa8538..ffa2efbbe382 100644
--- a/include/linux/raid/md_p.h
+++ b/include/linux/raid/md_p.h
@@ -232,7 +232,7 @@ struct mdp_superblock_1 {
 	__le64	reshape_position;	/* next address in array-space for reshape */
 	__le32	delta_disks;	/* change in number of raid_disks		*/
 	__le32	new_layout;	/* new layout					*/
-	__le32	new_chunk;	/* new chunk size (bytes)			*/
+	__le32	new_chunk;	/* new chunk size (512byte sectors)		*/
 	__u8	pad1[128-124];	/* set to 0 when written */
 
 	/* constant this-device information - 64 bytes */
-- 
cgit v1.2.3


From d6e412eaa52db82010f12ea7d2c9b9468e933c44 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:47:00 +1000
Subject: md: raid0: chunk_sectors cleanups.

following the conversion to chunk_sectors, there is room
for cleaning up a little.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid0.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index f20b18ff7969..11e384253718 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -273,7 +273,7 @@ static int raid0_mergeable_bvec(struct request_queue *q,
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
-	if (is_power_of_2(mddev->chunk_sectors))
+	if (is_power_of_2(chunk_sectors))
 		max =  (chunk_sectors - ((sector & (chunk_sectors-1))
 						+ bio_sectors)) << 9;
 	else
@@ -384,7 +384,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 	raid0_conf_t *conf = mddev->private;
 	unsigned int chunk_sects = mddev->chunk_sectors;
 
-	if (is_power_of_2(mddev->chunk_sectors)) {
+	if (is_power_of_2(chunk_sects)) {
 		int chunksect_bits = ffz(~chunk_sects);
 		/* find the sector offset inside the chunk */
 		sect_in_chunk  = sector & (chunk_sects - 1);
@@ -414,7 +414,7 @@ static mdk_rdev_t *map_sector(mddev_t *mddev, struct strip_zone *zone,
 static inline int is_io_in_chunk_boundary(mddev_t *mddev,
 			unsigned int chunk_sects, struct bio *bio)
 {
-	if (likely(is_power_of_2(mddev->chunk_sectors))) {
+	if (likely(is_power_of_2(chunk_sects))) {
 		return chunk_sects >= ((bio->bi_sector & (chunk_sects-1))
 					+ (bio->bi_size >> 9));
 	} else{
@@ -456,7 +456,7 @@ static int raid0_make_request(struct request_queue *q, struct bio *bio)
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
-		if (likely(is_power_of_2(mddev->chunk_sectors)))
+		if (likely(is_power_of_2(chunk_sects)))
 			bp = bio_split(bio, chunk_sects - (sector &
 							   (chunk_sects-1)));
 		else
-- 
cgit v1.2.3


From 01ee22b496c41384eaa6dcae983c86d8bc32fbb8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:47:20 +1000
Subject: md: raid5: check stripe cache is large enough in start_reshape

In reshape cases that do not change the number of devices,
start_reshape is called without first calling check_reshape.

Currently, the check that the stripe_cache is large enough is
only done in check_reshape.  It should be in start_reshape too.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 44 +++++++++++++++++++++++++++-----------------
 1 file changed, 27 insertions(+), 17 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 009344156751..b84766e347c3 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4845,6 +4845,29 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors)
 	return 0;
 }
 
+static int check_stripe_cache(mddev_t *mddev)
+{
+	/* Can only proceed if there are plenty of stripe_heads.
+	 * We need a minimum of one full stripe,, and for sensible progress
+	 * it is best to have about 4 times that.
+	 * If we require 4 times, then the default 256 4K stripe_heads will
+	 * allow for chunk sizes up to 256K, which is probably OK.
+	 * If the chunk size is greater, user-space should request more
+	 * stripe_heads first.
+	 */
+	raid5_conf_t *conf = mddev->private;
+	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
+	    > conf->max_nr_stripes ||
+	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
+	    > conf->max_nr_stripes) {
+		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
+		       ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
+			/ STRIPE_SIZE)*4);
+		return 0;
+	}
+	return 1;
+}
+
 static int raid5_check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
@@ -4871,24 +4894,8 @@ static int raid5_check_reshape(mddev_t *mddev)
 			return -EINVAL;
 	}
 
-	/* Can only proceed if there are plenty of stripe_heads.
-	 * We need a minimum of one full stripe,, and for sensible progress
-	 * it is best to have about 4 times that.
-	 * If we require 4 times, then the default 256 4K stripe_heads will
-	 * allow for chunk sizes up to 256K, which is probably OK.
-	 * If the chunk size is greater, user-space should request more
-	 * stripe_heads first.
-	 */
-	if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
-		> conf->max_nr_stripes ||
-	    ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
-		> conf->max_nr_stripes) {
-		printk(KERN_WARNING "raid5: reshape: not enough stripes.  Needed %lu\n",
-		       (max(mddev->chunk_sectors << 9,
-			mddev->new_chunk_sectors << 9)
-			/ STRIPE_SIZE)*4);
+	if (!check_stripe_cache(mddev))
 		return -ENOSPC;
-	}
 
 	return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
 }
@@ -4904,6 +4911,9 @@ static int raid5_start_reshape(mddev_t *mddev)
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 
+	if (!check_stripe_cache(mddev))
+		return -ENOSPC;
+
 	list_for_each_entry(rdev, &mddev->disks, same_set)
 		if (rdev->raid_disk < 0 &&
 		    !test_bit(Faulty, &rdev->flags))
-- 
cgit v1.2.3


From 597a711b69cfff95c4b8f6069037e7ad3fc71f56 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:47:42 +1000
Subject: md: remove unnecessary arguments from ->reconfig method.

Passing the new layout and chunksize as args is not necessary as
the mddev has fields for new_check and new_layout.

This is preparation for combining the check_reshape and reconfig
methods

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c | 13 +++++++------
 drivers/md/md.c     | 23 +++++++++++++++++------
 drivers/md/md.h     |  2 +-
 drivers/md/raid5.c  | 42 ++++++++++++++++--------------------------
 4 files changed, 41 insertions(+), 39 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 8695809b24b0..6513b7b3e379 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -255,14 +255,14 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 }
 
 
-static int reconfig(mddev_t *mddev, int layout, int chunk_size)
+static int reconfig(mddev_t *mddev)
 {
-	int mode = layout & ModeMask;
-	int count = layout >> ModeShift;
+	int mode = mddev->new_layout & ModeMask;
+	int count = mddev->new_layout >> ModeShift;
 	conf_t *conf = mddev->private;
 
-	if (chunk_size != -1)
-		return -EINVAL;
+	if (mddev->new_layout < 0)
+		return 0;
 
 	/* new layout */
 	if (mode == ClearFaults)
@@ -279,6 +279,7 @@ static int reconfig(mddev_t *mddev, int layout, int chunk_size)
 		atomic_set(&conf->counters[mode], count);
 	} else
 		return -EINVAL;
+	mddev->new_layout = -1;
 	mddev->layout = -1; /* makes sure further changes come through */
 	return 0;
 }
@@ -315,7 +316,7 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
 
-	reconfig(mddev, mddev->layout, -1);
+	reconfig(mddev);
 
 	return 0;
 }
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f996d8342a85..5caa421c2367 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2809,9 +2809,12 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
 		int err;
 		if (mddev->pers->reconfig == NULL)
 			return -EBUSY;
-		err = mddev->pers->reconfig(mddev, n, -1);
-		if (err)
+		mddev->new_layout = n;
+		err = mddev->pers->reconfig(mddev);
+		if (err) {
+			mddev->new_layout = mddev->layout;
 			return err;
+		}
 	} else {
 		mddev->new_layout = n;
 		if (mddev->reshape_position == MaxSector)
@@ -2884,9 +2887,12 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 		int err;
 		if (mddev->pers->reconfig == NULL)
 			return -EBUSY;
-		err = mddev->pers->reconfig(mddev, -1, n);
-		if (err)
+		mddev->new_chunk_sectors = n >> 9;
+		err = mddev->pers->reconfig(mddev);
+		if (err) {
+			mddev->new_chunk_sectors = mddev->chunk_sectors;
 			return err;
+		}
 	} else {
 		mddev->new_chunk_sectors = n >> 9;
 		if (mddev->reshape_position == MaxSector)
@@ -5220,8 +5226,13 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		 */
 		if (mddev->pers->reconfig == NULL)
 			return -EINVAL;
-		else
-			return mddev->pers->reconfig(mddev, info->layout, -1);
+		else {
+			mddev->new_layout = info->layout;
+			rv = mddev->pers->reconfig(mddev);
+			if (rv)
+				mddev->new_layout = mddev->layout;
+			return rv;
+		}
 	}
 	if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
 		rv = update_size(mddev, (sector_t)info->size * 2);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index e0a2b8e3985d..815013f8da6c 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -326,7 +326,7 @@ struct mdk_personality
 	int (*check_reshape) (mddev_t *mddev);
 	int (*start_reshape) (mddev_t *mddev);
 	void (*finish_reshape) (mddev_t *mddev);
-	int (*reconfig) (mddev_t *mddev, int layout, int chunk_size);
+	int (*reconfig) (mddev_t *mddev);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
 	 * 1 - no new requests allowed
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index b84766e347c3..136051bc6725 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5165,7 +5165,7 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
 }
 
 
-static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid5_reconfig(mddev_t *mddev)
 {
 	/* For a 2-drive array, the layout and chunk size can be changed
 	 * immediately as not restriping is needed.
@@ -5173,15 +5173,16 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	 * to be used by a reshape pass.
 	 */
 	raid5_conf_t *conf = mddev->private;
+	int new_chunk = mddev->new_chunk_sectors;
 
-	if (new_layout >= 0 && !algorithm_valid_raid5(new_layout))
+	if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
 		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
-		if (new_chunk < PAGE_SIZE)
+		if (new_chunk < (PAGE_SIZE>>9))
 			return -EINVAL;
-		if (mddev->array_sectors & ((new_chunk>>9)-1))
+		if (mddev->array_sectors & (new_chunk-1))
 			/* not factor of array size */
 			return -EINVAL;
 	}
@@ -5189,48 +5190,37 @@ static int raid5_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
 	/* They look valid */
 
 	if (mddev->raid_disks == 2) {
-
-		if (new_layout >= 0) {
-			conf->algorithm = new_layout;
-			mddev->layout = mddev->new_layout = new_layout;
+		/* can make the change immediately */
+		if (mddev->new_layout >= 0) {
+			conf->algorithm = mddev->new_layout;
+			mddev->layout = mddev->new_layout;
 		}
 		if (new_chunk > 0) {
-			conf->chunk_sectors = new_chunk >> 9;
-			mddev->new_chunk_sectors = new_chunk >> 9;
-			mddev->chunk_sectors = new_chunk >> 9;
+			conf->chunk_sectors = new_chunk ;
+			mddev->chunk_sectors = new_chunk;
 		}
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
-	} else {
-		if (new_layout >= 0)
-			mddev->new_layout = new_layout;
-		if (new_chunk > 0)
-			mddev->new_chunk_sectors = new_chunk >> 9;
 	}
 	return 0;
 }
 
-static int raid6_reconfig(mddev_t *mddev, int new_layout, int new_chunk)
+static int raid6_reconfig(mddev_t *mddev)
 {
-	if (new_layout >= 0 && !algorithm_valid_raid6(new_layout))
+	int new_chunk = mddev->new_chunk_sectors;
+	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
 		if (!is_power_of_2(new_chunk))
 			return -EINVAL;
-		if (new_chunk < PAGE_SIZE)
+		if (new_chunk < (PAGE_SIZE >> 9))
 			return -EINVAL;
-		if (mddev->array_sectors & ((new_chunk>>9)-1))
+		if (mddev->array_sectors & (new_chunk-1))
 			/* not factor of array size */
 			return -EINVAL;
 	}
 
 	/* They look valid */
-
-	if (new_layout >= 0)
-		mddev->new_layout = new_layout;
-	if (new_chunk > 0)
-		mddev->new_chunk_sectors = new_chunk >> 9;
-
 	return 0;
 }
 
-- 
cgit v1.2.3


From 50ac168a6e0a061bf5346d53aa9e7beb94c97527 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:47:55 +1000
Subject: md: merge reconfig and check_reshape methods.

The difference between these two methods is artificial.
Both check that a pending reshape is valid, and perform any
aspect of it that can be done immediately.
'reconfig' handles chunk size and layout.
'check_reshape' handles raid_disks.

So make them just one method.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c |  6 +++---
 drivers/md/md.c     | 15 ++++++++-------
 drivers/md/md.h     |  1 -
 drivers/md/raid5.c  | 17 ++++++++---------
 4 files changed, 19 insertions(+), 20 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 6513b7b3e379..6e83b38d931d 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -255,7 +255,7 @@ static void status(struct seq_file *seq, mddev_t *mddev)
 }
 
 
-static int reconfig(mddev_t *mddev)
+static int reshape(mddev_t *mddev)
 {
 	int mode = mddev->new_layout & ModeMask;
 	int count = mddev->new_layout >> ModeShift;
@@ -316,7 +316,7 @@ static int run(mddev_t *mddev)
 	md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
 	mddev->private = conf;
 
-	reconfig(mddev);
+	reshape(mddev);
 
 	return 0;
 }
@@ -339,7 +339,7 @@ static struct mdk_personality faulty_personality =
 	.run		= run,
 	.stop		= stop,
 	.status		= status,
-	.reconfig	= reconfig,
+	.check_reshape	= reshape,
 	.size		= faulty_size,
 };
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5caa421c2367..80f039ec3ac2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2807,10 +2807,10 @@ layout_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers) {
 		int err;
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
 		mddev->new_layout = n;
-		err = mddev->pers->reconfig(mddev);
+		err = mddev->pers->check_reshape(mddev);
 		if (err) {
 			mddev->new_layout = mddev->layout;
 			return err;
@@ -2885,10 +2885,10 @@ chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
 
 	if (mddev->pers) {
 		int err;
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EBUSY;
 		mddev->new_chunk_sectors = n >> 9;
-		err = mddev->pers->reconfig(mddev);
+		err = mddev->pers->check_reshape(mddev);
 		if (err) {
 			mddev->new_chunk_sectors = mddev->chunk_sectors;
 			return err;
@@ -5224,11 +5224,11 @@ static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
 		 * we don't need to do anything at the md level, the
 		 * personality will take care of it all.
 		 */
-		if (mddev->pers->reconfig == NULL)
+		if (mddev->pers->check_reshape == NULL)
 			return -EINVAL;
 		else {
 			mddev->new_layout = info->layout;
-			rv = mddev->pers->reconfig(mddev);
+			rv = mddev->pers->check_reshape(mddev);
 			if (rv)
 				mddev->new_layout = mddev->layout;
 			return rv;
@@ -6731,7 +6731,8 @@ void md_check_recovery(mddev_t *mddev)
 		 */
 
 		if (mddev->reshape_position != MaxSector) {
-			if (mddev->pers->check_reshape(mddev) != 0)
+			if (mddev->pers->check_reshape == NULL ||
+			    mddev->pers->check_reshape(mddev) != 0)
 				/* Cannot proceed */
 				goto unlock;
 			set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 815013f8da6c..bac7c2bf8616 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -326,7 +326,6 @@ struct mdk_personality
 	int (*check_reshape) (mddev_t *mddev);
 	int (*start_reshape) (mddev_t *mddev);
 	void (*finish_reshape) (mddev_t *mddev);
-	int (*reconfig) (mddev_t *mddev);
 	/* quiesce moves between quiescence states
 	 * 0 - fully active
 	 * 1 - no new requests allowed
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 136051bc6725..5ea2bdece278 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4868,14 +4868,14 @@ static int check_stripe_cache(mddev_t *mddev)
 	return 1;
 }
 
-static int raid5_check_reshape(mddev_t *mddev)
+static int check_reshape(mddev_t *mddev)
 {
 	raid5_conf_t *conf = mddev->private;
 
 	if (mddev->delta_disks == 0 &&
 	    mddev->new_layout == mddev->layout &&
 	    mddev->new_chunk_sectors == mddev->chunk_sectors)
-		return -EINVAL; /* nothing to do */
+		return 0; /* nothing to do */
 	if (mddev->bitmap)
 		/* Cannot grow a bitmap yet */
 		return -EBUSY;
@@ -5165,7 +5165,7 @@ static void *raid5_takeover_raid6(mddev_t *mddev)
 }
 
 
-static int raid5_reconfig(mddev_t *mddev)
+static int raid5_check_reshape(mddev_t *mddev)
 {
 	/* For a 2-drive array, the layout and chunk size can be changed
 	 * immediately as not restriping is needed.
@@ -5202,12 +5202,13 @@ static int raid5_reconfig(mddev_t *mddev)
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 	}
-	return 0;
+	return check_reshape(mddev);
 }
 
-static int raid6_reconfig(mddev_t *mddev)
+static int raid6_check_reshape(mddev_t *mddev)
 {
 	int new_chunk = mddev->new_chunk_sectors;
+
 	if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
 		return -EINVAL;
 	if (new_chunk > 0) {
@@ -5221,7 +5222,7 @@ static int raid6_reconfig(mddev_t *mddev)
 	}
 
 	/* They look valid */
-	return 0;
+	return check_reshape(mddev);
 }
 
 static void *raid5_takeover(mddev_t *mddev)
@@ -5312,12 +5313,11 @@ static struct mdk_personality raid6_personality =
 	.sync_request	= sync_request,
 	.resize		= raid5_resize,
 	.size		= raid5_size,
-	.check_reshape	= raid5_check_reshape,
+	.check_reshape	= raid6_check_reshape,
 	.start_reshape  = raid5_start_reshape,
 	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid6_takeover,
-	.reconfig	= raid6_reconfig,
 };
 static struct mdk_personality raid5_personality =
 {
@@ -5340,7 +5340,6 @@ static struct mdk_personality raid5_personality =
 	.finish_reshape = raid5_finish_reshape,
 	.quiesce	= raid5_quiesce,
 	.takeover	= raid5_takeover,
-	.reconfig	= raid5_reconfig,
 };
 
 static struct mdk_personality raid4_personality =
-- 
cgit v1.2.3


From 8c6ac868b107ed50a46204f6d14e2ad9443ff146 Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:48:06 +1000
Subject: md: Push down reconstruction log message to personality code.

Currently, the md layer checks in analyze_sbs() if the raid level
supports reconstruction (mddev->level >= 1) and if reconstruction is
in progress (mddev->recovery_cp != MaxSector).

Move that printk into the personality code of those raid levels that
care (levels 1, 4, 5, 6, 10).

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c     | 9 ---------
 drivers/md/raid1.c  | 4 ++++
 drivers/md/raid10.c | 4 ++++
 drivers/md/raid5.c  | 4 ++++
 4 files changed, 12 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 80f039ec3ac2..90147370bfd7 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -2604,15 +2604,6 @@ static void analyze_sbs(mddev_t * mddev)
 			clear_bit(In_sync, &rdev->flags);
 		}
 	}
-
-
-
-	if (mddev->recovery_cp != MaxSector &&
-	    mddev->level >= 1)
-		printk(KERN_ERR "md: %s: raid array is not clean"
-		       " -- starting background reconstruction\n",
-		       mdname(mddev));
-
 }
 
 static void md_safemode_timeout(unsigned long data);
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 12f8f34f17ae..89939a7aef57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -2052,6 +2052,10 @@ static int run(mddev_t *mddev)
 		goto out_free_conf;
 	}
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid1: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	printk(KERN_INFO 
 		"raid1: raid set %s active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded, 
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 30029a312cf5..ae12ceafe10c 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -2186,6 +2186,10 @@ static int run(mddev_t *mddev)
 		goto out_free_conf;
 	}
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid10: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	printk(KERN_INFO
 		"raid10: raid set %s active with %d out of %d devices\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 5ea2bdece278..8f2e3740a05a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4454,6 +4454,10 @@ static int run(mddev_t *mddev)
 	int working_disks = 0;
 	mdk_rdev_t *rdev;
 
+	if (mddev->recovery_cp != MaxSector)
+		printk(KERN_NOTICE "raid5: %s is not clean"
+		       " -- starting background reconstruction\n",
+		       mdname(mddev));
 	if (mddev->reshape_position != MaxSector) {
 		/* Check that we can continue the reshape.
 		 * Currently only disks can change, it must
-- 
cgit v1.2.3


From 1b57f132231593923cb4ab99943ddd777e8745bc Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:48:19 +1000
Subject: md: move assignment of ->utime so that it never gets skipped.

Currently the assignment to utime gets skipped for 'external'
metadata.  So move it to the top of the function so that it
always gets effected.
This is of largely cosmetic interest.  Nothing actually depends
on ->utime being right for external arrays.
"mdadm --monitor" does use it for 0.90 and 1.x arrays, but with
mdadm-3.0, this is not important for external metadata.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 90147370bfd7..d9cce54cc2d5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1904,6 +1904,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
 	int sync_req;
 	int nospares = 0;
 
+	mddev->utime = get_seconds();
 	if (mddev->external)
 		return;
 repeat:
@@ -1933,7 +1934,6 @@ repeat:
 		nospares = 0;
 
 	sync_req = mddev->in_sync;
-	mddev->utime = get_seconds();
 
 	/* If this is just a dirty<->clean transition, and the array is clean
 	 * and 'events' is odd, we can roll back to the previous clean state */
-- 
cgit v1.2.3


From 13f2682b7216ebebd72b3d5868fe7fccec91a92d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:48:55 +1000
Subject: md: raid0/linear: ensure device sizes are rounded to chunk size.

This is currently ensured by common code, but it is more reliable to
ensure it where it is needed in personality code.
All the other personalities that care already round the size to
the chunk_size.  raid0 and linear are the only hold-outs.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 6 ++++++
 drivers/md/raid0.c  | 6 ++++++
 2 files changed, 12 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 9f7cec42dd8e..dda2f1b64a6d 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -135,6 +135,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		int j = rdev->raid_disk;
 		dev_info_t *disk = conf->disks + j;
+		sector_t sectors;
 
 		if (j < 0 || j >= raid_disks || disk->rdev) {
 			printk("linear: disk numbering problem. Aborting!\n");
@@ -142,6 +143,11 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
 		}
 
 		disk->rdev = rdev;
+		if (mddev->chunk_sectors) {
+			sectors = rdev->sectors;
+			sector_div(sectors, mddev->chunk_sectors);
+			rdev->sectors = sectors * mddev->chunk_sectors;
+		}
 
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 11e384253718..717e64a4af9a 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -100,6 +100,12 @@ static int create_strip_zones(mddev_t *mddev)
 		printk(KERN_INFO "raid0: looking at %s\n",
 			bdevname(rdev1->bdev,b));
 		c = 0;
+
+		/* round size to chunk_size */
+		sectors = rdev1->sectors;
+		sector_div(sectors, mddev->chunk_sectors);
+		rdev1->sectors = sectors * mddev->chunk_sectors;
+
 		list_for_each_entry(rdev2, &mddev->disks, same_set) {
 			printk(KERN_INFO "raid0:   comparing %s(%llu)",
 			       bdevname(rdev1->bdev,b),
-- 
cgit v1.2.3


From 8190e754e0723de7cecb80bdd9eb93911dfa04a1 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:48:58 +1000
Subject: md: remove chunksize rounding from common code.

It is easiest to round sizes to multiples of chunk size in
the personality code for those personalities which care.
Those personalities now do the rounding, so we can
remove that function from common code.

Also remove the upper bound on the size of a chunk, and the lower
bound on the size of a device (1 chunk), neither of which really buy
us anything.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/md.c | 52 +++-------------------------------------------------
 drivers/md/md.h |  7 -------
 2 files changed, 3 insertions(+), 56 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/md.c b/drivers/md/md.c
index d9cce54cc2d5..0f11fd1417ab 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -440,18 +440,6 @@ static inline sector_t calc_dev_sboffset(struct block_device *bdev)
 	return MD_NEW_SIZE_SECTORS(num_sectors);
 }
 
-static sector_t calc_num_sectors(mdk_rdev_t *rdev, unsigned chunk_size)
-{
-	sector_t num_sectors = rdev->sb_start;
-
-	if (chunk_size) {
-		unsigned chunk_sects = chunk_size>>9;
-		sector_div(num_sectors, chunk_sects);
-		num_sectors *= chunk_sects;
-	}
-	return num_sectors;
-}
-
 static int alloc_disk_sb(mdk_rdev_t * rdev)
 {
 	if (rdev->sb_page)
@@ -839,7 +827,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 		else 
 			ret = 0;
 	}
-	rdev->sectors = calc_num_sectors(rdev, sb->chunk_size);
+	rdev->sectors = rdev->sb_start;
 
 	if (rdev->sectors < sb->size * 2 && sb->level > 1)
 		/* "this cannot possibly happen" ... */
@@ -1251,13 +1239,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 	if (rdev->sectors < le64_to_cpu(sb->data_size))
 		return -EINVAL;
 	rdev->sectors = le64_to_cpu(sb->data_size);
-	if (le32_to_cpu(sb->chunksize)) {
-		int chunk_sects = le32_to_cpu(sb->chunksize);
-		sector_t chunks = rdev->sectors;
-		sector_div(chunks, chunk_sects);
-		rdev->sectors = chunks * chunk_sects;
-	}
-
 	if (le64_to_cpu(sb->size) > rdev->sectors)
 		return -EINVAL;
 	return ret;
@@ -3983,11 +3964,9 @@ static int start_dirty_degraded;
 static int do_md_run(mddev_t * mddev)
 {
 	int err;
-	int chunk_size;
 	mdk_rdev_t *rdev;
 	struct gendisk *disk;
 	struct mdk_personality *pers;
-	char b[BDEVNAME_SIZE];
 
 	if (list_empty(&mddev->disks))
 		/* cannot run an array with no devices.. */
@@ -4005,30 +3984,6 @@ static int do_md_run(mddev_t * mddev)
 		analyze_sbs(mddev);
 	}
 
-	chunk_size = mddev->chunk_sectors << 9;
-
-	if (chunk_size) {
-		if (chunk_size > MAX_CHUNK_SIZE) {
-			printk(KERN_ERR "too big chunk_size: %d > %d\n",
-				chunk_size, MAX_CHUNK_SIZE);
-			return -EINVAL;
-		}
-		/* devices must have minimum size of one chunk */
-		list_for_each_entry(rdev, &mddev->disks, same_set) {
-			if (test_bit(Faulty, &rdev->flags))
-				continue;
-			if (rdev->sectors < chunk_size / 512) {
-				printk(KERN_WARNING
-					"md: Dev %s smaller than chunk_size:"
-					" %llu < %d\n",
-					bdevname(rdev->bdev,b),
-					(unsigned long long)rdev->sectors,
-					chunk_size / 512);
-				return -EINVAL;
-			}
-		}
-	}
-
 	if (mddev->level != LEVEL_NONE)
 		request_module("md-level-%d", mddev->level);
 	else if (mddev->clevel[0])
@@ -4842,8 +4797,7 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
 			rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 		} else 
 			rdev->sb_start = calc_dev_sboffset(rdev->bdev);
-		rdev->sectors = calc_num_sectors(rdev,
-						 mddev->chunk_sectors << 9);
+		rdev->sectors = rdev->sb_start;
 
 		err = bind_rdev_to_array(rdev, mddev);
 		if (err) {
@@ -4913,7 +4867,7 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
 	else
 		rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
 
-	rdev->sectors = calc_num_sectors(rdev, mddev->chunk_sectors << 9);
+	rdev->sectors = rdev->sb_start;
 
 	if (test_bit(Faulty, &rdev->flags)) {
 		printk(KERN_WARNING 
diff --git a/drivers/md/md.h b/drivers/md/md.h
index bac7c2bf8616..ea2c441449d4 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -29,13 +29,6 @@
 typedef struct mddev_s mddev_t;
 typedef struct mdk_rdev_s mdk_rdev_t;
 
-/*
- * options passed in raidrun:
- */
-
-/* Currently this must fit in an 'int' */
-#define MAX_CHUNK_SIZE (1<<30)
-
 /*
  * MD's 'extended' device
  */
-- 
cgit v1.2.3


From 0894cc3066aaa3e75a99383c0d25feebf9b688ac Mon Sep 17 00:00:00 2001
From: Andre Noll <maan@systemlinux.org>
Date: Thu, 18 Jun 2009 08:49:23 +1000
Subject: md: Move check for bitmap presence to personality code.

If the superblock of a component device indicates the presence of a
bitmap but the corresponding raid personality does not support bitmaps
(raid0, linear, multipath, faulty), then something is seriously wrong
and we'd better refuse to run such an array.

Currently, this check is performed while the superblocks are examined,
i.e. before entering personality code. Therefore the generic md layer
must know which raid levels support bitmaps and which do not.

This patch avoids this layer violation without adding identical code
to various personalities. This is accomplished by introducing a new
public function to md.c, md_check_no_bitmap(), which replaces the
hard-coded checks in the superblock loading functions.

A call to md_check_no_bitmap() is added to the ->run method of each
personality which does not support bitmaps and assembly is aborted
if at least one component device contains a bitmap.

Signed-off-by: Andre Noll <maan@systemlinux.org>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/faulty.c    |  6 +++++-
 drivers/md/linear.c    |  2 ++
 drivers/md/md.c        | 40 ++++++++++++++++++----------------------
 drivers/md/md.h        |  1 +
 drivers/md/multipath.c |  3 +++
 drivers/md/raid0.c     |  2 ++
 6 files changed, 31 insertions(+), 23 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index 6e83b38d931d..87d88dbb667f 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -299,8 +299,12 @@ static int run(mddev_t *mddev)
 {
 	mdk_rdev_t *rdev;
 	int i;
+	conf_t *conf;
 
-	conf_t *conf = kmalloc(sizeof(*conf), GFP_KERNEL);
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
+	conf = kmalloc(sizeof(*conf), GFP_KERNEL);
 	if (!conf)
 		return -ENOMEM;
 
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index dda2f1b64a6d..564c390f8a1b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -189,6 +189,8 @@ static int linear_run (mddev_t *mddev)
 {
 	linear_conf_t *conf;
 
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 	conf = linear_conf(mddev, mddev->raid_disks);
 
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 0f11fd1417ab..09be637d52cb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -735,6 +735,24 @@ struct super_type  {
 						sector_t num_sectors);
 };
 
+/*
+ * Check that the given mddev has no bitmap.
+ *
+ * This function is called from the run method of all personalities that do not
+ * support bitmaps. It prints an error message and returns non-zero if mddev
+ * has a bitmap. Otherwise, it returns 0.
+ *
+ */
+int md_check_no_bitmap(mddev_t *mddev)
+{
+	if (!mddev->bitmap_file && !mddev->bitmap_offset)
+		return 0;
+	printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
+		mdname(mddev), mddev->pers->name);
+	return 1;
+}
+EXPORT_SYMBOL(md_check_no_bitmap);
+
 /*
  * load_super for 0.90.0 
  */
@@ -788,17 +806,6 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
 	rdev->data_offset = 0;
 	rdev->sb_size = MD_SB_BYTES;
 
-	if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
-		if (sb->level != 1 && sb->level != 4
-		    && sb->level != 5 && sb->level != 6
-		    && sb->level != 10) {
-			/* FIXME use a better test */
-			printk(KERN_WARNING
-			       "md: bitmaps not supported for this level.\n");
-			goto abort;
-		}
-	}
-
 	if (sb->level == LEVEL_MULTIPATH)
 		rdev->desc_nr = -1;
 	else
@@ -1176,17 +1183,6 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
 		       bdevname(rdev->bdev,b));
 		return -EINVAL;
 	}
-	if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) {
-		if (sb->level != cpu_to_le32(1) &&
-		    sb->level != cpu_to_le32(4) &&
-		    sb->level != cpu_to_le32(5) &&
-		    sb->level != cpu_to_le32(6) &&
-		    sb->level != cpu_to_le32(10)) {
-			printk(KERN_WARNING
-			       "md: bitmaps not supported for this level.\n");
-			return -EINVAL;
-		}
-	}
 
 	rdev->preferred_minor = 0xffff;
 	rdev->data_offset = le64_to_cpu(sb->data_offset);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index ea2c441449d4..9430a110db93 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -430,5 +430,6 @@ extern void md_new_event(mddev_t *mddev);
 extern int md_allow_write(mddev_t *mddev);
 extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
 extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
+extern int md_check_no_bitmap(mddev_t *mddev);
 
 #endif /* _MD_MD_H */
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index c1ca63f278a9..cbe368fa6598 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -421,6 +421,9 @@ static int multipath_run (mddev_t *mddev)
 	struct multipath_info *disk;
 	mdk_rdev_t *rdev;
 
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
+
 	if (mddev->level != LEVEL_MULTIPATH) {
 		printk("multipath: %s: raid level not set to multipath IO (%d)\n",
 		       mdname(mddev), mddev->level);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 717e64a4af9a..ab4a489d8695 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -314,6 +314,8 @@ static int raid0_run(mddev_t *mddev)
 		printk(KERN_ERR "md/raid0: chunk size must be set.\n");
 		return -EINVAL;
 	}
+	if (md_check_no_bitmap(mddev))
+		return -EINVAL;
 	blk_queue_max_sectors(mddev->queue, mddev->chunk_sectors);
 	mddev->queue->queue_lock = &mddev->queue->__queue_lock;
 
-- 
cgit v1.2.3


From af11c397fd8835c70ec0bb777104e4ab98b2d660 Mon Sep 17 00:00:00 2001
From: SandeepKsinha <sandeepksinha@gmail.com>
Date: Thu, 18 Jun 2009 08:49:35 +1000
Subject: md linear: Protecting mddev with rcu locks to avoid races

Due to the lack of memory ordering guarantees, we may have races around
mddev->conf.

In particular, the correct contents of the structure we get from
dereferencing ->private might not be visible to this CPU yet, and
they might not be correct w.r.t mddev->raid_disks.

This patch addresses the problem using rcu protection to avoid
such race conditions.

Signed-off-by: SandeepKsinha <sandeepksinha@gmail.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 47 ++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 38 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 564c390f8a1b..93f2b1d18398 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -28,10 +28,11 @@
 static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector)
 {
 	int lo, mid, hi;
-	linear_conf_t *conf = mddev->private;
+	linear_conf_t *conf;
 
 	lo = 0;
 	hi = mddev->raid_disks - 1;
+	conf = rcu_dereference(mddev->private);
 
 	/*
 	 * Binary Search
@@ -66,8 +67,10 @@ static int linear_mergeable_bvec(struct request_queue *q,
 	unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 
+	rcu_read_lock();
 	dev0 = which_dev(mddev, sector);
 	maxsectors = dev0->end_sector - sector;
+	rcu_read_unlock();
 
 	if (maxsectors < bio_sectors)
 		maxsectors = 0;
@@ -86,36 +89,50 @@ static int linear_mergeable_bvec(struct request_queue *q,
 static void linear_unplug(struct request_queue *q)
 {
 	mddev_t *mddev = q->queuedata;
-	linear_conf_t *conf = mddev->private;
+	linear_conf_t *conf;
 	int i;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
 	for (i=0; i < mddev->raid_disks; i++) {
 		struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev);
 		blk_unplug(r_queue);
 	}
+	rcu_read_unlock();
 }
 
 static int linear_congested(void *data, int bits)
 {
 	mddev_t *mddev = data;
-	linear_conf_t *conf = mddev->private;
+	linear_conf_t *conf;
 	int i, ret = 0;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
+
 	for (i = 0; i < mddev->raid_disks && !ret ; i++) {
 		struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev);
 		ret |= bdi_congested(&q->backing_dev_info, bits);
 	}
+
+	rcu_read_unlock();
 	return ret;
 }
 
 static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks)
 {
-	linear_conf_t *conf = mddev->private;
+	linear_conf_t *conf;
+	sector_t array_sectors;
 
+	rcu_read_lock();
+	conf = rcu_dereference(mddev->private);
 	WARN_ONCE(sectors || raid_disks,
 		  "%s does not support generic reshape\n", __func__);
+	array_sectors = conf->array_sectors;
+	rcu_read_unlock();
 
-	return conf->array_sectors;
+	return array_sectors;
 }
 
 static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks)
@@ -229,8 +246,8 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 		return -ENOMEM;
 
 	newconf->prev = mddev->private;
-	mddev->private = newconf;
 	mddev->raid_disks++;
+	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	return 0;
@@ -239,7 +256,13 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 static int linear_stop (mddev_t *mddev)
 {
 	linear_conf_t *conf = mddev->private;
-  
+
+	/*
+	 * We do not require rcu protection here since
+	 * we hold reconfig_mutex for both linear_add and
+	 * linear_stop, so they cannot race.
+	 */
+
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	do {
 		linear_conf_t *t = conf->prev;
@@ -269,9 +292,11 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 		      bio_sectors(bio));
 	part_stat_unlock();
 
+	rcu_read_lock();
 	tmp_dev = which_dev(mddev, bio->bi_sector);
 	start_sector = tmp_dev->end_sector - tmp_dev->rdev->sectors;
 
+
 	if (unlikely(bio->bi_sector >= (tmp_dev->end_sector)
 		     || (bio->bi_sector < start_sector))) {
 		char b[BDEVNAME_SIZE];
@@ -282,6 +307,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 			bdevname(tmp_dev->rdev->bdev, b),
 			(unsigned long long)tmp_dev->rdev->sectors,
 			(unsigned long long)start_sector);
+		rcu_read_unlock();
 		bio_io_error(bio);
 		return 0;
 	}
@@ -291,9 +317,11 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 		 * split it.
 		 */
 		struct bio_pair *bp;
+		sector_t end_sector = tmp_dev->end_sector;
+
+		rcu_read_unlock();
 
-		bp = bio_split(bio,
-			       tmp_dev->end_sector - bio->bi_sector);
+		bp = bio_split(bio, end_sector - bio->bi_sector);
 
 		if (linear_make_request(q, &bp->bio1))
 			generic_make_request(&bp->bio1);
@@ -306,6 +334,7 @@ static int linear_make_request (struct request_queue *q, struct bio *bio)
 	bio->bi_bdev = tmp_dev->rdev->bdev;
 	bio->bi_sector = bio->bi_sector - start_sector
 		+ tmp_dev->rdev->data_offset;
+	rcu_read_unlock();
 
 	return 1;
 }
-- 
cgit v1.2.3


From 495d357301e1de01fabe30ce9a555301fb4675c3 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 08:49:42 +1000
Subject: md/linear: use call_rcu to free obsolete 'conf' structures.

Current, when we update the 'conf' structure, when adding a
drive to a linear array, we keep the old version around until
the array is finally stopped, as it is not safe to free it
immediately.

Now that we have rcu protection on all accesses to 'conf',
we can use call_rcu to free it more promptly.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/linear.c | 21 +++++++++++++--------
 drivers/md/linear.h |  2 +-
 2 files changed, 14 insertions(+), 9 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 93f2b1d18398..15c8b7b25a9b 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -223,6 +223,12 @@ static int linear_run (mddev_t *mddev)
 	return 0;
 }
 
+static void free_conf(struct rcu_head *head)
+{
+	linear_conf_t *conf = container_of(head, linear_conf_t, rcu);
+	kfree(conf);
+}
+
 static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	/* Adding a drive to a linear array allows the array to grow.
@@ -233,7 +239,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	 * The current one is never freed until the array is stopped.
 	 * This avoids races.
 	 */
-	linear_conf_t *newconf;
+	linear_conf_t *newconf, *oldconf;
 
 	if (rdev->saved_raid_disk != mddev->raid_disks)
 		return -EINVAL;
@@ -245,11 +251,12 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev)
 	if (!newconf)
 		return -ENOMEM;
 
-	newconf->prev = mddev->private;
+	oldconf = rcu_dereference(mddev->private);
 	mddev->raid_disks++;
 	rcu_assign_pointer(mddev->private, newconf);
 	md_set_array_sectors(mddev, linear_size(mddev, 0, 0));
 	set_capacity(mddev->gendisk, mddev->array_sectors);
+	call_rcu(&oldconf->rcu, free_conf);
 	return 0;
 }
 
@@ -261,14 +268,12 @@ static int linear_stop (mddev_t *mddev)
 	 * We do not require rcu protection here since
 	 * we hold reconfig_mutex for both linear_add and
 	 * linear_stop, so they cannot race.
+	 * We should make sure any old 'conf's are properly
+	 * freed though.
 	 */
-
+	rcu_barrier();
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
-	do {
-		linear_conf_t *t = conf->prev;
-		kfree(conf);
-		conf = t;
-	} while (conf);
+	kfree(conf);
 
 	return 0;
 }
diff --git a/drivers/md/linear.h b/drivers/md/linear.h
index 599e5c1bbb01..0ce29b61605a 100644
--- a/drivers/md/linear.h
+++ b/drivers/md/linear.h
@@ -10,9 +10,9 @@ typedef struct dev_info dev_info_t;
 
 struct linear_private_data
 {
-	struct linear_private_data *prev;	/* earlier version */
 	sector_t		array_sectors;
 	dev_info_t		disks[0];
+	struct rcu_head		rcu;
 };
 
 
-- 
cgit v1.2.3


From 7a3ab908948b6296ee7e81d42f7c176361c51975 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Tue, 16 Jun 2009 16:00:33 -0700
Subject: md/raid5: add missing call to schedule() after prepare_to_wait()

In the unlikely event that reshape progresses past the current request
while it is waiting for a stripe we need to schedule() before retrying
for 2 reasons:
1/ Prevent list corruption from duplicated list_add() calls without
   intervening list_del().
2/ Give the reshape code a chance to make some progress to resolve the
   conflict.

Cc: <stable@kernel.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 8f2e3740a05a..59e29c2983cb 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3695,6 +3695,7 @@ static int make_request(struct request_queue *q, struct bio * bi)
 				spin_unlock_irq(&conf->device_lock);
 				if (must_retry) {
 					release_stripe(sh);
+					schedule();
 					goto retry;
 				}
 			}
-- 
cgit v1.2.3


From 48606a9f2fc034f0b308d088c1f7ab6d407c462c Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 18 Jun 2009 09:14:12 +1000
Subject: md/raid5: correctly update sync_completed when we reach max_resync

At the end of reshape_request we update cyrr_resync_completed
if we are about to pause due to reaching resync_max.
However we update it to the wrong value.  We need to add the
"reshape_sectors" that have just been reshaped.

Signed-off-by: NeilBrown <neilb@suse.de>
---
 drivers/md/raid5.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 59e29c2983cb..f9f991e6e138 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3946,7 +3946,7 @@ static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped
 		wait_event(conf->wait_for_overlap,
 			   atomic_read(&conf->reshape_stripes) == 0);
 		mddev->reshape_position = conf->reshape_progress;
-		mddev->curr_resync_completed = mddev->curr_resync;
+		mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
-- 
cgit v1.2.3


From 90c699a9ee4be165966d40f1837909ccb8890a68 Mon Sep 17 00:00:00 2001
From: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Date: Fri, 19 Jun 2009 08:08:50 +0200
Subject: block: rename CONFIG_LBD to CONFIG_LBDAF

Follow-up to "block: enable by default support for large devices
and files on 32-bit archs".

Rename CONFIG_LBD to CONFIG_LBDAF to:
- allow update of existing [def]configs for "default y" change
- reflect that it is used also for large files support nowadays

Signed-off-by: Bartlomiej Zolnierkiewicz <bzolnier@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 Documentation/SubmitChecklist       |  2 +-
 Documentation/ja_JP/SubmitChecklist |  2 +-
 block/Kconfig                       |  4 ++--
 drivers/md/dm-exception-store.h     |  2 +-
 fs/ext3/resize.c                    |  2 +-
 fs/ext3/super.c                     |  2 +-
 fs/ext4/resize.c                    |  2 +-
 fs/ext4/super.c                     | 10 +++++-----
 fs/gfs2/Kconfig                     |  2 +-
 fs/ocfs2/super.c                    |  2 +-
 fs/xfs/linux-2.6/xfs_linux.h        |  2 +-
 fs/xfs/linux-2.6/xfs_super.c        |  2 +-
 include/linux/kernel.h              |  2 +-
 include/linux/types.h               |  2 +-
 14 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'drivers/md')

diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index ac5e0b2f1097..78a9168ff377 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -54,7 +54,7 @@ kernel patches.
     CONFIG_PREEMPT.
 
 14: If the patch affects IO/Disk, etc: has been tested with and without
-    CONFIG_LBD.
+    CONFIG_LBDAF.
 
 15: All codepaths have been exercised with all lockdep features enabled.
 
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 6c42e071d723..2df4576f1173 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -75,7 +75,7 @@ Linux カーネルパッチ投稿者向けチェックリスト
     ビルドした上、動作確認を行ってください。
 
 14: もしパッチがディスクのI/O性能などに影響を与えるようであれば、
-    'CONFIG_LBD'オプションを有効にした場合と無効にした場合の両方で
+    'CONFIG_LBDAF'オプションを有効にした場合と無効にした場合の両方で
     テストを実施してみてください。
 
 15: lockdepの機能を全て有効にした上で、全てのコードパスを評価してください。
diff --git a/block/Kconfig b/block/Kconfig
index 2c39527aa7db..95a86adc33a1 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -23,8 +23,8 @@ menuconfig BLOCK
 
 if BLOCK
 
-config LBD
-	bool "Support for large block devices and files"
+config LBDAF
+	bool "Support for large (2TB+) block devices and files"
 	depends on !64BIT
 	default y
 	help
diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index 0a2e6e7f67b3..c92701dc5001 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -111,7 +111,7 @@ struct dm_exception_store {
 /*
  * Funtions to manipulate consecutive chunks
  */
-#  if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#  if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 #    define DM_CHUNK_CONSECUTIVE_BITS 8
 #    define DM_CHUNK_NUMBER_BITS 56
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 8a0b26340b54..8359e7b3dc89 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -990,7 +990,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext3_warning(sb, __func__,
-			"CONFIG_LBD not enabled\n");
+			"CONFIG_LBDAF not enabled\n");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 26aa64dee6aa..601e881e6105 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -1812,7 +1812,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
 		printk(KERN_ERR "EXT3-fs: filesystem on %s:"
 			" too large to mount safely\n", sb->s_id);
 		if (sizeof(sector_t) < 8)
-			printk(KERN_WARNING "EXT3-fs: CONFIG_LBD not "
+			printk(KERN_WARNING "EXT3-fs: CONFIG_LBDAF not "
 					"enabled\n");
 		goto failed_mount;
 	}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 27eb289eea37..68b0351fc647 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1002,7 +1002,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 			" too large to resize to %llu blocks safely\n",
 			sb->s_id, n_blocks_count);
 		if (sizeof(sector_t) < 8)
-			ext4_warning(sb, __func__, "CONFIG_LBD not enabled");
+			ext4_warning(sb, __func__, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
 	}
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 012c4251397e..6e0c2d77c87e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1957,7 +1957,7 @@ static loff_t ext4_max_size(int blkbits, int has_huge_files)
 	/* small i_blocks in vfs inode? */
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * CONFIG_LBD is not enabled implies the inode
+		 * CONFIG_LBDAF is not enabled implies the inode
 		 * i_block represent total blocks in 512 bytes
 		 * 32 == size of vfs inode i_blocks * 8
 		 */
@@ -2000,7 +2000,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 
 	if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
 		/*
-		 * !has_huge_files or CONFIG_LBD not enabled implies that
+		 * !has_huge_files or CONFIG_LBDAF not enabled implies that
 		 * the inode i_block field represents total file blocks in
 		 * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
 		 */
@@ -2436,13 +2436,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	if (has_huge_files) {
 		/*
 		 * Large file size enabled file system can only be
-		 * mount if kernel is build with CONFIG_LBD
+		 * mount if kernel is build with CONFIG_LBDAF
 		 */
 		if (sizeof(root->i_blocks) < sizeof(u64) &&
 				!(sb->s_flags & MS_RDONLY)) {
 			ext4_msg(sb, KERN_ERR, "Filesystem with huge "
 					"files cannot be mounted read-write "
-					"without CONFIG_LBD");
+					"without CONFIG_LBDAF");
 			goto failed_mount;
 		}
 	}
@@ -2566,7 +2566,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		ext4_msg(sb, KERN_ERR, "filesystem"
 			" too large to mount safely");
 		if (sizeof(sector_t) < 8)
-			ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
+			ext4_msg(sb, KERN_WARNING, "CONFIG_LBDAF not enabled");
 		goto failed_mount;
 	}
 
diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig
index cad957cdb1e5..5971359d2090 100644
--- a/fs/gfs2/Kconfig
+++ b/fs/gfs2/Kconfig
@@ -1,6 +1,6 @@
 config GFS2_FS
 	tristate "GFS2 file system support"
-	depends on EXPERIMENTAL && (64BIT || LBD)
+	depends on EXPERIMENTAL && (64BIT || LBDAF)
 	select DLM if GFS2_FS_LOCKING_DLM
 	select CONFIGFS_FS if GFS2_FS_LOCKING_DLM
 	select SYSFS if GFS2_FS_LOCKING_DLM
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index d33767f17ba3..0d3ed7407a04 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -552,7 +552,7 @@ static unsigned long long ocfs2_max_file_offset(unsigned int bbits,
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	BUILD_BUG_ON(sizeof(sector_t) != 8);
 	/*
 	 * We might be limited by page cache size.
diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h
index f65a53f8752f..6127e24062d0 100644
--- a/fs/xfs/linux-2.6/xfs_linux.h
+++ b/fs/xfs/linux-2.6/xfs_linux.h
@@ -24,7 +24,7 @@
  * XFS_BIG_BLKNOS needs block layer disk addresses to be 64 bits.
  * XFS_BIG_INUMS requires XFS_BIG_BLKNOS to be set.
  */
-#if defined(CONFIG_LBD) || (BITS_PER_LONG == 64)
+#if defined(CONFIG_LBDAF) || (BITS_PER_LONG == 64)
 # define XFS_BIG_BLKNOS	1
 # define XFS_BIG_INUMS	1
 #else
diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c
index 2e09efbca8db..a220d36f789b 100644
--- a/fs/xfs/linux-2.6/xfs_super.c
+++ b/fs/xfs/linux-2.6/xfs_super.c
@@ -616,7 +616,7 @@ xfs_max_file_offset(
 	 */
 
 #if BITS_PER_LONG == 32
-# if defined(CONFIG_LBD)
+# if defined(CONFIG_LBDAF)
 	ASSERT(sizeof(sector_t) == 8);
 	pagefactor = PAGE_CACHE_SIZE;
 	bitshift = BITS_PER_LONG;
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index c5a71c38a95f..fac104e7186a 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -58,7 +58,7 @@ extern const char linux_proc_banner[];
 #define _RET_IP_		(unsigned long)__builtin_return_address(0)
 #define _THIS_IP_  ({ __label__ __here; __here: (unsigned long)&&__here; })
 
-#ifdef CONFIG_LBD
+#ifdef CONFIG_LBDAF
 # include <asm/div64.h>
 # define sector_div(a, b) do_div(a, b)
 #else
diff --git a/include/linux/types.h b/include/linux/types.h
index 5abe354020f9..c42724f8c802 100644
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -131,7 +131,7 @@ typedef		__s64		int64_t;
  *
  * blkcnt_t is the type of the inode's block count.
  */
-#ifdef CONFIG_LBD
+#ifdef CONFIG_LBDAF
 typedef u64 sector_t;
 typedef u64 blkcnt_t;
 #else
-- 
cgit v1.2.3


From 0e0497c0c017664994819f4602dc07fd95896c52 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:08:02 +0100
Subject: dm mpath: validate table argument count

The parser reads the argument count as a number but doesn't check that
sufficient arguments are supplied. This command triggers the bug:

dmsetup create mpath --table "0 `blockdev --getsize /dev/mapper/cr0`
    multipath 0 0 2 1 round-robin 1000 0 1 1 /dev/mapper/cr0
    round-robin 0 1 1 /dev/mapper/cr1 1000"
kernel BUG at drivers/md/dm-mpath.c:530!

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 6a386ab4f7eb..d880299334e9 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -553,6 +553,12 @@ static int parse_path_selector(struct arg_set *as, struct priority_group *pg,
 		return -EINVAL;
 	}
 
+	if (ps_argc > as->argc) {
+		dm_put_path_selector(pst);
+		ti->error = "not enough arguments for path selector";
+		return -EINVAL;
+	}
+
 	r = pst->create(&pg->ps, ps_argc, as->argv);
 	if (r) {
 		dm_put_path_selector(pst);
-- 
cgit v1.2.3


From e094f4f15f5169526c7200b9bde44b900548a81e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:10 +0100
Subject: dm mpath: validate hw_handler argument count

Fix arg count parsing error in hw handlers.

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index d880299334e9..f25bdebcb4ab 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -705,6 +705,11 @@ static int parse_hw_handler(struct arg_set *as, struct multipath *m)
 	if (!hw_argc)
 		return 0;
 
+	if (hw_argc > as->argc) {
+		ti->error = "not enough arguments for hardware handler";
+		return -EINVAL;
+	}
+
 	m->hw_handler_name = kstrdup(shift(as), GFP_KERNEL);
 	request_module("scsi_dh_%s", m->hw_handler_name);
 	if (scsi_dh_handler_exist(m->hw_handler_name) == 0) {
-- 
cgit v1.2.3


From 4d89b7b4e4726893453d0fb4ddbb5b3e16353994 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Mon, 22 Jun 2009 10:12:11 +0100
Subject: dm: sysfs skip output when device is being destroyed

Do not process sysfs attributes when device is being destroyed.

Otherwise code can cause
  BUG_ON(test_bit(DMF_FREEING, &md->flags));
in dm_put() call.

Cc: stable@kernel.org
Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 48db308fae67..f1db689667ea 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1777,6 +1777,10 @@ struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 	if (&md->kobj != kobj)
 		return NULL;
 
+	if (test_bit(DMF_FREEING, &md->flags) ||
+	    test_bit(DMF_DELETING, &md->flags))
+		return NULL;
+
 	dm_get(md);
 	return md;
 }
-- 
cgit v1.2.3


From a0cf7ea9549ec60988369f90e5c0f855f08abac9 Mon Sep 17 00:00:00 2001
From: Hannes Reinecke <hare@suse.de>
Date: Mon, 22 Jun 2009 10:12:11 +0100
Subject: dm mpath: change attached scsi_dh

When specifying a different hardware handler via multipath
features we should be able to override the built-in defaults.

The problem here is the hardware table from scsi_dh is compiled
in and cannot be changed from userland. The multipath.conf OTOH
is purely user-defined and, what's more, the user might have a valid
reason for modifying it.
(EG EMC Clariion can well be run in PNR mode even though ALUA is
active, or the user might want to try ALUA on any as-of-yet unknown
devices)

So _not_ allowing multipath to override the device handler setting
will just add to the confusion and makes error tracking even more
difficult.

Signed-off-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f25bdebcb4ab..545abcc25c42 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -597,9 +597,20 @@ static struct pgpath *parse_path(struct arg_set *as, struct path_selector *ps,
 	}
 
 	if (m->hw_handler_name) {
-		r = scsi_dh_attach(bdev_get_queue(p->path.dev->bdev),
-				   m->hw_handler_name);
+		struct request_queue *q = bdev_get_queue(p->path.dev->bdev);
+
+		r = scsi_dh_attach(q, m->hw_handler_name);
+		if (r == -EBUSY) {
+			/*
+			 * Already attached to different hw_handler,
+			 * try to reattach with correct one.
+			 */
+			scsi_dh_detach(q);
+			r = scsi_dh_attach(q, m->hw_handler_name);
+		}
+
 		if (r < 0) {
+			ti->error = "error attaching hardware handler";
 			dm_put_device(ti, p->path.dev);
 			goto bad;
 		}
-- 
cgit v1.2.3


From e54f77ddda72781ec1c1696b21aabd6a30cbb7c6 Mon Sep 17 00:00:00 2001
From: Chandra Seetharaman <sekharan@us.ibm.com>
Date: Mon, 22 Jun 2009 10:12:12 +0100
Subject: dm mpath: call activate fn for each path in pg_init

Fixed a problem affecting reinstatement of passive paths.

Before we moved the hardware handler from dm to SCSI, it performed a pg_init
for a path group and didn't maintain any state about each path in hardware
handler code.

But in SCSI dh, such state is now maintained, as we want to fail I/O early on a
path if it is not the active path.

All the hardware handlers have a state now and set to active or some form of
inactive.  They have prep_fn() which uses this state to fail the I/O without
it ever being sent to the device.

So in effect when dm-multipath calls scsi_dh_activate(), activate is
sent to only one path and the "state" of that path is changed appropriately
to "active" while other paths in the same path group are never changed
as they never got an "activate".

In order make sure all the paths in a path group gets their state set
properly when a pg_init happens, we need to call scsi_dh_activate() on
all paths in a path group.

Doing this at the hardware handler layer is not a good option as we
want the multipath layer to define the relationship between path and path
groups and not the hardware handler.

Attached patch sends an "activate" on each path in a path group when a
path group is switched. It also sends an activate when a path is reinstated.

Signed-off-by: Chandra Seetharaman <sekharan@us.ibm.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 63 +++++++++++++++++++++------------------------------
 1 file changed, 26 insertions(+), 37 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 545abcc25c42..838f01b1dd30 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -35,6 +35,7 @@ struct pgpath {
 
 	struct dm_path path;
 	struct work_struct deactivate_path;
+	struct work_struct activate_path;
 };
 
 #define path_to_pgpath(__pgp) container_of((__pgp), struct pgpath, path)
@@ -64,8 +65,6 @@ struct multipath {
 	spinlock_t lock;
 
 	const char *hw_handler_name;
-	struct work_struct activate_path;
-	struct pgpath *pgpath_to_activate;
 	unsigned nr_priority_groups;
 	struct list_head priority_groups;
 	unsigned pg_init_required;	/* pg_init needs calling? */
@@ -128,6 +127,7 @@ static struct pgpath *alloc_pgpath(void)
 	if (pgpath) {
 		pgpath->is_active = 1;
 		INIT_WORK(&pgpath->deactivate_path, deactivate_path);
+		INIT_WORK(&pgpath->activate_path, activate_path);
 	}
 
 	return pgpath;
@@ -160,7 +160,6 @@ static struct priority_group *alloc_priority_group(void)
 
 static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 {
-	unsigned long flags;
 	struct pgpath *pgpath, *tmp;
 	struct multipath *m = ti->private;
 
@@ -169,10 +168,6 @@ static void free_pgpaths(struct list_head *pgpaths, struct dm_target *ti)
 		if (m->hw_handler_name)
 			scsi_dh_detach(bdev_get_queue(pgpath->path.dev->bdev));
 		dm_put_device(ti, pgpath->path.dev);
-		spin_lock_irqsave(&m->lock, flags);
-		if (m->pgpath_to_activate == pgpath)
-			m->pgpath_to_activate = NULL;
-		spin_unlock_irqrestore(&m->lock, flags);
 		free_pgpath(pgpath);
 	}
 }
@@ -202,7 +197,6 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
 		INIT_WORK(&m->trigger_event, trigger_event);
-		INIT_WORK(&m->activate_path, activate_path);
 		m->mpio_pool = mempool_create_slab_pool(MIN_IOS, _mpio_cache);
 		if (!m->mpio_pool) {
 			kfree(m);
@@ -427,8 +421,8 @@ static void process_queued_ios(struct work_struct *work)
 {
 	struct multipath *m =
 		container_of(work, struct multipath, process_queued_ios);
-	struct pgpath *pgpath = NULL;
-	unsigned init_required = 0, must_queue = 1;
+	struct pgpath *pgpath = NULL, *tmp;
+	unsigned must_queue = 1;
 	unsigned long flags;
 
 	spin_lock_irqsave(&m->lock, flags);
@@ -446,19 +440,15 @@ static void process_queued_ios(struct work_struct *work)
 		must_queue = 0;
 
 	if (m->pg_init_required && !m->pg_init_in_progress && pgpath) {
-		m->pgpath_to_activate = pgpath;
 		m->pg_init_count++;
 		m->pg_init_required = 0;
-		m->pg_init_in_progress = 1;
-		init_required = 1;
+		list_for_each_entry(tmp, &pgpath->pg->pgpaths, list) {
+			if (queue_work(kmpath_handlerd, &tmp->activate_path))
+				m->pg_init_in_progress++;
+		}
 	}
-
 out:
 	spin_unlock_irqrestore(&m->lock, flags);
-
-	if (init_required)
-		queue_work(kmpath_handlerd, &m->activate_path);
-
 	if (!must_queue)
 		dispatch_queued_ios(m);
 }
@@ -946,9 +936,13 @@ static int reinstate_path(struct pgpath *pgpath)
 
 	pgpath->is_active = 1;
 
-	m->current_pgpath = NULL;
-	if (!m->nr_valid_paths++ && m->queue_size)
+	if (!m->nr_valid_paths++ && m->queue_size) {
+		m->current_pgpath = NULL;
 		queue_work(kmultipathd, &m->process_queued_ios);
+	} else if (m->hw_handler_name && (m->current_pg == pgpath->pg)) {
+		if (queue_work(kmpath_handlerd, &pgpath->activate_path))
+			m->pg_init_in_progress++;
+	}
 
 	dm_path_uevent(DM_UEVENT_PATH_REINSTATED, m->ti,
 		      pgpath->path.dev->name, m->nr_valid_paths);
@@ -1124,35 +1118,30 @@ static void pg_init_done(struct dm_path *path, int errors)
 
 	spin_lock_irqsave(&m->lock, flags);
 	if (errors) {
-		DMERR("Could not failover device. Error %d.", errors);
-		m->current_pgpath = NULL;
-		m->current_pg = NULL;
+		if (pgpath == m->current_pgpath) {
+			DMERR("Could not failover device. Error %d.", errors);
+			m->current_pgpath = NULL;
+			m->current_pg = NULL;
+		}
 	} else if (!m->pg_init_required) {
 		m->queue_io = 0;
 		pg->bypassed = 0;
 	}
 
-	m->pg_init_in_progress = 0;
-	queue_work(kmultipathd, &m->process_queued_ios);
+	m->pg_init_in_progress--;
+	if (!m->pg_init_in_progress)
+		queue_work(kmultipathd, &m->process_queued_ios);
 	spin_unlock_irqrestore(&m->lock, flags);
 }
 
 static void activate_path(struct work_struct *work)
 {
 	int ret;
-	struct multipath *m =
-		container_of(work, struct multipath, activate_path);
-	struct dm_path *path;
-	unsigned long flags;
+	struct pgpath *pgpath =
+		container_of(work, struct pgpath, activate_path);
 
-	spin_lock_irqsave(&m->lock, flags);
-	path = &m->pgpath_to_activate->path;
-	m->pgpath_to_activate = NULL;
-	spin_unlock_irqrestore(&m->lock, flags);
-	if (!path)
-		return;
-	ret = scsi_dh_activate(bdev_get_queue(path->dev->bdev));
-	pg_init_done(path, ret);
+	ret = scsi_dh_activate(bdev_get_queue(pgpath->path.dev->bdev));
+	pg_init_done(&pgpath->path, ret);
 }
 
 /*
-- 
cgit v1.2.3


From a72986c562eeec3f7b992198c168f0f41606fe53 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:13 +0100
Subject: dm raid1: keep retrying alloc if mempool_alloc failed

If the code can't handle allocation failures, use __GFP_NOFAIL so that
in case of memory pressure the allocator will retry indefinitely and
won't return NULL which would cause a crash in the function.

This is still not a correct fix, it may cause a classic deadlock when
memory manager waits for I/O being done and I/O waits for some free memory.
I/O code shouldn't allocate any memory. But in this case it probably
doesn't matter much in practice, people usually do not swap on RAID.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-region-hash.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-region-hash.c b/drivers/md/dm-region-hash.c
index 7b899be0b087..36dbe29f2fd6 100644
--- a/drivers/md/dm-region-hash.c
+++ b/drivers/md/dm-region-hash.c
@@ -283,7 +283,7 @@ static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
 
 	nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
 	if (unlikely(!nreg))
-		nreg = kmalloc(sizeof(*nreg), GFP_NOIO);
+		nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
 
 	nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
 		      DM_RH_CLEAN : DM_RH_NOSYNC;
-- 
cgit v1.2.3


From 53b351f972a882ea8b6cdb19602535f1057c884a Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:13 +0100
Subject: dm mpath: flush keventd queue in destructor

The commit fe9cf30eb8186ef267d1868dc9f12f2d0f40835a moves dm table event
submission from kmultipath queue to kernel kevent queue to avoid a
deadlock.

There is a possibility of race condition because kevent queue is not flushed
in the multipath destructor. The scenario is:
- some event happens and is queued to keventd
- keventd thread is delayed due to scheuling latency or some other work
- multipath device is destroyed
- keventd now attempts to process work_struct that is residing in already
  released memory.

The patch flushes the keventd queue in multipath constructor.
I've already fixed similar bug in dm-raid1.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: stable@kernel.org
---
 drivers/md/dm-mpath.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 838f01b1dd30..92bdcbb7c935 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -848,6 +848,7 @@ static void multipath_dtr(struct dm_target *ti)
 
 	flush_workqueue(kmpath_handlerd);
 	flush_workqueue(kmultipathd);
+	flush_scheduled_work();
 	free_multipath(m);
 }
 
-- 
cgit v1.2.3


From 8cbeb67ad50f7d68e5e83be2cb2284de8f9c03b5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:14 +0100
Subject: dm: avoid unsupported spanning of md stripe boundaries

A bio that has two or more vector entries, size less than or equal to
page size, that crosses a stripe boundary of an underlying md device is
accepted by device mapper (it conforms to all its limits) but not by the
underlying device.

The fix is: If device mapper selects the one-page maximum request size,
it also needs to set its own q->merge_bvec_fn to reject any bios with
multiple vector entries that span more pages.

The problem was discovered in the following scenario:
  * MD - RAID-0
  * LV on the top of it (raid1, snapshot or striped with chunk
size/stripe larger than RAID-0 stripe)
  * one of the logical volumes is exported to xen domU
  * inside xen domU it is partitioned, the key point is that the partition
must be unaligned on page boundary (fdisk normally aligns the partition to
63 sectors which will trigger it)
  * install the system on the partitioned disk in domU
This causes I/O failures in dom0.
Reference: https://bugzilla.redhat.com/show_bug.cgi?id=223947

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f1db689667ea..59806e67a175 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -925,6 +925,16 @@ static int dm_merge_bvec(struct request_queue *q,
 	 */
 	if (max_size && ti->type->merge)
 		max_size = ti->type->merge(ti, bvm, biovec, max_size);
+	/*
+	 * If the target doesn't support merge method and some of the devices
+	 * provided their merge_bvec method (we know this by looking at
+	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
+	 * entries.  So always set max_size to 0, and the code below allows
+	 * just one page.
+	 */
+	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
+
+		max_size = 0;
 
 out_table:
 	dm_table_put(map);
-- 
cgit v1.2.3


From 5657e8fa45cf230df278040c420fb80e06309d8f Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:14 +0100
Subject: dm: use i_size_read

Use i_size_read() instead of reading i_size.

If someone changes the size of the device simultaneously, i_size_read
is guaranteed to return a valid value (either the old one or the new one).

i_size can return some intermediate invalid value (on 32-bit computers
with 64-bit i_size, the reads to both halves of i_size can be interleaved
with updates to i_size, resulting in garbage being returned).

Cc: stable@kernel.org
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.h | 2 +-
 drivers/md/dm-log.c             | 2 +-
 drivers/md/dm-table.c           | 3 ++-
 3 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.h b/drivers/md/dm-exception-store.h
index c92701dc5001..2442c8c07898 100644
--- a/drivers/md/dm-exception-store.h
+++ b/drivers/md/dm-exception-store.h
@@ -156,7 +156,7 @@ static inline void dm_consecutive_chunk_count_inc(struct dm_snap_exception *e)
  */
 static inline sector_t get_dev_size(struct block_device *bdev)
 {
-	return bdev->bd_inode->i_size >> SECTOR_SHIFT;
+	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 }
 
 static inline chunk_t sector_to_chunk(struct dm_exception_store *store,
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6fa8ccf91c70..6352a9ad4446 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -416,7 +416,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 				       bitset_size,
 				       ti->limits.logical_block_size);
 
-		if (buf_size > dev->bdev->bd_inode->i_size) {
+		if (buf_size > i_size_read(dev->bdev->bd_inode)) {
 			DMWARN("log device %s too small: need %llu bytes",
 				dev->name, (unsigned long long)buf_size);
 			kfree(lc);
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e9a73bb242b0..0e2210c0c168 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -388,7 +388,8 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 static int check_device_area(struct dm_dev_internal *dd, sector_t start,
 			     sector_t len)
 {
-	sector_t dev_size = dd->dm_dev.bdev->bd_inode->i_size >> SECTOR_SHIFT;
+	sector_t dev_size = i_size_read(dd->dm_dev.bdev->bd_inode) >>
+			    SECTOR_SHIFT;
 
 	if (!dev_size)
 		return 1;
-- 
cgit v1.2.3


From f6bd4eb73cdf2a5bf954e497972842f39cabb7e3 Mon Sep 17 00:00:00 2001
From: Jonathan Brassow <jbrassow@redhat.com>
Date: Mon, 22 Jun 2009 10:12:15 +0100
Subject: dm exception store: fix exstore lookup to be case insensitive

When snapshots are created using 'p' instead of 'P' as the
exception store type, the device-mapper table loading fails.

This patch makes the code case insensitive as intended and fixes some
regressions reported with device-mapper snapshots.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-exception-store.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 75d8081a9041..c3ae51584b12 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -216,7 +216,7 @@ int dm_exception_store_create(struct dm_target *ti, int argc, char **argv,
 		return -EINVAL;
 	}
 
-	type = get_type(argv[1]);
+	type = get_type(&persistent);
 	if (!type) {
 		ti->error = "Exception store type not recognised";
 		r = -EINVAL;
-- 
cgit v1.2.3


From db8fef4fabe4a546ce74f80bff64fd43776e5912 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:15 +0100
Subject: dm: rename suspended_bdev to bdev

Rename suspended_bdev to bdev.

This patch doesn't change any functionality, just renames the variable.
In the next patch, the variable will be used even for non-suspended device.

(Pre-requisite for the per-target barrier support patches.)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 36 ++++++++++++++++++------------------
 1 file changed, 18 insertions(+), 18 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 59806e67a175..1cfd9b72403d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -157,7 +157,7 @@ struct mapped_device {
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
-	struct block_device *suspended_bdev;
+	struct block_device *bdev;
 
 	/* forced geometry settings */
 	struct hd_geometry geometry;
@@ -1214,9 +1214,9 @@ static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->suspended_bdev) {
+	if (md->bdev) {
 		unlock_fs(md);
-		bdput(md->suspended_bdev);
+		bdput(md->bdev);
 	}
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
@@ -1259,9 +1259,9 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	mutex_lock(&md->suspended_bdev->bd_inode->i_mutex);
-	i_size_write(md->suspended_bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
-	mutex_unlock(&md->suspended_bdev->bd_inode->i_mutex);
+	mutex_lock(&md->bdev->bd_inode->i_mutex);
+	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
 static int __bind(struct mapped_device *md, struct dm_table *t)
@@ -1277,7 +1277,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->suspended_bdev)
+	if (md->bdev)
 		__set_size(md, size);
 
 	if (!size) {
@@ -1521,7 +1521,7 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 
 	/* without bdev, the device size cannot be changed */
-	if (!md->suspended_bdev)
+	if (!md->bdev)
 		if (get_capacity(md->disk) != dm_table_get_size(table))
 			goto out;
 
@@ -1543,7 +1543,7 @@ static int lock_fs(struct mapped_device *md)
 
 	WARN_ON(md->frozen_sb);
 
-	md->frozen_sb = freeze_bdev(md->suspended_bdev);
+	md->frozen_sb = freeze_bdev(md->bdev);
 	if (IS_ERR(md->frozen_sb)) {
 		r = PTR_ERR(md->frozen_sb);
 		md->frozen_sb = NULL;
@@ -1563,7 +1563,7 @@ static void unlock_fs(struct mapped_device *md)
 	if (!test_bit(DMF_FROZEN, &md->flags))
 		return;
 
-	thaw_bdev(md->suspended_bdev, md->frozen_sb);
+	thaw_bdev(md->bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
 	clear_bit(DMF_FROZEN, &md->flags);
 }
@@ -1603,8 +1603,8 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	/* bdget() can stall if the pending I/Os are not flushed */
 	if (!noflush) {
-		md->suspended_bdev = bdget_disk(md->disk, 0);
-		if (!md->suspended_bdev) {
+		md->bdev = bdget_disk(md->disk, 0);
+		if (!md->bdev) {
 			DMWARN("bdget failed in dm_suspend");
 			r = -ENOMEM;
 			goto out;
@@ -1675,9 +1675,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->suspended_bdev) {
-		bdput(md->suspended_bdev);
-		md->suspended_bdev = NULL;
+	if (r && md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
 	}
 
 	dm_table_put(map);
@@ -1708,9 +1708,9 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
-	if (md->suspended_bdev) {
-		bdput(md->suspended_bdev);
-		md->suspended_bdev = NULL;
+	if (md->bdev) {
+		bdput(md->bdev);
+		md->bdev = NULL;
 	}
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
-- 
cgit v1.2.3


From 32a926da5a16c01a8213331e5764472ce2f14a8d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:17 +0100
Subject: dm: always hold bdev reference

Fix a potential deadlock when creating multiple snapshots by holding a
reference to struct block_device for the whole lifecycle of every dm
device instead of obtaining it independently at each point it is needed.

bdget_disk() was called while the device was being suspended, in
dm_suspend().  However there could be other devices already suspended,
for example when creating additional snapshots of a device. bdget_disk()
can wait for IO and allocate memory resulting in waiting for the
already-suspended device - deadlock.

This patch changes the code so that it gets the reference to struct
block_device when struct mapped_device is allocated and initialized in
alloc_dev() where it is always OK to allocate memory or wait for I/O.
It drops the reference when it is destroyed in free_dev().  Thus there
is no call to bdget_disk() while any device is suspended.

Previously unlock_fs() was called only if bdev was held.  Now it is
called unconditionally, but the superfluous calls are harmless because
it returns immediately if the filesystem was not previously frozen.

This patch also now allows the device size to be changed in a
noflush suspend because the bdev is held.  This has no adverse effect.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 57 ++++++++++++++++-----------------------------------------
 1 file changed, 16 insertions(+), 41 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 1cfd9b72403d..5e06f1e6234f 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1180,6 +1180,10 @@ static struct mapped_device *alloc_dev(int minor)
 	if (!md->wq)
 		goto bad_thread;
 
+	md->bdev = bdget_disk(md->disk, 0);
+	if (!md->bdev)
+		goto bad_bdev;
+
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
@@ -1189,6 +1193,8 @@ static struct mapped_device *alloc_dev(int minor)
 
 	return md;
 
+bad_bdev:
+	destroy_workqueue(md->wq);
 bad_thread:
 	put_disk(md->disk);
 bad_disk:
@@ -1214,10 +1220,8 @@ static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
 
-	if (md->bdev) {
-		unlock_fs(md);
-		bdput(md->bdev);
-	}
+	unlock_fs(md);
+	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 	mempool_destroy(md->tio_pool);
 	mempool_destroy(md->io_pool);
@@ -1277,8 +1281,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 
-	if (md->bdev)
-		__set_size(md, size);
+	__set_size(md, size);
 
 	if (!size) {
 		dm_table_destroy(t);
@@ -1520,11 +1523,6 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended(md))
 		goto out;
 
-	/* without bdev, the device size cannot be changed */
-	if (!md->bdev)
-		if (get_capacity(md->disk) != dm_table_get_size(table))
-			goto out;
-
 	__unbind(md);
 	r = __bind(md, table);
 
@@ -1552,9 +1550,6 @@ static int lock_fs(struct mapped_device *md)
 
 	set_bit(DMF_FROZEN, &md->flags);
 
-	/* don't bdput right now, we don't want the bdev
-	 * to go away while it is locked.
-	 */
 	return 0;
 }
 
@@ -1601,24 +1596,14 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	/* This does not get reverted if there's an error later. */
 	dm_table_presuspend_targets(map);
 
-	/* bdget() can stall if the pending I/Os are not flushed */
-	if (!noflush) {
-		md->bdev = bdget_disk(md->disk, 0);
-		if (!md->bdev) {
-			DMWARN("bdget failed in dm_suspend");
-			r = -ENOMEM;
+	/*
+	 * Flush I/O to the device. noflush supersedes do_lockfs,
+	 * because lock_fs() needs to flush I/Os.
+	 */
+	if (!noflush && do_lockfs) {
+		r = lock_fs(md);
+		if (r)
 			goto out;
-		}
-
-		/*
-		 * Flush I/O to the device. noflush supersedes do_lockfs,
-		 * because lock_fs() needs to flush I/Os.
-		 */
-		if (do_lockfs) {
-			r = lock_fs(md);
-			if (r)
-				goto out;
-		}
 	}
 
 	/*
@@ -1675,11 +1660,6 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	set_bit(DMF_SUSPENDED, &md->flags);
 
 out:
-	if (r && md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	dm_table_put(map);
 
 out_unlock:
@@ -1708,11 +1688,6 @@ int dm_resume(struct mapped_device *md)
 
 	unlock_fs(md);
 
-	if (md->bdev) {
-		bdput(md->bdev);
-		md->bdev = NULL;
-	}
-
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);
-- 
cgit v1.2.3


From 531fe96364f30879753d46c1f52ab839e12d2e5d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:17 +0100
Subject: dm: make dm_flush return void

Make dm_flush return void.

The first error during flush is stored in md->barrier_error instead.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5e06f1e6234f..e34d694ddd04 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1439,34 +1439,25 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 	return r;
 }
 
-static int dm_flush(struct mapped_device *md)
+static void dm_flush(struct mapped_device *md)
 {
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-	return 0;
 }
 
 static void process_barrier(struct mapped_device *md, struct bio *bio)
 {
-	int error = dm_flush(md);
+	dm_flush(md);
 
-	if (unlikely(error)) {
-		bio_endio(bio, error);
-		return;
-	}
 	if (bio_empty_barrier(bio)) {
 		bio_endio(bio, 0);
 		return;
 	}
 
 	__split_and_process_bio(md, bio);
-
-	error = dm_flush(md);
-
-	if (!error && md->barrier_error)
-		error = md->barrier_error;
+	dm_flush(md);
 
 	if (md->barrier_error != DM_ENDIO_REQUEUE)
-		bio_endio(bio, error);
+		bio_endio(bio, md->barrier_error);
 }
 
 /*
-- 
cgit v1.2.3


From 2761e95fe40ca0d01864310fa4d488d7c5e34e18 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:18 +0100
Subject: dm: process requeue in dm_wq_work

If barrier request was returned with DM_ENDIO_REQUEUE,
requeue it in dm_wq_work instead of dec_pending.

This allows us to correctly handle a situation when some targets
are asking for a requeue and other targets signal an error.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index e34d694ddd04..910bce85f443 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -536,9 +536,11 @@ static void dec_pending(struct dm_io *io, int error)
 			 * Target requested pushing back the I/O.
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
-			if (__noflush_suspending(md))
-				bio_list_add_head(&md->deferred, io->bio);
-			else
+			if (__noflush_suspending(md)) {
+				if (!bio_barrier(io->bio))
+					bio_list_add_head(&md->deferred,
+							  io->bio);
+			} else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
@@ -1458,6 +1460,11 @@ static void process_barrier(struct mapped_device *md, struct bio *bio)
 
 	if (md->barrier_error != DM_ENDIO_REQUEUE)
 		bio_endio(bio, md->barrier_error);
+	else {
+		spin_lock_irq(&md->deferred_lock);
+		bio_list_add_head(&md->deferred, bio);
+		spin_unlock_irq(&md->deferred_lock);
+	}
 }
 
 /*
-- 
cgit v1.2.3


From 5aa2781d964e9835c441932110484bc454b5c207 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:18 +0100
Subject: dm: store only first barrier error

With the following patches, more than one error can occur during
processing.  Change md->barrier_error so that only the first one is
recorded and returned to the caller.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 910bce85f443..77972090abe5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -555,7 +555,8 @@ static void dec_pending(struct dm_io *io, int error)
 			 * a per-device variable for error reporting.
 			 * Note that you can't touch the bio after end_io_acct
 			 */
-			md->barrier_error = io_error;
+			if (!md->barrier_error)
+				md->barrier_error = io_error;
 			end_io_acct(io);
 		} else {
 			end_io_acct(io);
@@ -867,7 +868,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 		if (!bio_barrier(bio))
 			bio_io_error(bio);
 		else
-			md->barrier_error = -EIO;
+			if (!md->barrier_error)
+				md->barrier_error = -EIO;
 		return;
 	}
 
@@ -1448,16 +1450,15 @@ static void dm_flush(struct mapped_device *md)
 
 static void process_barrier(struct mapped_device *md, struct bio *bio)
 {
+	md->barrier_error = 0;
+
 	dm_flush(md);
 
-	if (bio_empty_barrier(bio)) {
-		bio_endio(bio, 0);
-		return;
+	if (!bio_empty_barrier(bio)) {
+		__split_and_process_bio(md, bio);
+		dm_flush(md);
 	}
 
-	__split_and_process_bio(md, bio);
-	dm_flush(md);
-
 	if (md->barrier_error != DM_ENDIO_REQUEUE)
 		bio_endio(bio, md->barrier_error);
 	else {
-- 
cgit v1.2.3


From fdb9572b73abd008b80931c3b1f157dac3888bb9 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:19 +0100
Subject: dm: remove EOPNOTSUPP for barriers

If the underlying device doesn't support barriers and dm receives a
barrier, it waits until all requests on that device drain so it no
longer needs to report -EOPNOTSUPP to the caller.

This patch deals with the confusing situation when moving a volume from
one physical device to another triggers an EOPNOTSUPP on a volume that
didn't report it before.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 77972090abe5..8498dc4ce1f0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -555,7 +555,7 @@ static void dec_pending(struct dm_io *io, int error)
 			 * a per-device variable for error reporting.
 			 * Note that you can't touch the bio after end_io_acct
 			 */
-			if (!md->barrier_error)
+			if (!md->barrier_error && io_error != -EOPNOTSUPP)
 				md->barrier_error = io_error;
 			end_io_acct(io);
 		} else {
-- 
cgit v1.2.3


From 27eaa14975d8b53f0bad422e53cdf8e5f6dd44ec Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:20 +0100
Subject: dm: remove check that prevents mapping empty bios

Remove the check that the size of the cloned bio is not zero because a
subsequent patch needs to send zero-sized barriers down this path.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 8498dc4ce1f0..7d9ca7094337 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -637,11 +637,6 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
 	sector_t sector;
 	struct mapped_device *md;
 
-	/*
-	 * Sanity checks.
-	 */
-	BUG_ON(!clone->bi_size);
-
 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
 
-- 
cgit v1.2.3


From f9ab94cee313746573b2d693bc2afb807ebb0998 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:20 +0100
Subject: dm: introduce num_flush_requests

Introduce num_flush_requests for a target to set to say how many flush
instructions (empty barriers) it wants to receive.  These are sent by
__clone_and_map_empty_barrier with map_info->flush_request going from 0
to (num_flush_requests - 1).

Old targets without flush support won't receive any flush requests.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c               | 39 +++++++++++++++++++++++++++++++++++++++
 include/linux/device-mapper.h | 11 +++++++++++
 2 files changed, 50 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 7d9ca7094337..badb7519cccb 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -750,6 +750,40 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	return clone;
 }
 
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+			  unsigned flush_nr)
+{
+	struct dm_target_io *tio = alloc_tio(ci->md);
+	struct bio *clone;
+
+	tio->io = ci->io;
+	tio->ti = ti;
+
+	memset(&tio->info, 0, sizeof(tio->info));
+	tio->info.flush_request = flush_nr;
+
+	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
+	__bio_clone(clone, ci->bio);
+	clone->bi_destructor = dm_bio_destructor;
+
+	__map_bio(ti, clone, tio);
+}
+
+static int __clone_and_map_empty_barrier(struct clone_info *ci)
+{
+	unsigned target_nr = 0, flush_nr;
+	struct dm_target *ti;
+
+	while ((ti = dm_table_get_target(ci->map, target_nr++)))
+		for (flush_nr = 0; flush_nr < ti->num_flush_requests;
+		     flush_nr++)
+			__flush_target(ci, ti, flush_nr);
+
+	ci->sector_count = 0;
+
+	return 0;
+}
+
 static int __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
@@ -757,6 +791,9 @@ static int __clone_and_map(struct clone_info *ci)
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 
+	if (unlikely(bio_empty_barrier(bio)))
+		return __clone_and_map_empty_barrier(ci);
+
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
@@ -877,6 +914,8 @@ static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 	ci.io->md = md;
 	ci.sector = bio->bi_sector;
 	ci.sector_count = bio_sectors(bio);
+	if (unlikely(bio_empty_barrier(bio)))
+		ci.sector_count = 1;
 	ci.idx = bio->bi_idx;
 
 	start_io_acct(ci.io);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 49c2362977fd..fc36a4d07723 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -21,6 +21,7 @@ typedef enum { STATUSTYPE_INFO, STATUSTYPE_TABLE } status_type_t;
 union map_info {
 	void *ptr;
 	unsigned long long ll;
+	unsigned flush_request;
 };
 
 /*
@@ -167,6 +168,16 @@ struct dm_target {
 	/* Always a power of 2 */
 	sector_t split_io;
 
+	/*
+	 * A number of zero-length barrier requests that will be submitted
+	 * to the target for the purpose of flushing cache.
+	 *
+	 * The request number will be placed in union map_info->flush_request.
+	 * It is a responsibility of the target driver to remap these requests
+	 * to the real underlying devices.
+	 */
+	unsigned num_flush_requests;
+
 	/*
 	 * These are automatically filled in by
 	 * dm_table_get_device.
-- 
cgit v1.2.3


From 9015df24a8008d7bea2bd3df881783ebe0dcb9af Mon Sep 17 00:00:00 2001
From: Alasdair G Kergon <agk@redhat.com>
Date: Mon, 22 Jun 2009 10:12:21 +0100
Subject: dm: initialise tio in alloc_tio

Move repeated dm_target_io initialisation inside alloc_tio().

Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index badb7519cccb..edf9f2467691 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -391,11 +391,6 @@ static void free_io(struct mapped_device *md, struct dm_io *io)
 	mempool_free(io, md->io_pool);
 }
 
-static struct dm_target_io *alloc_tio(struct mapped_device *md)
-{
-	return mempool_alloc(md->tio_pool, GFP_NOIO);
-}
-
 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
 	mempool_free(tio, md->tio_pool);
@@ -750,16 +745,24 @@ static struct bio *clone_bio(struct bio *bio, sector_t sector,
 	return clone;
 }
 
-static void __flush_target(struct clone_info *ci, struct dm_target *ti,
-			  unsigned flush_nr)
+static struct dm_target_io *alloc_tio(struct clone_info *ci,
+				      struct dm_target *ti)
 {
-	struct dm_target_io *tio = alloc_tio(ci->md);
-	struct bio *clone;
+	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
 
 	tio->io = ci->io;
 	tio->ti = ti;
-
 	memset(&tio->info, 0, sizeof(tio->info));
+
+	return tio;
+}
+
+static void __flush_target(struct clone_info *ci, struct dm_target *ti,
+			  unsigned flush_nr)
+{
+	struct dm_target_io *tio = alloc_tio(ci, ti);
+	struct bio *clone;
+
 	tio->info.flush_request = flush_nr;
 
 	clone = bio_alloc_bioset(GFP_NOIO, 0, ci->md->bs);
@@ -803,10 +806,7 @@ static int __clone_and_map(struct clone_info *ci)
 	/*
 	 * Allocate a target io object.
 	 */
-	tio = alloc_tio(ci->md);
-	tio->io = ci->io;
-	tio->ti = ti;
-	memset(&tio->info, 0, sizeof(tio->info));
+	tio = alloc_tio(ci, ti);
 
 	if (ci->sector_count <= max) {
 		/*
@@ -862,10 +862,7 @@ static int __clone_and_map(struct clone_info *ci)
 
 				max = max_io_len(ci->md, ci->sector, ti);
 
-				tio = alloc_tio(ci->md);
-				tio->io = ci->io;
-				tio->ti = ti;
-				memset(&tio->info, 0, sizeof(tio->info));
+				tio = alloc_tio(ci, ti);
 			}
 
 			len = min(remaining, max);
-- 
cgit v1.2.3


From 52b1fd5a27c625c78373e024bf570af3c9d44a79 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:21 +0100
Subject: dm: send empty barriers to targets in dm_flush

Pass empty barrier flushes to the targets in dm_flush().

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index edf9f2467691..36142e947ffc 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -164,6 +164,9 @@ struct mapped_device {
 
 	/* sysfs handle */
 	struct kobject kobj;
+
+	/* zero-length barrier that will be cloned and submitted to targets */
+	struct bio barrier_bio;
 };
 
 #define MIN_IOS 256
@@ -1477,6 +1480,13 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 static void dm_flush(struct mapped_device *md)
 {
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+
+	bio_init(&md->barrier_bio);
+	md->barrier_bio.bi_bdev = md->bdev;
+	md->barrier_bio.bi_rw = WRITE_BARRIER;
+	__split_and_process_bio(md, &md->barrier_bio);
+
+	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
 }
 
 static void process_barrier(struct mapped_device *md, struct bio *bio)
-- 
cgit v1.2.3


From 433bcac5645508b71eab2710b6817c3ef937eba8 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:22 +0100
Subject: dm: linear support flush

Flush support for the linear target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-linear.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 79fb53e51c70..ecbb17421da4 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -53,6 +53,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad;
 	}
 
+	ti->num_flush_requests = 1;
 	ti->private = lc;
 	return 0;
 
@@ -81,7 +82,8 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
 	struct linear_c *lc = ti->private;
 
 	bio->bi_bdev = lc->dev->bdev;
-	bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
+	if (bio_sectors(bio))
+		bio->bi_sector = linear_map_sector(ti, bio->bi_sector);
 }
 
 static int linear_map(struct dm_target *ti, struct bio *bio,
-- 
cgit v1.2.3


From 374bf7e7f6cc38b0483351a2029a97910eadde1b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:22 +0100
Subject: dm: stripe support flush

Flush support for the stripe target.

This sets ti->num_flush_requests to the number of stripes and
remaps individual flush requests to the appropriate stripe devices.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-stripe.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 41569bc60abc..c64fe827a5f1 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -167,6 +167,7 @@ static int stripe_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	sc->stripes = stripes;
 	sc->stripe_width = width;
 	ti->split_io = chunk_size;
+	ti->num_flush_requests = stripes;
 
 	sc->chunk_mask = ((sector_t) chunk_size) - 1;
 	for (sc->chunk_shift = 0; chunk_size; sc->chunk_shift++)
@@ -211,10 +212,18 @@ static int stripe_map(struct dm_target *ti, struct bio *bio,
 		      union map_info *map_context)
 {
 	struct stripe_c *sc = (struct stripe_c *) ti->private;
+	sector_t offset, chunk;
+	uint32_t stripe;
 
-	sector_t offset = bio->bi_sector - ti->begin;
-	sector_t chunk = offset >> sc->chunk_shift;
-	uint32_t stripe = sector_div(chunk, sc->stripes);
+	if (unlikely(bio_empty_barrier(bio))) {
+		BUG_ON(map_context->flush_request >= sc->stripes);
+		bio->bi_bdev = sc->stripe[map_context->flush_request].dev->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
+	offset = bio->bi_sector - ti->begin;
+	chunk = offset >> sc->chunk_shift;
+	stripe = sector_div(chunk, sc->stripes);
 
 	bio->bi_bdev = sc->stripe[stripe].dev->bdev;
 	bio->bi_sector = sc->stripe[stripe].physical_start +
-- 
cgit v1.2.3


From 647c7db14ef9cacc4ccb3683e206b61f0de6dc2b Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:23 +0100
Subject: dm crypt: support flush

Flush support for dm-crypt target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-crypt.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 53394e863c74..04db6c4004a8 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1132,6 +1132,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		goto bad_crypt_queue;
 	}
 
+	ti->num_flush_requests = 1;
 	ti->private = cc;
 	return 0;
 
@@ -1189,6 +1190,13 @@ static int crypt_map(struct dm_target *ti, struct bio *bio,
 		     union map_info *map_context)
 {
 	struct dm_crypt_io *io;
+	struct crypt_config *cc;
+
+	if (unlikely(bio_empty_barrier(bio))) {
+		cc = ti->private;
+		bio->bi_bdev = cc->dev->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
 
 	io = crypt_io_alloc(ti, bio, bio->bi_sector - ti->begin);
 
-- 
cgit v1.2.3


From c927259e34e518d913d86f51c71b786a513f94d6 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:23 +0100
Subject: dm delay: support barriers

Flush support for dm-delay target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-delay.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 559dbb52bc85..8ad8a9044bbf 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -197,6 +197,7 @@ out:
 	mutex_init(&dc->timer_lock);
 	atomic_set(&dc->may_delay, 1);
 
+	ti->num_flush_requests = 1;
 	ti->private = dc;
 	return 0;
 
@@ -278,8 +279,9 @@ static int delay_map(struct dm_target *ti, struct bio *bio,
 
 	if ((bio_data_dir(bio) == WRITE) && (dc->dev_write)) {
 		bio->bi_bdev = dc->dev_write->bdev;
-		bio->bi_sector = dc->start_write +
-				 (bio->bi_sector - ti->begin);
+		if (bio_sectors(bio))
+			bio->bi_sector = dc->start_write +
+					 (bio->bi_sector - ti->begin);
 
 		return delay_bio(dc, dc->write_delay, bio);
 	}
-- 
cgit v1.2.3


From 8627921fa2ef6d40fd9b787e163ba3a9ff8f471d Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:24 +0100
Subject: dm mpath: support barriers

Flush support for dm-multipath target.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 92bdcbb7c935..f1cf8f7449d6 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -835,6 +835,8 @@ static int multipath_ctr(struct dm_target *ti, unsigned int argc,
 		goto bad;
 	}
 
+	ti->num_flush_requests = 1;
+
 	return 0;
 
  bad:
-- 
cgit v1.2.3


From 494b3ee7d4f69210def80aecce28d08c3f0755d5 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:25 +0100
Subject: dm snapshot: support barriers

Flush support for dm-snapshot target.

This patch just forwards the flush request to either the origin or the snapshot
device.  (It doesn't flush exception store metadata.)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index d73f17fc7778..d573165cd2b7 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -678,6 +678,7 @@ static int snapshot_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 
 	ti->private = s;
 	ti->split_io = s->store->chunk_size;
+	ti->num_flush_requests = 1;
 
 	return 0;
 
@@ -1030,6 +1031,11 @@ static int snapshot_map(struct dm_target *ti, struct bio *bio,
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
 
+	if (unlikely(bio_empty_barrier(bio))) {
+		bio->bi_bdev = s->store->cow->bdev;
+		return DM_MAPIO_REMAPPED;
+	}
+
 	chunk = sector_to_chunk(s->store, bio->bi_sector);
 
 	/* Full snapshots are not usable */
@@ -1338,6 +1344,8 @@ static int origin_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 	}
 
 	ti->private = dev;
+	ti->num_flush_requests = 1;
+
 	return 0;
 }
 
@@ -1353,6 +1361,9 @@ static int origin_map(struct dm_target *ti, struct bio *bio,
 	struct dm_dev *dev = ti->private;
 	bio->bi_bdev = dev->bdev;
 
+	if (unlikely(bio_empty_barrier(bio)))
+		return DM_MAPIO_REMAPPED;
+
 	/* Only tell snapshots if this is a write */
 	return (bio_rw(bio) == WRITE) ? do_origin(dev, bio) : DM_MAPIO_REMAPPED;
 }
-- 
cgit v1.2.3


From 5af443a7e1c0864100cc44525a9821aa2a2f4719 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:25 +0100
Subject: dm io: record eopnotsupp

Add another field, eopnotsupp_bits. It is subset of error_bits, representing
regions that returned -EOPNOTSUPP.  (The bit is set in both error_bits and
eopnotsupp_bits).

This value will be used in further patches.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index e73aabd61cd7..a89f41f00757 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -22,6 +22,7 @@ struct dm_io_client {
 /* FIXME: can we shrink this ? */
 struct io {
 	unsigned long error_bits;
+	unsigned long eopnotsupp_bits;
 	atomic_t count;
 	struct task_struct *sleeper;
 	struct dm_io_client *client;
@@ -107,8 +108,11 @@ static inline unsigned bio_get_region(struct bio *bio)
  *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-	if (error)
+	if (error) {
 		set_bit(region, &io->error_bits);
+		if (error == -EOPNOTSUPP)
+			set_bit(region, &io->eopnotsupp_bits);
+	}
 
 	if (atomic_dec_and_test(&io->count)) {
 		if (io->sleeper)
@@ -361,6 +365,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	}
 
 	io.error_bits = 0;
+	io.eopnotsupp_bits = 0;
 	atomic_set(&io.count, 1); /* see dispatch_io() */
 	io.sleeper = current;
 	io.client = client;
@@ -397,6 +402,7 @@ static int async_io(struct dm_io_client *client, unsigned int num_regions,
 
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
+	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = NULL;
 	io->client = client;
-- 
cgit v1.2.3


From 51aa322849581f1a73594e48ea0df63f914ee6a2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:26 +0100
Subject: dm io: retry after barrier error

If -EOPNOTSUPP was returned and the request was a barrier request, retry it
without barrier.

Retry all regions for now. Barriers are submitted only for one-region requests,
so it doesn't matter.  (In the future, retries can be limited to the actual
regions that failed.)

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-io.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a89f41f00757..3a2e6a2f8bdd 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -364,6 +364,7 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 		return -EIO;
 	}
 
+retry:
 	io.error_bits = 0;
 	io.eopnotsupp_bits = 0;
 	atomic_set(&io.count, 1); /* see dispatch_io() */
@@ -382,6 +383,11 @@ static int sync_io(struct dm_io_client *client, unsigned int num_regions,
 	}
 	set_current_state(TASK_RUNNING);
 
+	if (io.eopnotsupp_bits && (rw & (1 << BIO_RW_BARRIER))) {
+		rw &= ~(1 << BIO_RW_BARRIER);
+		goto retry;
+	}
+
 	if (error_bits)
 		*error_bits = io.error_bits;
 
-- 
cgit v1.2.3


From 2bd023452592e5f5cf90dd426cc39b7632b15b76 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Mon, 22 Jun 2009 10:12:26 +0100
Subject: dm snapshot: use barrier when writing exception store

Send barrier requests when updating the exception area.

Exception area updates need to be ordered w.r.t. data writes, so that
the writes are not reordered in hardware disk cache.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-snap-persistent.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c
index 2662a41337e7..6e3fe4f14934 100644
--- a/drivers/md/dm-snap-persistent.c
+++ b/drivers/md/dm-snap-persistent.c
@@ -636,7 +636,7 @@ static void persistent_commit_exception(struct dm_exception_store *store,
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, WRITE))
+	if (ps->valid && area_io(ps, WRITE_BARRIER))
 		ps->valid = 0;
 
 	/*
-- 
cgit v1.2.3


From 02ab823fd1a27d193bda06b74fdad685a20a3e5e Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:27 +0100
Subject: dm mpath: add start_io and nr_bytes to path selectors

This patch makes two additions to the dm path selector interface for
dynamic load balancers:
  o a new hook, start_io()
  o a new parameter 'nr_bytes' to select_path()/start_io()/end_io()
    to pass the size of the I/O

start_io() is called when a target driver actually submits I/O
to the selected path.
Path selectors can use it to start accounting of the I/O.
(e.g. counting the number of in-flight I/Os.)
The start_io hook is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html

nr_bytes, the size of the I/O, is so path selectors can take the
size of the I/O into account when deciding which path to use.
dm-service-time uses it to estimate service time, for example.
(Added the nr_bytes member to dm_mpath_io instead of using existing
 details.bi_size, since request-based dm patch deletes it.)

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c         | 28 ++++++++++++++++++----------
 drivers/md/dm-path-selector.h |  8 ++++++--
 drivers/md/dm-round-robin.c   |  2 +-
 3 files changed, 25 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f1cf8f7449d6..890c0e8ed13e 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -101,6 +101,7 @@ struct multipath {
 struct dm_mpath_io {
 	struct pgpath *pgpath;
 	struct dm_bio_details details;
+	size_t nr_bytes;
 };
 
 typedef int (*action_fn) (struct pgpath *pgpath);
@@ -244,11 +245,12 @@ static void __switch_pg(struct multipath *m, struct pgpath *pgpath)
 	m->pg_init_count = 0;
 }
 
-static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
+static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg,
+			       size_t nr_bytes)
 {
 	struct dm_path *path;
 
-	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count);
+	path = pg->ps.type->select_path(&pg->ps, &m->repeat_count, nr_bytes);
 	if (!path)
 		return -ENXIO;
 
@@ -260,7 +262,7 @@ static int __choose_path_in_pg(struct multipath *m, struct priority_group *pg)
 	return 0;
 }
 
-static void __choose_pgpath(struct multipath *m)
+static void __choose_pgpath(struct multipath *m, size_t nr_bytes)
 {
 	struct priority_group *pg;
 	unsigned bypassed = 1;
@@ -272,12 +274,12 @@ static void __choose_pgpath(struct multipath *m)
 	if (m->next_pg) {
 		pg = m->next_pg;
 		m->next_pg = NULL;
-		if (!__choose_path_in_pg(m, pg))
+		if (!__choose_path_in_pg(m, pg, nr_bytes))
 			return;
 	}
 
 	/* Don't change PG until it has no remaining paths */
-	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg))
+	if (m->current_pg && !__choose_path_in_pg(m, m->current_pg, nr_bytes))
 		return;
 
 	/*
@@ -289,7 +291,7 @@ static void __choose_pgpath(struct multipath *m)
 		list_for_each_entry(pg, &m->priority_groups, list) {
 			if (pg->bypassed == bypassed)
 				continue;
-			if (!__choose_path_in_pg(m, pg))
+			if (!__choose_path_in_pg(m, pg, nr_bytes))
 				return;
 		}
 	} while (bypassed--);
@@ -320,6 +322,7 @@ static int map_io(struct multipath *m, struct bio *bio,
 		  struct dm_mpath_io *mpio, unsigned was_queued)
 {
 	int r = DM_MAPIO_REMAPPED;
+	size_t nr_bytes = bio->bi_size;
 	unsigned long flags;
 	struct pgpath *pgpath;
 
@@ -328,7 +331,7 @@ static int map_io(struct multipath *m, struct bio *bio,
 	/* Do we need to select a new pgpath? */
 	if (!m->current_pgpath ||
 	    (!m->queue_io && (m->repeat_count && --m->repeat_count == 0)))
-		__choose_pgpath(m);
+		__choose_pgpath(m, nr_bytes);
 
 	pgpath = m->current_pgpath;
 
@@ -353,6 +356,11 @@ static int map_io(struct multipath *m, struct bio *bio,
 		r = -EIO;	/* Failed */
 
 	mpio->pgpath = pgpath;
+	mpio->nr_bytes = nr_bytes;
+
+	if (r == DM_MAPIO_REMAPPED && pgpath->pg->ps.type->start_io)
+		pgpath->pg->ps.type->start_io(&pgpath->pg->ps, &pgpath->path,
+					      nr_bytes);
 
 	spin_unlock_irqrestore(&m->lock, flags);
 
@@ -431,7 +439,7 @@ static void process_queued_ios(struct work_struct *work)
 		goto out;
 
 	if (!m->current_pgpath)
-		__choose_pgpath(m);
+		__choose_pgpath(m, 0);
 
 	pgpath = m->current_pgpath;
 
@@ -1209,7 +1217,7 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
-			ps->type->end_io(ps, &pgpath->path);
+			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
 	if (r != DM_ENDIO_INCOMPLETE)
 		mempool_free(mpio, m->mpio_pool);
@@ -1425,7 +1433,7 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	spin_lock_irqsave(&m->lock, flags);
 
 	if (!m->current_pgpath)
-		__choose_pgpath(m);
+		__choose_pgpath(m, 0);
 
 	if (m->current_pgpath) {
 		bdev = m->current_pgpath->path.dev->bdev;
diff --git a/drivers/md/dm-path-selector.h b/drivers/md/dm-path-selector.h
index 27357b85d73d..e7d1fa8b0459 100644
--- a/drivers/md/dm-path-selector.h
+++ b/drivers/md/dm-path-selector.h
@@ -56,7 +56,8 @@ struct path_selector_type {
 	 * the path fails.
 	 */
 	struct dm_path *(*select_path) (struct path_selector *ps,
-				     unsigned *repeat_count);
+					unsigned *repeat_count,
+					size_t nr_bytes);
 
 	/*
 	 * Notify the selector that a path has failed.
@@ -75,7 +76,10 @@ struct path_selector_type {
 	int (*status) (struct path_selector *ps, struct dm_path *path,
 		       status_type_t type, char *result, unsigned int maxlen);
 
-	int (*end_io) (struct path_selector *ps, struct dm_path *path);
+	int (*start_io) (struct path_selector *ps, struct dm_path *path,
+			 size_t nr_bytes);
+	int (*end_io) (struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes);
 };
 
 /* Register a path selector */
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index cdfbf65b28cb..24752f449bef 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -161,7 +161,7 @@ static int rr_reinstate_path(struct path_selector *ps, struct dm_path *p)
 }
 
 static struct dm_path *rr_select_path(struct path_selector *ps,
-				   unsigned *repeat_count)
+				      unsigned *repeat_count, size_t nr_bytes)
 {
 	struct selector *s = (struct selector *) ps->context;
 	struct path_info *pi = NULL;
-- 
cgit v1.2.3


From fd5e033908b7b743b5650790f196761dd930f988 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:27 +0100
Subject: dm mpath: add queue length load balancer

This patch adds a dynamic load balancer, dm-queue-length, which
balances the number of in-flight I/Os across the paths.

The code is based on the patch posted by Stefan Bader:
https://www.redhat.com/archives/dm-devel/2005-October/msg00050.html

Signed-off-by: Stefan Bader <stefan.bader@canonical.com>
Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-queue-length.txt |  39 ++++
 drivers/md/Kconfig                              |   9 +
 drivers/md/Makefile                             |   1 +
 drivers/md/dm-queue-length.c                    | 263 ++++++++++++++++++++++++
 4 files changed, 312 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-queue-length.txt
 create mode 100644 drivers/md/dm-queue-length.c

(limited to 'drivers/md')

diff --git a/Documentation/device-mapper/dm-queue-length.txt b/Documentation/device-mapper/dm-queue-length.txt
new file mode 100644
index 000000000000..f4db2562175c
--- /dev/null
+++ b/Documentation/device-mapper/dm-queue-length.txt
@@ -0,0 +1,39 @@
+dm-queue-length
+===============
+
+dm-queue-length is a path selector module for device-mapper targets,
+which selects a path with the least number of in-flight I/Os.
+The path selector name is 'queue-length'.
+
+Table parameters for each path: [<repeat_count>]
+	<repeat_count>: The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used. To check
+			the default value, see the activated table.
+
+Status for each path: <status> <fail-count> <in-flight>
+	<status>: 'A' if the path is active, 'F' if the path is failed.
+	<fail-count>: The number of path failures.
+	<in-flight>: The number of in-flight I/Os on the path.
+
+
+Algorithm
+=========
+
+dm-queue-length increments/decrements 'in-flight' when an I/O is
+dispatched/completed respectively.
+dm-queue-length selects a path with the minimum 'in-flight'.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128.
+
+# echo "0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 queue-length 0 2 1 8:0 128 8:16 128
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 1 8:0 A 0 0 8:16 A 0 0
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 36e0675be9f7..3b311d273346 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -249,6 +249,15 @@ config DM_MULTIPATH
 	---help---
 	  Allow volume managers to support multipath hardware.
 
+config DM_MULTIPATH_QL
+	tristate "I/O Path Selector based on the number of in-flight I/Os"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  the path with the least number of in-flight I/Os.
+
+	  If unsure, say N.
+
 config DM_DELAY
 	tristate "I/O delaying target (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 45cc5951d928..ff9f545dd516 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_DM)	+= dm-mod.o
 obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
+obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
new file mode 100644
index 000000000000..f92b6cea9d9c
--- /dev/null
+++ b/drivers/md/dm-queue-length.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright (C) 2004-2005 IBM Corp.  All Rights Reserved.
+ * Copyright (C) 2006-2009 NEC Corporation.
+ *
+ * dm-queue-length.c
+ *
+ * Module Author: Stefan Bader, IBM
+ * Modified by: Kiyoshi Ueda, NEC
+ *
+ * This file is released under the GPL.
+ *
+ * queue-length path selector - choose a path with the least number of
+ * in-flight I/Os.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#include <linux/slab.h>
+#include <linux/ctype.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+#include <asm/atomic.h>
+
+#define DM_MSG_PREFIX	"multipath queue-length"
+#define QL_MIN_IO	128
+#define QL_VERSION	"0.1.0"
+
+struct selector {
+	struct list_head	valid_paths;
+	struct list_head	failed_paths;
+};
+
+struct path_info {
+	struct list_head	list;
+	struct dm_path		*path;
+	unsigned		repeat_count;
+	atomic_t		qlen;	/* the number of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int ql_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void ql_free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void ql_destroy(struct path_selector *ps)
+{
+	struct selector *s = ps->context;
+
+	ql_free_paths(&s->valid_paths);
+	ql_free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int ql_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct path_info *pi;
+
+	/* When called with NULL path, return selector status/args. */
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%d ", atomic_read(&pi->qlen));
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u ", pi->repeat_count);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int ql_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = QL_MIN_IO;
+
+	/*
+	 * Arguments: [<repeat_count>]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (QL_MIN_IO) is used.
+	 */
+	if (argc > 1) {
+		*error = "queue-length ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "queue-length ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	/* Allocate the path information structure */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "queue-length ps: Error allocating path information";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	atomic_set(&pi->qlen, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void ql_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int ql_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+/*
+ * Select a path having the minimum number of in-flight I/Os
+ */
+static struct dm_path *ql_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list) {
+		if (!best ||
+		    (atomic_read(&pi->qlen) < atomic_read(&best->qlen)))
+			best = pi;
+
+		if (!atomic_read(&best->qlen))
+			break;
+	}
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int ql_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_inc(&pi->qlen);
+
+	return 0;
+}
+
+static int ql_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_dec(&pi->qlen);
+
+	return 0;
+}
+
+static struct path_selector_type ql_ps = {
+	.name		= "queue-length",
+	.module		= THIS_MODULE,
+	.table_args	= 1,
+	.info_args	= 1,
+	.create		= ql_create,
+	.destroy	= ql_destroy,
+	.status		= ql_status,
+	.add_path	= ql_add_path,
+	.fail_path	= ql_fail_path,
+	.reinstate_path	= ql_reinstate_path,
+	.select_path	= ql_select_path,
+	.start_io	= ql_start_io,
+	.end_io		= ql_end_io,
+};
+
+static int __init dm_ql_init(void)
+{
+	int r = dm_register_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " QL_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_ql_exit(void)
+{
+	int r = dm_unregister_path_selector(&ql_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_ql_init);
+module_exit(dm_ql_exit);
+
+MODULE_AUTHOR("Stefan Bader <Stefan.Bader at de.ibm.com>");
+MODULE_DESCRIPTION(
+	"(C) Copyright IBM Corp. 2004,2005   All Rights Reserved.\n"
+	DM_NAME " path selector to balance the number of in-flight I/Os"
+);
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From f392ba889b019602976082bfe7bf486c2594f85c Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:28 +0100
Subject: dm mpath: add service time load balancer

This patch adds a service time oriented dynamic load balancer,
dm-service-time, which selects the path with the shortest estimated
service time for the incoming I/O.
The service time is estimated by dividing the in-flight I/O size
by a performance value of each path.

The performance value can be given as a table argument at the table
loading time.  If no performance value is given, all paths are
considered equal.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-service-time.txt |  91 +++++++
 drivers/md/Kconfig                              |  10 +
 drivers/md/Makefile                             |   1 +
 drivers/md/dm-service-time.c                    | 339 ++++++++++++++++++++++++
 4 files changed, 441 insertions(+)
 create mode 100644 Documentation/device-mapper/dm-service-time.txt
 create mode 100644 drivers/md/dm-service-time.c

(limited to 'drivers/md')

diff --git a/Documentation/device-mapper/dm-service-time.txt b/Documentation/device-mapper/dm-service-time.txt
new file mode 100644
index 000000000000..7d00668e97bb
--- /dev/null
+++ b/Documentation/device-mapper/dm-service-time.txt
@@ -0,0 +1,91 @@
+dm-service-time
+===============
+
+dm-service-time is a path selector module for device-mapper targets,
+which selects a path with the shortest estimated service time for
+the incoming I/O.
+
+The service time for each path is estimated by dividing the total size
+of in-flight I/Os on a path with the performance value of the path.
+The performance value is a relative throughput value among all paths
+in a path-group, and it can be specified as a table argument.
+
+The path selector name is 'service-time'.
+
+Table parameters for each path: [<repeat_count> [<relative_throughput>]]
+	<repeat_count>: The number of I/Os to dispatch using the selected
+			path before switching to the next path.
+			If not given, internal default is used.  To check
+			the default value, see the activated table.
+	<relative_throughput>: The relative throughput value of the path
+			among all paths in the path-group.
+			The valid range is 0-100.
+			If not given, minimum value '1' is used.
+			If '0' is given, the path isn't selected while
+			other paths having a positive value are available.
+
+Status for each path: <status> <fail-count> <in-flight-size> \
+		      <relative_throughput>
+	<status>: 'A' if the path is active, 'F' if the path is failed.
+	<fail-count>: The number of path failures.
+	<in-flight-size>: The size of in-flight I/Os on the path.
+	<relative_throughput>: The relative throughput value of the path
+			among all paths in the path-group.
+
+
+Algorithm
+=========
+
+dm-service-time adds the I/O size to 'in-flight-size' when the I/O is
+dispatched and substracts when completed.
+Basically, dm-service-time selects a path having minimum service time
+which is calculated by:
+
+	('in-flight-size' + 'size-of-incoming-io') / 'relative_throughput'
+
+However, some optimizations below are used to reduce the calculation
+as much as possible.
+
+	1. If the paths have the same 'relative_throughput', skip
+	   the division and just compare the 'in-flight-size'.
+
+	2. If the paths have the same 'in-flight-size', skip the division
+	   and just compare the 'relative_throughput'.
+
+	3. If some paths have non-zero 'relative_throughput' and others
+	   have zero 'relative_throughput', ignore those paths with zero
+	   'relative_throughput'.
+
+If such optimizations can't be applied, calculate service time, and
+compare service time.
+If calculated service time is equal, the path having maximum
+'relative_throughput' may be better.  So compare 'relative_throughput'
+then.
+
+
+Examples
+========
+In case that 2 paths (sda and sdb) are used with repeat_count == 128
+and sda has an average throughput 1GB/s and sdb has 4GB/s,
+'relative_throughput' value may be '1' for sda and '4' for sdb.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 1 8:16 128 4
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 1 8:16 A 0 0 4
+
+
+Or '2' for sda and '8' for sdb would be also true.
+
+# echo "0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8" \
+  dmsetup create test
+#
+# dmsetup table
+test: 0 10 multipath 0 0 1 1 service-time 0 2 2 8:0 128 2 8:16 128 8
+#
+# dmsetup status
+test: 0 10 multipath 2 0 0 0 1 1 E 0 2 2 8:0 A 0 0 2 8:16 A 0 0 8
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 3b311d273346..09f93fa68912 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -258,6 +258,16 @@ config DM_MULTIPATH_QL
 
 	  If unsure, say N.
 
+config DM_MULTIPATH_ST
+	tristate "I/O Path Selector based on the service time"
+	depends on DM_MULTIPATH
+	---help---
+	  This path selector is a dynamic load balancer which selects
+	  the path expected to complete the incoming I/O in the shortest
+	  time.
+
+	  If unsure, say N.
+
 config DM_DELAY
 	tristate "I/O delaying target (EXPERIMENTAL)"
 	depends on BLK_DEV_DM && EXPERIMENTAL
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index ff9f545dd516..dade52f60733 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -37,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT)		+= dm-crypt.o
 obj-$(CONFIG_DM_DELAY)		+= dm-delay.o
 obj-$(CONFIG_DM_MULTIPATH)	+= dm-multipath.o dm-round-robin.o
 obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
+obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
new file mode 100644
index 000000000000..cfa668f46c40
--- /dev/null
+++ b/drivers/md/dm-service-time.c
@@ -0,0 +1,339 @@
+/*
+ * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
+ *
+ * Module Author: Kiyoshi Ueda
+ *
+ * This file is released under the GPL.
+ *
+ * Throughput oriented path selector.
+ */
+
+#include "dm.h"
+#include "dm-path-selector.h"
+
+#define DM_MSG_PREFIX	"multipath service-time"
+#define ST_MIN_IO	1
+#define ST_MAX_RELATIVE_THROUGHPUT	100
+#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
+#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
+#define ST_VERSION	"0.2.0"
+
+struct selector {
+	struct list_head valid_paths;
+	struct list_head failed_paths;
+};
+
+struct path_info {
+	struct list_head list;
+	struct dm_path *path;
+	unsigned repeat_count;
+	unsigned relative_throughput;
+	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
+};
+
+static struct selector *alloc_selector(void)
+{
+	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
+
+	if (s) {
+		INIT_LIST_HEAD(&s->valid_paths);
+		INIT_LIST_HEAD(&s->failed_paths);
+	}
+
+	return s;
+}
+
+static int st_create(struct path_selector *ps, unsigned argc, char **argv)
+{
+	struct selector *s = alloc_selector();
+
+	if (!s)
+		return -ENOMEM;
+
+	ps->context = s;
+	return 0;
+}
+
+static void free_paths(struct list_head *paths)
+{
+	struct path_info *pi, *next;
+
+	list_for_each_entry_safe(pi, next, paths, list) {
+		list_del(&pi->list);
+		kfree(pi);
+	}
+}
+
+static void st_destroy(struct path_selector *ps)
+{
+	struct selector *s = ps->context;
+
+	free_paths(&s->valid_paths);
+	free_paths(&s->failed_paths);
+	kfree(s);
+	ps->context = NULL;
+}
+
+static int st_status(struct path_selector *ps, struct dm_path *path,
+		     status_type_t type, char *result, unsigned maxlen)
+{
+	unsigned sz = 0;
+	struct path_info *pi;
+
+	if (!path)
+		DMEMIT("0 ");
+	else {
+		pi = path->pscontext;
+
+		switch (type) {
+		case STATUSTYPE_INFO:
+			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
+			       pi->relative_throughput);
+			break;
+		case STATUSTYPE_TABLE:
+			DMEMIT("%u %u ", pi->repeat_count,
+			       pi->relative_throughput);
+			break;
+		}
+	}
+
+	return sz;
+}
+
+static int st_add_path(struct path_selector *ps, struct dm_path *path,
+		       int argc, char **argv, char **error)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi;
+	unsigned repeat_count = ST_MIN_IO;
+	unsigned relative_throughput = 1;
+
+	/*
+	 * Arguments: [<repeat_count> [<relative_throughput>]]
+	 * 	<repeat_count>: The number of I/Os before switching path.
+	 * 			If not given, default (ST_MIN_IO) is used.
+	 * 	<relative_throughput>: The relative throughput value of
+	 *			the path among all paths in the path-group.
+	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
+	 *			If not given, minimum value '1' is used.
+	 *			If '0' is given, the path isn't selected while
+	 * 			other paths having a positive value are
+	 * 			available.
+	 */
+	if (argc > 2) {
+		*error = "service-time ps: incorrect number of arguments";
+		return -EINVAL;
+	}
+
+	if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
+		*error = "service-time ps: invalid repeat count";
+		return -EINVAL;
+	}
+
+	if ((argc == 2) &&
+	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
+	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
+		*error = "service-time ps: invalid relative_throughput value";
+		return -EINVAL;
+	}
+
+	/* allocate the path */
+	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
+	if (!pi) {
+		*error = "service-time ps: Error allocating path context";
+		return -ENOMEM;
+	}
+
+	pi->path = path;
+	pi->repeat_count = repeat_count;
+	pi->relative_throughput = relative_throughput;
+	atomic_set(&pi->in_flight_size, 0);
+
+	path->pscontext = pi;
+
+	list_add_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+static void st_fail_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move(&pi->list, &s->failed_paths);
+}
+
+static int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = path->pscontext;
+
+	list_move_tail(&pi->list, &s->valid_paths);
+
+	return 0;
+}
+
+/*
+ * Compare the estimated service time of 2 paths, pi1 and pi2,
+ * for the incoming I/O.
+ *
+ * Returns:
+ * < 0 : pi1 is better
+ * 0   : no difference between pi1 and pi2
+ * > 0 : pi2 is better
+ *
+ * Description:
+ * Basically, the service time is estimated by:
+ *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
+ * To reduce the calculation, some optimizations are made.
+ * (See comments inline)
+ */
+static int st_compare_load(struct path_info *pi1, struct path_info *pi2,
+			   size_t incoming)
+{
+	size_t sz1, sz2, st1, st2;
+
+	sz1 = atomic_read(&pi1->in_flight_size);
+	sz2 = atomic_read(&pi2->in_flight_size);
+
+	/*
+	 * Case 1: Both have same throughput value. Choose less loaded path.
+	 */
+	if (pi1->relative_throughput == pi2->relative_throughput)
+		return sz1 - sz2;
+
+	/*
+	 * Case 2a: Both have same load. Choose higher throughput path.
+	 * Case 2b: One path has no throughput value. Choose the other one.
+	 */
+	if (sz1 == sz2 ||
+	    !pi1->relative_throughput || !pi2->relative_throughput)
+		return pi2->relative_throughput - pi1->relative_throughput;
+
+	/*
+	 * Case 3: Calculate service time. Choose faster path.
+	 *         Service time using pi1:
+	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
+	 *         Service time using pi2:
+	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
+	 *
+	 *         To avoid the division, transform the expression to use
+	 *         multiplication.
+	 *         Because ->relative_throughput > 0 here, if st1 < st2,
+	 *         the expressions below are the same meaning:
+	 *             (sz1 + incoming) / pi1->relative_throughput <
+	 *                 (sz2 + incoming) / pi2->relative_throughput
+	 *             (sz1 + incoming) * pi2->relative_throughput <
+	 *                 (sz2 + incoming) * pi1->relative_throughput
+	 *         So use the later one.
+	 */
+	sz1 += incoming;
+	sz2 += incoming;
+	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
+		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
+		/*
+		 * Size may be too big for multiplying pi->relative_throughput
+		 * and overflow.
+		 * To avoid the overflow and mis-selection, shift down both.
+		 */
+		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
+	}
+	st1 = sz1 * pi2->relative_throughput;
+	st2 = sz2 * pi1->relative_throughput;
+	if (st1 != st2)
+		return st1 - st2;
+
+	/*
+	 * Case 4: Service time is equal. Choose higher throughput path.
+	 */
+	return pi2->relative_throughput - pi1->relative_throughput;
+}
+
+static struct dm_path *st_select_path(struct path_selector *ps,
+				      unsigned *repeat_count, size_t nr_bytes)
+{
+	struct selector *s = ps->context;
+	struct path_info *pi = NULL, *best = NULL;
+
+	if (list_empty(&s->valid_paths))
+		return NULL;
+
+	/* Change preferred (first in list) path to evenly balance. */
+	list_move_tail(s->valid_paths.next, &s->valid_paths);
+
+	list_for_each_entry(pi, &s->valid_paths, list)
+		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
+			best = pi;
+
+	if (!best)
+		return NULL;
+
+	*repeat_count = best->repeat_count;
+
+	return best->path;
+}
+
+static int st_start_io(struct path_selector *ps, struct dm_path *path,
+		       size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_add(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static int st_end_io(struct path_selector *ps, struct dm_path *path,
+		     size_t nr_bytes)
+{
+	struct path_info *pi = path->pscontext;
+
+	atomic_sub(nr_bytes, &pi->in_flight_size);
+
+	return 0;
+}
+
+static struct path_selector_type st_ps = {
+	.name		= "service-time",
+	.module		= THIS_MODULE,
+	.table_args	= 2,
+	.info_args	= 2,
+	.create		= st_create,
+	.destroy	= st_destroy,
+	.status		= st_status,
+	.add_path	= st_add_path,
+	.fail_path	= st_fail_path,
+	.reinstate_path	= st_reinstate_path,
+	.select_path	= st_select_path,
+	.start_io	= st_start_io,
+	.end_io		= st_end_io,
+};
+
+static int __init dm_st_init(void)
+{
+	int r = dm_register_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("register failed %d", r);
+
+	DMINFO("version " ST_VERSION " loaded");
+
+	return r;
+}
+
+static void __exit dm_st_exit(void)
+{
+	int r = dm_unregister_path_selector(&st_ps);
+
+	if (r < 0)
+		DMERR("unregister failed %d", r);
+}
+
+module_init(dm_st_init);
+module_exit(dm_st_exit);
+
+MODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
+MODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
+MODULE_LICENSE("GPL");
-- 
cgit v1.2.3


From 1b6da754594e6e26c24e6fbc1a34f9c03e4617a3 Mon Sep 17 00:00:00 2001
From: Jonthan Brassow <jbrassow@redhat.com>
Date: Mon, 22 Jun 2009 10:12:29 +0100
Subject: dm table: improve warning message when devices not freed before
 destruction

Report any devices forgotten to be freed before a table is destroyed.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 0e2210c0c168..af1ceae2582a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -267,6 +267,8 @@ static void free_devices(struct list_head *devices)
 	list_for_each_safe(tmp, next, devices) {
 		struct dm_dev_internal *dd =
 		    list_entry(tmp, struct dm_dev_internal, list);
+		DMWARN("dm_table_destroy: dm_put_device call missing for %s",
+		       dd->dm_dev.name);
 		kfree(dd);
 	}
 }
@@ -296,12 +298,8 @@ void dm_table_destroy(struct dm_table *t)
 	vfree(t->highs);
 
 	/* free the device list */
-	if (t->devices.next != &t->devices) {
-		DMWARN("devices still present during destroy: "
-		       "dm_table_remove_device calls missing");
-
+	if (t->devices.next != &t->devices)
 		free_devices(&t->devices);
-	}
 
 	kfree(t);
 }
-- 
cgit v1.2.3


From 486d220fe4909b5745c4faa67faddd30a707abe2 Mon Sep 17 00:00:00 2001
From: Peter Rajnoha <prajnoha@redhat.com>
Date: Mon, 22 Jun 2009 10:12:29 +0100
Subject: dm: sysfs add suspended attribute

Add a file named 'suspended' to each device-mapper device directory in
sysfs.  It holds the value 1 while the device is suspended.  Otherwise
it holds 0.

Signed-off-by: Peter Rajnoha <prajnoha@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-sysfs.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-sysfs.c b/drivers/md/dm-sysfs.c
index a2a45e6c7c8b..4b045903a4e2 100644
--- a/drivers/md/dm-sysfs.c
+++ b/drivers/md/dm-sysfs.c
@@ -57,12 +57,21 @@ static ssize_t dm_attr_uuid_show(struct mapped_device *md, char *buf)
 	return strlen(buf);
 }
 
+static ssize_t dm_attr_suspended_show(struct mapped_device *md, char *buf)
+{
+	sprintf(buf, "%d\n", dm_suspended(md));
+
+	return strlen(buf);
+}
+
 static DM_ATTR_RO(name);
 static DM_ATTR_RO(uuid);
+static DM_ATTR_RO(suspended);
 
 static struct attribute *dm_attrs[] = {
 	&dm_attr_name.attr,
 	&dm_attr_uuid.attr,
+	&dm_attr_suspended.attr,
 	NULL,
 };
 
-- 
cgit v1.2.3


From 60935eb21d3c5bac79618000f38f92c249d153c4 Mon Sep 17 00:00:00 2001
From: Milan Broz <mbroz@redhat.com>
Date: Mon, 22 Jun 2009 10:12:30 +0100
Subject: dm ioctl: support cookies for udev

Add support for passing a 32 bit "cookie" into the kernel with the
DM_SUSPEND, DM_DEV_RENAME and DM_DEV_REMOVE ioctls.  The (unsigned)
value of this cookie is returned to userspace alongside the uevents
issued by these ioctls in the variable DM_COOKIE.

This means the userspace process issuing these ioctls can be notified
by udev after udev has completed any actions triggered.

To minimise the interface extension, we pass the cookie into the
kernel in the event_nr field which is otherwise unused when calling
these ioctls.  Incrementing the version number allows userspace to
determine in advance whether or not the kernel supports the cookie.
If the kernel does support this but userspace does not, there should
be no impact as the new variable will just get ignored.

Signed-off-by: Milan Broz <mbroz@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c    | 14 ++++++++++----
 drivers/md/dm.c          | 25 +++++++++++++++++++------
 drivers/md/dm.h          |  3 ++-
 include/linux/dm-ioctl.h | 14 ++++++++++++--
 4 files changed, 43 insertions(+), 13 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1128d3fba797..1c871736f48c 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -276,7 +276,7 @@ retry:
 	up_write(&_hash_lock);
 }
 
-static int dm_hash_rename(const char *old, const char *new)
+static int dm_hash_rename(uint32_t cookie, const char *old, const char *new)
 {
 	char *new_name, *old_name;
 	struct hash_cell *hc;
@@ -333,7 +333,7 @@ static int dm_hash_rename(const char *old, const char *new)
 		dm_table_put(table);
 	}
 
-	dm_kobject_uevent(hc->md);
+	dm_kobject_uevent(hc->md, KOBJ_CHANGE, cookie);
 
 	dm_put(hc->md);
 	up_write(&_hash_lock);
@@ -680,6 +680,9 @@ static int dev_remove(struct dm_ioctl *param, size_t param_size)
 
 	__hash_remove(hc);
 	up_write(&_hash_lock);
+
+	dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr);
+
 	dm_put(md);
 	param->data_size = 0;
 	return 0;
@@ -715,7 +718,7 @@ static int dev_rename(struct dm_ioctl *param, size_t param_size)
 		return r;
 
 	param->data_size = 0;
-	return dm_hash_rename(param->name, new_name);
+	return dm_hash_rename(param->event_nr, param->name, new_name);
 }
 
 static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
@@ -842,8 +845,11 @@ static int do_resume(struct dm_ioctl *param)
 	if (dm_suspended(md))
 		r = dm_resume(md);
 
-	if (!r)
+
+	if (!r) {
+		dm_kobject_uevent(md, KOBJ_CHANGE, param->event_nr);
 		r = __dev_status(md, param);
+	}
 
 	dm_put(md);
 	return r;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 36142e947ffc..a9210bb594e7 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -24,6 +24,13 @@
 
 #define DM_MSG_PREFIX "core"
 
+/*
+ * Cookies are numeric values sent with CHANGE and REMOVE
+ * uevents while resuming, removing or renaming the device.
+ */
+#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
+#define DM_COOKIE_LENGTH 24
+
 static const char *_name = DM_NAME;
 
 static unsigned int major = 0;
@@ -1731,11 +1738,7 @@ int dm_resume(struct mapped_device *md)
 	clear_bit(DMF_SUSPENDED, &md->flags);
 
 	dm_table_unplug_all(map);
-
-	dm_kobject_uevent(md);
-
 	r = 0;
-
 out:
 	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
@@ -1746,9 +1749,19 @@ out:
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
-void dm_kobject_uevent(struct mapped_device *md)
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie)
 {
-	kobject_uevent(&disk_to_dev(md->disk)->kobj, KOBJ_CHANGE);
+	char udev_cookie[DM_COOKIE_LENGTH];
+	char *envp[] = { udev_cookie, NULL };
+
+	if (!cookie)
+		kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
+	else {
+		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
+			 DM_COOKIE_ENV_VAR_NAME, cookie);
+		kobject_uevent_env(&disk_to_dev(md->disk)->kobj, action, envp);
+	}
 }
 
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a31506d93e91..b5935c610c44 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -92,7 +92,8 @@ void dm_stripe_exit(void);
 int dm_open_count(struct mapped_device *md);
 int dm_lock_for_deletion(struct mapped_device *md);
 
-void dm_kobject_uevent(struct mapped_device *md);
+void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
+		       unsigned cookie);
 
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
diff --git a/include/linux/dm-ioctl.h b/include/linux/dm-ioctl.h
index 48e44ee2b466..2ab84c83c31a 100644
--- a/include/linux/dm-ioctl.h
+++ b/include/linux/dm-ioctl.h
@@ -123,6 +123,16 @@ struct dm_ioctl {
 	__u32 target_count;	/* in/out */
 	__s32 open_count;	/* out */
 	__u32 flags;		/* in/out */
+
+	/*
+	 * event_nr holds either the event number (input and output) or the
+	 * udev cookie value (input only).
+	 * The DM_DEV_WAIT ioctl takes an event number as input.
+	 * The DM_SUSPEND, DM_DEV_REMOVE and DM_DEV_RENAME ioctls
+	 * use the field as a cookie to return in the DM_COOKIE
+	 * variable with the uevents they issue.
+	 * For output, the ioctls return the event number, not the cookie.
+	 */
 	__u32 event_nr;      	/* in/out */
 	__u32 padding;
 
@@ -256,9 +266,9 @@ enum {
 #define DM_DEV_SET_GEOMETRY	_IOWR(DM_IOCTL, DM_DEV_SET_GEOMETRY_CMD, struct dm_ioctl)
 
 #define DM_VERSION_MAJOR	4
-#define DM_VERSION_MINOR	14
+#define DM_VERSION_MINOR	15
 #define DM_VERSION_PATCHLEVEL	0
-#define DM_VERSION_EXTRA	"-ioctl (2008-04-23)"
+#define DM_VERSION_EXTRA	"-ioctl (2009-04-01)"
 
 /* Status bits */
 #define DM_READONLY_FLAG	(1 << 0) /* In/Out */
-- 
cgit v1.2.3


From 02acc3a4fa0a6c2a5ccc4fb722b55fb710265882 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:30 +0100
Subject: dm table: ensure targets are aligned to logical_block_size

Ensure I/O is aligned to the logical block size of target devices.

Rename check_device_area() to device_area_is_valid() for clarity and
establish the device limits including the logical block size prior to
calling it.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 58 ++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 44 insertions(+), 14 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index af1ceae2582a..535fdaf2473d 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -383,16 +383,45 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
  * If possible, this checks an area of a destination device is valid.
  */
-static int check_device_area(struct dm_dev_internal *dd, sector_t start,
-			     sector_t len)
+static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
+			     sector_t start, sector_t len)
 {
-	sector_t dev_size = i_size_read(dd->dm_dev.bdev->bd_inode) >>
-			    SECTOR_SHIFT;
+	sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	unsigned short logical_block_size_sectors =
+		ti->limits.logical_block_size >> SECTOR_SHIFT;
+	char b[BDEVNAME_SIZE];
 
 	if (!dev_size)
 		return 1;
 
-	return ((start < dev_size) && (len <= (dev_size - start)));
+	if ((start >= dev_size) || (start + len > dev_size)) {
+		DMWARN("%s: %s too small for target",
+		       dm_device_name(ti->table->md), bdevname(bdev, b));
+		return 0;
+	}
+
+	if (logical_block_size_sectors <= 1)
+		return 1;
+
+	if (start & (logical_block_size_sectors - 1)) {
+		DMWARN("%s: start=%llu not aligned to h/w "
+		       "logical block size %hu of %s",
+		       dm_device_name(ti->table->md),
+		       (unsigned long long)start,
+		       ti->limits.logical_block_size, bdevname(bdev, b));
+		return 0;
+	}
+
+	if (len & (logical_block_size_sectors - 1)) {
+		DMWARN("%s: len=%llu not aligned to h/w "
+		       "logical block size %hu of %s",
+		       dm_device_name(ti->table->md),
+		       (unsigned long long)len,
+		       ti->limits.logical_block_size, bdevname(bdev, b));
+		return 0;
+	}
+
+	return 1;
 }
 
 /*
@@ -478,14 +507,7 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	}
 	atomic_inc(&dd->count);
 
-	if (!check_device_area(dd, start, len)) {
-		DMWARN("device %s too small for target", path);
-		dm_put_device(ti, &dd->dm_dev);
-		return -EINVAL;
-	}
-
 	*result = &dd->dm_dev;
-
 	return 0;
 }
 
@@ -554,8 +576,16 @@ int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 	int r = __table_get_device(ti->table, ti, path,
 				   start, len, mode, result);
 
-	if (!r)
-		dm_set_device_limits(ti, (*result)->bdev);
+	if (r)
+		return r;
+
+	dm_set_device_limits(ti, (*result)->bdev);
+
+	if (!device_area_is_valid(ti, (*result)->bdev, start, len)) {
+		dm_put_device(ti, *result);
+		*result = NULL;
+		return -EINVAL;
+	}
 
 	return r;
 }
-- 
cgit v1.2.3


From be6d4305db093ad1cc623f7dd3d2470b7bd73fa4 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:31 +0100
Subject: dm table: validate device logical_block_size

Impose necessary and sufficient conditions on a devices's table such
that any incoming bio which respects its logical_block_size can be
processed successfully.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 69 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 535fdaf2473d..e3bcfb8b15a1 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -724,6 +724,71 @@ static void check_for_valid_limits(struct io_restrictions *rs)
 		rs->bounce_pfn = -1;
 }
 
+/*
+ * Impose necessary and sufficient conditions on a devices's table such
+ * that any incoming bio which respects its logical_block_size can be
+ * processed successfully.  If it falls across the boundary between
+ * two or more targets, the size of each piece it gets split into must
+ * be compatible with the logical_block_size of the target processing it.
+ */
+static int validate_hardware_logical_block_alignment(struct dm_table *table)
+{
+	/*
+	 * This function uses arithmetic modulo the logical_block_size
+	 * (in units of 512-byte sectors).
+	 */
+	unsigned short device_logical_block_size_sects =
+		table->limits.logical_block_size >> SECTOR_SHIFT;
+
+	/*
+	 * Offset of the start of the next table entry, mod logical_block_size.
+	 */
+	unsigned short next_target_start = 0;
+
+	/*
+	 * Given an aligned bio that extends beyond the end of a
+	 * target, how many sectors must the next target handle?
+	 */
+	unsigned short remaining = 0;
+
+	struct dm_target *uninitialized_var(ti);
+	unsigned i = 0;
+
+	/*
+	 * Check each entry in the table in turn.
+	 */
+	while (i < dm_table_get_num_targets(table)) {
+		ti = dm_table_get_target(table, i++);
+
+		/*
+		 * If the remaining sectors fall entirely within this
+		 * table entry are they compatible with its logical_block_size?
+		 */
+		if (remaining < ti->len &&
+		    remaining & ((ti->limits.logical_block_size >>
+				  SECTOR_SHIFT) - 1))
+			break;	/* Error */
+
+		next_target_start =
+		    (unsigned short) ((next_target_start + ti->len) &
+				      (device_logical_block_size_sects - 1));
+		remaining = next_target_start ?
+		    device_logical_block_size_sects - next_target_start : 0;
+	}
+
+	if (remaining) {
+		DMWARN("%s: table line %u (start sect %llu len %llu) "
+		       "not aligned to hardware logical block size %hu",
+		       dm_device_name(table->md), i,
+		       (unsigned long long) ti->begin,
+		       (unsigned long long) ti->len,
+		       table->limits.logical_block_size);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
 int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params)
 {
@@ -823,6 +888,10 @@ int dm_table_complete(struct dm_table *t)
 
 	check_for_valid_limits(&t->limits);
 
+	r = validate_hardware_logical_block_alignment(t);
+	if (r)
+		return r;
+
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
-- 
cgit v1.2.3


From 5ab97588fb266187b88d1ad893251c94388f18ba Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:32 +0100
Subject: dm table: replace struct io_restrictions with struct queue_limits

Use blk_stack_limits() to stack block limits (including topology) rather
than duplicate the equivalent within Device Mapper.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 138 +++++++++++++-----------------------------
 include/linux/device-mapper.h |  16 +----
 2 files changed, 45 insertions(+), 109 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e3bcfb8b15a1..41ec2bf9fbe9 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -66,7 +66,7 @@ struct dm_table {
 	 * These are optimistic limits taken from all the
 	 * targets, some targets will need smaller limits.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
@@ -88,43 +88,6 @@ static unsigned int int_log(unsigned int n, unsigned int base)
 	return result;
 }
 
-/*
- * Returns the minimum that is _not_ zero, unless both are zero.
- */
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
-
-/*
- * Combine two io_restrictions, always taking the lower value.
- */
-static void combine_restrictions_low(struct io_restrictions *lhs,
-				     struct io_restrictions *rhs)
-{
-	lhs->max_sectors =
-		min_not_zero(lhs->max_sectors, rhs->max_sectors);
-
-	lhs->max_phys_segments =
-		min_not_zero(lhs->max_phys_segments, rhs->max_phys_segments);
-
-	lhs->max_hw_segments =
-		min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments);
-
-	lhs->logical_block_size = max(lhs->logical_block_size,
-				      rhs->logical_block_size);
-
-	lhs->max_segment_size =
-		min_not_zero(lhs->max_segment_size, rhs->max_segment_size);
-
-	lhs->max_hw_sectors =
-		min_not_zero(lhs->max_hw_sectors, rhs->max_hw_sectors);
-
-	lhs->seg_boundary_mask =
-		min_not_zero(lhs->seg_boundary_mask, rhs->seg_boundary_mask);
-
-	lhs->bounce_pfn = min_not_zero(lhs->bounce_pfn, rhs->bounce_pfn);
-
-	lhs->no_cluster |= rhs->no_cluster;
-}
-
 /*
  * Calculate the index of the child node of the n'th node k'th key.
  */
@@ -511,10 +474,14 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
 	return 0;
 }
 
+/*
+ * Returns the minimum that is _not_ zero, unless both are zero.
+ */
+#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
+
 void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
-	struct io_restrictions *rs = &ti->limits;
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
@@ -523,15 +490,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 		return;
 	}
 
-	/*
-	 * Combine the device limits low.
-	 *
-	 * FIXME: if we move an io_restriction struct
-	 *        into q this would just be a call to
-	 *        combine_restrictions_low()
-	 */
-	rs->max_sectors =
-		min_not_zero(rs->max_sectors, queue_max_sectors(q));
+	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+		DMWARN("%s: target device %s is misaligned",
+		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
 	/*
 	 * Check if merge fn is supported.
@@ -540,33 +501,9 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		rs->max_sectors =
-			min_not_zero(rs->max_sectors,
+		ti->limits.max_sectors =
+			min_not_zero(ti->limits.max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
-
-	rs->max_phys_segments =
-		min_not_zero(rs->max_phys_segments,
-			     queue_max_phys_segments(q));
-
-	rs->max_hw_segments =
-		min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q));
-
-	rs->logical_block_size = max(rs->logical_block_size,
-				     queue_logical_block_size(q));
-
-	rs->max_segment_size =
-		min_not_zero(rs->max_segment_size, queue_max_segment_size(q));
-
-	rs->max_hw_sectors =
-		min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q));
-
-	rs->seg_boundary_mask =
-		min_not_zero(rs->seg_boundary_mask,
-			     queue_segment_boundary(q));
-
-	rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q));
-
-	rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags);
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
@@ -704,24 +641,32 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void check_for_valid_limits(struct io_restrictions *rs)
+static void init_valid_queue_limits(struct queue_limits *limits)
 {
-	if (!rs->max_sectors)
-		rs->max_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_hw_sectors)
-		rs->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!rs->max_phys_segments)
-		rs->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!rs->max_hw_segments)
-		rs->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!rs->logical_block_size)
-		rs->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!rs->max_segment_size)
-		rs->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!rs->seg_boundary_mask)
-		rs->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!rs->bounce_pfn)
-		rs->bounce_pfn = -1;
+	if (!limits->max_sectors)
+		limits->max_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_hw_sectors)
+		limits->max_hw_sectors = SAFE_MAX_SECTORS;
+	if (!limits->max_phys_segments)
+		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
+	if (!limits->max_hw_segments)
+		limits->max_hw_segments = MAX_HW_SEGMENTS;
+	if (!limits->logical_block_size)
+		limits->logical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->physical_block_size)
+		limits->physical_block_size = 1 << SECTOR_SHIFT;
+	if (!limits->io_min)
+		limits->io_min = 1 << SECTOR_SHIFT;
+	if (!limits->max_segment_size)
+		limits->max_segment_size = MAX_SEGMENT_SIZE;
+	if (!limits->seg_boundary_mask)
+		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+	if (!limits->bounce_pfn)
+		limits->bounce_pfn = -1;
+	/*
+	 * The other fields (alignment_offset, io_opt, misaligned)
+	 * hold 0 from the kzalloc().
+	 */
 }
 
 /*
@@ -841,9 +786,12 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	/* FIXME: the plan is to combine high here and then have
-	 * the merge fn apply the target level restrictions. */
-	combine_restrictions_low(&t->limits, &tgt->limits);
+	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
+		DMWARN("%s: target device (start sect %llu len %llu) "
+		       "is misaligned",
+		       dm_device_name(t->md),
+		       (unsigned long long) tgt->begin,
+		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -886,7 +834,7 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	check_for_valid_limits(&t->limits);
+	init_valid_queue_limits(&t->limits);
 
 	r = validate_hardware_logical_block_alignment(t);
 	if (r)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index fc36a4d07723..236880c1dc3f 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -144,18 +144,6 @@ struct target_type {
 	struct list_head list;
 };
 
-struct io_restrictions {
-	unsigned long bounce_pfn;
-	unsigned long seg_boundary_mask;
-	unsigned max_hw_sectors;
-	unsigned max_sectors;
-	unsigned max_segment_size;
-	unsigned short logical_block_size;
-	unsigned short max_hw_segments;
-	unsigned short max_phys_segments;
-	unsigned char no_cluster; /* inverted so that 0 is default */
-};
-
 struct dm_target {
 	struct dm_table *table;
 	struct target_type *type;
@@ -164,7 +152,7 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with io_restrictions */
+	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -182,7 +170,7 @@ struct dm_target {
 	 * These are automatically filled in by
 	 * dm_table_get_device.
 	 */
-	struct io_restrictions limits;
+	struct queue_limits limits;
 
 	/* target specific data */
 	void *private;
-- 
cgit v1.2.3


From 1197764e403d97231eb6da2b1e16f511a7fd3101 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:32 +0100
Subject: dm table: establish queue limits by copying table limits

Copy the table's queue_limits to the DM device's request_queue.  This
properly initializes the queue's topology limits and also avoids having
to track the evolution of 'struct queue_limits' in
dm_table_set_restrictions()

Also fixes a bug that was introduced in dm_table_set_restrictions() via
commit ae03bf639a5027d27270123f5f6e3ee6a412781d.  In addition to
establishing 'bounce_pfn' in the queue's limits blk_queue_bounce_limit()
also performs an allocation to setup the ISA DMA pool.  This allocation
resulted in "sleeping function called from invalid context" when called
from dm_table_set_restrictions().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41ec2bf9fbe9..267817edc844 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -956,17 +956,9 @@ no_integrity:
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
 {
 	/*
-	 * Make sure we obey the optimistic sub devices
-	 * restrictions.
+	 * Copy table's limits to the DM device's request_queue
 	 */
-	blk_queue_max_sectors(q, t->limits.max_sectors);
-	blk_queue_max_phys_segments(q, t->limits.max_phys_segments);
-	blk_queue_max_hw_segments(q, t->limits.max_hw_segments);
-	blk_queue_logical_block_size(q, t->limits.logical_block_size);
-	blk_queue_max_segment_size(q, t->limits.max_segment_size);
-	blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors);
-	blk_queue_segment_boundary(q, t->limits.seg_boundary_mask);
-	blk_queue_bounce_limit(q, t->limits.bounce_pfn);
+	q->limits = t->limits;
 
 	if (t->limits.no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
-- 
cgit v1.2.3


From af4874e03ed82f050d5872d8c39ce64bf16b5c38 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:33 +0100
Subject: dm target:s introduce iterate devices fn

Add .iterate_devices to 'struct target_type' to allow a function to be
called for all devices in a DM target.  Implemented it for all targets
except those in dm-snap.c (origin and snapshot).

(The raid1 version number jumps to 1.12 because we originally reserved
1.1 to 1.11 for 'block_on_error' but ended up using 'handle_errors'
instead.)

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Cc: martin.petersen@oracle.com
---
 drivers/md/dm-crypt.c         | 11 ++++++++++-
 drivers/md/dm-delay.c         | 20 +++++++++++++++++++-
 drivers/md/dm-linear.c        | 11 ++++++++++-
 drivers/md/dm-mpath.c         | 23 ++++++++++++++++++++++-
 drivers/md/dm-raid1.c         | 17 ++++++++++++++++-
 drivers/md/dm-stripe.c        | 18 +++++++++++++++++-
 include/linux/device-mapper.h | 11 +++++++++++
 7 files changed, 105 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 04db6c4004a8..9933eb861c71 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1313,9 +1313,17 @@ static int crypt_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int crypt_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct crypt_config *cc = ti->private;
+
+	return fn(ti, cc->dev, cc->start, data);
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
-	.version= {1, 6, 0},
+	.version = {1, 7, 0},
 	.module = THIS_MODULE,
 	.ctr    = crypt_ctr,
 	.dtr    = crypt_dtr,
@@ -1326,6 +1334,7 @@ static struct target_type crypt_target = {
 	.resume = crypt_resume,
 	.message = crypt_message,
 	.merge  = crypt_merge,
+	.iterate_devices = crypt_iterate_devices,
 };
 
 static int __init dm_crypt_init(void)
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index 8ad8a9044bbf..4e5b843cd4d7 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -318,9 +318,26 @@ static int delay_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int delay_iterate_devices(struct dm_target *ti,
+				 iterate_devices_callout_fn fn, void *data)
+{
+	struct delay_c *dc = ti->private;
+	int ret = 0;
+
+	ret = fn(ti, dc->dev_read, dc->start_read, data);
+	if (ret)
+		goto out;
+
+	if (dc->dev_write)
+		ret = fn(ti, dc->dev_write, dc->start_write, data);
+
+out:
+	return ret;
+}
+
 static struct target_type delay_target = {
 	.name	     = "delay",
-	.version     = {1, 0, 2},
+	.version     = {1, 1, 0},
 	.module      = THIS_MODULE,
 	.ctr	     = delay_ctr,
 	.dtr	     = delay_dtr,
@@ -328,6 +345,7 @@ static struct target_type delay_target = {
 	.presuspend  = delay_presuspend,
 	.resume	     = delay_resume,
 	.status	     = delay_status,
+	.iterate_devices = delay_iterate_devices,
 };
 
 static int __init dm_delay_init(void)
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index ecbb17421da4..9184b6deb868 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -134,9 +134,17 @@ static int linear_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 
+static int linear_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct linear_c *lc = ti->private;
+
+	return fn(ti, lc->dev, lc->start, data);
+}
+
 static struct target_type linear_target = {
 	.name   = "linear",
-	.version= {1, 0, 3},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr    = linear_ctr,
 	.dtr    = linear_dtr,
@@ -144,6 +152,7 @@ static struct target_type linear_target = {
 	.status = linear_status,
 	.ioctl  = linear_ioctl,
 	.merge  = linear_merge,
+	.iterate_devices = linear_iterate_devices,
 };
 
 int __init dm_linear_init(void)
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 890c0e8ed13e..f8aeaaa54afe 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1450,12 +1450,32 @@ static int multipath_ioctl(struct dm_target *ti, unsigned int cmd,
 	return r ? : __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 }
 
+static int multipath_iterate_devices(struct dm_target *ti,
+				     iterate_devices_callout_fn fn, void *data)
+{
+	struct multipath *m = ti->private;
+	struct priority_group *pg;
+	struct pgpath *p;
+	int ret = 0;
+
+	list_for_each_entry(pg, &m->priority_groups, list) {
+		list_for_each_entry(p, &pg->pgpaths, list) {
+			ret = fn(ti, p->path.dev, ti->begin, data);
+			if (ret)
+				goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
 /*-----------------------------------------------------------------
  * Module setup
  *---------------------------------------------------------------*/
 static struct target_type multipath_target = {
 	.name = "multipath",
-	.version = {1, 0, 5},
+	.version = {1, 1, 0},
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
@@ -1466,6 +1486,7 @@ static struct target_type multipath_target = {
 	.status = multipath_status,
 	.message = multipath_message,
 	.ioctl  = multipath_ioctl,
+	.iterate_devices = multipath_iterate_devices,
 };
 
 static int __init dm_multipath_init(void)
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 076fbb4e967a..ce8868c768cc 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -1283,9 +1283,23 @@ static int mirror_status(struct dm_target *ti, status_type_t type,
 	return 0;
 }
 
+static int mirror_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct mirror_set *ms = ti->private;
+	int ret = 0;
+	unsigned i;
+
+	for (i = 0; !ret && i < ms->nr_mirrors; i++)
+		ret = fn(ti, ms->mirror[i].dev,
+			 ms->mirror[i].offset, data);
+
+	return ret;
+}
+
 static struct target_type mirror_target = {
 	.name	 = "mirror",
-	.version = {1, 0, 20},
+	.version = {1, 12, 0},
 	.module	 = THIS_MODULE,
 	.ctr	 = mirror_ctr,
 	.dtr	 = mirror_dtr,
@@ -1295,6 +1309,7 @@ static struct target_type mirror_target = {
 	.postsuspend = mirror_postsuspend,
 	.resume	 = mirror_resume,
 	.status	 = mirror_status,
+	.iterate_devices = mirror_iterate_devices,
 };
 
 static int __init dm_mirror_init(void)
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index c64fe827a5f1..b240e85ae39a 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -313,15 +313,31 @@ static int stripe_end_io(struct dm_target *ti, struct bio *bio,
 	return error;
 }
 
+static int stripe_iterate_devices(struct dm_target *ti,
+				  iterate_devices_callout_fn fn, void *data)
+{
+	struct stripe_c *sc = ti->private;
+	int ret = 0;
+	unsigned i = 0;
+
+	do
+		ret = fn(ti, sc->stripe[i].dev,
+			 sc->stripe[i].physical_start, data);
+	while (!ret && ++i < sc->stripes);
+
+	return ret;
+}
+
 static struct target_type stripe_target = {
 	.name   = "striped",
-	.version = {1, 1, 0},
+	.version = {1, 2, 0},
 	.module = THIS_MODULE,
 	.ctr    = stripe_ctr,
 	.dtr    = stripe_dtr,
 	.map    = stripe_map,
 	.end_io = stripe_end_io,
 	.status = stripe_status,
+	.iterate_devices = stripe_iterate_devices,
 };
 
 int __init dm_stripe_init(void)
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 236880c1dc3f..deac3b4e5e18 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -11,6 +11,7 @@
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 
+struct dm_dev;
 struct dm_target;
 struct dm_table;
 struct mapped_device;
@@ -81,6 +82,15 @@ typedef int (*dm_ioctl_fn) (struct dm_target *ti, unsigned int cmd,
 typedef int (*dm_merge_fn) (struct dm_target *ti, struct bvec_merge_data *bvm,
 			    struct bio_vec *biovec, int max_size);
 
+typedef int (*iterate_devices_callout_fn) (struct dm_target *ti,
+					   struct dm_dev *dev,
+					   sector_t physical_start,
+					   void *data);
+
+typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
+				      iterate_devices_callout_fn fn,
+				      void *data);
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -139,6 +149,7 @@ struct target_type {
 	dm_ioctl_fn ioctl;
 	dm_merge_fn merge;
 	dm_busy_fn busy;
+	dm_iterate_devices_fn iterate_devices;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
-- 
cgit v1.2.3


From 18d8594dd93a1ae2fafd591ec026e87d743292bf Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:33 +0100
Subject: dm log: fix create_log_context to use logical_block_size of log
 device

create_log_context() must use the logical_block_size from the log disk,
where the I/O happens, not the target's logical_block_size.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-log.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 6352a9ad4446..9443896ede07 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -412,9 +412,10 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
 		/*
 		 * Buffer holds both header and bitset.
 		 */
-		buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) +
-				       bitset_size,
-				       ti->limits.logical_block_size);
+		buf_size =
+		    dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + bitset_size,
+				bdev_logical_block_size(lc->header_location.
+							    bdev));
 
 		if (buf_size > i_size_read(dev->bdev->bd_inode)) {
 			DMWARN("log device %s too small: need %llu bytes",
-- 
cgit v1.2.3


From 754c5fc7ebb417b23601a6222a6005cc2e7f2913 Mon Sep 17 00:00:00 2001
From: Mike Snitzer <snitzer@redhat.com>
Date: Mon, 22 Jun 2009 10:12:34 +0100
Subject: dm: calculate queue limits during resume not load

Currently, device-mapper maintains a separate instance of 'struct
queue_limits' for each table of each device.  When the configuration of
a device is to be changed, first its table is loaded and this structure
is populated, then the device is 'resumed' and the calculated
queue_limits are applied.

This places restrictions on how userspace may process related devices,
where it is often advantageous to 'load' tables for several devices
at once before 'resuming' them together.  As the new queue_limits
only take effect after the 'resume', if they are changing and one
device uses another, the latter must be 'resumed' before the former
may be 'loaded'.

This patch moves the calculation of these queue_limits out of
the 'load' operation into 'resume'.  Since we are no longer
pre-calculating this struct, we no longer need to maintain copies
within our dm structs.

dm_set_device_limits() now passes the 'start' of the device's
data area (aka pe_start) as the 'offset' to blk_stack_limits().

init_valid_queue_limits() is replaced by blk_set_default_limits().

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Cc: martin.petersen@oracle.com
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         | 185 +++++++++++++++++++++++-------------------
 drivers/md/dm.c               |  12 ++-
 drivers/md/dm.h               |   5 +-
 include/linux/device-mapper.h |  10 +--
 4 files changed, 117 insertions(+), 95 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 267817edc844..09a57113955e 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -62,12 +62,6 @@ struct dm_table {
 	/* a list of devices used by this table */
 	struct list_head devices;
 
-	/*
-	 * These are optimistic limits taken from all the
-	 * targets, some targets will need smaller limits.
-	 */
-	struct queue_limits limits;
-
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
 	void *event_context;
@@ -346,18 +340,21 @@ static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 /*
  * If possible, this checks an area of a destination device is valid.
  */
-static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
-			     sector_t start, sector_t len)
+static int device_area_is_valid(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, void *data)
 {
-	sector_t dev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
+	sector_t dev_size =
+		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	unsigned short logical_block_size_sectors =
-		ti->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 
 	if (!dev_size)
 		return 1;
 
-	if ((start >= dev_size) || (start + len > dev_size)) {
+	if ((start >= dev_size) || (start + ti->len > dev_size)) {
 		DMWARN("%s: %s too small for target",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 		return 0;
@@ -371,16 +368,16 @@ static int device_area_is_valid(struct dm_target *ti, struct block_device *bdev,
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
-	if (len & (logical_block_size_sectors - 1)) {
+	if (ti->len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
 		       "logical block size %hu of %s",
 		       dm_device_name(ti->table->md),
-		       (unsigned long long)len,
-		       ti->limits.logical_block_size, bdevname(bdev, b));
+		       (unsigned long long)ti->len,
+		       limits->logical_block_size, bdevname(bdev, b));
 		return 0;
 	}
 
@@ -479,18 +476,21 @@ static int __table_get_device(struct dm_table *t, struct dm_target *ti,
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data)
 {
+	struct queue_limits *limits = data;
+	struct block_device *bdev = dev->bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 	char b[BDEVNAME_SIZE];
 
 	if (unlikely(!q)) {
 		DMWARN("%s: Cannot set limits for nonexistent device %s",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
-		return;
+		return 0;
 	}
 
-	if (blk_stack_limits(&ti->limits, &q->limits, 0) < 0)
+	if (blk_stack_limits(limits, &q->limits, start) < 0)
 		DMWARN("%s: target device %s is misaligned",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 
@@ -501,32 +501,21 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev)
 	 */
 
 	if (q->merge_bvec_fn && !ti->type->merge)
-		ti->limits.max_sectors =
-			min_not_zero(ti->limits.max_sectors,
+		limits->max_sectors =
+			min_not_zero(limits->max_sectors,
 				     (unsigned int) (PAGE_SIZE >> 9));
+	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 
 int dm_get_device(struct dm_target *ti, const char *path, sector_t start,
 		  sector_t len, fmode_t mode, struct dm_dev **result)
 {
-	int r = __table_get_device(ti->table, ti, path,
-				   start, len, mode, result);
-
-	if (r)
-		return r;
-
-	dm_set_device_limits(ti, (*result)->bdev);
-
-	if (!device_area_is_valid(ti, (*result)->bdev, start, len)) {
-		dm_put_device(ti, *result);
-		*result = NULL;
-		return -EINVAL;
-	}
-
-	return r;
+	return __table_get_device(ti->table, ti, path,
+				  start, len, mode, result);
 }
 
+
 /*
  * Decrement a devices use count and remove it if necessary.
  */
@@ -641,34 +630,6 @@ int dm_split_args(int *argc, char ***argvp, char *input)
 	return 0;
 }
 
-static void init_valid_queue_limits(struct queue_limits *limits)
-{
-	if (!limits->max_sectors)
-		limits->max_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_hw_sectors)
-		limits->max_hw_sectors = SAFE_MAX_SECTORS;
-	if (!limits->max_phys_segments)
-		limits->max_phys_segments = MAX_PHYS_SEGMENTS;
-	if (!limits->max_hw_segments)
-		limits->max_hw_segments = MAX_HW_SEGMENTS;
-	if (!limits->logical_block_size)
-		limits->logical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->physical_block_size)
-		limits->physical_block_size = 1 << SECTOR_SHIFT;
-	if (!limits->io_min)
-		limits->io_min = 1 << SECTOR_SHIFT;
-	if (!limits->max_segment_size)
-		limits->max_segment_size = MAX_SEGMENT_SIZE;
-	if (!limits->seg_boundary_mask)
-		limits->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
-	if (!limits->bounce_pfn)
-		limits->bounce_pfn = -1;
-	/*
-	 * The other fields (alignment_offset, io_opt, misaligned)
-	 * hold 0 from the kzalloc().
-	 */
-}
-
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
@@ -676,14 +637,15 @@ static void init_valid_queue_limits(struct queue_limits *limits)
  * two or more targets, the size of each piece it gets split into must
  * be compatible with the logical_block_size of the target processing it.
  */
-static int validate_hardware_logical_block_alignment(struct dm_table *table)
+static int validate_hardware_logical_block_alignment(struct dm_table *table,
+						 struct queue_limits *limits)
 {
 	/*
 	 * This function uses arithmetic modulo the logical_block_size
 	 * (in units of 512-byte sectors).
 	 */
 	unsigned short device_logical_block_size_sects =
-		table->limits.logical_block_size >> SECTOR_SHIFT;
+		limits->logical_block_size >> SECTOR_SHIFT;
 
 	/*
 	 * Offset of the start of the next table entry, mod logical_block_size.
@@ -697,6 +659,7 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	unsigned short remaining = 0;
 
 	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
 	unsigned i = 0;
 
 	/*
@@ -705,12 +668,19 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 
+		blk_set_default_limits(&ti_limits);
+
+		/* combine all target devices' limits */
+		if (ti->type->iterate_devices)
+			ti->type->iterate_devices(ti, dm_set_device_limits,
+						  &ti_limits);
+
 		/*
 		 * If the remaining sectors fall entirely within this
 		 * table entry are they compatible with its logical_block_size?
 		 */
 		if (remaining < ti->len &&
-		    remaining & ((ti->limits.logical_block_size >>
+		    remaining & ((ti_limits.logical_block_size >>
 				  SECTOR_SHIFT) - 1))
 			break;	/* Error */
 
@@ -723,11 +693,11 @@ static int validate_hardware_logical_block_alignment(struct dm_table *table)
 
 	if (remaining) {
 		DMWARN("%s: table line %u (start sect %llu len %llu) "
-		       "not aligned to hardware logical block size %hu",
+		       "not aligned to h/w logical block size %hu",
 		       dm_device_name(table->md), i,
 		       (unsigned long long) ti->begin,
 		       (unsigned long long) ti->len,
-		       table->limits.logical_block_size);
+		       limits->logical_block_size);
 		return -EINVAL;
 	}
 
@@ -786,12 +756,6 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 
-	if (blk_stack_limits(&t->limits, &tgt->limits, 0) < 0)
-		DMWARN("%s: target device (start sect %llu len %llu) "
-		       "is misaligned",
-		       dm_device_name(t->md),
-		       (unsigned long long) tgt->begin,
-		       (unsigned long long) tgt->len);
 	return 0;
 
  bad:
@@ -834,12 +798,6 @@ int dm_table_complete(struct dm_table *t)
 	int r = 0;
 	unsigned int leaf_nodes;
 
-	init_valid_queue_limits(&t->limits);
-
-	r = validate_hardware_logical_block_alignment(t);
-	if (r)
-		return r;
-
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -914,6 +872,57 @@ struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 
+/*
+ * Establish the new table's queue_limits and validate them.
+ */
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits)
+{
+	struct dm_target *uninitialized_var(ti);
+	struct queue_limits ti_limits;
+	unsigned i = 0;
+
+	blk_set_default_limits(limits);
+
+	while (i < dm_table_get_num_targets(table)) {
+		blk_set_default_limits(&ti_limits);
+
+		ti = dm_table_get_target(table, i++);
+
+		if (!ti->type->iterate_devices)
+			goto combine_limits;
+
+		/*
+		 * Combine queue limits of all the devices this target uses.
+		 */
+		ti->type->iterate_devices(ti, dm_set_device_limits,
+					  &ti_limits);
+
+		/*
+		 * Check each device area is consistent with the target's
+		 * overall queue limits.
+		 */
+		if (!ti->type->iterate_devices(ti, device_area_is_valid,
+					       &ti_limits))
+			return -EINVAL;
+
+combine_limits:
+		/*
+		 * Merge this target's queue limits into the overall limits
+		 * for the table.
+		 */
+		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
+			DMWARN("%s: target device "
+			       "(start sect %llu len %llu) "
+			       "is misaligned",
+			       dm_device_name(table->md),
+			       (unsigned long long) ti->begin,
+			       (unsigned long long) ti->len);
+	}
+
+	return validate_hardware_logical_block_alignment(table, limits);
+}
+
 /*
  * Set the integrity profile for this device if all devices used have
  * matching profiles.
@@ -953,14 +962,24 @@ no_integrity:
 	return;
 }
 
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits)
 {
+	/*
+	 * Each target device in the table has a data area that should normally
+	 * be aligned such that the DM device's alignment_offset is 0.
+	 * FIXME: Propagate alignment_offsets up the stack and warn of
+	 *	  sub-optimal or inconsistent settings.
+	 */
+	limits->alignment_offset = 0;
+	limits->misaligned = 0;
+
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
-	q->limits = t->limits;
+	q->limits = *limits;
 
-	if (t->limits.no_cluster)
+	if (limits->no_cluster)
 		queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q);
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index a9210bb594e7..f609793a92d0 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1313,7 +1313,8 @@ static void __set_size(struct mapped_device *md, sector_t size)
 	mutex_unlock(&md->bdev->bd_inode->i_mutex);
 }
 
-static int __bind(struct mapped_device *md, struct dm_table *t)
+static int __bind(struct mapped_device *md, struct dm_table *t,
+		  struct queue_limits *limits)
 {
 	struct request_queue *q = md->queue;
 	sector_t size;
@@ -1337,7 +1338,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t)
 
 	write_lock(&md->map_lock);
 	md->map = t;
-	dm_table_set_restrictions(t, q);
+	dm_table_set_restrictions(t, q, limits);
 	write_unlock(&md->map_lock);
 
 	return 0;
@@ -1562,6 +1563,7 @@ static void dm_queue_flush(struct mapped_device *md)
  */
 int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
+	struct queue_limits limits;
 	int r = -EINVAL;
 
 	mutex_lock(&md->suspend_lock);
@@ -1570,8 +1572,12 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (!dm_suspended(md))
 		goto out;
 
+	r = dm_calculate_queue_limits(table, &limits);
+	if (r)
+		goto out;
+
 	__unbind(md);
-	r = __bind(md, table);
+	r = __bind(md, table, &limits);
 
 out:
 	mutex_unlock(&md->suspend_lock);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b5935c610c44..604e85caadf6 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -41,7 +41,10 @@ void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index);
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector);
-void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q);
+int dm_calculate_queue_limits(struct dm_table *table,
+			      struct queue_limits *limits);
+void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
+			       struct queue_limits *limits);
 struct list_head *dm_table_get_devices(struct dm_table *t);
 void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index deac3b4e5e18..e6bf3b8c7bf2 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -103,7 +103,8 @@ void dm_error(const char *message);
 /*
  * Combine device limits.
  */
-void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev);
+int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
+			 sector_t start, void *data);
 
 struct dm_dev {
 	struct block_device *bdev;
@@ -163,7 +164,6 @@ struct dm_target {
 	sector_t begin;
 	sector_t len;
 
-	/* FIXME: turn this into a mask, and merge with queue_limits */
 	/* Always a power of 2 */
 	sector_t split_io;
 
@@ -177,12 +177,6 @@ struct dm_target {
 	 */
 	unsigned num_flush_requests;
 
-	/*
-	 * These are automatically filled in by
-	 * dm_table_get_device.
-	 */
-	struct queue_limits limits;
-
 	/* target specific data */
 	void *private;
 
-- 
cgit v1.2.3


From f5db4af466e2dca0fe822019812d586ca910b00c Mon Sep 17 00:00:00 2001
From: Jonthan Brassow <jbrassow@redhat.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm raid1: add userspace log

This patch contains a device-mapper mirror log module that forwards
requests to userspace for processing.

The structures used for communication between kernel and userspace are
located in include/linux/dm-log-userspace.h.  Due to the frequency,
diversity, and 2-way communication nature of the exchanges between
kernel and userspace, 'connector' was chosen as the interface for
communication.

The first log implementations written in userspace - "clustered-disk"
and "clustered-core" - support clustered shared storage.   A userspace
daemon (in the LVM2 source code repository) uses openAIS/corosync to
process requests in an ordered fashion with the rest of the nodes in the
cluster so as to prevent log state corruption.  Other implementations
with no association to LVM or openAIS/corosync, are certainly possible.

(Imagine if two machines are writing to the same region of a mirror.
They would both mark the region dirty, but you need a cluster-aware
entity that can handle properly marking the region clean when they are
done.  Otherwise, you might clear the region when the first machine is
done, not the second.)

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Cc: Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 Documentation/device-mapper/dm-log.txt |  54 +++
 drivers/md/Kconfig                     |  11 +
 drivers/md/Makefile                    |   3 +
 drivers/md/dm-log-userspace-base.c     | 696 +++++++++++++++++++++++++++++++++
 drivers/md/dm-log-userspace-transfer.c | 276 +++++++++++++
 drivers/md/dm-log-userspace-transfer.h |  18 +
 include/linux/Kbuild                   |   1 +
 include/linux/connector.h              |   4 +-
 include/linux/dm-log-userspace.h       | 386 ++++++++++++++++++
 9 files changed, 1448 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/device-mapper/dm-log.txt
 create mode 100644 drivers/md/dm-log-userspace-base.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.c
 create mode 100644 drivers/md/dm-log-userspace-transfer.h
 create mode 100644 include/linux/dm-log-userspace.h

(limited to 'drivers/md')

diff --git a/Documentation/device-mapper/dm-log.txt b/Documentation/device-mapper/dm-log.txt
new file mode 100644
index 000000000000..994dd75475a6
--- /dev/null
+++ b/Documentation/device-mapper/dm-log.txt
@@ -0,0 +1,54 @@
+Device-Mapper Logging
+=====================
+The device-mapper logging code is used by some of the device-mapper
+RAID targets to track regions of the disk that are not consistent.
+A region (or portion of the address space) of the disk may be
+inconsistent because a RAID stripe is currently being operated on or
+a machine died while the region was being altered.  In the case of
+mirrors, a region would be considered dirty/inconsistent while you
+are writing to it because the writes need to be replicated for all
+the legs of the mirror and may not reach the legs at the same time.
+Once all writes are complete, the region is considered clean again.
+
+There is a generic logging interface that the device-mapper RAID
+implementations use to perform logging operations (see
+dm_dirty_log_type in include/linux/dm-dirty-log.h).  Various different
+logging implementations are available and provide different
+capabilities.  The list includes:
+
+Type		Files
+====		=====
+disk		drivers/md/dm-log.c
+core		drivers/md/dm-log.c
+userspace	drivers/md/dm-log-userspace* include/linux/dm-log-userspace.h
+
+The "disk" log type
+-------------------
+This log implementation commits the log state to disk.  This way, the
+logging state survives reboots/crashes.
+
+The "core" log type
+-------------------
+This log implementation keeps the log state in memory.  The log state
+will not survive a reboot or crash, but there may be a small boost in
+performance.  This method can also be used if no storage device is
+available for storing log state.
+
+The "userspace" log type
+------------------------
+This log type simply provides a way to export the log API to userspace,
+so log implementations can be done there.  This is done by forwarding most
+logging requests to userspace, where a daemon receives and processes the
+request.
+
+The structure used for communication between kernel and userspace are
+located in include/linux/dm-log-userspace.h.  Due to the frequency,
+diversity, and 2-way communication nature of the exchanges between
+kernel and userspace, 'connector' is used as the interface for
+communication.
+
+There are currently two userspace log implementations that leverage this
+framework - "clustered_disk" and "clustered_core".  These implementations
+provide a cluster-coherent log for shared-storage.  Device-mapper mirroring
+can be used in a shared-storage environment when the cluster log implementations
+are employed.
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index 09f93fa68912..020f9573fd82 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -231,6 +231,17 @@ config DM_MIRROR
          Allow volume managers to mirror logical volumes, also
          needed for live data migration tools such as 'pvmove'.
 
+config DM_LOG_USERSPACE
+	tristate "Mirror userspace logging (EXPERIMENTAL)"
+	depends on DM_MIRROR && EXPERIMENTAL && NET
+	select CONNECTOR
+	---help---
+	  The userspace logging module provides a mechanism for
+	  relaying the dm-dirty-log API to userspace.  Log designs
+	  which are more suited to userspace implementation (e.g.
+	  shared storage logs) or experimental logs can be implemented
+	  by leveraging this framework.
+
 config DM_ZERO
 	tristate "Zero target"
 	depends on BLK_DEV_DM
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index dade52f60733..1dc4185bd781 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -8,6 +8,8 @@ dm-multipath-y	+= dm-path-selector.o dm-mpath.o
 dm-snapshot-y	+= dm-snap.o dm-exception-store.o dm-snap-transient.o \
 		    dm-snap-persistent.o
 dm-mirror-y	+= dm-raid1.o
+dm-log-userspace-y \
+		+= dm-log-userspace-base.o dm-log-userspace-transfer.o
 md-mod-y	+= md.o bitmap.o
 raid456-y	+= raid5.o
 raid6_pq-y	+= raid6algos.o raid6recov.o raid6tables.o \
@@ -40,6 +42,7 @@ obj-$(CONFIG_DM_MULTIPATH_QL)	+= dm-queue-length.o
 obj-$(CONFIG_DM_MULTIPATH_ST)	+= dm-service-time.o
 obj-$(CONFIG_DM_SNAPSHOT)	+= dm-snapshot.o
 obj-$(CONFIG_DM_MIRROR)		+= dm-mirror.o dm-log.o dm-region-hash.o
+obj-$(CONFIG_DM_LOG_USERSPACE)	+= dm-log-userspace.o
 obj-$(CONFIG_DM_ZERO)		+= dm-zero.o
 
 quiet_cmd_unroll = UNROLL  $@
diff --git a/drivers/md/dm-log-userspace-base.c b/drivers/md/dm-log-userspace-base.c
new file mode 100644
index 000000000000..e69b96560997
--- /dev/null
+++ b/drivers/md/dm-log-userspace-base.c
@@ -0,0 +1,696 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/bio.h>
+#include <linux/dm-dirty-log.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+struct flush_entry {
+	int type;
+	region_t region;
+	struct list_head list;
+};
+
+struct log_c {
+	struct dm_target *ti;
+	uint32_t region_size;
+	region_t region_count;
+	char uuid[DM_UUID_LEN];
+
+	char *usr_argv_str;
+	uint32_t usr_argc;
+
+	/*
+	 * in_sync_hint gets set when doing is_remote_recovering.  It
+	 * represents the first region that needs recovery.  IOW, the
+	 * first zero bit of sync_bits.  This can be useful for to limit
+	 * traffic for calls like is_remote_recovering and get_resync_work,
+	 * but be take care in its use for anything else.
+	 */
+	uint64_t in_sync_hint;
+
+	spinlock_t flush_lock;
+	struct list_head flush_list;  /* only for clear and mark requests */
+};
+
+static mempool_t *flush_entry_pool;
+
+static void *flush_entry_alloc(gfp_t gfp_mask, void *pool_data)
+{
+	return kmalloc(sizeof(struct flush_entry), gfp_mask);
+}
+
+static void flush_entry_free(void *element, void *pool_data)
+{
+	kfree(element);
+}
+
+static int userspace_do_request(struct log_c *lc, const char *uuid,
+				int request_type, char *data, size_t data_size,
+				char *rdata, size_t *rdata_size)
+{
+	int r;
+
+	/*
+	 * If the server isn't there, -ESRCH is returned,
+	 * and we must keep trying until the server is
+	 * restored.
+	 */
+retry:
+	r = dm_consult_userspace(uuid, request_type, data,
+				 data_size, rdata, rdata_size);
+
+	if (r != -ESRCH)
+		return r;
+
+	DMERR(" Userspace log server not found.");
+	while (1) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(2*HZ);
+		DMWARN("Attempting to contact userspace log server...");
+		r = dm_consult_userspace(uuid, DM_ULOG_CTR, lc->usr_argv_str,
+					 strlen(lc->usr_argv_str) + 1,
+					 NULL, NULL);
+		if (!r)
+			break;
+	}
+	DMINFO("Reconnected to userspace log server... DM_ULOG_CTR complete");
+	r = dm_consult_userspace(uuid, DM_ULOG_RESUME, NULL,
+				 0, NULL, NULL);
+	if (!r)
+		goto retry;
+
+	DMERR("Error trying to resume userspace log: %d", r);
+
+	return -ESRCH;
+}
+
+static int build_constructor_string(struct dm_target *ti,
+				    unsigned argc, char **argv,
+				    char **ctr_str)
+{
+	int i, str_size;
+	char *str = NULL;
+
+	*ctr_str = NULL;
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += strlen(argv[i]) + 1; /* +1 for space between args */
+
+	str_size += 20; /* Max number of chars in a printed u64 number */
+
+	str = kzalloc(str_size, GFP_KERNEL);
+	if (!str) {
+		DMWARN("Unable to allocate memory for constructor string");
+		return -ENOMEM;
+	}
+
+	for (i = 0, str_size = 0; i < argc; i++)
+		str_size += sprintf(str + str_size, "%s ", argv[i]);
+	str_size += sprintf(str + str_size, "%llu",
+			    (unsigned long long)ti->len);
+
+	*ctr_str = str;
+	return str_size;
+}
+
+/*
+ * userspace_ctr
+ *
+ * argv contains:
+ *	<UUID> <other args>
+ * Where 'other args' is the userspace implementation specific log
+ * arguments.  An example might be:
+ *	<UUID> clustered_disk <arg count> <log dev> <region_size> [[no]sync]
+ *
+ * So, this module will strip off the <UUID> for identification purposes
+ * when communicating with userspace about a log; but will pass on everything
+ * else.
+ */
+static int userspace_ctr(struct dm_dirty_log *log, struct dm_target *ti,
+			 unsigned argc, char **argv)
+{
+	int r = 0;
+	int str_size;
+	char *ctr_str = NULL;
+	struct log_c *lc = NULL;
+	uint64_t rdata;
+	size_t rdata_size = sizeof(rdata);
+
+	if (argc < 3) {
+		DMWARN("Too few arguments to userspace dirty log");
+		return -EINVAL;
+	}
+
+	lc = kmalloc(sizeof(*lc), GFP_KERNEL);
+	if (!lc) {
+		DMWARN("Unable to allocate userspace log context.");
+		return -ENOMEM;
+	}
+
+	lc->ti = ti;
+
+	if (strlen(argv[0]) > (DM_UUID_LEN - 1)) {
+		DMWARN("UUID argument too long.");
+		kfree(lc);
+		return -EINVAL;
+	}
+
+	strncpy(lc->uuid, argv[0], DM_UUID_LEN);
+	spin_lock_init(&lc->flush_lock);
+	INIT_LIST_HEAD(&lc->flush_list);
+
+	str_size = build_constructor_string(ti, argc - 1, argv + 1, &ctr_str);
+	if (str_size < 0) {
+		kfree(lc);
+		return str_size;
+	}
+
+	/* Send table string */
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_CTR,
+				 ctr_str, str_size, NULL, NULL);
+
+	if (r == -ESRCH) {
+		DMERR("Userspace log server not found");
+		goto out;
+	}
+
+	/* Since the region size does not change, get it now */
+	rdata_size = sizeof(rdata);
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_GET_REGION_SIZE,
+				 NULL, 0, (char *)&rdata, &rdata_size);
+
+	if (r) {
+		DMERR("Failed to get region size of dirty log");
+		goto out;
+	}
+
+	lc->region_size = (uint32_t)rdata;
+	lc->region_count = dm_sector_div_up(ti->len, lc->region_size);
+
+out:
+	if (r) {
+		kfree(lc);
+		kfree(ctr_str);
+	} else {
+		lc->usr_argv_str = ctr_str;
+		lc->usr_argc = argc;
+		log->context = lc;
+	}
+
+	return r;
+}
+
+static void userspace_dtr(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_DTR,
+				 NULL, 0,
+				 NULL, NULL);
+
+	kfree(lc->usr_argv_str);
+	kfree(lc);
+
+	return;
+}
+
+static int userspace_presuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_PRESUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_postsuspend(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_POSTSUSPEND,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static int userspace_resume(struct dm_dirty_log *log)
+{
+	int r;
+	struct log_c *lc = log->context;
+
+	lc->in_sync_hint = 0;
+	r = dm_consult_userspace(lc->uuid, DM_ULOG_RESUME,
+				 NULL, 0,
+				 NULL, NULL);
+
+	return r;
+}
+
+static uint32_t userspace_get_region_size(struct dm_dirty_log *log)
+{
+	struct log_c *lc = log->context;
+
+	return lc->region_size;
+}
+
+/*
+ * userspace_is_clean
+ *
+ * Check whether a region is clean.  If there is any sort of
+ * failure when consulting the server, we return not clean.
+ *
+ * Returns: 1 if clean, 0 otherwise
+ */
+static int userspace_is_clean(struct dm_dirty_log *log, region_t region)
+{
+	int r;
+	uint64_t region64 = (uint64_t)region;
+	int64_t is_clean;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(is_clean);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_CLEAN,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&is_clean, &rdata_size);
+
+	return (r) ? 0 : (int)is_clean;
+}
+
+/*
+ * userspace_in_sync
+ *
+ * Check if the region is in-sync.  If there is any sort
+ * of failure when consulting the server, we assume that
+ * the region is not in sync.
+ *
+ * If 'can_block' is set, return immediately
+ *
+ * Returns: 1 if in-sync, 0 if not-in-sync, -EWOULDBLOCK
+ */
+static int userspace_in_sync(struct dm_dirty_log *log, region_t region,
+			     int can_block)
+{
+	int r;
+	uint64_t region64 = region;
+	int64_t in_sync;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+
+	/*
+	 * We can never respond directly - even if in_sync_hint is
+	 * set.  This is because another machine could see a device
+	 * failure and mark the region out-of-sync.  If we don't go
+	 * to userspace to ask, we might think the region is in-sync
+	 * and allow a read to pick up data that is stale.  (This is
+	 * very unlikely if a device actually fails; but it is very
+	 * likely if a connection to one device from one machine fails.)
+	 *
+	 * There still might be a problem if the mirror caches the region
+	 * state as in-sync... but then this call would not be made.  So,
+	 * that is a mirror problem.
+	 */
+	if (!can_block)
+		return -EWOULDBLOCK;
+
+	rdata_size = sizeof(in_sync);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IN_SYNC,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&in_sync, &rdata_size);
+	return (r) ? 0 : (int)in_sync;
+}
+
+/*
+ * userspace_flush
+ *
+ * This function is ok to block.
+ * The flush happens in two stages.  First, it sends all
+ * clear/mark requests that are on the list.  Then it
+ * tells the server to commit them.  This gives the
+ * server a chance to optimise the commit, instead of
+ * doing it for every request.
+ *
+ * Additionally, we could implement another thread that
+ * sends the requests up to the server - reducing the
+ * load on flush.  Then the flush would have less in
+ * the list and be responsible for the finishing commit.
+ *
+ * Returns: 0 on success, < 0 on failure
+ */
+static int userspace_flush(struct dm_dirty_log *log)
+{
+	int r = 0;
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	LIST_HEAD(flush_list);
+	struct flush_entry *fe, *tmp_fe;
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	list_splice_init(&lc->flush_list, &flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	if (list_empty(&flush_list))
+		return 0;
+
+	/*
+	 * FIXME: Count up requests, group request types,
+	 * allocate memory to stick all requests in and
+	 * send to server in one go.  Failing the allocation,
+	 * do it one by one.
+	 */
+
+	list_for_each_entry(fe, &flush_list, list) {
+		r = userspace_do_request(lc, lc->uuid, fe->type,
+					 (char *)&fe->region,
+					 sizeof(fe->region),
+					 NULL, NULL);
+		if (r)
+			goto fail;
+	}
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_FLUSH,
+				 NULL, 0, NULL, NULL);
+
+fail:
+	/*
+	 * We can safely remove these entries, even if failure.
+	 * Calling code will receive an error and will know that
+	 * the log facility has failed.
+	 */
+	list_for_each_entry_safe(fe, tmp_fe, &flush_list, list) {
+		list_del(&fe->list);
+		mempool_free(fe, flush_entry_pool);
+	}
+
+	if (r)
+		dm_table_event(lc->ti->table);
+
+	return r;
+}
+
+/*
+ * userspace_mark_region
+ *
+ * This function should avoid blocking unless absolutely required.
+ * (Memory allocation is valid for blocking.)
+ */
+static void userspace_mark_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/* Wait for an allocation, but _never_ fail */
+	fe = mempool_alloc(flush_entry_pool, GFP_NOIO);
+	BUG_ON(!fe);
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_MARK_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_clear_region
+ *
+ * This function must not block.
+ * So, the alloc can't block.  In the worst case, it is ok to
+ * fail.  It would simply mean we can't clear the region.
+ * Does nothing to current sync context, but does mean
+ * the region will be re-sync'ed on a reload of the mirror
+ * even though it is in-sync.
+ */
+static void userspace_clear_region(struct dm_dirty_log *log, region_t region)
+{
+	unsigned long flags;
+	struct log_c *lc = log->context;
+	struct flush_entry *fe;
+
+	/*
+	 * If we fail to allocate, we skip the clearing of
+	 * the region.  This doesn't hurt us in any way, except
+	 * to cause the region to be resync'ed when the
+	 * device is activated next time.
+	 */
+	fe = mempool_alloc(flush_entry_pool, GFP_ATOMIC);
+	if (!fe) {
+		DMERR("Failed to allocate memory to clear region.");
+		return;
+	}
+
+	spin_lock_irqsave(&lc->flush_lock, flags);
+	fe->type = DM_ULOG_CLEAR_REGION;
+	fe->region = region;
+	list_add(&fe->list, &lc->flush_list);
+	spin_unlock_irqrestore(&lc->flush_lock, flags);
+
+	return;
+}
+
+/*
+ * userspace_get_resync_work
+ *
+ * Get a region that needs recovery.  It is valid to return
+ * an error for this function.
+ *
+ * Returns: 1 if region filled, 0 if no work, <0 on error
+ */
+static int userspace_get_resync_work(struct dm_dirty_log *log, region_t *region)
+{
+	int r;
+	size_t rdata_size;
+	struct log_c *lc = log->context;
+	struct {
+		int64_t i; /* 64-bit for mix arch compatibility */
+		region_t r;
+	} pkg;
+
+	if (lc->in_sync_hint >= lc->region_count)
+		return 0;
+
+	rdata_size = sizeof(pkg);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_RESYNC_WORK,
+				 NULL, 0,
+				 (char *)&pkg, &rdata_size);
+
+	*region = pkg.r;
+	return (r) ? r : (int)pkg.i;
+}
+
+/*
+ * userspace_set_region_sync
+ *
+ * Set the sync status of a given region.  This function
+ * must not fail.
+ */
+static void userspace_set_region_sync(struct dm_dirty_log *log,
+				      region_t region, int in_sync)
+{
+	int r;
+	struct log_c *lc = log->context;
+	struct {
+		region_t r;
+		int64_t i;
+	} pkg;
+
+	pkg.r = region;
+	pkg.i = (int64_t)in_sync;
+
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_SET_REGION_SYNC,
+				 (char *)&pkg, sizeof(pkg),
+				 NULL, NULL);
+
+	/*
+	 * It would be nice to be able to report failures.
+	 * However, it is easy emough to detect and resolve.
+	 */
+	return;
+}
+
+/*
+ * userspace_get_sync_count
+ *
+ * If there is any sort of failure when consulting the server,
+ * we assume that the sync count is zero.
+ *
+ * Returns: sync count on success, 0 on failure
+ */
+static region_t userspace_get_sync_count(struct dm_dirty_log *log)
+{
+	int r;
+	size_t rdata_size;
+	uint64_t sync_count;
+	struct log_c *lc = log->context;
+
+	rdata_size = sizeof(sync_count);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_GET_SYNC_COUNT,
+				 NULL, 0,
+				 (char *)&sync_count, &rdata_size);
+
+	if (r)
+		return 0;
+
+	if (sync_count >= lc->region_count)
+		lc->in_sync_hint = lc->region_count;
+
+	return (region_t)sync_count;
+}
+
+/*
+ * userspace_status
+ *
+ * Returns: amount of space consumed
+ */
+static int userspace_status(struct dm_dirty_log *log, status_type_t status_type,
+			    char *result, unsigned maxlen)
+{
+	int r = 0;
+	size_t sz = (size_t)maxlen;
+	struct log_c *lc = log->context;
+
+	switch (status_type) {
+	case STATUSTYPE_INFO:
+		r = userspace_do_request(lc, lc->uuid, DM_ULOG_STATUS_INFO,
+					 NULL, 0,
+					 result, &sz);
+
+		if (r) {
+			sz = 0;
+			DMEMIT("%s 1 COM_FAILURE", log->type->name);
+		}
+		break;
+	case STATUSTYPE_TABLE:
+		sz = 0;
+		DMEMIT("%s %u %s %s", log->type->name, lc->usr_argc + 1,
+		       lc->uuid, lc->usr_argv_str);
+		break;
+	}
+	return (r) ? 0 : (int)sz;
+}
+
+/*
+ * userspace_is_remote_recovering
+ *
+ * Returns: 1 if region recovering, 0 otherwise
+ */
+static int userspace_is_remote_recovering(struct dm_dirty_log *log,
+					  region_t region)
+{
+	int r;
+	uint64_t region64 = region;
+	struct log_c *lc = log->context;
+	static unsigned long long limit;
+	struct {
+		int64_t is_recovering;
+		uint64_t in_sync_hint;
+	} pkg;
+	size_t rdata_size = sizeof(pkg);
+
+	/*
+	 * Once the mirror has been reported to be in-sync,
+	 * it will never again ask for recovery work.  So,
+	 * we can safely say there is not a remote machine
+	 * recovering if the device is in-sync.  (in_sync_hint
+	 * must be reset at resume time.)
+	 */
+	if (region < lc->in_sync_hint)
+		return 0;
+	else if (jiffies < limit)
+		return 1;
+
+	limit = jiffies + (HZ / 4);
+	r = userspace_do_request(lc, lc->uuid, DM_ULOG_IS_REMOTE_RECOVERING,
+				 (char *)&region64, sizeof(region64),
+				 (char *)&pkg, &rdata_size);
+	if (r)
+		return 1;
+
+	lc->in_sync_hint = pkg.in_sync_hint;
+
+	return (int)pkg.is_recovering;
+}
+
+static struct dm_dirty_log_type _userspace_type = {
+	.name = "userspace",
+	.module = THIS_MODULE,
+	.ctr = userspace_ctr,
+	.dtr = userspace_dtr,
+	.presuspend = userspace_presuspend,
+	.postsuspend = userspace_postsuspend,
+	.resume = userspace_resume,
+	.get_region_size = userspace_get_region_size,
+	.is_clean = userspace_is_clean,
+	.in_sync = userspace_in_sync,
+	.flush = userspace_flush,
+	.mark_region = userspace_mark_region,
+	.clear_region = userspace_clear_region,
+	.get_resync_work = userspace_get_resync_work,
+	.set_region_sync = userspace_set_region_sync,
+	.get_sync_count = userspace_get_sync_count,
+	.status = userspace_status,
+	.is_remote_recovering = userspace_is_remote_recovering,
+};
+
+static int __init userspace_dirty_log_init(void)
+{
+	int r = 0;
+
+	flush_entry_pool = mempool_create(100, flush_entry_alloc,
+					  flush_entry_free, NULL);
+
+	if (!flush_entry_pool) {
+		DMWARN("Unable to create flush_entry_pool:  No memory.");
+		return -ENOMEM;
+	}
+
+	r = dm_ulog_tfr_init();
+	if (r) {
+		DMWARN("Unable to initialize userspace log communications");
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	r = dm_dirty_log_type_register(&_userspace_type);
+	if (r) {
+		DMWARN("Couldn't register userspace dirty log type");
+		dm_ulog_tfr_exit();
+		mempool_destroy(flush_entry_pool);
+		return r;
+	}
+
+	DMINFO("version 1.0.0 loaded");
+	return 0;
+}
+
+static void __exit userspace_dirty_log_exit(void)
+{
+	dm_dirty_log_type_unregister(&_userspace_type);
+	dm_ulog_tfr_exit();
+	mempool_destroy(flush_entry_pool);
+
+	DMINFO("version 1.0.0 unloaded");
+	return;
+}
+
+module_init(userspace_dirty_log_init);
+module_exit(userspace_dirty_log_exit);
+
+MODULE_DESCRIPTION(DM_NAME " userspace dirty log link");
+MODULE_AUTHOR("Jonathan Brassow <dm-devel@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm-log-userspace-transfer.c b/drivers/md/dm-log-userspace-transfer.c
new file mode 100644
index 000000000000..0ca1ee768a1f
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <net/sock.h>
+#include <linux/workqueue.h>
+#include <linux/connector.h>
+#include <linux/device-mapper.h>
+#include <linux/dm-log-userspace.h>
+
+#include "dm-log-userspace-transfer.h"
+
+static uint32_t dm_ulog_seq;
+
+/*
+ * Netlink/Connector is an unreliable protocol.  How long should
+ * we wait for a response before assuming it was lost and retrying?
+ * (If we do receive a response after this time, it will be discarded
+ * and the response to the resent request will be waited for.
+ */
+#define DM_ULOG_RETRY_TIMEOUT (15 * HZ)
+
+/*
+ * Pre-allocated space for speed
+ */
+#define DM_ULOG_PREALLOCED_SIZE 512
+static struct cn_msg *prealloced_cn_msg;
+static struct dm_ulog_request *prealloced_ulog_tfr;
+
+static struct cb_id ulog_cn_id = {
+	.idx = CN_IDX_DM,
+	.val = CN_VAL_DM_USERSPACE_LOG
+};
+
+static DEFINE_MUTEX(dm_ulog_lock);
+
+struct receiving_pkg {
+	struct list_head list;
+	struct completion complete;
+
+	uint32_t seq;
+
+	int error;
+	size_t *data_size;
+	char *data;
+};
+
+static DEFINE_SPINLOCK(receiving_list_lock);
+static struct list_head receiving_list;
+
+static int dm_ulog_sendto_server(struct dm_ulog_request *tfr)
+{
+	int r;
+	struct cn_msg *msg = prealloced_cn_msg;
+
+	memset(msg, 0, sizeof(struct cn_msg));
+
+	msg->id.idx = ulog_cn_id.idx;
+	msg->id.val = ulog_cn_id.val;
+	msg->ack = 0;
+	msg->seq = tfr->seq;
+	msg->len = sizeof(struct dm_ulog_request) + tfr->data_size;
+
+	r = cn_netlink_send(msg, 0, gfp_any());
+
+	return r;
+}
+
+/*
+ * Parameters for this function can be either msg or tfr, but not
+ * both.  This function fills in the reply for a waiting request.
+ * If just msg is given, then the reply is simply an ACK from userspace
+ * that the request was received.
+ *
+ * Returns: 0 on success, -ENOENT on failure
+ */
+static int fill_pkg(struct cn_msg *msg, struct dm_ulog_request *tfr)
+{
+	uint32_t rtn_seq = (msg) ? msg->seq : (tfr) ? tfr->seq : 0;
+	struct receiving_pkg *pkg;
+
+	/*
+	 * The 'receiving_pkg' entries in this list are statically
+	 * allocated on the stack in 'dm_consult_userspace'.
+	 * Each process that is waiting for a reply from the user
+	 * space server will have an entry in this list.
+	 *
+	 * We are safe to do it this way because the stack space
+	 * is unique to each process, but still addressable by
+	 * other processes.
+	 */
+	list_for_each_entry(pkg, &receiving_list, list) {
+		if (rtn_seq != pkg->seq)
+			continue;
+
+		if (msg) {
+			pkg->error = -msg->ack;
+			/*
+			 * If we are trying again, we will need to know our
+			 * storage capacity.  Otherwise, along with the
+			 * error code, we make explicit that we have no data.
+			 */
+			if (pkg->error != -EAGAIN)
+				*(pkg->data_size) = 0;
+		} else if (tfr->data_size > *(pkg->data_size)) {
+			DMERR("Insufficient space to receive package [%u] "
+			      "(%u vs %lu)", tfr->request_type,
+			      tfr->data_size, *(pkg->data_size));
+
+			*(pkg->data_size) = 0;
+			pkg->error = -ENOSPC;
+		} else {
+			pkg->error = tfr->error;
+			memcpy(pkg->data, tfr->data, tfr->data_size);
+			*(pkg->data_size) = tfr->data_size;
+		}
+		complete(&pkg->complete);
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+/*
+ * This is the connector callback that delivers data
+ * that was sent from userspace.
+ */
+static void cn_ulog_callback(void *data)
+{
+	struct cn_msg *msg = (struct cn_msg *)data;
+	struct dm_ulog_request *tfr = (struct dm_ulog_request *)(msg + 1);
+
+	spin_lock(&receiving_list_lock);
+	if (msg->len == 0)
+		fill_pkg(msg, NULL);
+	else if (msg->len < sizeof(*tfr))
+		DMERR("Incomplete message received (expected %u, got %u): [%u]",
+		      (unsigned)sizeof(*tfr), msg->len, msg->seq);
+	else
+		fill_pkg(NULL, tfr);
+	spin_unlock(&receiving_list_lock);
+}
+
+/**
+ * dm_consult_userspace
+ * @uuid: log's uuid (must be DM_UUID_LEN in size)
+ * @request_type:  found in include/linux/dm-log-userspace.h
+ * @data: data to tx to the server
+ * @data_size: size of data in bytes
+ * @rdata: place to put return data from server
+ * @rdata_size: value-result (amount of space given/amount of space used)
+ *
+ * rdata_size is undefined on failure.
+ *
+ * Memory used to communicate with userspace is zero'ed
+ * before populating to ensure that no unwanted bits leak
+ * from kernel space to user-space.  All userspace log communications
+ * between kernel and user space go through this function.
+ *
+ * Returns: 0 on success, -EXXX on failure
+ **/
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size)
+{
+	int r = 0;
+	size_t dummy = 0;
+	int overhead_size =
+		sizeof(struct dm_ulog_request *) + sizeof(struct cn_msg);
+	struct dm_ulog_request *tfr = prealloced_ulog_tfr;
+	struct receiving_pkg pkg;
+
+	if (data_size > (DM_ULOG_PREALLOCED_SIZE - overhead_size)) {
+		DMINFO("Size of tfr exceeds preallocated size");
+		return -EINVAL;
+	}
+
+	if (!rdata_size)
+		rdata_size = &dummy;
+resend:
+	/*
+	 * We serialize the sending of requests so we can
+	 * use the preallocated space.
+	 */
+	mutex_lock(&dm_ulog_lock);
+
+	memset(tfr, 0, DM_ULOG_PREALLOCED_SIZE - overhead_size);
+	memcpy(tfr->uuid, uuid, DM_UUID_LEN);
+	tfr->seq = dm_ulog_seq++;
+
+	/*
+	 * Must be valid request type (all other bits set to
+	 * zero).  This reserves other bits for possible future
+	 * use.
+	 */
+	tfr->request_type = request_type & DM_ULOG_REQUEST_MASK;
+
+	tfr->data_size = data_size;
+	if (data && data_size)
+		memcpy(tfr->data, data, data_size);
+
+	memset(&pkg, 0, sizeof(pkg));
+	init_completion(&pkg.complete);
+	pkg.seq = tfr->seq;
+	pkg.data_size = rdata_size;
+	pkg.data = rdata;
+	spin_lock(&receiving_list_lock);
+	list_add(&(pkg.list), &receiving_list);
+	spin_unlock(&receiving_list_lock);
+
+	r = dm_ulog_sendto_server(tfr);
+
+	mutex_unlock(&dm_ulog_lock);
+
+	if (r) {
+		DMERR("Unable to send log request [%u] to userspace: %d",
+		      request_type, r);
+		spin_lock(&receiving_list_lock);
+		list_del_init(&(pkg.list));
+		spin_unlock(&receiving_list_lock);
+
+		goto out;
+	}
+
+	r = wait_for_completion_timeout(&(pkg.complete), DM_ULOG_RETRY_TIMEOUT);
+	spin_lock(&receiving_list_lock);
+	list_del_init(&(pkg.list));
+	spin_unlock(&receiving_list_lock);
+	if (!r) {
+		DMWARN("[%s] Request timed out: [%u/%u] - retrying",
+		       (strlen(uuid) > 8) ?
+		       (uuid + (strlen(uuid) - 8)) : (uuid),
+		       request_type, pkg.seq);
+		goto resend;
+	}
+
+	r = pkg.error;
+	if (r == -EAGAIN)
+		goto resend;
+
+out:
+	return r;
+}
+
+int dm_ulog_tfr_init(void)
+{
+	int r;
+	void *prealloced;
+
+	INIT_LIST_HEAD(&receiving_list);
+
+	prealloced = kmalloc(DM_ULOG_PREALLOCED_SIZE, GFP_KERNEL);
+	if (!prealloced)
+		return -ENOMEM;
+
+	prealloced_cn_msg = prealloced;
+	prealloced_ulog_tfr = prealloced + sizeof(struct cn_msg);
+
+	r = cn_add_callback(&ulog_cn_id, "dmlogusr", cn_ulog_callback);
+	if (r) {
+		cn_del_callback(&ulog_cn_id);
+		return r;
+	}
+
+	return 0;
+}
+
+void dm_ulog_tfr_exit(void)
+{
+	cn_del_callback(&ulog_cn_id);
+	kfree(prealloced_cn_msg);
+}
diff --git a/drivers/md/dm-log-userspace-transfer.h b/drivers/md/dm-log-userspace-transfer.h
new file mode 100644
index 000000000000..c26d8e4e2710
--- /dev/null
+++ b/drivers/md/dm-log-userspace-transfer.h
@@ -0,0 +1,18 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_TRANSFER_H__
+#define __DM_LOG_USERSPACE_TRANSFER_H__
+
+#define DM_MSG_PREFIX "dm-log-userspace"
+
+int dm_ulog_tfr_init(void);
+void dm_ulog_tfr_exit(void);
+int dm_consult_userspace(const char *uuid, int request_type,
+			 char *data, size_t data_size,
+			 char *rdata, size_t *rdata_size);
+
+#endif /* __DM_LOG_USERSPACE_TRANSFER_H__ */
diff --git a/include/linux/Kbuild b/include/linux/Kbuild
index 03f22076381f..334a3593cdfd 100644
--- a/include/linux/Kbuild
+++ b/include/linux/Kbuild
@@ -57,6 +57,7 @@ header-y += dlmconstants.h
 header-y += dlm_device.h
 header-y += dlm_netlink.h
 header-y += dm-ioctl.h
+header-y += dm-log-userspace.h
 header-y += dn.h
 header-y += dqblk_xfs.h
 header-y += efs_fs_sb.h
diff --git a/include/linux/connector.h b/include/linux/connector.h
index b9966e64604e..b68d27850d51 100644
--- a/include/linux/connector.h
+++ b/include/linux/connector.h
@@ -41,8 +41,10 @@
 #define CN_IDX_BB			0x5	/* BlackBoard, from the TSP GPL sampling framework */
 #define CN_DST_IDX			0x6
 #define CN_DST_VAL			0x1
+#define CN_IDX_DM			0x7	/* Device Mapper */
+#define CN_VAL_DM_USERSPACE_LOG		0x1
 
-#define CN_NETLINK_USERS		7
+#define CN_NETLINK_USERS		8
 
 /*
  * Maximum connector's message size.
diff --git a/include/linux/dm-log-userspace.h b/include/linux/dm-log-userspace.h
new file mode 100644
index 000000000000..642e3017b51f
--- /dev/null
+++ b/include/linux/dm-log-userspace.h
@@ -0,0 +1,386 @@
+/*
+ * Copyright (C) 2006-2009 Red Hat, Inc.
+ *
+ * This file is released under the LGPL.
+ */
+
+#ifndef __DM_LOG_USERSPACE_H__
+#define __DM_LOG_USERSPACE_H__
+
+#include <linux/dm-ioctl.h> /* For DM_UUID_LEN */
+
+/*
+ * The device-mapper userspace log module consists of a kernel component and
+ * a user-space component.  The kernel component implements the API defined
+ * in dm-dirty-log.h.  Its purpose is simply to pass the parameters and
+ * return values of those API functions between kernel and user-space.
+ *
+ * Below are defined the 'request_types' - DM_ULOG_CTR, DM_ULOG_DTR, etc.
+ * These request types represent the different functions in the device-mapper
+ * dirty log API.  Each of these is described in more detail below.
+ *
+ * The user-space program must listen for requests from the kernel (representing
+ * the various API functions) and process them.
+ *
+ * User-space begins by setting up the communication link (error checking
+ * removed for clarity):
+ *	fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
+ *	addr.nl_family = AF_NETLINK;
+ *	addr.nl_groups = CN_IDX_DM;
+ *	addr.nl_pid = 0;
+ *	r = bind(fd, (struct sockaddr *) &addr, sizeof(addr));
+ *	opt = addr.nl_groups;
+ *	setsockopt(fd, SOL_NETLINK, NETLINK_ADD_MEMBERSHIP, &opt, sizeof(opt));
+ *
+ * User-space will then wait to receive requests form the kernel, which it
+ * will process as described below.  The requests are received in the form,
+ * ((struct dm_ulog_request) + (additional data)).  Depending on the request
+ * type, there may or may not be 'additional data'.  In the descriptions below,
+ * you will see 'Payload-to-userspace' and 'Payload-to-kernel'.  The
+ * 'Payload-to-userspace' is what the kernel sends in 'additional data' as
+ * necessary parameters to complete the request.  The 'Payload-to-kernel' is
+ * the 'additional data' returned to the kernel that contains the necessary
+ * results of the request.  The 'data_size' field in the dm_ulog_request
+ * structure denotes the availability and amount of payload data.
+ */
+
+/*
+ * DM_ULOG_CTR corresponds to (found in dm-dirty-log.h):
+ * int (*ctr)(struct dm_dirty_log *log, struct dm_target *ti,
+ *	      unsigned argc, char **argv);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is the reference that
+ * will be used by all request types to a specific log.  The constructor must
+ * record this assotiation with instance created.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CTR                    1
+
+/*
+ * DM_ULOG_DTR corresponds to (found in dm-dirty-log.h):
+ * void (*dtr)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being destroyed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_DTR                    2
+
+/*
+ * DM_ULOG_PRESUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*presuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being presuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_PRESUSPEND             3
+
+/*
+ * DM_ULOG_POSTSUSPEND corresponds to (found in dm-dirty-log.h):
+ * int (*postsuspend)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being postsuspended.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_POSTSUSPEND            4
+
+/*
+ * DM_ULOG_RESUME corresponds to (found in dm-dirty-log.h):
+ * int (*resume)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * The UUID contained in the dm_ulog_request structure is all that is
+ * necessary to identify the log instance being resumed.  There is no
+ * payload data.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_RESUME                 5
+
+/*
+ * DM_ULOG_GET_REGION_SIZE corresponds to (found in dm-dirty-log.h):
+ * uint32_t (*get_region_size)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - contains the region size
+ *
+ * The region size is something that was determined at constructor time.
+ * It is returned in the payload area and 'data_size' is set to
+ * reflect this.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_REGION_SIZE        6
+
+/*
+ * DM_ULOG_IS_CLEAN corresponds to (found in dm-dirty-log.h):
+ * int (*is_clean)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get clean status on
+ * Payload-to-kernel:
+ *	int64_t  - 1 if clean, 0 otherwise
+ *
+ * Payload is sizeof(uint64_t) and contains the region for which the clean
+ * status is being made.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - filling the payload with 0 (not clean) or
+ * 1 (clean), setting 'data_size' and 'error' appropriately.
+ */
+#define DM_ULOG_IS_CLEAN               7
+
+/*
+ * DM_ULOG_IN_SYNC corresponds to (found in dm-dirty-log.h):
+ * int (*in_sync)(struct dm_dirty_log *log, region_t region,
+ *		  int can_block);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - the region to get sync status on
+ * Payload-to-kernel:
+ *	int64_t - 1 if in-sync, 0 otherwise
+ *
+ * Exactly the same as 'is_clean' above, except this time asking "has the
+ * region been recovered?" vs. "is the region not being modified?"
+ */
+#define DM_ULOG_IN_SYNC                8
+
+/*
+ * DM_ULOG_FLUSH corresponds to (found in dm-dirty-log.h):
+ * int (*flush)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	None.
+ *
+ * No incoming or outgoing payload.  Simply flush log state to disk.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_FLUSH                  9
+
+/*
+ * DM_ULOG_MARK_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*mark_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to mark
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark dirty.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_MARK_REGION           10
+
+/*
+ * DM_ULOG_CLEAR_REGION corresponds to (found in dm-dirty-log.h):
+ * void (*clear_region)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t [] - region(s) to clear
+ * Payload-to-kernel:
+ *	None.
+ *
+ * Incoming payload contains the one or more regions to mark clean.
+ * The number of regions contained in the payload can be determined from
+ * 'data_size/sizeof(uint64_t)'.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_CLEAR_REGION          11
+
+/*
+ * DM_ULOG_GET_RESYNC_WORK corresponds to (found in dm-dirty-log.h):
+ * int (*get_resync_work)(struct dm_dirty_log *log, region_t *region);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	{
+ *		int64_t i; -- 1 if recovery necessary, 0 otherwise
+ *		uint64_t r; -- The region to recover if i=1
+ *	}
+ * 'data_size' should be set appropriately.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field appropriately.
+ */
+#define DM_ULOG_GET_RESYNC_WORK       12
+
+/*
+ * DM_ULOG_SET_REGION_SYNC corresponds to (found in dm-dirty-log.h):
+ * void (*set_region_sync)(struct dm_dirty_log *log,
+ *			   region_t region, int in_sync);
+ *
+ * Payload-to-userspace:
+ *	{
+ *		uint64_t - region to set sync state on
+ *		int64_t  - 0 if not-in-sync, 1 if in-sync
+ *	}
+ * Payload-to-kernel:
+ *	None.
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and clearing
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_SET_REGION_SYNC       13
+
+/*
+ * DM_ULOG_GET_SYNC_COUNT corresponds to (found in dm-dirty-log.h):
+ * region_t (*get_sync_count)(struct dm_dirty_log *log);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	uint64_t - the number of in-sync regions
+ *
+ * No incoming payload.  Kernel-bound payload contains the number of
+ * regions that are in-sync (in a size_t).
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_GET_SYNC_COUNT        14
+
+/*
+ * DM_ULOG_STATUS_INFO corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_INFO,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_INFO
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_INFO           15
+
+/*
+ * DM_ULOG_STATUS_TABLE corresponds to (found in dm-dirty-log.h):
+ * int (*status)(struct dm_dirty_log *log, STATUSTYPE_TABLE,
+ *		 char *result, unsigned maxlen);
+ *
+ * Payload-to-userspace:
+ *	None.
+ * Payload-to-kernel:
+ *	Character string containing STATUSTYPE_TABLE
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_STATUS_TABLE          16
+
+/*
+ * DM_ULOG_IS_REMOTE_RECOVERING corresponds to (found in dm-dirty-log.h):
+ * int (*is_remote_recovering)(struct dm_dirty_log *log, region_t region);
+ *
+ * Payload-to-userspace:
+ *	uint64_t - region to determine recovery status on
+ * Payload-to-kernel:
+ *	{
+ *		int64_t is_recovering;  -- 0 if no, 1 if yes
+ *		uint64_t in_sync_hint;  -- lowest region still needing resync
+ *	}
+ *
+ * When the request has been processed, user-space must return the
+ * dm_ulog_request to the kernel - setting the 'error' field and
+ * 'data_size' appropriately.
+ */
+#define DM_ULOG_IS_REMOTE_RECOVERING  17
+
+/*
+ * (DM_ULOG_REQUEST_MASK & request_type) to get the request type
+ *
+ * Payload-to-userspace:
+ *	A single string containing all the argv arguments separated by ' 's
+ * Payload-to-kernel:
+ *	None.  ('data_size' in the dm_ulog_request struct should be 0.)
+ *
+ * We are reserving 8 bits of the 32-bit 'request_type' field for the
+ * various request types above.  The remaining 24-bits are currently
+ * set to zero and are reserved for future use and compatibility concerns.
+ *
+ * User-space should always use DM_ULOG_REQUEST_TYPE to aquire the
+ * request type from the 'request_type' field to maintain forward compatibility.
+ */
+#define DM_ULOG_REQUEST_MASK 0xFF
+#define DM_ULOG_REQUEST_TYPE(request_type) \
+	(DM_ULOG_REQUEST_MASK & (request_type))
+
+struct dm_ulog_request {
+	char uuid[DM_UUID_LEN]; /* Ties a request to a specific mirror log */
+	char padding[7];        /* Padding because DM_UUID_LEN = 129 */
+
+	int32_t error;          /* Used to report back processing errors */
+
+	uint32_t seq;           /* Sequence number for request */
+	uint32_t request_type;  /* DM_ULOG_* defined above */
+	uint32_t data_size;     /* How much data (not including this struct) */
+
+	char data[0];
+};
+
+#endif /* __DM_LOG_USERSPACE_H__ */
-- 
cgit v1.2.3


From cec47e3d4a861e1d942b3a580d0bbef2700d2bb2 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:35 +0100
Subject: dm: prepare for request based option

This patch adds core functions for request-based dm.

When struct mapped device (md) is initialized, md->queue has
an I/O scheduler and the following functions are used for
request-based dm as the queue functions:
    make_request_fn: dm_make_request()
    pref_fn:         dm_prep_fn()
    request_fn:      dm_request_fn()
    softirq_done_fn: dm_softirq_done()
    lld_busy_fn:     dm_lld_busy()
Actual initializations are done in another patch (PATCH 2).

Below is a brief summary of how request-based dm behaves, including:
  - making request from bio
  - cloning, mapping and dispatching request
  - completing request and bio
  - suspending md
  - resuming md

  bio to request
  ==============
  md->queue->make_request_fn() (dm_make_request()) calls __make_request()
  for a bio submitted to the md.
  Then, the bio is kept in the queue as a new request or merged into
  another request in the queue if possible.

  Cloning and Mapping
  ===================
  Cloning and mapping are done in md->queue->request_fn() (dm_request_fn()),
  when requests are dispatched after they are sorted by the I/O scheduler.

  dm_request_fn() checks busy state of underlying devices using
  target's busy() function and stops dispatching requests to keep them
  on the dm device's queue if busy.
  It helps better I/O merging, since no merge is done for a request
  once it is dispatched to underlying devices.

  Actual cloning and mapping are done in dm_prep_fn() and map_request()
  called from dm_request_fn().
  dm_prep_fn() clones not only request but also bios of the request
  so that dm can hold bio completion in error cases and prevent
  the bio submitter from noticing the error.
  (See the "Completion" section below for details.)

  After the cloning, the clone is mapped by target's map_rq() function
    and inserted to underlying device's queue using
    blk_insert_cloned_request().

  Completion
  ==========
  Request completion can be hooked by rq->end_io(), but then, all bios
  in the request will have been completed even error cases, and the bio
  submitter will have noticed the error.
  To prevent the bio completion in error cases, request-based dm clones
  both bio and request and hooks both bio->bi_end_io() and rq->end_io():
      bio->bi_end_io(): end_clone_bio()
      rq->end_io():     end_clone_request()

  Summary of the request completion flow is below:
  blk_end_request() for a clone request
    => blk_update_request()
       => bio->bi_end_io() == end_clone_bio() for each clone bio
          => Free the clone bio
          => Success: Complete the original bio (blk_update_request())
             Error:   Don't complete the original bio
    => blk_finish_request()
       => rq->end_io() == end_clone_request()
          => blk_complete_request()
             => dm_softirq_done()
                => Free the clone request
                => Success: Complete the original request (blk_end_request())
                   Error:   Requeue the original request

  end_clone_bio() completes the original request on the size of
  the original bio in successful cases.
  Even if all bios in the original request are completed by that
  completion, the original request must not be completed yet to keep
  the ordering of request completion for the stacking.
  So end_clone_bio() uses blk_update_request() instead of
  blk_end_request().
  In error cases, end_clone_bio() doesn't complete the original bio.
  It just frees the cloned bio and gives over the error handling to
  end_clone_request().

  end_clone_request(), which is called with queue lock held, completes
  the clone request and the original request in a softirq context
  (dm_softirq_done()), which has no queue lock, to avoid a deadlock
  issue on submission of another request during the completion:
      - The submitted request may be mapped to the same device
      - Request submission requires queue lock, but the queue lock
        has been held by itself and it doesn't know that

  The clone request has no clone bio when dm_softirq_done() is called.
  So target drivers can't resubmit it again even error cases.
  Instead, they can ask dm core for requeueing and remapping
  the original request in that cases.

  suspend
  =======
  Request-based dm uses stopping md->queue as suspend of the md.
  For noflush suspend, just stops md->queue.

  For flush suspend, inserts a marker request to the tail of md->queue.
  And dispatches all requests in md->queue until the marker comes to
  the front of md->queue.  Then, stops dispatching request and waits
  for the all dispatched requests to complete.
  After that, completes the marker request, stops md->queue and
  wake up the waiter on the suspend queue, md->wait.

  resume
  ======
  Starts md->queue.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c         |  14 +
 drivers/md/dm.c               | 705 +++++++++++++++++++++++++++++++++++++++++-
 drivers/md/dm.h               |   1 +
 include/linux/device-mapper.h |   9 +
 4 files changed, 725 insertions(+), 4 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 09a57113955e..c5f784419f23 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1080,6 +1080,20 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 	return r;
 }
 
+int dm_table_any_busy_target(struct dm_table *t)
+{
+	unsigned i;
+	struct dm_target *ti;
+
+	for (i = 0; i < t->num_targets; i++) {
+		ti = t->targets + i;
+		if (ti->type->busy && ti->type->busy(ti))
+			return 1;
+	}
+
+	return 0;
+}
+
 void dm_table_unplug_all(struct dm_table *t)
 {
 	struct dm_dev_internal *dd;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index f609793a92d0..be003e5fea3d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -78,7 +78,7 @@ struct dm_rq_target_io {
  */
 struct dm_rq_clone_bio_info {
 	struct bio *orig;
-	struct request *rq;
+	struct dm_rq_target_io *tio;
 };
 
 union map_info *dm_get_mapinfo(struct bio *bio)
@@ -88,6 +88,14 @@ union map_info *dm_get_mapinfo(struct bio *bio)
 	return NULL;
 }
 
+union map_info *dm_get_rq_mapinfo(struct request *rq)
+{
+	if (rq && rq->end_io_data)
+		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
+	return NULL;
+}
+EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
+
 #define MINOR_ALLOCED ((void *)-1)
 
 /*
@@ -169,6 +177,12 @@ struct mapped_device {
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 
+	/* marker of flush suspend for request-based dm */
+	struct request suspend_rq;
+
+	/* For saving the address of __make_request for request based dm */
+	make_request_fn *saved_make_request_fn;
+
 	/* sysfs handle */
 	struct kobject kobj;
 
@@ -406,6 +420,26 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 	mempool_free(tio, md->tio_pool);
 }
 
+static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md)
+{
+	return mempool_alloc(md->tio_pool, GFP_ATOMIC);
+}
+
+static void free_rq_tio(struct dm_rq_target_io *tio)
+{
+	mempool_free(tio, tio->md->tio_pool);
+}
+
+static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
+{
+	return mempool_alloc(md->io_pool, GFP_ATOMIC);
+}
+
+static void free_bio_info(struct dm_rq_clone_bio_info *info)
+{
+	mempool_free(info, info->tio->md->io_pool);
+}
+
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
@@ -615,6 +649,262 @@ static void clone_endio(struct bio *bio, int error)
 	dec_pending(io, error);
 }
 
+/*
+ * Partial completion handling for request-based dm
+ */
+static void end_clone_bio(struct bio *clone, int error)
+{
+	struct dm_rq_clone_bio_info *info = clone->bi_private;
+	struct dm_rq_target_io *tio = info->tio;
+	struct bio *bio = info->orig;
+	unsigned int nr_bytes = info->orig->bi_size;
+
+	bio_put(clone);
+
+	if (tio->error)
+		/*
+		 * An error has already been detected on the request.
+		 * Once error occurred, just let clone->end_io() handle
+		 * the remainder.
+		 */
+		return;
+	else if (error) {
+		/*
+		 * Don't notice the error to the upper layer yet.
+		 * The error handling decision is made by the target driver,
+		 * when the request is completed.
+		 */
+		tio->error = error;
+		return;
+	}
+
+	/*
+	 * I/O for the bio successfully completed.
+	 * Notice the data completion to the upper layer.
+	 */
+
+	/*
+	 * bios are processed from the head of the list.
+	 * So the completing bio should always be rq->bio.
+	 * If it's not, something wrong is happening.
+	 */
+	if (tio->orig->bio != bio)
+		DMERR("bio completion is going in the middle of the request");
+
+	/*
+	 * Update the original request.
+	 * Do not use blk_end_request() here, because it may complete
+	 * the original request before the clone, and break the ordering.
+	 */
+	blk_update_request(tio->orig, 0, nr_bytes);
+}
+
+/*
+ * Don't touch any member of the md after calling this function because
+ * the md may be freed in dm_put() at the end of this function.
+ * Or do dm_get() before calling this function and dm_put() later.
+ */
+static void rq_completed(struct mapped_device *md, int run_queue)
+{
+	int wakeup_waiters = 0;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!queue_in_flight(q))
+		wakeup_waiters = 1;
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	/* nudge anyone waiting on suspend queue */
+	if (wakeup_waiters)
+		wake_up(&md->wait);
+
+	if (run_queue)
+		blk_run_queue(q);
+
+	/*
+	 * dm_put() must be at the end of this function. See the comment above
+	 */
+	dm_put(md);
+}
+
+static void dm_unprep_request(struct request *rq)
+{
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	rq->special = NULL;
+	rq->cmd_flags &= ~REQ_DONTPREP;
+
+	blk_rq_unprep_clone(clone);
+	free_rq_tio(tio);
+}
+
+/*
+ * Requeue the original request of a clone.
+ */
+void dm_requeue_unmapped_request(struct request *clone)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+	struct request_queue *q = rq->q;
+	unsigned long flags;
+
+	dm_unprep_request(rq);
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (elv_queue_empty(q))
+		blk_plug_device(q);
+	blk_requeue_request(q, rq);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	rq_completed(md, 0);
+}
+EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
+
+static void __stop_queue(struct request_queue *q)
+{
+	blk_stop_queue(q);
+}
+
+static void stop_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__stop_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void __start_queue(struct request_queue *q)
+{
+	if (blk_queue_stopped(q))
+		blk_start_queue(q);
+}
+
+static void start_queue(struct request_queue *q)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+/*
+ * Complete the clone and the original request.
+ * Must be called without queue lock.
+ */
+static void dm_end_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct mapped_device *md = tio->md;
+	struct request *rq = tio->orig;
+
+	if (blk_pc_request(rq)) {
+		rq->errors = clone->errors;
+		rq->resid_len = clone->resid_len;
+
+		if (rq->sense)
+			/*
+			 * We are using the sense buffer of the original
+			 * request.
+			 * So setting the length of the sense data is enough.
+			 */
+			rq->sense_len = clone->sense_len;
+	}
+
+	BUG_ON(clone->bio);
+	free_rq_tio(tio);
+
+	blk_end_request_all(rq, error);
+
+	rq_completed(md, 1);
+}
+
+/*
+ * Request completion handler for request-based dm
+ */
+static void dm_softirq_done(struct request *rq)
+{
+	struct request *clone = rq->completion_data;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
+	int error = tio->error;
+
+	if (!(rq->cmd_flags & REQ_FAILED) && rq_end_io)
+		error = rq_end_io(tio->ti, clone, error, &tio->info);
+
+	if (error <= 0)
+		/* The target wants to complete the I/O */
+		dm_end_request(clone, error);
+	else if (error == DM_ENDIO_INCOMPLETE)
+		/* The target will handle the I/O */
+		return;
+	else if (error == DM_ENDIO_REQUEUE)
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+	else {
+		DMWARN("unimplemented target endio return value: %d", error);
+		BUG();
+	}
+}
+
+/*
+ * Complete the clone and the original request with the error status
+ * through softirq context.
+ */
+static void dm_complete_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	tio->error = error;
+	rq->completion_data = clone;
+	blk_complete_request(rq);
+}
+
+/*
+ * Complete the not-mapped clone and the original request with the error status
+ * through softirq context.
+ * Target's rq_end_io() function isn't called.
+ * This may be used when the target's map_rq() function fails.
+ */
+void dm_kill_unmapped_request(struct request *clone, int error)
+{
+	struct dm_rq_target_io *tio = clone->end_io_data;
+	struct request *rq = tio->orig;
+
+	rq->cmd_flags |= REQ_FAILED;
+	dm_complete_request(clone, error);
+}
+EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
+
+/*
+ * Called with the queue lock held
+ */
+static void end_clone_request(struct request *clone, int error)
+{
+	/*
+	 * For just cleaning up the information of the queue in which
+	 * the clone was dispatched.
+	 * The clone is *NOT* freed actually here because it is alloced from
+	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
+	 */
+	__blk_put_request(clone->q, clone);
+
+	/*
+	 * Actual request completion is done in a softirq context which doesn't
+	 * hold the queue lock.  Otherwise, deadlock could occur because:
+	 *     - another request may be submitted by the upper level driver
+	 *       of the stacking during the completion
+	 *     - the submission which requires queue lock may be done
+	 *       against this queue
+	 */
+	dm_complete_request(clone, error);
+}
+
 static sector_t max_io_len(struct mapped_device *md,
 			   sector_t sector, struct dm_target *ti)
 {
@@ -998,7 +1288,7 @@ out:
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
-static int dm_request(struct request_queue *q, struct bio *bio)
+static int _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
@@ -1035,12 +1325,274 @@ static int dm_request(struct request_queue *q, struct bio *bio)
 	return 0;
 }
 
+static int dm_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (unlikely(bio_barrier(bio))) {
+		bio_endio(bio, -EOPNOTSUPP);
+		return 0;
+	}
+
+	return md->saved_make_request_fn(q, bio); /* call __make_request() */
+}
+
+static int dm_request_based(struct mapped_device *md)
+{
+	return blk_queue_stackable(md->queue);
+}
+
+static int dm_request(struct request_queue *q, struct bio *bio)
+{
+	struct mapped_device *md = q->queuedata;
+
+	if (dm_request_based(md))
+		return dm_make_request(q, bio);
+
+	return _dm_request(q, bio);
+}
+
+void dm_dispatch_request(struct request *rq)
+{
+	int r;
+
+	if (blk_queue_io_stat(rq->q))
+		rq->cmd_flags |= REQ_IO_STAT;
+
+	rq->start_time = jiffies;
+	r = blk_insert_cloned_request(rq->q, rq);
+	if (r)
+		dm_complete_request(rq, r);
+}
+EXPORT_SYMBOL_GPL(dm_dispatch_request);
+
+static void dm_rq_bio_destructor(struct bio *bio)
+{
+	struct dm_rq_clone_bio_info *info = bio->bi_private;
+	struct mapped_device *md = info->tio->md;
+
+	free_bio_info(info);
+	bio_free(bio, md->bs);
+}
+
+static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
+				 void *data)
+{
+	struct dm_rq_target_io *tio = data;
+	struct mapped_device *md = tio->md;
+	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
+
+	if (!info)
+		return -ENOMEM;
+
+	info->orig = bio_orig;
+	info->tio = tio;
+	bio->bi_end_io = end_clone_bio;
+	bio->bi_private = info;
+	bio->bi_destructor = dm_rq_bio_destructor;
+
+	return 0;
+}
+
+static int setup_clone(struct request *clone, struct request *rq,
+		       struct dm_rq_target_io *tio)
+{
+	int r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
+				  dm_rq_bio_constructor, tio);
+
+	if (r)
+		return r;
+
+	clone->cmd = rq->cmd;
+	clone->cmd_len = rq->cmd_len;
+	clone->sense = rq->sense;
+	clone->buffer = rq->buffer;
+	clone->end_io = end_clone_request;
+	clone->end_io_data = tio;
+
+	return 0;
+}
+
+static int dm_rq_flush_suspending(struct mapped_device *md)
+{
+	return !md->suspend_rq.special;
+}
+
+/*
+ * Called with the queue lock held.
+ */
+static int dm_prep_fn(struct request_queue *q, struct request *rq)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_rq_target_io *tio;
+	struct request *clone;
+
+	if (unlikely(rq == &md->suspend_rq)) {
+		if (dm_rq_flush_suspending(md))
+			return BLKPREP_OK;
+		else
+			/* The flush suspend was interrupted */
+			return BLKPREP_KILL;
+	}
+
+	if (unlikely(rq->special)) {
+		DMWARN("Already has something in rq->special.");
+		return BLKPREP_KILL;
+	}
+
+	tio = alloc_rq_tio(md); /* Only one for each original request */
+	if (!tio)
+		/* -ENOMEM */
+		return BLKPREP_DEFER;
+
+	tio->md = md;
+	tio->ti = NULL;
+	tio->orig = rq;
+	tio->error = 0;
+	memset(&tio->info, 0, sizeof(tio->info));
+
+	clone = &tio->clone;
+	if (setup_clone(clone, rq, tio)) {
+		/* -ENOMEM */
+		free_rq_tio(tio);
+		return BLKPREP_DEFER;
+	}
+
+	rq->special = clone;
+	rq->cmd_flags |= REQ_DONTPREP;
+
+	return BLKPREP_OK;
+}
+
+static void map_request(struct dm_target *ti, struct request *rq,
+			struct mapped_device *md)
+{
+	int r;
+	struct request *clone = rq->special;
+	struct dm_rq_target_io *tio = clone->end_io_data;
+
+	/*
+	 * Hold the md reference here for the in-flight I/O.
+	 * We can't rely on the reference count by device opener,
+	 * because the device may be closed during the request completion
+	 * when all bios are completed.
+	 * See the comment in rq_completed() too.
+	 */
+	dm_get(md);
+
+	tio->ti = ti;
+	r = ti->type->map_rq(ti, clone, &tio->info);
+	switch (r) {
+	case DM_MAPIO_SUBMITTED:
+		/* The target has taken the I/O to submit by itself later */
+		break;
+	case DM_MAPIO_REMAPPED:
+		/* The target has remapped the I/O so dispatch it */
+		dm_dispatch_request(clone);
+		break;
+	case DM_MAPIO_REQUEUE:
+		/* The target wants to requeue the I/O */
+		dm_requeue_unmapped_request(clone);
+		break;
+	default:
+		if (r > 0) {
+			DMWARN("unimplemented target map return value: %d", r);
+			BUG();
+		}
+
+		/* The target wants to complete the I/O */
+		dm_kill_unmapped_request(clone, r);
+		break;
+	}
+}
+
+/*
+ * q->request_fn for request-based dm.
+ * Called with the queue lock held.
+ */
+static void dm_request_fn(struct request_queue *q)
+{
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+	struct dm_target *ti;
+	struct request *rq;
+
+	/*
+	 * For noflush suspend, check blk_queue_stopped() to immediately
+	 * quit I/O dispatching.
+	 */
+	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
+		rq = blk_peek_request(q);
+		if (!rq)
+			goto plug_and_out;
+
+		if (unlikely(rq == &md->suspend_rq)) { /* Flush suspend maker */
+			if (queue_in_flight(q))
+				/* Not quiet yet.  Wait more */
+				goto plug_and_out;
+
+			/* This device should be quiet now */
+			__stop_queue(q);
+			blk_start_request(rq);
+			__blk_end_request_all(rq, 0);
+			wake_up(&md->wait);
+			goto out;
+		}
+
+		ti = dm_table_find_target(map, blk_rq_pos(rq));
+		if (ti->type->busy && ti->type->busy(ti))
+			goto plug_and_out;
+
+		blk_start_request(rq);
+		spin_unlock(q->queue_lock);
+		map_request(ti, rq, md);
+		spin_lock_irq(q->queue_lock);
+	}
+
+	goto out;
+
+plug_and_out:
+	if (!elv_queue_empty(q))
+		/* Some requests still remain, retry later */
+		blk_plug_device(q);
+
+out:
+	dm_table_put(map);
+
+	return;
+}
+
+int dm_underlying_device_busy(struct request_queue *q)
+{
+	return blk_lld_busy(q);
+}
+EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
+
+static int dm_lld_busy(struct request_queue *q)
+{
+	int r;
+	struct mapped_device *md = q->queuedata;
+	struct dm_table *map = dm_get_table(md);
+
+	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
+		r = 1;
+	else
+		r = dm_table_any_busy_target(map);
+
+	dm_table_put(map);
+
+	return r;
+}
+
 static void dm_unplug_all(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_table(md);
 
 	if (map) {
+		if (dm_request_based(md))
+			generic_unplug_device(q);
+
 		dm_table_unplug_all(map);
 		dm_table_put(map);
 	}
@@ -1055,7 +1607,16 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_table(md);
 		if (map) {
-			r = dm_table_any_congested(map, bdi_bits);
+			/*
+			 * Request-based dm cares about only own queue for
+			 * the query about congestion status of request_queue
+			 */
+			if (dm_request_based(md))
+				r = md->queue->backing_dev_info.state &
+				    bdi_bits;
+			else
+				r = dm_table_any_congested(map, bdi_bits);
+
 			dm_table_put(map);
 		}
 	}
@@ -1458,6 +2019,8 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
+	struct request_queue *q = md->queue;
+	unsigned long flags;
 
 	dm_unplug_all(md->queue);
 
@@ -1467,7 +2030,14 @@ static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 		set_current_state(interruptible);
 
 		smp_mb();
-		if (!atomic_read(&md->pending))
+		if (dm_request_based(md)) {
+			spin_lock_irqsave(q->queue_lock, flags);
+			if (!queue_in_flight(q) && blk_queue_stopped(q)) {
+				spin_unlock_irqrestore(q->queue_lock, flags);
+				break;
+			}
+			spin_unlock_irqrestore(q->queue_lock, flags);
+		} else if (!atomic_read(&md->pending))
 			break;
 
 		if (interruptible == TASK_INTERRUPTIBLE &&
@@ -1584,6 +2154,67 @@ out:
 	return r;
 }
 
+static void dm_rq_invalidate_suspend_marker(struct mapped_device *md)
+{
+	md->suspend_rq.special = (void *)0x1;
+}
+
+static void dm_rq_abort_suspend(struct mapped_device *md, int noflush)
+{
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (!noflush)
+		dm_rq_invalidate_suspend_marker(md);
+	__start_queue(q);
+	spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
+static void dm_rq_start_suspend(struct mapped_device *md, int noflush)
+{
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+
+	if (noflush)
+		stop_queue(q);
+	else {
+		blk_rq_init(q, rq);
+		blk_insert_request(q, rq, 0, NULL);
+	}
+}
+
+static int dm_rq_suspend_available(struct mapped_device *md, int noflush)
+{
+	int r = 1;
+	struct request *rq = &md->suspend_rq;
+	struct request_queue *q = md->queue;
+	unsigned long flags;
+
+	if (noflush)
+		return r;
+
+	/* The marker must be protected by queue lock if it is in use */
+	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(rq->ref_count)) {
+		/*
+		 * This can happen, when the previous flush suspend was
+		 * interrupted, the marker is still in the queue and
+		 * this flush suspend has been invoked, because we don't
+		 * remove the marker at the time of suspend interruption.
+		 * We have only one marker per mapped_device, so we can't
+		 * start another flush suspend while it is in use.
+		 */
+		BUG_ON(!rq->special); /* The marker should be invalidated */
+		DMWARN("Invalidating the previous flush suspend is still in"
+		       " progress.  Please retry later.");
+		r = 0;
+	}
+	spin_unlock_irqrestore(q->queue_lock, flags);
+
+	return r;
+}
+
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
@@ -1623,6 +2254,53 @@ static void unlock_fs(struct mapped_device *md)
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
+/*
+ * Suspend mechanism in request-based dm.
+ *
+ * After the suspend starts, further incoming requests are kept in
+ * the request_queue and deferred.
+ * Remaining requests in the request_queue at the start of suspend are flushed
+ * if it is flush suspend.
+ * The suspend completes when the following conditions have been satisfied,
+ * so wait for it:
+ *    1. q->in_flight is 0 (which means no in_flight request)
+ *    2. queue has been stopped (which means no request dispatching)
+ *
+ *
+ * Noflush suspend
+ * ---------------
+ * Noflush suspend doesn't need to dispatch remaining requests.
+ * So stop the queue immediately.  Then, wait for all in_flight requests
+ * to be completed or requeued.
+ *
+ * To abort noflush suspend, start the queue.
+ *
+ *
+ * Flush suspend
+ * -------------
+ * Flush suspend needs to dispatch remaining requests.  So stop the queue
+ * after the remaining requests are completed. (Requeued request must be also
+ * re-dispatched and completed.  Until then, we can't stop the queue.)
+ *
+ * During flushing the remaining requests, further incoming requests are also
+ * inserted to the same queue.  To distinguish which requests are to be
+ * flushed, we insert a marker request to the queue at the time of starting
+ * flush suspend, like a barrier.
+ * The dispatching is blocked when the marker is found on the top of the queue.
+ * And the queue is stopped when all in_flight requests are completed, since
+ * that means the remaining requests are completely flushed.
+ * Then, the marker is removed from the queue.
+ *
+ * To abort flush suspend, we also need to take care of the marker, not only
+ * starting the queue.
+ * We don't remove the marker forcibly from the queue since it's against
+ * the block-layer manner.  Instead, we put a invalidated mark on the marker.
+ * When the invalidated marker is found on the top of the queue, it is
+ * immediately removed from the queue, so it doesn't block dispatching.
+ * Because we have only one marker per mapped_device, we can't start another
+ * flush suspend until the invalidated marker is removed from the queue.
+ * So fail and return with -EBUSY in such a case.
+ */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
@@ -1637,6 +2315,11 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 		goto out_unlock;
 	}
 
+	if (dm_request_based(md) && !dm_rq_suspend_available(md, noflush)) {
+		r = -EBUSY;
+		goto out_unlock;
+	}
+
 	map = dm_get_table(md);
 
 	/*
@@ -1682,6 +2365,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 
 	flush_workqueue(md->wq);
 
+	if (dm_request_based(md))
+		dm_rq_start_suspend(md, noflush);
+
 	/*
 	 * At this point no more requests are entering target request routines.
 	 * We call dm_wait_for_completion to wait for all existing requests
@@ -1698,6 +2384,9 @@ int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 	if (r < 0) {
 		dm_queue_flush(md);
 
+		if (dm_request_based(md))
+			dm_rq_abort_suspend(md, noflush);
+
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
@@ -1739,6 +2428,14 @@ int dm_resume(struct mapped_device *md)
 
 	dm_queue_flush(md);
 
+	/*
+	 * Flushing deferred I/Os must be done after targets are resumed
+	 * so that mapping of targets can work correctly.
+	 * Request-based dm is queueing the deferred I/Os in its request_queue.
+	 */
+	if (dm_request_based(md))
+		start_queue(md->queue);
+
 	unlock_fs(md);
 
 	clear_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 604e85caadf6..8dcabb1caff1 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -50,6 +50,7 @@ void dm_table_presuspend_targets(struct dm_table *t);
 void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
+int dm_table_any_busy_target(struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index e6bf3b8c7bf2..0d6310657f32 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -234,6 +234,7 @@ struct gendisk *dm_disk(struct mapped_device *md);
 int dm_suspended(struct mapped_device *md);
 int dm_noflush_suspending(struct dm_target *ti);
 union map_info *dm_get_mapinfo(struct bio *bio);
+union map_info *dm_get_rq_mapinfo(struct request *rq);
 
 /*
  * Geometry functions.
@@ -396,4 +397,12 @@ static inline unsigned long to_bytes(sector_t n)
 	return (n << SECTOR_SHIFT);
 }
 
+/*-----------------------------------------------------------------
+ * Helper for block layer and dm core operations
+ *---------------------------------------------------------------*/
+void dm_dispatch_request(struct request *rq);
+void dm_requeue_unmapped_request(struct request *rq);
+void dm_kill_unmapped_request(struct request *rq, int error);
+int dm_underlying_device_busy(struct request_queue *q);
+
 #endif	/* _LINUX_DEVICE_MAPPER_H */
-- 
cgit v1.2.3


From e6ee8c0b767540f59e20da3ced282601db8aa502 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:36 +0100
Subject: dm: enable request based option

This patch enables request-based dm.

o Request-based dm and bio-based dm coexist, since there are
  some target drivers which are more fitting to bio-based dm.
  Also, there are other bio-based devices in the kernel
  (e.g. md, loop).
  Since bio-based device can't receive struct request,
  there are some limitations on device stacking between
  bio-based and request-based.

                     type of underlying device
                   bio-based      request-based
   ----------------------------------------------
    bio-based         OK                OK
    request-based     --                OK

  The device type is recognized by the queue flag in the kernel,
  so dm follows that.

o The type of a dm device is decided at the first table binding time.
  Once the type of a dm device is decided, the type can't be changed.

o Mempool allocations are deferred to at the table loading time, since
  mempools for request-based dm are different from those for bio-based
  dm and needed mempool type is fixed by the type of table.

o Currently, request-based dm supports only tables that have a single
  target.  To support multiple targets, we need to support request
  splitting or prevent bio/request from spanning multiple targets.
  The former needs lots of changes in the block layer, and the latter
  needs that all target drivers support merge() function.
  Both will take a time.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-ioctl.c |  13 ++++
 drivers/md/dm-table.c | 111 ++++++++++++++++++++++++++++++++++
 drivers/md/dm.c       | 162 ++++++++++++++++++++++++++++++++++++++++++--------
 drivers/md/dm.h       |  25 ++++++++
 4 files changed, 285 insertions(+), 26 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 1c871736f48c..7f77f18fcafa 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1050,6 +1050,12 @@ static int populate_table(struct dm_table *table,
 		next = spec->next;
 	}
 
+	r = dm_table_set_type(table);
+	if (r) {
+		DMWARN("unable to set table type");
+		return r;
+	}
+
 	return dm_table_complete(table);
 }
 
@@ -1095,6 +1101,13 @@ static int table_load(struct dm_ioctl *param, size_t param_size)
 		goto out;
 	}
 
+	r = dm_table_alloc_md_mempools(t);
+	if (r) {
+		DMWARN("unable to allocate mempools for this table");
+		dm_table_destroy(t);
+		goto out;
+	}
+
 	down_write(&_hash_lock);
 	hc = dm_get_mdptr(md);
 	if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index c5f784419f23..aaeb82ed2852 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -41,6 +41,7 @@
 struct dm_table {
 	struct mapped_device *md;
 	atomic_t holders;
+	unsigned type;
 
 	/* btree table */
 	unsigned int depth;
@@ -65,6 +66,8 @@ struct dm_table {
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
 	void *event_context;
+
+	struct dm_md_mempools *mempools;
 };
 
 /*
@@ -258,6 +261,8 @@ void dm_table_destroy(struct dm_table *t)
 	if (t->devices.next != &t->devices)
 		free_devices(&t->devices);
 
+	dm_free_md_mempools(t->mempools);
+
 	kfree(t);
 }
 
@@ -764,6 +769,99 @@ int dm_table_add_target(struct dm_table *t, const char *type,
 	return r;
 }
 
+int dm_table_set_type(struct dm_table *t)
+{
+	unsigned i;
+	unsigned bio_based = 0, request_based = 0;
+	struct dm_target *tgt;
+	struct dm_dev_internal *dd;
+	struct list_head *devices;
+
+	for (i = 0; i < t->num_targets; i++) {
+		tgt = t->targets + i;
+		if (dm_target_request_based(tgt))
+			request_based = 1;
+		else
+			bio_based = 1;
+
+		if (bio_based && request_based) {
+			DMWARN("Inconsistent table: different target types"
+			       " can't be mixed up");
+			return -EINVAL;
+		}
+	}
+
+	if (bio_based) {
+		/* We must use this table as bio-based */
+		t->type = DM_TYPE_BIO_BASED;
+		return 0;
+	}
+
+	BUG_ON(!request_based); /* No targets in this table */
+
+	/* Non-request-stackable devices can't be used for request-based dm */
+	devices = dm_table_get_devices(t);
+	list_for_each_entry(dd, devices, list) {
+		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
+			DMWARN("table load rejected: including"
+			       " non-request-stackable devices");
+			return -EINVAL;
+		}
+	}
+
+	/*
+	 * Request-based dm supports only tables that have a single target now.
+	 * To support multiple targets, request splitting support is needed,
+	 * and that needs lots of changes in the block-layer.
+	 * (e.g. request completion process for partial completion.)
+	 */
+	if (t->num_targets > 1) {
+		DMWARN("Request-based dm doesn't support multiple targets yet");
+		return -EINVAL;
+	}
+
+	t->type = DM_TYPE_REQUEST_BASED;
+
+	return 0;
+}
+
+unsigned dm_table_get_type(struct dm_table *t)
+{
+	return t->type;
+}
+
+bool dm_table_request_based(struct dm_table *t)
+{
+	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
+}
+
+int dm_table_alloc_md_mempools(struct dm_table *t)
+{
+	unsigned type = dm_table_get_type(t);
+
+	if (unlikely(type == DM_TYPE_NONE)) {
+		DMWARN("no table type is set, can't allocate mempools");
+		return -EINVAL;
+	}
+
+	t->mempools = dm_alloc_md_mempools(type);
+	if (!t->mempools)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void dm_table_free_md_mempools(struct dm_table *t)
+{
+	dm_free_md_mempools(t->mempools);
+	t->mempools = NULL;
+}
+
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
+{
+	return t->mempools;
+}
+
 static int setup_indexes(struct dm_table *t)
 {
 	int i;
@@ -985,6 +1083,19 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 		queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
 
 	dm_table_set_integrity(t);
+
+	/*
+	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
+	 * visible to other CPUs because, once the flag is set, incoming bios
+	 * are processed by request-based dm, which refers to the queue
+	 * settings.
+	 * Until the flag set, bios are passed to bio-based dm and queued to
+	 * md->deferred where queue settings are not needed yet.
+	 * Those bios are passed to request-based dm at the resume time.
+	 */
+	smp_mb();
+	if (dm_table_request_based(t))
+		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 
 unsigned int dm_table_get_num_targets(struct dm_table *t)
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index be003e5fea3d..5a843c1f4d64 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -190,6 +190,15 @@ struct mapped_device {
 	struct bio barrier_bio;
 };
 
+/*
+ * For mempools pre-allocation at the table loading time.
+ */
+struct dm_md_mempools {
+	mempool_t *io_pool;
+	mempool_t *tio_pool;
+	struct bio_set *bs;
+};
+
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_tio_cache;
@@ -1739,10 +1748,22 @@ static struct mapped_device *alloc_dev(int minor)
 	INIT_LIST_HEAD(&md->uevent_list);
 	spin_lock_init(&md->uevent_lock);
 
-	md->queue = blk_alloc_queue(GFP_KERNEL);
+	md->queue = blk_init_queue(dm_request_fn, NULL);
 	if (!md->queue)
 		goto bad_queue;
 
+	/*
+	 * Request-based dm devices cannot be stacked on top of bio-based dm
+	 * devices.  The type of this dm device has not been decided yet,
+	 * although we initialized the queue using blk_init_queue().
+	 * The type is decided at the first table loading time.
+	 * To prevent problematic device stacking, clear the queue flag
+	 * for request stacking support until then.
+	 *
+	 * This queue is new, so no concurrency on the queue_flags.
+	 */
+	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
+	md->saved_make_request_fn = md->queue->make_request_fn;
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
@@ -1751,18 +1772,9 @@ static struct mapped_device *alloc_dev(int minor)
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
-
-	md->io_pool = mempool_create_slab_pool(MIN_IOS, _io_cache);
-	if (!md->io_pool)
-		goto bad_io_pool;
-
-	md->tio_pool = mempool_create_slab_pool(MIN_IOS, _tio_cache);
-	if (!md->tio_pool)
-		goto bad_tio_pool;
-
-	md->bs = bioset_create(16, 0);
-	if (!md->bs)
-		goto bad_no_bioset;
+	blk_queue_softirq_done(md->queue, dm_softirq_done);
+	blk_queue_prep_rq(md->queue, dm_prep_fn);
+	blk_queue_lld_busy(md->queue, dm_lld_busy);
 
 	md->disk = alloc_disk(1);
 	if (!md->disk)
@@ -1804,12 +1816,6 @@ bad_bdev:
 bad_thread:
 	put_disk(md->disk);
 bad_disk:
-	bioset_free(md->bs);
-bad_no_bioset:
-	mempool_destroy(md->tio_pool);
-bad_tio_pool:
-	mempool_destroy(md->io_pool);
-bad_io_pool:
 	blk_cleanup_queue(md->queue);
 bad_queue:
 	free_minor(minor);
@@ -1829,9 +1835,12 @@ static void free_dev(struct mapped_device *md)
 	unlock_fs(md);
 	bdput(md->bdev);
 	destroy_workqueue(md->wq);
-	mempool_destroy(md->tio_pool);
-	mempool_destroy(md->io_pool);
-	bioset_free(md->bs);
+	if (md->tio_pool)
+		mempool_destroy(md->tio_pool);
+	if (md->io_pool)
+		mempool_destroy(md->io_pool);
+	if (md->bs)
+		bioset_free(md->bs);
 	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
 	free_minor(minor);
@@ -1846,6 +1855,29 @@ static void free_dev(struct mapped_device *md)
 	kfree(md);
 }
 
+static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
+{
+	struct dm_md_mempools *p;
+
+	if (md->io_pool && md->tio_pool && md->bs)
+		/* the md already has necessary mempools */
+		goto out;
+
+	p = dm_table_get_md_mempools(t);
+	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
+
+	md->io_pool = p->io_pool;
+	p->io_pool = NULL;
+	md->tio_pool = p->tio_pool;
+	p->tio_pool = NULL;
+	md->bs = p->bs;
+	p->bs = NULL;
+
+out:
+	/* mempool bind completed, now no need any mempools in the table */
+	dm_table_free_md_mempools(t);
+}
+
 /*
  * Bind a table to the device.
  */
@@ -1897,6 +1929,18 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 
 	dm_table_event_callback(t, event_callback, md);
 
+	/*
+	 * The queue hasn't been stopped yet, if the old table type wasn't
+	 * for request-based during suspension.  So stop it to prevent
+	 * I/O mapping before resume.
+	 * This must be done before setting the queue restrictions,
+	 * because request-based dm may be run just after the setting.
+	 */
+	if (dm_table_request_based(t) && !blk_queue_stopped(q))
+		stop_queue(q);
+
+	__bind_mempools(md, t);
+
 	write_lock(&md->map_lock);
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
@@ -2110,10 +2154,14 @@ static void dm_wq_work(struct work_struct *work)
 
 		up_write(&md->io_lock);
 
-		if (bio_barrier(c))
-			process_barrier(md, c);
-		else
-			__split_and_process_bio(md, c);
+		if (dm_request_based(md))
+			generic_make_request(c);
+		else {
+			if (bio_barrier(c))
+				process_barrier(md, c);
+			else
+				__split_and_process_bio(md, c);
+		}
 
 		down_write(&md->io_lock);
 	}
@@ -2146,6 +2194,13 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 	if (r)
 		goto out;
 
+	/* cannot change the device type, once a table is bound */
+	if (md->map &&
+	    (dm_table_get_type(md->map) != dm_table_get_type(table))) {
+		DMWARN("can't change the device type after a table is bound");
+		goto out;
+	}
+
 	__unbind(md);
 	r = __bind(md, table, &limits);
 
@@ -2542,6 +2597,61 @@ int dm_noflush_suspending(struct dm_target *ti)
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
+{
+	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
+
+	if (!pools)
+		return NULL;
+
+	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
+			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
+			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
+	if (!pools->io_pool)
+		goto free_pools_and_out;
+
+	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
+			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
+			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
+	if (!pools->tio_pool)
+		goto free_io_pool_and_out;
+
+	pools->bs = (type == DM_TYPE_BIO_BASED) ?
+		    bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
+	if (!pools->bs)
+		goto free_tio_pool_and_out;
+
+	return pools;
+
+free_tio_pool_and_out:
+	mempool_destroy(pools->tio_pool);
+
+free_io_pool_and_out:
+	mempool_destroy(pools->io_pool);
+
+free_pools_and_out:
+	kfree(pools);
+
+	return NULL;
+}
+
+void dm_free_md_mempools(struct dm_md_mempools *pools)
+{
+	if (!pools)
+		return;
+
+	if (pools->io_pool)
+		mempool_destroy(pools->io_pool);
+
+	if (pools->tio_pool)
+		mempool_destroy(pools->tio_pool);
+
+	if (pools->bs)
+		bioset_free(pools->bs);
+
+	kfree(pools);
+}
+
 static struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index 8dcabb1caff1..a7663eba17e2 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -22,6 +22,13 @@
 #define DM_SUSPEND_LOCKFS_FLAG		(1 << 0)
 #define DM_SUSPEND_NOFLUSH_FLAG		(1 << 1)
 
+/*
+ * Type of table and mapped_device's mempool
+ */
+#define DM_TYPE_NONE		0
+#define DM_TYPE_BIO_BASED	1
+#define DM_TYPE_REQUEST_BASED	2
+
 /*
  * List of devices that a metadevice uses and should open/close.
  */
@@ -32,6 +39,7 @@ struct dm_dev_internal {
 };
 
 struct dm_table;
+struct dm_md_mempools;
 
 /*-----------------------------------------------------------------
  * Internal table functions.
@@ -51,12 +59,23 @@ void dm_table_postsuspend_targets(struct dm_table *t);
 int dm_table_resume_targets(struct dm_table *t);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 int dm_table_any_busy_target(struct dm_table *t);
+int dm_table_set_type(struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+bool dm_table_request_based(struct dm_table *t);
+int dm_table_alloc_md_mempools(struct dm_table *t);
+void dm_table_free_md_mempools(struct dm_table *t);
+struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
 
 /*
  * To check the return value from dm_table_find_target().
  */
 #define dm_target_is_valid(t) ((t)->table)
 
+/*
+ * To check whether the target type is request-based or not (bio-based).
+ */
+#define dm_target_request_based(t) ((t)->type->map_rq != NULL)
+
 /*-----------------------------------------------------------------
  * A registry of target types.
  *---------------------------------------------------------------*/
@@ -102,4 +121,10 @@ void dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 int dm_kcopyd_init(void);
 void dm_kcopyd_exit(void);
 
+/*
+ * Mempool operations
+ */
+struct dm_md_mempools *dm_alloc_md_mempools(unsigned type);
+void dm_free_md_mempools(struct dm_md_mempools *pools);
+
 #endif
-- 
cgit v1.2.3


From 5d67aa2366ccb8257d103d0b43df855605c3c086 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:36 +0100
Subject: dm: do not set QUEUE_ORDERED_DRAIN if request based

Request-based dm doesn't have barrier support yet.
So we need to set QUEUE_ORDERED_DRAIN only for bio-based dm.
Since the device type is decided at the first table loading time,
the flag set is deferred until then.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-table.c |  5 +++++
 drivers/md/dm.c       | 11 ++++++++++-
 drivers/md/dm.h       |  1 +
 3 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index aaeb82ed2852..4899ebe767c8 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -830,6 +830,11 @@ unsigned dm_table_get_type(struct dm_table *t)
 	return t->type;
 }
 
+bool dm_table_bio_based(struct dm_table *t)
+{
+	return dm_table_get_type(t) == DM_TYPE_BIO_BASED;
+}
+
 bool dm_table_request_based(struct dm_table *t)
 {
 	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5a843c1f4d64..00c768860818 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1768,7 +1768,6 @@ static struct mapped_device *alloc_dev(int minor)
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
-	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
@@ -2201,6 +2200,16 @@ int dm_swap_table(struct mapped_device *md, struct dm_table *table)
 		goto out;
 	}
 
+	/*
+	 * It is enought that blk_queue_ordered() is called only once when
+	 * the first bio-based table is bound.
+	 *
+	 * This setting should be moved to alloc_dev() when request-based dm
+	 * supports barrier.
+	 */
+	if (!md->map && dm_table_bio_based(table))
+		blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN, NULL);
+
 	__unbind(md);
 	r = __bind(md, table, &limits);
 
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index a7663eba17e2..23278ae80f08 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -61,6 +61,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits);
 int dm_table_any_busy_target(struct dm_table *t);
 int dm_table_set_type(struct dm_table *t);
 unsigned dm_table_get_type(struct dm_table *t);
+bool dm_table_bio_based(struct dm_table *t);
 bool dm_table_request_based(struct dm_table *t);
 int dm_table_alloc_md_mempools(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
-- 
cgit v1.2.3


From 523d9297d43cce3fa6de6474b7674329e98743b1 Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:37 +0100
Subject: dm: disable interrupt when taking map_lock

This patch disables interrupt when taking map_lock to avoid
lockdep warnings in request-based dm.

request-based dm takes map_lock after taking queue_lock with
disabling interrupt:
  spin_lock_irqsave(queue_lock)
  q->request_fn() == dm_request_fn()
    => dm_get_table()
         => read_lock(map_lock)
while queue_lock could be (but isn't) taken in interrupt context.

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 00c768860818..3c6d4ee8921d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -512,12 +512,13 @@ static void queue_io(struct mapped_device *md, struct bio *bio)
 struct dm_table *dm_get_table(struct mapped_device *md)
 {
 	struct dm_table *t;
+	unsigned long flags;
 
-	read_lock(&md->map_lock);
+	read_lock_irqsave(&md->map_lock, flags);
 	t = md->map;
 	if (t)
 		dm_table_get(t);
-	read_unlock(&md->map_lock);
+	read_unlock_irqrestore(&md->map_lock, flags);
 
 	return t;
 }
@@ -1910,6 +1911,7 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 {
 	struct request_queue *q = md->queue;
 	sector_t size;
+	unsigned long flags;
 
 	size = dm_table_get_size(t);
 
@@ -1940,10 +1942,10 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 
 	__bind_mempools(md, t);
 
-	write_lock(&md->map_lock);
+	write_lock_irqsave(&md->map_lock, flags);
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
-	write_unlock(&md->map_lock);
+	write_unlock_irqrestore(&md->map_lock, flags);
 
 	return 0;
 }
@@ -1951,14 +1953,15 @@ static int __bind(struct mapped_device *md, struct dm_table *t,
 static void __unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
+	unsigned long flags;
 
 	if (!map)
 		return;
 
 	dm_table_event_callback(map, NULL, NULL);
-	write_lock(&md->map_lock);
+	write_lock_irqsave(&md->map_lock, flags);
 	md->map = NULL;
-	write_unlock(&md->map_lock);
+	write_unlock_irqrestore(&md->map_lock, flags);
 	dm_table_destroy(map);
 }
 
-- 
cgit v1.2.3


From f40c67f0f7e2767f80f7cbcbc1ab86c4113c202e Mon Sep 17 00:00:00 2001
From: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Date: Mon, 22 Jun 2009 10:12:37 +0100
Subject: dm mpath: change to be request based

This patch converts dm-multipath target to request-based from bio-based.

Basically, the patch just converts the I/O unit from struct bio
to struct request.
In the course of the conversion, it also changes the I/O queueing
mechanism.  The change in the I/O queueing is described in details
as follows.

I/O queueing mechanism change
-----------------------------
In I/O submission, map_io(), there is no mechanism change from
bio-based, since the clone request is ready for retry as it is.
However, in I/O complition, do_end_io(), there is a mechanism change
from bio-based, since the clone request is not ready for retry.

In do_end_io() of bio-based, the clone bio has all needed memory
for resubmission.  So the target driver can queue it and resubmit
it later without memory allocations.
The mechanism has almost no overhead.

On the other hand, in do_end_io() of request-based, the clone request
doesn't have clone bios, so the target driver can't resubmit it
as it is.  To resubmit the clone request, memory allocation for
clone bios is needed, and it takes some overheads.
To avoid the overheads just for queueing, the target driver doesn't
queue the clone request inside itself.
Instead, the target driver asks dm core for queueing and remapping
the original request of the clone request, since the overhead for
queueing is just a freeing memory for the clone request.

As a result, the target driver doesn't need to record/restore
the information of the original request for resubmitting
the clone request.  So dm_bio_details in dm_mpath_io is removed.

multipath_busy()
---------------------
The target driver returns "busy", only when the following case:
  o The target driver will map I/Os, if map() function is called
  and
  o The mapped I/Os will wait on underlying device's queue due to
    their congestions, if map() function is called now.

In other cases, the target driver doesn't return "busy".
Otherwise, dm core will keep the I/Os and the target driver can't
do what it wants.
(e.g. the target driver can't map I/Os now, so wants to kill I/Os.)

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Acked-by: Hannes Reinecke <hare@suse.de>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
---
 drivers/md/dm-mpath.c | 193 +++++++++++++++++++++++++++++++++-----------------
 1 file changed, 128 insertions(+), 65 deletions(-)

(limited to 'drivers/md')

diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f8aeaaa54afe..c70604a20897 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -8,7 +8,6 @@
 #include <linux/device-mapper.h>
 
 #include "dm-path-selector.h"
-#include "dm-bio-record.h"
 #include "dm-uevent.h"
 
 #include <linux/ctype.h>
@@ -83,7 +82,7 @@ struct multipath {
 	unsigned pg_init_count;		/* Number of times pg_init called */
 
 	struct work_struct process_queued_ios;
-	struct bio_list queued_ios;
+	struct list_head queued_ios;
 	unsigned queue_size;
 
 	struct work_struct trigger_event;
@@ -100,7 +99,6 @@ struct multipath {
  */
 struct dm_mpath_io {
 	struct pgpath *pgpath;
-	struct dm_bio_details details;
 	size_t nr_bytes;
 };
 
@@ -194,6 +192,7 @@ static struct multipath *alloc_multipath(struct dm_target *ti)
 	m = kzalloc(sizeof(*m), GFP_KERNEL);
 	if (m) {
 		INIT_LIST_HEAD(&m->priority_groups);
+		INIT_LIST_HEAD(&m->queued_ios);
 		spin_lock_init(&m->lock);
 		m->queue_io = 1;
 		INIT_WORK(&m->process_queued_ios, process_queued_ios);
@@ -318,13 +317,14 @@ static int __must_push_back(struct multipath *m)
 		dm_noflush_suspending(m->ti));
 }
 
-static int map_io(struct multipath *m, struct bio *bio,
+static int map_io(struct multipath *m, struct request *clone,
 		  struct dm_mpath_io *mpio, unsigned was_queued)
 {
 	int r = DM_MAPIO_REMAPPED;
-	size_t nr_bytes = bio->bi_size;
+	size_t nr_bytes = blk_rq_bytes(clone);
 	unsigned long flags;
 	struct pgpath *pgpath;
+	struct block_device *bdev;
 
 	spin_lock_irqsave(&m->lock, flags);
 
@@ -341,16 +341,18 @@ static int map_io(struct multipath *m, struct bio *bio,
 	if ((pgpath && m->queue_io) ||
 	    (!pgpath && m->queue_if_no_path)) {
 		/* Queue for the daemon to resubmit */
-		bio_list_add(&m->queued_ios, bio);
+		list_add_tail(&clone->queuelist, &m->queued_ios);
 		m->queue_size++;
 		if ((m->pg_init_required && !m->pg_init_in_progress) ||
 		    !m->queue_io)
 			queue_work(kmultipathd, &m->process_queued_ios);
 		pgpath = NULL;
 		r = DM_MAPIO_SUBMITTED;
-	} else if (pgpath)
-		bio->bi_bdev = pgpath->path.dev->bdev;
-	else if (__must_push_back(m))
+	} else if (pgpath) {
+		bdev = pgpath->path.dev->bdev;
+		clone->q = bdev_get_queue(bdev);
+		clone->rq_disk = bdev->bd_disk;
+	} else if (__must_push_back(m))
 		r = DM_MAPIO_REQUEUE;
 	else
 		r = -EIO;	/* Failed */
@@ -398,30 +400,31 @@ static void dispatch_queued_ios(struct multipath *m)
 {
 	int r;
 	unsigned long flags;
-	struct bio *bio = NULL, *next;
 	struct dm_mpath_io *mpio;
 	union map_info *info;
+	struct request *clone, *n;
+	LIST_HEAD(cl);
 
 	spin_lock_irqsave(&m->lock, flags);
-	bio = bio_list_get(&m->queued_ios);
+	list_splice_init(&m->queued_ios, &cl);
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	while (bio) {
-		next = bio->bi_next;
-		bio->bi_next = NULL;
+	list_for_each_entry_safe(clone, n, &cl, queuelist) {
+		list_del_init(&clone->queuelist);
 
-		info = dm_get_mapinfo(bio);
+		info = dm_get_rq_mapinfo(clone);
 		mpio = info->ptr;
 
-		r = map_io(m, bio, mpio, 1);
-		if (r < 0)
-			bio_endio(bio, r);
-		else if (r == DM_MAPIO_REMAPPED)
-			generic_make_request(bio);
-		else if (r == DM_MAPIO_REQUEUE)
-			bio_endio(bio, -EIO);
-
-		bio = next;
+		r = map_io(m, clone, mpio, 1);
+		if (r < 0) {
+			mempool_free(mpio, m->mpio_pool);
+			dm_kill_unmapped_request(clone, r);
+		} else if (r == DM_MAPIO_REMAPPED)
+			dm_dispatch_request(clone);
+		else if (r == DM_MAPIO_REQUEUE) {
+			mempool_free(mpio, m->mpio_pool);
+			dm_requeue_unmapped_request(clone);
+		}
 	}
 }
 
@@ -863,21 +866,24 @@ static void multipath_dtr(struct dm_target *ti)
 }
 
 /*
- * Map bios, recording original fields for later in case we have to resubmit
+ * Map cloned requests
  */
-static int multipath_map(struct dm_target *ti, struct bio *bio,
+static int multipath_map(struct dm_target *ti, struct request *clone,
 			 union map_info *map_context)
 {
 	int r;
 	struct dm_mpath_io *mpio;
 	struct multipath *m = (struct multipath *) ti->private;
 
-	mpio = mempool_alloc(m->mpio_pool, GFP_NOIO);
-	dm_bio_record(&mpio->details, bio);
+	mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
+	if (!mpio)
+		/* ENOMEM, requeue */
+		return DM_MAPIO_REQUEUE;
+	memset(mpio, 0, sizeof(*mpio));
 
 	map_context->ptr = mpio;
-	bio->bi_rw |= (1 << BIO_RW_FAILFAST_TRANSPORT);
-	r = map_io(m, bio, mpio, 0);
+	clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
+	r = map_io(m, clone, mpio, 0);
 	if (r < 0 || r == DM_MAPIO_REQUEUE)
 		mempool_free(mpio, m->mpio_pool);
 
@@ -1158,53 +1164,41 @@ static void activate_path(struct work_struct *work)
 /*
  * end_io handling
  */
-static int do_end_io(struct multipath *m, struct bio *bio,
+static int do_end_io(struct multipath *m, struct request *clone,
 		     int error, struct dm_mpath_io *mpio)
 {
+	/*
+	 * We don't queue any clone request inside the multipath target
+	 * during end I/O handling, since those clone requests don't have
+	 * bio clones.  If we queue them inside the multipath target,
+	 * we need to make bio clones, that requires memory allocation.
+	 * (See drivers/md/dm.c:end_clone_bio() about why the clone requests
+	 *  don't have bio clones.)
+	 * Instead of queueing the clone request here, we queue the original
+	 * request into dm core, which will remake a clone request and
+	 * clone bios for it and resubmit it later.
+	 */
+	int r = DM_ENDIO_REQUEUE;
 	unsigned long flags;
 
-	if (!error)
+	if (!error && !clone->errors)
 		return 0;	/* I/O complete */
 
-	if ((error == -EWOULDBLOCK) && bio_rw_ahead(bio))
-		return error;
-
 	if (error == -EOPNOTSUPP)
 		return error;
 
-	spin_lock_irqsave(&m->lock, flags);
-	if (!m->nr_valid_paths) {
-		if (__must_push_back(m)) {
-			spin_unlock_irqrestore(&m->lock, flags);
-			return DM_ENDIO_REQUEUE;
-		} else if (!m->queue_if_no_path) {
-			spin_unlock_irqrestore(&m->lock, flags);
-			return -EIO;
-		} else {
-			spin_unlock_irqrestore(&m->lock, flags);
-			goto requeue;
-		}
-	}
-	spin_unlock_irqrestore(&m->lock, flags);
-
 	if (mpio->pgpath)
 		fail_path(mpio->pgpath);
 
-      requeue:
-	dm_bio_restore(&mpio->details, bio);
-
-	/* queue for the daemon to resubmit or fail */
 	spin_lock_irqsave(&m->lock, flags);
-	bio_list_add(&m->queued_ios, bio);
-	m->queue_size++;
-	if (!m->queue_io)
-		queue_work(kmultipathd, &m->process_queued_ios);
+	if (!m->nr_valid_paths && !m->queue_if_no_path && !__must_push_back(m))
+		r = -EIO;
 	spin_unlock_irqrestore(&m->lock, flags);
 
-	return DM_ENDIO_INCOMPLETE;	/* io not complete */
+	return r;
 }
 
-static int multipath_end_io(struct dm_target *ti, struct bio *bio,
+static int multipath_end_io(struct dm_target *ti, struct request *clone,
 			    int error, union map_info *map_context)
 {
 	struct multipath *m = ti->private;
@@ -1213,14 +1207,13 @@ static int multipath_end_io(struct dm_target *ti, struct bio *bio,
 	struct path_selector *ps;
 	int r;
 
-	r  = do_end_io(m, bio, error, mpio);
+	r  = do_end_io(m, clone, error, mpio);
 	if (pgpath) {
 		ps = &pgpath->pg->ps;
 		if (ps->type->end_io)
 			ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
 	}
-	if (r != DM_ENDIO_INCOMPLETE)
-		mempool_free(mpio, m->mpio_pool);
+	mempool_free(mpio, m->mpio_pool);
 
 	return r;
 }
@@ -1470,6 +1463,75 @@ out:
 	return ret;
 }
 
+static int __pgpath_busy(struct pgpath *pgpath)
+{
+	struct request_queue *q = bdev_get_queue(pgpath->path.dev->bdev);
+
+	return dm_underlying_device_busy(q);
+}
+
+/*
+ * We return "busy", only when we can map I/Os but underlying devices
+ * are busy (so even if we map I/Os now, the I/Os will wait on
+ * the underlying queue).
+ * In other words, if we want to kill I/Os or queue them inside us
+ * due to map unavailability, we don't return "busy".  Otherwise,
+ * dm core won't give us the I/Os and we can't do what we want.
+ */
+static int multipath_busy(struct dm_target *ti)
+{
+	int busy = 0, has_active = 0;
+	struct multipath *m = ti->private;
+	struct priority_group *pg;
+	struct pgpath *pgpath;
+	unsigned long flags;
+
+	spin_lock_irqsave(&m->lock, flags);
+
+	/* Guess which priority_group will be used at next mapping time */
+	if (unlikely(!m->current_pgpath && m->next_pg))
+		pg = m->next_pg;
+	else if (likely(m->current_pg))
+		pg = m->current_pg;
+	else
+		/*
+		 * We don't know which pg will be used at next mapping time.
+		 * We don't call __choose_pgpath() here to avoid to trigger
+		 * pg_init just by busy checking.
+		 * So we don't know whether underlying devices we will be using
+		 * at next mapping time are busy or not. Just try mapping.
+		 */
+		goto out;
+
+	/*
+	 * If there is one non-busy active path at least, the path selector
+	 * will be able to select it. So we consider such a pg as not busy.
+	 */
+	busy = 1;
+	list_for_each_entry(pgpath, &pg->pgpaths, list)
+		if (pgpath->is_active) {
+			has_active = 1;
+
+			if (!__pgpath_busy(pgpath)) {
+				busy = 0;
+				break;
+			}
+		}
+
+	if (!has_active)
+		/*
+		 * No active path in this pg, so this pg won't be used and
+		 * the current_pg will be changed at next mapping time.
+		 * We need to try mapping to determine it.
+		 */
+		busy = 0;
+
+out:
+	spin_unlock_irqrestore(&m->lock, flags);
+
+	return busy;
+}
+
 /*-----------------------------------------------------------------
  * Module setup
  *---------------------------------------------------------------*/
@@ -1479,14 +1541,15 @@ static struct target_type multipath_target = {
 	.module = THIS_MODULE,
 	.ctr = multipath_ctr,
 	.dtr = multipath_dtr,
-	.map = multipath_map,
-	.end_io = multipath_end_io,
+	.map_rq = multipath_map,
+	.rq_end_io = multipath_end_io,
 	.presuspend = multipath_presuspend,
 	.resume = multipath_resume,
 	.status = multipath_status,
 	.message = multipath_message,
 	.ioctl  = multipath_ioctl,
 	.iterate_devices = multipath_iterate_devices,
+	.busy = multipath_busy,
 };
 
 static int __init dm_multipath_init(void)
-- 
cgit v1.2.3