summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:43:44 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2024-03-11 11:43:44 -0700
commit1ddeeb2a058d7b2a58ed9e820396b4ceb715d529 (patch)
tree32a27b8eb1c538239b641292d77dc1a8cee8ee97
parentd2c84bdce25a678c1e1f116d65b58790bd241af0 (diff)
parent5205a4aa8fc9454853b705b69611c80e9c644283 (diff)
Merge tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - MD pull requests via Song: - Cleanup redundant checks (Yu Kuai) - Remove deprecated headers (Marc Zyngier, Song Liu) - Concurrency fixes (Li Lingfeng) - Memory leak fix (Li Nan) - Refactor raid1 read_balance (Yu Kuai, Paul Luse) - Clean up and fix for md_ioctl (Li Nan) - Other small fixes (Gui-Dong Han, Heming Zhao) - MD atomic limits (Christoph) - NVMe pull request via Keith: - RDMA target enhancements (Max) - Fabrics fixes (Max, Guixin, Hannes) - Atomic queue_limits usage (Christoph) - Const use for class_register (Ricardo) - Identification error handling fixes (Shin'ichiro, Keith) - Improvement and cleanup for cached request handling (Christoph) - Moving towards atomic queue limits. Core changes and driver bits so far (Christoph) - Fix UAF issues in aoeblk (Chun-Yi) - Zoned fix and cleanups (Damien) - s390 dasd cleanups and fixes (Jan, Miroslav) - Block issue timestamp caching (me) - noio scope guarding for zoned IO (Johannes) - block/nvme PI improvements (Kanchan) - Ability to terminate long running discard loop (Keith) - bdev revalidation fix (Li) - Get rid of old nr_queues hack for kdump kernels (Ming) - Support for async deletion of ublk (Ming) - Improve IRQ bio recycling (Pavel) - Factor in CPU capacity for remote vs local completion (Qais) - Add shared_tags configfs entry for null_blk (Shin'ichiro - Fix for a regression in page refcounts introduced by the folio unification (Tony) - Misc fixes and cleanups (Arnd, Colin, John, Kunwu, Li, Navid, Ricardo, Roman, Tang, Uwe) * tag 'for-6.9/block-20240310' of git://git.kernel.dk/linux: (221 commits) block: partitions: only define function mac_fix_string for CONFIG_PPC_PMAC block/swim: Convert to platform remove callback returning void cdrom: gdrom: Convert to platform remove callback returning void block: remove disk_stack_limits md: remove mddev->queue md: don't initialize queue limits md/raid10: use the atomic queue limit update APIs md/raid5: use the atomic queue limit update APIs md/raid1: use the atomic queue limit update APIs md/raid0: use the atomic queue limit update APIs md: add queue limit helpers md: add a mddev_is_dm helper md: add a mddev_add_trace_msg helper md: add a mddev_trace_remap helper bcache: move calculation of stripe_size and io_opt into bcache_device_init virtio_blk: Do not use disk_set_max_open/active_zones() aoe: fix the potential use-after-free problem in aoecmd_cfg_pkts block: move capacity validation to blkpg_do_ioctl() block: prevent division by zero in blk_rq_stat_sum() drbd: atomically update queue limits in drbd_reconsider_queue_parameters ...
-rw-r--r--arch/m68k/emu/nfblock.c10
-rw-r--r--arch/um/drivers/ubd_kern.c135
-rw-r--r--arch/xtensa/platforms/iss/simdisk.c8
-rw-r--r--block/bdev.c2
-rw-r--r--block/bfq-cgroup.c14
-rw-r--r--block/bfq-iosched.c28
-rw-r--r--block/bio-integrity.c1
-rw-r--r--block/bio.c45
-rw-r--r--block/blk-cgroup.c2
-rw-r--r--block/blk-cgroup.h1
-rw-r--r--block/blk-core.c33
-rw-r--r--block/blk-flush.c2
-rw-r--r--block/blk-integrity.c1
-rw-r--r--block/blk-iocost.c8
-rw-r--r--block/blk-iolatency.c6
-rw-r--r--block/blk-lib.c70
-rw-r--r--block/blk-mq.c186
-rw-r--r--block/blk-settings.c329
-rw-r--r--block/blk-stat.c2
-rw-r--r--block/blk-sysfs.c59
-rw-r--r--block/blk-throttle.c10
-rw-r--r--block/blk-wbt.c6
-rw-r--r--block/blk-zoned.c20
-rw-r--r--block/blk.h85
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/genhd.c14
-rw-r--r--block/holder.c12
-rw-r--r--block/ioctl.c9
-rw-r--r--block/partitions/core.c11
-rw-r--r--block/partitions/mac.c2
-rw-r--r--block/sed-opal.c16
-rw-r--r--block/t10-pi.c72
-rw-r--r--drivers/base/base.h2
-rw-r--r--drivers/block/amiflop.c2
-rw-r--r--drivers/block/aoe/aoeblk.c15
-rw-r--r--drivers/block/aoe/aoecmd.c12
-rw-r--r--drivers/block/aoe/aoenet.c1
-rw-r--r--drivers/block/ataflop.c2
-rw-r--r--drivers/block/brd.c26
-rw-r--r--drivers/block/drbd/drbd_main.c17
-rw-r--r--drivers/block/drbd/drbd_nl.c210
-rw-r--r--drivers/block/drbd/drbd_state.c24
-rw-r--r--drivers/block/drbd/drbd_state_change.h8
-rw-r--r--drivers/block/floppy.c17
-rw-r--r--drivers/block/loop.c75
-rw-r--r--drivers/block/mtip32xx/mtip32xx.c13
-rw-r--r--drivers/block/n64cart.c12
-rw-r--r--drivers/block/nbd.c49
-rw-r--r--drivers/block/null_blk/main.c535
-rw-r--r--drivers/block/null_blk/null_blk.h24
-rw-r--r--drivers/block/null_blk/trace.h5
-rw-r--r--drivers/block/null_blk/zoned.c25
-rw-r--r--drivers/block/pktcdvd.c41
-rw-r--r--drivers/block/ps3disk.c17
-rw-r--r--drivers/block/ps3vram.c6
-rw-r--r--drivers/block/rbd.c31
-rw-r--r--drivers/block/rnbd/rnbd-clt.c64
-rw-r--r--drivers/block/sunvdc.c18
-rw-r--r--drivers/block/swim.c8
-rw-r--r--drivers/block/swim3.c2
-rw-r--r--drivers/block/ublk_drv.c111
-rw-r--r--drivers/block/virtio_blk.c303
-rw-r--r--drivers/block/xen-blkfront.c53
-rw-r--r--drivers/block/z2ram.c2
-rw-r--r--drivers/block/zram/zram_drv.c51
-rw-r--r--drivers/cdrom/gdrom.c20
-rw-r--r--drivers/md/bcache/super.c59
-rw-r--r--drivers/md/dm-raid.c93
-rw-r--r--drivers/md/dm-table.c27
-rw-r--r--drivers/md/dm-zoned-metadata.c5
-rw-r--r--drivers/md/dm.c4
-rw-r--r--drivers/md/md-bitmap.c18
-rw-r--r--drivers/md/md-linear.h17
-rw-r--r--drivers/md/md-multipath.h32
-rw-r--r--drivers/md/md.c400
-rw-r--r--drivers/md/md.h77
-rw-r--r--drivers/md/raid0.c42
-rw-r--r--drivers/md/raid1-10.c69
-rw-r--r--drivers/md/raid1.c601
-rw-r--r--drivers/md/raid1.h1
-rw-r--r--drivers/md/raid10.c143
-rw-r--r--drivers/md/raid5-ppl.c3
-rw-r--r--drivers/md/raid5.c273
-rw-r--r--drivers/memstick/core/ms_block.c14
-rw-r--r--drivers/memstick/core/mspro_block.c15
-rw-r--r--drivers/mmc/core/queue.c97
-rw-r--r--drivers/mtd/mtd_blkdevs.c12
-rw-r--r--drivers/mtd/ubi/block.c6
-rw-r--r--drivers/nvdimm/btt.c14
-rw-r--r--drivers/nvdimm/pmem.c14
-rw-r--r--drivers/nvme/host/apple.c2
-rw-r--r--drivers/nvme/host/core.c458
-rw-r--r--drivers/nvme/host/fabrics.c22
-rw-r--r--drivers/nvme/host/multipath.c17
-rw-r--r--drivers/nvme/host/nvme.h12
-rw-r--r--drivers/nvme/host/rdma.c14
-rw-r--r--drivers/nvme/host/sysfs.c7
-rw-r--r--drivers/nvme/host/zns.c24
-rw-r--r--drivers/nvme/target/admin-cmd.c2
-rw-r--r--drivers/nvme/target/configfs.c28
-rw-r--r--drivers/nvme/target/core.c18
-rw-r--r--drivers/nvme/target/discovery.c2
-rw-r--r--drivers/nvme/target/fabrics-cmd.c5
-rw-r--r--drivers/nvme/target/fcloop.c17
-rw-r--r--drivers/nvme/target/nvmet.h6
-rw-r--r--drivers/nvme/target/passthru.c2
-rw-r--r--drivers/nvme/target/rdma.c10
-rw-r--r--drivers/nvme/target/zns.c5
-rw-r--r--drivers/s390/block/dasd.c180
-rw-r--r--drivers/s390/block/dasd_3990_erp.c80
-rw-r--r--drivers/s390/block/dasd_alias.c8
-rw-r--r--drivers/s390/block/dasd_devmap.c34
-rw-r--r--drivers/s390/block/dasd_diag.c26
-rw-r--r--drivers/s390/block/dasd_eckd.c186
-rw-r--r--drivers/s390/block/dasd_eer.c7
-rw-r--r--drivers/s390/block/dasd_erp.c9
-rw-r--r--drivers/s390/block/dasd_fba.c88
-rw-r--r--drivers/s390/block/dasd_genhd.c18
-rw-r--r--drivers/s390/block/dasd_int.h35
-rw-r--r--drivers/s390/block/dasd_ioctl.c6
-rw-r--r--drivers/s390/block/dasd_proc.c5
-rw-r--r--drivers/s390/block/dcssblk.c10
-rw-r--r--drivers/s390/block/scm_blk.c17
-rw-r--r--drivers/scsi/scsi_scan.c2
-rw-r--r--drivers/ufs/core/ufshcd.c2
-rw-r--r--fs/btrfs/zoned.c35
-rw-r--r--fs/f2fs/segment.c15
-rw-r--r--fs/zonefs/super.c2
-rw-r--r--include/linux/blk-integrity.h1
-rw-r--r--include/linux/blk-mq.h10
-rw-r--r--include/linux/blk_types.h42
-rw-r--r--include/linux/blkdev.h73
-rw-r--r--include/linux/nvme-rdma.h6
-rw-r--r--include/linux/nvme.h2
-rw-r--r--include/linux/sched.h2
-rw-r--r--include/linux/sched/topology.h6
-rw-r--r--include/uapi/linux/ublk_cmd.h2
-rw-r--r--kernel/sched/core.c17
138 files changed, 3444 insertions, 3171 deletions
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c
index a708fbd5a844..642fb80c5c4e 100644
--- a/arch/m68k/emu/nfblock.c
+++ b/arch/m68k/emu/nfblock.c
@@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = {
static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
{
+ struct queue_limits lim = {
+ .logical_block_size = bsize,
+ };
struct nfhd_device *dev;
int dev_id = id - NFHD_DEV_OFFSET;
int err = -ENOMEM;
@@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->bsize = bsize;
dev->bshift = ffs(bsize) - 10;
- dev->disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!dev->disk)
+ dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(dev->disk)) {
+ err = PTR_ERR(dev->disk);
goto free_dev;
+ }
dev->disk->major = major_num;
dev->disk->first_minor = dev_id * 16;
@@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize)
dev->disk->private_data = dev;
sprintf(dev->disk->disk_name, "nfhd%u", dev_id);
set_capacity(dev->disk, (sector_t)blocks * (bsize / 512));
- blk_queue_logical_block_size(dev->disk->queue, bsize);
err = add_disk(dev->disk);
if (err)
goto out_cleanup_disk;
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 92ee2697ff39..63fc062add70 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data)
static DEFINE_MUTEX(ubd_lock);
static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */
-static int ubd_open(struct gendisk *disk, blk_mode_t mode);
-static void ubd_release(struct gendisk *disk);
static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
@@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo);
static const struct block_device_operations ubd_blops = {
.owner = THIS_MODULE,
- .open = ubd_open,
- .release = ubd_release,
.ioctl = ubd_ioctl,
.compat_ioctl = blkdev_compat_ptr_ioctl,
.getgeo = ubd_getgeo,
};
-/* Protected by ubd_lock */
-static struct gendisk *ubd_gendisk[MAX_DEV];
-
#ifdef CONFIG_BLK_DEV_UBD_SYNC
#define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \
.cl = 1 })
@@ -155,7 +148,6 @@ struct ubd {
* backing or the cow file. */
char *file;
char *serial;
- int count;
int fd;
__u64 size;
struct openflags boot_openflags;
@@ -165,7 +157,7 @@ struct ubd {
unsigned no_trim:1;
struct cow cow;
struct platform_device pdev;
- struct request_queue *queue;
+ struct gendisk *disk;
struct blk_mq_tag_set tag_set;
spinlock_t lock;
};
@@ -181,7 +173,6 @@ struct ubd {
#define DEFAULT_UBD { \
.file = NULL, \
.serial = NULL, \
- .count = 0, \
.fd = -1, \
.size = -1, \
.boot_openflags = OPEN_FLAGS, \
@@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
ubd_dev->fd = fd;
if(ubd_dev->cow.file != NULL){
- blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long));
-
err = -ENOMEM;
ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len);
if(ubd_dev->cow.bitmap == NULL){
@@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev)
if(err < 0) goto error;
ubd_dev->cow.fd = err;
}
- if (ubd_dev->no_trim == 0) {
- blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
- blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST);
- }
- blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue);
return 0;
error:
os_close_file(ubd_dev->fd);
@@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = {
NULL,
};
-static int ubd_disk_register(int major, u64 size, int unit,
- struct gendisk *disk)
-{
- disk->major = major;
- disk->first_minor = unit << UBD_SHIFT;
- disk->minors = 1 << UBD_SHIFT;
- disk->fops = &ubd_blops;
- set_capacity(disk, size / 512);
- sprintf(disk->disk_name, "ubd%c", 'a' + unit);
-
- ubd_devs[unit].pdev.id = unit;
- ubd_devs[unit].pdev.name = DRIVER_NAME;
- ubd_devs[unit].pdev.dev.release = ubd_device_release;
- dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]);
- platform_device_register(&ubd_devs[unit].pdev);
-
- disk->private_data = &ubd_devs[unit];
- disk->queue = ubd_devs[unit].queue;
- return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups);
-}
-
#define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE))
static const struct blk_mq_ops ubd_mq_ops = {
@@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = {
static int ubd_add(int n, char **error_out)
{
struct ubd *ubd_dev = &ubd_devs[n];
+ struct queue_limits lim = {
+ .max_segments = MAX_SG,
+ .seg_boundary_mask = PAGE_SIZE - 1,
+ };
struct gendisk *disk;
int err = 0;
if(ubd_dev->file == NULL)
goto out;
+ if (ubd_dev->cow.file)
+ lim.max_hw_sectors = 8 * sizeof(long);
+ if (!ubd_dev->no_trim) {
+ lim.max_hw_discard_sectors = UBD_MAX_REQUEST;
+ lim.max_write_zeroes_sectors = UBD_MAX_REQUEST;
+ }
+
err = ubd_file_size(ubd_dev, &ubd_dev->size);
if(err < 0){
*error_out = "Couldn't determine size of device's file";
goto out;
}
+ err = ubd_open_dev(ubd_dev);
+ if (err) {
+ pr_err("ubd%c: Can't open \"%s\": errno = %d\n",
+ 'a' + n, ubd_dev->file, -err);
+ goto out;
+ }
+
ubd_dev->size = ROUND_BLOCK(ubd_dev->size);
ubd_dev->tag_set.ops = &ubd_mq_ops;
@@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out)
err = blk_mq_alloc_tag_set(&ubd_dev->tag_set);
if (err)
- goto out;
+ goto out_close;
- disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev);
+ disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_cleanup_tags;
}
- ubd_dev->queue = disk->queue;
- blk_queue_write_cache(ubd_dev->queue, true, false);
- blk_queue_max_segments(ubd_dev->queue, MAX_SG);
- blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1);
- err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+ blk_queue_write_cache(disk->queue, true, false);
+ disk->major = UBD_MAJOR;
+ disk->first_minor = n << UBD_SHIFT;
+ disk->minors = 1 << UBD_SHIFT;
+ disk->fops = &ubd_blops;
+ set_capacity(disk, ubd_dev->size / 512);
+ sprintf(disk->disk_name, "ubd%c", 'a' + n);
+ disk->private_data = ubd_dev;
+ set_disk_ro(disk, !ubd_dev->openflags.w);
+
+ ubd_dev->pdev.id = n;
+ ubd_dev->pdev.name = DRIVER_NAME;
+ ubd_dev->pdev.dev.release = ubd_device_release;
+ dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev);
+ platform_device_register(&ubd_dev->pdev);
+
+ err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups);
if (err)
goto out_cleanup_disk;
- ubd_gendisk[n] = disk;
return 0;
out_cleanup_disk:
put_disk(disk);
out_cleanup_tags:
blk_mq_free_tag_set(&ubd_dev->tag_set);
+out_close:
+ ubd_close_dev(ubd_dev);
out:
return err;
}
@@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out)
static int ubd_remove(int n, char **error_out)
{
- struct gendisk *disk = ubd_gendisk[n];
struct ubd *ubd_dev;
int err = -ENODEV;
@@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out)
if(ubd_dev->file == NULL)
goto out;
- /* you cannot remove a open disk */
- err = -EBUSY;
- if(ubd_dev->count > 0)
- goto out;
+ if (ubd_dev->disk) {
+ /* you cannot remove a open disk */
+ err = -EBUSY;
+ if (disk_openers(ubd_dev->disk))
+ goto out;
- ubd_gendisk[n] = NULL;
- if(disk != NULL){
- del_gendisk(disk);
- put_disk(disk);
+ del_gendisk(ubd_dev->disk);
+ ubd_close_dev(ubd_dev);
+ put_disk(ubd_dev->disk);
}
err = 0;
@@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){
device_initcall(ubd_driver_init);
-static int ubd_open(struct gendisk *disk, blk_mode_t mode)
-{
- struct ubd *ubd_dev = disk->private_data;
- int err = 0;
-
- mutex_lock(&ubd_mutex);
- if(ubd_dev->count == 0){
- err = ubd_open_dev(ubd_dev);
- if(err){
- printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n",
- disk->disk_name, ubd_dev->file, -err);
- goto out;
- }
- }
- ubd_dev->count++;
- set_disk_ro(disk, !ubd_dev->openflags.w);
-out:
- mutex_unlock(&ubd_mutex);
- return err;
-}
-
-static void ubd_release(struct gendisk *disk)
-{
- struct ubd *ubd_dev = disk->private_data;
-
- mutex_lock(&ubd_mutex);
- if(--ubd_dev->count == 0)
- ubd_close_dev(ubd_dev);
- mutex_unlock(&ubd_mutex);
-}
-
static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask,
__u64 *cow_offset, unsigned long *bitmap,
__u64 bitmap_offset, unsigned long *bitmap_words,
diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c
index 178cf96ca10a..defc67909a9c 100644
--- a/arch/xtensa/platforms/iss/simdisk.c
+++ b/arch/xtensa/platforms/iss/simdisk.c
@@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which,
struct proc_dir_entry *procdir)
{
char tmp[2] = { '0' + which, 0 };
- int err = -ENOMEM;
+ int err;
dev->fd = -1;
dev->filename = NULL;
spin_lock_init(&dev->lock);
dev->users = 0;
- dev->gd = blk_alloc_disk(NUMA_NO_NODE);
- if (!dev->gd)
+ dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE);
+ if (IS_ERR(dev->gd)) {
+ err = PTR_ERR(dev->gd);
goto out;
+ }
dev->gd->major = simdisk_major;
dev->gd->first_minor = which;
dev->gd->minors = SIMDISK_MINORS;
diff --git a/block/bdev.c b/block/bdev.c
index 140093c99bdc..e7adaaf1c219 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -383,7 +383,7 @@ void __init bdev_cache_init(void)
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
+ SLAB_ACCOUNT|SLAB_PANIC),
init_once);
err = register_filesystem(&bd_type);
if (err)
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 2c90e5de0acd..d442ee358fc2 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
if (!bfqg_stats_waiting(stats))
return;
- now = ktime_get_ns();
+ now = blk_time_get_ns();
if (now > stats->start_group_wait_time)
bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
@@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg,
return;
if (bfqg == curr_bfqg)
return;
- stats->start_group_wait_time = ktime_get_ns();
+ stats->start_group_wait_time = blk_time_get_ns();
bfqg_stats_mark_waiting(stats);
}
@@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
if (!bfqg_stats_empty(stats))
return;
- now = ktime_get_ns();
+ now = blk_time_get_ns();
if (now > stats->start_empty_time)
bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
@@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
if (bfqg_stats_empty(stats))
return;
- stats->start_empty_time = ktime_get_ns();
+ stats->start_empty_time = blk_time_get_ns();
bfqg_stats_mark_empty(stats);
}
@@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
struct bfqg_stats *stats = &bfqg->stats;
if (bfqg_stats_idling(stats)) {
- u64 now = ktime_get_ns();
+ u64 now = blk_time_get_ns();
if (now > stats->start_idle_time)
bfq_stat_add(&stats->idle_time,
@@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
- stats->start_idle_time = ktime_get_ns();
+ stats->start_idle_time = blk_time_get_ns();
bfqg_stats_mark_idling(stats);
}
@@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
- u64 now = ktime_get_ns();
+ u64 now = blk_time_get_ns();
if (now > io_start_time_ns)
blkg_rwstat_add(&stats->service_time, opf,
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 3cce6de464a7..4b88a54a9b76 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
rq = rq_entry_fifo(bfqq->fifo.next);
- if (rq == last || ktime_get_ns() < rq->fifo_time)
+ if (rq == last || blk_time_get_ns() < rq->fifo_time)
return NULL;
bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq);
@@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* bfq_bfqq_update_budg_for_activation for
* details on the usage of the next variable.
*/
- arrived_in_time = ktime_get_ns() <=
+ arrived_in_time = blk_time_get_ns() <=
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
@@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false;
- u64 now_ns = ktime_get_ns();
+ u64 now_ns = blk_time_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++;
@@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq)
bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) {
- bfqd->last_empty_occupied_ns = ktime_get_ns();
+ bfqd->last_empty_occupied_ns = blk_time_get_ns();
/*
* Start the state machine for measuring the
* total service time of rq: setting
@@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd,
else
timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
- bfqd->last_budget_start = ktime_get();
+ bfqd->last_budget_start = blk_time_get();
bfqq->budget_timeout = jiffies +
bfqd->bfq_timeout * timeout_coeff;
@@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd)
else if (bfqq->wr_coeff > 1)
sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC);
- bfqd->last_idling_start = ktime_get();
+ bfqd->last_idling_start = blk_time_get();
bfqd->last_idling_start_jiffies = jiffies;
hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl),
@@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd,
struct request *rq)
{
if (rq != NULL) { /* new rq dispatch now, reset accordingly */
- bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns();
+ bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns();
bfqd->peak_rate_samples = 1;
bfqd->sequential_samples = 0;
bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size =
@@ -3590,7 +3590,7 @@ reset_computation:
*/
static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
{
- u64 now_ns = ktime_get_ns();
+ u64 now_ns = blk_time_get_ns();
if (bfqd->peak_rate_samples == 0) { /* first dispatch */
bfq_log(bfqd, "update_peak_rate: goto reset, samples %d",
@@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (compensate)
delta_ktime = bfqd->last_idling_start;
else
- delta_ktime = ktime_get();
+ delta_ktime = blk_time_get();
delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start);
delta_usecs = ktime_to_us(delta_ktime);
@@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_io_cq *bic, pid_t pid, int is_sync,
unsigned int act_idx)
{
- u64 now_ns = ktime_get_ns();
+ u64 now_ns = blk_time_get_ns();
bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node);
@@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
*/
if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
return;
- elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+ elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
@@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
- rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
+ rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)];
list_add_tail(&rq->queuelist, &bfqq->fifo);
bfq_rq_enqueued(bfqd, bfqq, rq);
@@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_weights_tree_remove(bfqq);
}
- now_ns = ktime_get_ns();
+ now_ns = blk_time_get_ns();
bfqq->ttime.last_end_request = now_ns;
@@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
static void bfq_update_inject_limit(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
- u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
+ u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit;
if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index c9a16fba58b9..2e3e8e04961e 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio,
iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector;
iter.prot_buf = bvec_virt(bip->bip_vec);
+ iter.pi_offset = bi->pi_offset;
__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
void *kaddr = bvec_kmap_local(&bv);
diff --git a/block/bio.c b/block/bio.c
index c9223e9d31da..d24420ed1c4c 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,7 +16,6 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/highmem.h>
-#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
#include <linux/xarray.h>
@@ -763,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio)
struct bio_alloc_cache *cache;
cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
- if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
- put_cpu();
- bio_free(bio);
- return;
- }
+ if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX)
+ goto out_free;
- bio_uninit(bio);
-
- if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
+ if (in_task()) {
+ bio_uninit(bio);
bio->bi_next = cache->free_list;
+ /* Not necessary but helps not to iopoll already freed bios */
bio->bi_bdev = NULL;
cache->free_list = bio;
cache->nr++;
- } else {
- unsigned long flags;
+ } else if (in_hardirq()) {
+ lockdep_assert_irqs_disabled();
- local_irq_save(flags);
+ bio_uninit(bio);
bio->bi_next = cache->free_list_irq;
cache->free_list_irq = bio;
cache->nr_irq++;
- local_irq_restore(flags);
+ } else {
+ goto out_free;
}
put_cpu();
+ return;
+out_free:
+ put_cpu();
+ bio_free(bio);
}
/**
@@ -1154,7 +1155,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_folio_all(fi, bio) {
struct page *page;
- size_t done = 0;
+ size_t nr_pages;
if (mark_dirty) {
folio_lock(fi.folio);
@@ -1162,10 +1163,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_unlock(fi.folio);
}
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
+ nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
+ fi.offset / PAGE_SIZE + 1;
do {
bio_release_page(bio, page++);
- done += PAGE_SIZE;
- } while (done < fi.length);
+ } while (--nr_pages != 0);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1371,21 +1373,12 @@ int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done,
bio->bi_bdev->bd_disk->lockdep_map);
- unsigned long hang_check;
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
-
- /* Prevent hang_check timer from firing at us during very long I/O */
- hang_check = sysctl_hung_task_timeout_secs;
- if (hang_check)
- while (!wait_for_completion_io_timeout(&done,
- hang_check * (HZ/2)))
- ;
- else
- wait_for_completion_io(&done);
+ blk_wait_io(&done);
return blk_status_to_errno(bio->bi_status);
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ff93c385ba5a..bdbb557feb5a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
bool clamp;
- u64 now = ktime_to_ns(ktime_get());
+ u64 now = blk_time_get_ns();
u64 exp;
u64 delay_nsec = 0;
int tok;
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b927a4a0ad03..78b74106bf10 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -19,6 +19,7 @@
#include <linux/kthread.h>
#include <linux/blk-mq.h>
#include <linux/llist.h>
+#include "blk.h"
struct blkcg_gq;
struct blkg_policy_data;
diff --git a/block/blk-core.c b/block/blk-core.c
index de771093b526..a16b5abdbbf5 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id)
{
struct request_queue *q;
+ int error;
q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
node_id);
if (!q)
- return NULL;
+ return ERR_PTR(-ENOMEM);
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
- if (q->id < 0)
+ if (q->id < 0) {
+ error = q->id;
goto fail_q;
+ }
q->stats = blk_alloc_queue_stats();
- if (!q->stats)
+ if (!q->stats) {
+ error = -ENOMEM;
goto fail_id;
+ }
+
+ error = blk_set_default_limits(lim);
+ if (error)
+ goto fail_stats;
+ q->limits = *lim;
q->node = node_id;
@@ -425,6 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id)
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
+ mutex_init(&q->limits_lock);
mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
@@ -435,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id)
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
*/
- if (percpu_ref_init(&q->q_usage_counter,
+ error = percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
- PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
+ PERCPU_REF_INIT_ATOMIC, GFP_KERNEL);
+ if (error)
goto fail_stats;
- blk_set_default_limits(&q->limits);
q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
@@ -451,7 +462,7 @@ fail_id:
ida_free(&blk_queue_ida, q->id);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
- return NULL;
+ return ERR_PTR(error);
}
/**
@@ -1083,6 +1094,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
if (tsk->plug)
return;
+ plug->cur_ktime = 0;
plug->mq_list = NULL;
plug->cached_rq = NULL;
plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
@@ -1182,6 +1194,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
*/
if (unlikely(!rq_list_empty(plug->cached_rq)))
blk_mq_free_plug_rqs(plug);
+
+ current->flags &= ~PF_BLOCK_TS;
}
/**
@@ -1229,8 +1243,7 @@ int __init blk_dev_init(void)
if (!kblockd_workqueue)
panic("Failed to create kblockd\n");
- blk_requestq_cachep = kmem_cache_create("request_queue",
- sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
+ blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC);
blk_debugfs_root = debugfs_create_dir("block", NULL);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 3f4d41952ef2..b0f314f4bc14 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq)
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
part_stat_add(part, nsecs[STAT_FLUSH],
- ktime_get_ns() - rq->start_time_ns);
+ blk_time_get_ns() - rq->start_time_ns);
part_stat_unlock();
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index d4e9b4556d14..ccbeb6dfa87a 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->profile = template->profile ? template->profile : &nop_profile;
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
+ bi->pi_offset = template->pi_offset;
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 04d44f0bcbc8..9a85bfbbc45a 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
/* step up/down based on the vrate */
vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
- now_ns = ktime_get_ns();
+ now_ns = blk_time_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
if (!ioc->autop_too_fast_at)
@@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
unsigned seq;
u64 vrate;
- now->now_ns = ktime_get();
+ now->now_ns = blk_time_get_ns();
now->now = ktime_to_us(now->now_ns);
vrate = atomic64_read(&ioc->vtime_rate);
@@ -2817,7 +2817,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
return;
}
- on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
+ on_q_ns = blk_time_get_ns() - rq->alloc_time_ns;
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
@@ -2900,7 +2900,7 @@ static int blk_iocost_init(struct gendisk *disk)
ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
- ioc->period_at = ktime_to_us(ktime_get());
+ ioc->period_at = ktime_to_us(blk_time_get());
atomic64_set(&ioc->cur_period, 0);
atomic_set(&ioc->hweight_gen, 0);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c1a6aba1d59e..ebb522788d97 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
if (!iolat->blkiolat->enabled)
return;
- now = ktime_to_ns(ktime_get());
+ now = blk_time_get_ns();
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer);
struct blkcg_gq *blkg;
struct cgroup_subsys_state *pos_css;
- u64 now = ktime_to_ns(ktime_get());
+ u64 now = blk_time_get_ns();
rcu_read_lock();
blkg_for_each_descendant_pre(blkg, pos_css,
@@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
struct blkcg_gq *blkg = lat_to_blkg(iolat);
struct rq_qos *rqos = iolat_rq_qos(blkg->q);
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
- u64 now = ktime_to_ns(ktime_get());
+ u64 now = blk_time_get_ns();
int cpu;
if (blk_queue_nonrot(blkg->q))
diff --git a/block/blk-lib.c b/block/blk-lib.c
index e59c3069e835..dc8e35d0a51d 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
+static void await_bio_endio(struct bio *bio)
+{
+ complete(bio->bi_private);
+ bio_put(bio);
+}
+
+/*
+ * await_bio_chain - ends @bio and waits for every chained bio to complete
+ */
+static void await_bio_chain(struct bio *bio)
+{
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
+
+ bio->bi_private = &done;
+ bio->bi_end_io = await_bio_endio;
+ bio_endio(bio);
+ blk_wait_io(&done);
+}
+
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{
@@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
* is disabled.
*/
cond_resched();
+ if (fatal_signal_pending(current)) {
+ await_bio_chain(bio);
+ return -EINTR;
+ }
}
*biop = bio;
@@ -120,32 +144,33 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
- unsigned int max_write_zeroes_sectors;
+ unsigned int max_sectors;
if (bdev_read_only(bdev))
return -EPERM;
- /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */
- max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev);
+ /* Ensure that max_sectors doesn't overflow bi_size */
+ max_sectors = bdev_write_zeroes_sectors(bdev);
- if (max_write_zeroes_sectors == 0)
+ if (max_sectors == 0)
return -EOPNOTSUPP;
while (nr_sects) {
+ unsigned int len = min_t(sector_t, nr_sects, max_sectors);
+
bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
- if (nr_sects > max_write_zeroes_sectors) {
- bio->bi_iter.bi_size = max_write_zeroes_sectors << 9;
- nr_sects -= max_write_zeroes_sectors;
- sector += max_write_zeroes_sectors;
- } else {
- bio->bi_iter.bi_size = nr_sects << 9;
- nr_sects = 0;
- }
+ bio->bi_iter.bi_size = len << SECTOR_SHIFT;
+ nr_sects -= len;
+ sector += len;
cond_resched();
+ if (fatal_signal_pending(current)) {
+ await_bio_chain(bio);
+ return -EINTR;
+ }
}
*biop = bio;
@@ -190,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev,
break;
}
cond_resched();
+ if (fatal_signal_pending(current)) {
+ await_bio_chain(bio);
+ return -EINTR;
+ }
}
*biop = bio;
@@ -280,7 +309,7 @@ retry:
bio_put(bio);
}
blk_finish_plug(&plug);
- if (ret && try_write_zeroes) {
+ if (ret && ret != -EINTR && try_write_zeroes) {
if (!(flags & BLKDEV_ZERO_NOFALLBACK)) {
try_write_zeroes = false;
goto retry;
@@ -322,7 +351,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
return -EPERM;
blk_start_plug(&plug);
- for (;;) {
+ while (nr_sects) {
unsigned int len = min_t(sector_t, nr_sects, max_sectors);
bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp);
@@ -331,12 +360,17 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
sector += len;
nr_sects -= len;
- if (!nr_sects) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
+ cond_resched();
+ if (fatal_signal_pending(current)) {
+ await_bio_chain(bio);
+ ret = -EINTR;
+ bio = NULL;
break;
}
- cond_resched();
+ }
+ if (bio) {
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
}
blk_finish_plug(&plug);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 9c475a89e3af..555ada922cf0 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -21,7 +21,6 @@
#include <linux/llist.h>
#include <linux/cpu.h>
#include <linux/cache.h>
-#include <linux/sched/sysctl.h>
#include <linux/sched/topology.h>
#include <linux/sched/signal.h>
#include <linux/delay.h>
@@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
RB_CLEAR_NODE(&rq->rb_node);
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = BLK_MQ_NO_TAG;
- rq->start_time_ns = ktime_get_ns();
+ rq->start_time_ns = blk_time_get_ns();
rq->part = NULL;
blk_crypto_rq_set_defaults(rq);
}
@@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init);
static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = ktime_get_ns();
+ rq->start_time_ns = blk_time_get_ns();
else
rq->start_time_ns = 0;
@@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
- alloc_time_ns = ktime_get_ns();
+ alloc_time_ns = blk_time_get_ns();
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
/* alloc_time includes depth and tag waits */
if (blk_queue_rq_alloc_time(q))
- alloc_time_ns = ktime_get_ns();
+ alloc_time_ns = blk_time_get_ns();
/*
* If the tag allocator sleeps we could get an allocation for a
@@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
{
if (blk_mq_need_time_stamp(rq))
- __blk_mq_end_request_acct(rq, ktime_get_ns());
+ __blk_mq_end_request_acct(rq, blk_time_get_ns());
blk_mq_finish_request(rq);
@@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob)
u64 now = 0;
if (iob->need_ts)
- now = ktime_get_ns();
+ now = blk_time_get_ns();
while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
prefetch(rq->bio);
@@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq)
if (force_irqthreads())
return false;
- /* same CPU or cache domain? Complete locally */
+ /* same CPU or cache domain and capacity? Complete locally */
if (cpu == rq->mq_ctx->cpu ||
(!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
- cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+ cpus_share_cache(cpu, rq->mq_ctx->cpu) &&
+ cpus_equal_capacity(cpu, rq->mq_ctx->cpu)))
return false;
/* don't try to IPI to an offline CPU */
@@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq)
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
!blk_rq_is_passthrough(rq)) {
- rq->io_start_time_ns = ktime_get_ns();
+ rq->io_start_time_ns = blk_time_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
@@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head)
blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
blk_mq_run_hw_queue(hctx, false);
- if (blk_rq_is_poll(rq)) {
+ if (blk_rq_is_poll(rq))
blk_rq_poll_completion(rq, &wait.done);
- } else {
- /*
- * Prevent hang_check timer from firing at us during very long
- * I/O
- */
- unsigned long hang_check = sysctl_hung_task_timeout_secs;
-
- if (hang_check)
- while (!wait_for_completion_io_timeout(&wait.done,
- hang_check * (HZ/2)))
- ;
- else
- wait_for_completion_io(&wait.done);
- }
+ else
+ blk_wait_io(&wait.done);
return wait.ret;
}
@@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
};
struct request *rq;
- if (blk_mq_attempt_bio_merge(q, bio, nsegs))
- return NULL;
-
rq_qos_throttle(q, bio);
if (plug) {
@@ -2913,22 +2898,31 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
}
/*
- * Check if we can use the passed on request for submitting the passed in bio,
- * and remove it from the request list if it can be used.
+ * Check if there is a suitable cached request and return it.
*/
-static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
- struct bio *bio)
+static struct request *blk_mq_peek_cached_request(struct blk_plug *plug,
+ struct request_queue *q, blk_opf_t opf)
{
- enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
- enum hctx_type hctx_type = rq->mq_hctx->type;
+ enum hctx_type type = blk_mq_get_hctx_type(opf);
+ struct request *rq;
- WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+ if (!plug)
+ return NULL;
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
+ if (type != rq->mq_hctx->type &&
+ (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT))
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+ return rq;
+}
- if (type != hctx_type &&
- !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
- return false;
- if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
- return false;
+static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
+ struct bio *bio)
+{
+ WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
/*
* If any qos ->throttle() end up blocking, we will have flushed the
@@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
blk_mq_rq_time_init(rq, 0);
rq->cmd_flags = bio->bi_opf;
INIT_LIST_HEAD(&rq->queuelist);
- return true;
}
/**
@@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio)
struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
struct blk_mq_hw_ctx *hctx;
- struct request *rq = NULL;
unsigned int nr_segs = 1;
+ struct request *rq;
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
- if (plug) {
- rq = rq_list_peek(&plug->cached_rq);
- if (rq && rq->q != q)
- rq = NULL;
- }
- if (rq) {
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- return;
- }
- if (!bio_integrity_prep(bio))
- return;
- if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
- return;
- if (blk_mq_use_cached_rq(rq, plug, bio))
- goto done;
- percpu_ref_get(&q->q_usage_counter);
- } else {
+ /*
+ * If the plug has a cached request for this queue, try use it.
+ *
+ * The cached request already holds a q_usage_counter reference and we
+ * don't have to acquire a new one if we use it.
+ */
+ rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf);
+ if (!rq) {
if (unlikely(bio_queue_enter(bio)))
return;
- if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- goto fail;
- }
- if (!bio_integrity_prep(bio))
- goto fail;
}
- rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
- if (unlikely(!rq)) {
-fail:
- blk_queue_exit(q);
- return;
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto queue_exit;
+ }
+ if (!bio_integrity_prep(bio))
+ goto queue_exit;
+
+ if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
+ goto queue_exit;
+
+ if (!rq) {
+ rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
+ if (unlikely(!rq))
+ goto queue_exit;
+ } else {
+ blk_mq_use_cached_rq(rq, plug, bio);
}
-done:
trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
@@ -3037,6 +3023,15 @@ done:
} else {
blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
}
+ return;
+
+queue_exit:
+ /*
+ * Don't drop the queue reference if we were trying to use a cached
+ * request and thus didn't acquire one.
+ */
+ if (!rq)
+ blk_queue_exit(q);
}
#ifdef CONFIG_BLK_MQ_STACKING
@@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
blk_mq_run_dispatch_ops(q,
ret = blk_mq_request_issue_directly(rq, true));
if (ret)
- blk_account_io_done(rq, ktime_get_ns());
+ blk_account_io_done(rq, blk_time_get_ns());
return ret;
}
EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
@@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
-static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
- void *queuedata)
+struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
+ struct queue_limits *lim, void *queuedata)
{
+ struct queue_limits default_lim = { };
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node);
- if (!q)
- return ERR_PTR(-ENOMEM);
+ q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node);
+ if (IS_ERR(q))
+ return q;
q->queuedata = queuedata;
ret = blk_mq_init_allocated_queue(set, q);
if (ret) {
@@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
}
return q;
}
-
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
-{
- return blk_mq_init_queue_data(set, NULL);
-}
-EXPORT_SYMBOL(blk_mq_init_queue);
+EXPORT_SYMBOL(blk_mq_alloc_queue);
/**
* blk_mq_destroy_queue - shutdown a request queue
* @q: request queue to shutdown
*
- * This shuts down a request queue allocated by blk_mq_init_queue(). All future
+ * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future
* requests will be failed with -ENODEV. The caller is responsible for dropping
- * the reference from blk_mq_init_queue() by calling blk_put_queue().
+ * the reference from blk_mq_alloc_queue() by calling blk_put_queue().
*
* Context: can sleep
*/
@@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
-struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
+ struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass)
{
struct request_queue *q;
struct gendisk *disk;
- q = blk_mq_init_queue_data(set, queuedata);
+ q = blk_mq_alloc_queue(set, lim, queuedata);
if (IS_ERR(q))
return ERR_CAST(q);
@@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
if (set->nr_maps == 1)
set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
- if (set->ops->map_queues && !is_kdump_kernel()) {
+ if (set->ops->map_queues) {
int i;
/*
@@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
/*
* If a crashdump is active, then we are potentially in a very
- * memory constrained environment. Limit us to 1 queue and
- * 64 tags to prevent using too much memory.
+ * memory constrained environment. Limit us to 64 tags to prevent
+ * using too much memory.
*/
- if (is_kdump_kernel()) {
- set->nr_hw_queues = 1;
- set->nr_maps = 1;
+ if (is_kdump_kernel())
set->queue_depth = min(64U, set->queue_depth);
- }
+
/*
* There is no use for more h/w queues than cpus if we just have
* a single map
@@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
GFP_KERNEL, set->numa_node);
if (!set->map[i].mq_map)
goto out_free_mq_map;
- set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
+ set->map[i].nr_queues = set->nr_hw_queues;
}
blk_mq_update_queue_map(set);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 06ea91e51b8b..3c7d8d638ab5 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -26,52 +26,21 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
/**
- * blk_set_default_limits - reset limits to default values
- * @lim: the queue_limits structure to reset
- *
- * Description:
- * Returns a queue_limit struct to its default state.
- */
-void blk_set_default_limits(struct queue_limits *lim)
-{
- lim->max_segments = BLK_MAX_SEGMENTS;
- lim->max_discard_segments = 1;
- lim->max_integrity_segments = 0;
- lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
- lim->virt_boundary_mask = 0;
- lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
- lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
- lim->max_user_sectors = lim->max_dev_sectors = 0;
- lim->chunk_sectors = 0;
- lim->max_write_zeroes_sectors = 0;
- lim->max_zone_append_sectors = 0;
- lim->max_discard_sectors = 0;
- lim->max_hw_discard_sectors = 0;
- lim->max_secure_erase_sectors = 0;
- lim->discard_granularity = 512;
- lim->discard_alignment = 0;
- lim->discard_misaligned = 0;
- lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
- lim->bounce = BLK_BOUNCE_NONE;
- lim->alignment_offset = 0;
- lim->io_opt = 0;
- lim->misaligned = 0;
- lim->zoned = false;
- lim->zone_write_granularity = 0;
- lim->dma_alignment = 511;
-}
-
-/**
* blk_set_stacking_limits - set default limits for stacking devices
* @lim: the queue_limits structure to reset
*
- * Description:
- * Returns a queue_limit struct to its default state. Should be used
- * by stacking drivers like DM that have no internal limits.
+ * Prepare queue limits for applying limits from underlying devices using
+ * blk_stack_limits().
*/
void blk_set_stacking_limits(struct queue_limits *lim)
{
- blk_set_default_limits(lim);
+ memset(lim, 0, sizeof(*lim));
+ lim->logical_block_size = SECTOR_SIZE;
+ lim->physical_block_size = SECTOR_SIZE;
+ lim->io_min = SECTOR_SIZE;
+ lim->discard_granularity = SECTOR_SIZE;
+ lim->dma_alignment = SECTOR_SIZE - 1;
+ lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
/* Inherit limits from component devices */
lim->max_segments = USHRT_MAX;
@@ -82,9 +51,239 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
+ lim->max_user_discard_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
+static void blk_apply_bdi_limits(struct backing_dev_info *bdi,
+ struct queue_limits *lim)
+{
+ /*
+ * For read-ahead of large files to be effective, we need to read ahead
+ * at least twice the optimal I/O size.
+ */
+ bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT;
+}
+
+static int blk_validate_zoned_limits(struct queue_limits *lim)
+{
+ if (!lim->zoned) {
+ if (WARN_ON_ONCE(lim->max_open_zones) ||
+ WARN_ON_ONCE(lim->max_active_zones) ||
+ WARN_ON_ONCE(lim->zone_write_granularity) ||
+ WARN_ON_ONCE(lim->max_zone_append_sectors))
+ return -EINVAL;
+ return 0;
+ }
+
+ if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)))
+ return -EINVAL;
+
+ if (lim->zone_write_granularity < lim->logical_block_size)
+ lim->zone_write_granularity = lim->logical_block_size;
+
+ if (lim->max_zone_append_sectors) {
+ /*
+ * The Zone Append size is limited by the maximum I/O size
+ * and the zone size given that it can't span zones.
+ */
+ lim->max_zone_append_sectors =
+ min3(lim->max_hw_sectors,
+ lim->max_zone_append_sectors,
+ lim->chunk_sectors);
+ }
+
+ return 0;
+}
+
+/*
+ * Check that the limits in lim are valid, initialize defaults for unset
+ * values, and cap values based on others where needed.
+ */
+static int blk_validate_limits(struct queue_limits *lim)
+{
+ unsigned int max_hw_sectors;
+
+ /*
+ * Unless otherwise specified, default to 512 byte logical blocks and a
+ * physical block size equal to the logical block size.
+ */
+ if (!lim->logical_block_size)
+ lim->logical_block_size = SECTOR_SIZE;
+ if (lim->physical_block_size < lim->logical_block_size)
+ lim->physical_block_size = lim->logical_block_size;
+
+ /*
+ * The minimum I/O size defaults to the physical block size unless
+ * explicitly overridden.
+ */
+ if (lim->io_min < lim->physical_block_size)
+ lim->io_min = lim->physical_block_size;
+
+ /*
+ * max_hw_sectors has a somewhat weird default for historical reason,
+ * but driver really should set their own instead of relying on this
+ * value.
+ *
+ * The block layer relies on the fact that every driver can
+ * handle at lest a page worth of data per I/O, and needs the value
+ * aligned to the logical block size.
+ */
+ if (!lim->max_hw_sectors)
+ lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+ if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS))
+ return -EINVAL;
+ lim->max_hw_sectors = round_down(lim->max_hw_sectors,
+ lim->logical_block_size >> SECTOR_SHIFT);
+
+ /*
+ * The actual max_sectors value is a complex beast and also takes the
+ * max_dev_sectors value (set by SCSI ULPs) and a user configurable
+ * value into account. The ->max_sectors value is always calculated
+ * from these, so directly setting it won't have any effect.
+ */
+ max_hw_sectors = min_not_zero(lim->max_hw_sectors,
+ lim->max_dev_sectors);
+ if (lim->max_user_sectors) {
+ if (lim->max_user_sectors > max_hw_sectors ||
+ lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE)
+ return -EINVAL;
+ lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors);
+ } else {
+ lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP);
+ }
+ lim->max_sectors = round_down(lim->max_sectors,
+ lim->logical_block_size >> SECTOR_SHIFT);
+
+ /*
+ * Random default for the maximum number of segments. Driver should not
+ * rely on this and set their own.
+ */
+ if (!lim->max_segments)
+ lim->max_segments = BLK_MAX_SEGMENTS;
+
+ lim->max_discard_sectors =
+ min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors);
+
+ if (!lim->max_discard_segments)
+ lim->max_discard_segments = 1;
+
+ if (lim->discard_granularity < lim->physical_block_size)
+ lim->discard_granularity = lim->physical_block_size;
+
+ /*
+ * By default there is no limit on the segment boundary alignment,
+ * but if there is one it can't be smaller than the page size as
+ * that would break all the normal I/O patterns.
+ */
+ if (!lim->seg_boundary_mask)
+ lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+ if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1))
+ return -EINVAL;
+
+ /*
+ * Devices that require a virtual boundary do not support scatter/gather
+ * I/O natively, but instead require a descriptor list entry for each
+ * page (which might not be identical to the Linux PAGE_SIZE). Because
+ * of that they are not limited by our notion of "segment size".
+ */
+ if (lim->virt_boundary_mask) {
+ if (WARN_ON_ONCE(lim->max_segment_size &&
+ lim->max_segment_size != UINT_MAX))
+ return -EINVAL;
+ lim->max_segment_size = UINT_MAX;
+ } else {
+ /*
+ * The maximum segment size has an odd historic 64k default that
+ * drivers probably should override. Just like the I/O size we
+ * require drivers to at least handle a full page per segment.
+ */
+ if (!lim->max_segment_size)
+ lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
+ if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE))
+ return -EINVAL;
+ }
+
+ /*
+ * We require drivers to at least do logical block aligned I/O, but
+ * historically could not check for that due to the separate calls
+ * to set the limits. Once the transition is finished the check
+ * below should be narrowed down to check the logical block size.
+ */
+ if (!lim->dma_alignment)
+ lim->dma_alignment = SECTOR_SIZE - 1;
+ if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE))
+ return -EINVAL;
+
+ if (lim->alignment_offset) {
+ lim->alignment_offset &= (lim->physical_block_size - 1);
+ lim->misaligned = 0;
+ }
+
+ return blk_validate_zoned_limits(lim);
+}
+
+/*
+ * Set the default limits for a newly allocated queue. @lim contains the
+ * initial limits set by the driver, which could be no limit in which case
+ * all fields are cleared to zero.
+ */
+int blk_set_default_limits(struct queue_limits *lim)
+{
+ /*
+ * Most defaults are set by capping the bounds in blk_validate_limits,
+ * but max_user_discard_sectors is special and needs an explicit
+ * initialization to the max value here.
+ */
+ lim->max_user_discard_sectors = UINT_MAX;
+ return blk_validate_limits(lim);
+}
+
+/**
+ * queue_limits_commit_update - commit an atomic update of queue limits
+ * @q: queue to update
+ * @lim: limits to apply
+ *
+ * Apply the limits in @lim that were obtained from queue_limits_start_update()
+ * and updated by the caller to @q.
+ *
+ * Returns 0 if successful, else a negative error code.
+ */
+int queue_limits_commit_update(struct request_queue *q,
+ struct queue_limits *lim)
+ __releases(q->limits_lock)
+{
+ int error = blk_validate_limits(lim);
+
+ if (!error) {
+ q->limits = *lim;
+ if (q->disk)
+ blk_apply_bdi_limits(q->disk->bdi, lim);
+ }
+ mutex_unlock(&q->limits_lock);
+ return error;
+}
+EXPORT_SYMBOL_GPL(queue_limits_commit_update);
+
+/**
+ * queue_limits_set - apply queue limits to queue
+ * @q: queue to update
+ * @lim: limits to apply
+ *
+ * Apply the limits in @lim that were freshly initialized to @q.
+ * To update existing limits use queue_limits_start_update() and
+ * queue_limits_commit_update() instead.
+ *
+ * Returns 0 if successful, else a negative error code.
+ */
+int queue_limits_set(struct request_queue *q, struct queue_limits *lim)
+{
+ mutex_lock(&q->limits_lock);
+ return queue_limits_commit_update(q, lim);
+}
+EXPORT_SYMBOL_GPL(queue_limits_set);
+
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
@@ -177,8 +376,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors);
void blk_queue_max_discard_sectors(struct request_queue *q,
unsigned int max_discard_sectors)
{
- q->limits.max_hw_discard_sectors = max_discard_sectors;
- q->limits.max_discard_sectors = max_discard_sectors;
+ struct queue_limits *lim = &q->limits;
+
+ lim->max_hw_discard_sectors = max_discard_sectors;
+ lim->max_discard_sectors =
+ min(max_discard_sectors, lim->max_user_discard_sectors);
}
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
@@ -393,15 +595,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset);
void disk_update_readahead(struct gendisk *disk)
{
- struct request_queue *q = disk->queue;
-
- /*
- * For read-ahead of large files to be effective, we need to read ahead
- * at least twice the optimal I/O size.
- */
- disk->bdi->ra_pages =
- max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
- disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
+ blk_apply_bdi_limits(disk->bdi, &disk->queue->limits);
}
EXPORT_SYMBOL_GPL(disk_update_readahead);
@@ -689,33 +883,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
+ if (!t->zoned) {
+ t->zone_write_granularity = 0;
+ t->max_zone_append_sectors = 0;
+ }
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
- * disk_stack_limits - adjust queue limits for stacked drivers
- * @disk: MD/DM gendisk (top)
+ * queue_limits_stack_bdev - adjust queue_limits for stacked devices
+ * @t: the stacking driver limits (top device)
* @bdev: the underlying block device (bottom)
* @offset: offset to beginning of data within component device
+ * @pfx: prefix to use for warnings logged
*
* Description:
- * Merges the limits for a top level gendisk and a bottom level
- * block_device.
+ * This function is used by stacking drivers like MD and DM to ensure
+ * that all component devices have compatible block sizes and
+ * alignments. The stacking driver must provide a queue_limits
+ * struct (top) and then iteratively call the stacking function for
+ * all component (bottom) devices. The stacking function will
+ * attempt to combine the values and ensure proper alignment.
*/
-void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
- sector_t offset)
+void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
+ sector_t offset, const char *pfx)
{
- struct request_queue *t = disk->queue;
-
- if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
- get_start_sect(bdev) + (offset >> 9)) < 0)
+ if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits,
+ get_start_sect(bdev) + offset))
pr_notice("%s: Warning: Device %pg is misaligned\n",
- disk->disk_name, bdev);
-
- disk_update_readahead(disk);
+ pfx, bdev);
}
-EXPORT_SYMBOL(disk_stack_limits);
+EXPORT_SYMBOL_GPL(queue_limits_stack_bdev);
/**
* blk_queue_update_dma_pad - update pad mask
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 7ff76ae6c76a..e42c263e53fb 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
- if (!src->nr_samples)
+ if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
return;
dst->min = min(dst->min, src->min);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 6b2429cad81a..8c8f69d8ba48 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page)
static ssize_t queue_discard_max_store(struct request_queue *q,
const char *page, size_t count)
{
- unsigned long max_discard;
- ssize_t ret = queue_var_store(&max_discard, page, count);
+ unsigned long max_discard_bytes;
+ struct queue_limits lim;
+ ssize_t ret;
+ int err;
+ ret = queue_var_store(&max_discard_bytes, page, count);
if (ret < 0)
return ret;
- if (max_discard & (q->limits.discard_granularity - 1))
+ if (max_discard_bytes & (q->limits.discard_granularity - 1))
return -EINVAL;
- max_discard >>= 9;
- if (max_discard > UINT_MAX)
+ if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX)
return -EINVAL;
- if (max_discard > q->limits.max_hw_discard_sectors)
- max_discard = q->limits.max_hw_discard_sectors;
+ blk_mq_freeze_queue(q);
+ lim = queue_limits_start_update(q);
+ lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT;
+ err = queue_limits_commit_update(q, &lim);
+ blk_mq_unfreeze_queue(q);
- q->limits.max_discard_sectors = max_discard;
+ if (err)
+ return err;
return ret;
}
@@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
- unsigned long var;
- unsigned int max_sectors_kb,
- max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
- page_kb = 1 << (PAGE_SHIFT - 10);
- ssize_t ret = queue_var_store(&var, page, count);
+ unsigned long max_sectors_kb;
+ struct queue_limits lim;
+ ssize_t ret;
+ int err;
+ ret = queue_var_store(&max_sectors_kb, page, count);
if (ret < 0)
return ret;
- max_sectors_kb = (unsigned int)var;
- max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb,
- q->limits.max_dev_sectors >> 1);
- if (max_sectors_kb == 0) {
- q->limits.max_user_sectors = 0;
- max_sectors_kb = min(max_hw_sectors_kb,
- BLK_DEF_MAX_SECTORS_CAP >> 1);
- } else {
- if (max_sectors_kb > max_hw_sectors_kb ||
- max_sectors_kb < page_kb)
- return -EINVAL;
- q->limits.max_user_sectors = max_sectors_kb << 1;
- }
-
- spin_lock_irq(&q->queue_lock);
- q->limits.max_sectors = max_sectors_kb << 1;
- if (q->disk)
- q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
- spin_unlock_irq(&q->queue_lock);
-
+ blk_mq_freeze_queue(q);
+ lim = queue_limits_start_update(q);
+ lim.max_user_sectors = max_sectors_kb << 1;
+ err = queue_limits_commit_update(q, &lim);
+ blk_mq_unfreeze_queue(q);
+ if (err)
+ return err;
return ret;
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 16f5766620a4..f4850a6f860b 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ tg_dispatch_one_bio(tg, READ);
nr_reads++;
if (nr_reads >= max_nr_reads)
@@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
tg_may_dispatch(tg, bio, NULL)) {
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
+ tg_dispatch_one_bio(tg, WRITE);
nr_writes++;
if (nr_writes >= max_nr_writes)
@@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold);
ret = tg->latency_target == DFL_LATENCY_TARGET ||
tg->idletime_threshold == DFL_IDLE_THRESHOLD ||
- (ktime_get_ns() >> 10) - tg->last_finish_time > time ||
+ (blk_time_get_ns() >> 10) - tg->last_finish_time > time ||
tg->avg_idletime > tg->idletime_threshold ||
(tg->latency_target && tg->bio_cnt &&
tg->bad_bio_cnt * 5 < tg->bio_cnt);
@@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
if (last_finish_time == 0)
return;
- now = ktime_get_ns() >> 10;
+ now = blk_time_get_ns() >> 10;
if (now <= last_finish_time ||
last_finish_time == tg->checked_last_finish_time)
return;
@@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio)
if (!tg->td->limit_valid[LIMIT_LOW])
return;
- finish_time_ns = ktime_get_ns();
+ finish_time_ns = blk_time_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
start_time = bio_issue_time(&bio->bi_issue) >> 10;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0c0e270a8265..64472134dd26 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -29,6 +29,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
#include "elevator.h"
+#include "blk.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat)
static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
{
- u64 now, issue = READ_ONCE(rwb->sync_issue);
+ u64 issue = READ_ONCE(rwb->sync_issue);
if (!issue || !rwb->sync_cookie)
return 0;
- now = ktime_to_ns(ktime_get());
- return now - issue;
+ return blk_time_get_ns() - issue;
}
static inline unsigned int wbt_inflight(struct rq_wb *rwb)
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index d343e5756a9c..da0f4b2a8fa0 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -11,7 +11,6 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
#include <linux/mm.h>
@@ -177,8 +176,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
}
}
-static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
- gfp_t gfp_mask)
+static int blkdev_zone_reset_all_emulated(struct block_device *bdev)
{
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = bdev_nr_sectors(bdev);
@@ -205,7 +203,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
}
bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
- gfp_mask);
+ GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -223,7 +221,7 @@ out_free_need_reset:
return ret;
}
-static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
+static int blkdev_zone_reset_all(struct block_device *bdev)
{
struct bio bio;
@@ -238,7 +236,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* @sector: Start sector of the first zone to operate on
* @nr_sectors: Number of sectors, should be at least the length of one zone and
* must be zone size aligned.
- * @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
* Perform the specified operation on the range of zones specified by
@@ -248,7 +245,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
* or finish request.
*/
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
- sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
+ sector_t sector, sector_t nr_sectors)
{
struct request_queue *q = bdev_get_queue(bdev);
sector_t zone_sectors = bdev_zone_sectors(bdev);
@@ -285,12 +282,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
*/
if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
if (!blk_queue_zone_resetall(q))
- return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
- return blkdev_zone_reset_all(bdev, gfp_mask);
+ return blkdev_zone_reset_all_emulated(bdev);
+ return blkdev_zone_reset_all(bdev);
}
while (sector < end_sector) {
- bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -419,8 +416,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
return -ENOTTY;
}
- ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
- GFP_KERNEL);
+ ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors);
fail:
if (cmd == BLKRESETZONE)
diff --git a/block/blk.h b/block/blk.h
index f02b25f22e8b..a19b7b42e650 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,6 +4,8 @@
#include <linux/blk-crypto.h>
#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
+#include <linux/sched/sysctl.h>
+#include <linux/timekeeping.h>
#include <xen/xen.h>
#include "blk-crypto-internal.h"
@@ -70,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio)
return __bio_queue_enter(q, bio);
}
+static inline void blk_wait_io(struct completion *done)
+{
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+
+ if (timeout)
+ while (!wait_for_completion_io_timeout(done, timeout))
+ ;
+ else
+ wait_for_completion_io(done);
+}
+
#define BIO_INLINE_VECS 4
struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
gfp_t gfp_mask);
@@ -329,7 +343,7 @@ void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
-void blk_set_default_limits(struct queue_limits *lim);
+int blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void);
/*
@@ -447,7 +461,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page)
unpin_user_page(page);
}
-struct request_queue *blk_alloc_queue(int node_id);
+struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
@@ -516,8 +530,75 @@ static inline int req_ref_read(struct request *req)
return atomic_read(&req->ref);
}
+static inline u64 blk_time_get_ns(void)
+{
+ struct blk_plug *plug = current->plug;
+
+ if (!plug)
+ return ktime_get_ns();
+
+ /*
+ * 0 could very well be a valid time, but rather than flag "this is
+ * a valid timestamp" separately, just accept that we'll do an extra
+ * ktime_get_ns() if we just happen to get 0 as the current time.
+ */
+ if (!plug->cur_ktime) {
+ plug->cur_ktime = ktime_get_ns();
+ current->flags |= PF_BLOCK_TS;
+ }
+ return plug->cur_ktime;
+}
+
+static inline ktime_t blk_time_get(void)
+{
+ return ns_to_ktime(blk_time_get_ns());
+}
+
+/*
+ * From most significant bit:
+ * 1 bit: reserved for other usage, see below
+ * 12 bits: original size of bio
+ * 51 bits: issue time of bio
+ */
+#define BIO_ISSUE_RES_BITS 1
+#define BIO_ISSUE_SIZE_BITS 12
+#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
+#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
+#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
+#define BIO_ISSUE_SIZE_MASK \
+ (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
+#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
+
+/* Reserved bit for blk-throtl */
+#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
+
+static inline u64 __bio_issue_time(u64 time)
+{
+ return time & BIO_ISSUE_TIME_MASK;
+}
+
+static inline u64 bio_issue_time(struct bio_issue *issue)
+{
+ return __bio_issue_time(issue->value);
+}
+
+static inline sector_t bio_issue_size(struct bio_issue *issue)
+{
+ return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
+}
+
+static inline void bio_issue_init(struct bio_issue *issue,
+ sector_t size)
+{
+ size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
+ issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
+ (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) |
+ ((u64)size << BIO_ISSUE_SIZE_SHIFT));
+}
+
void bdev_release(struct file *bdev_file);
int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder,
const struct blk_holder_ops *hops, struct file *bdev_file);
int bdev_permission(dev_t dev, blk_mode_t mode, void *holder);
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index b3acdbdb6e7e..bcc7dee6abce 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
if (blk_mq_alloc_tag_set(set))
goto out_tag_set;
- q = blk_mq_init_queue(set);
+ q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(q)) {
ret = PTR_ERR(q);
goto out_queue;
diff --git a/block/genhd.c b/block/genhd.c
index a911d2969c07..bb29a68e1d67 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}
-struct class block_class = {
+const struct class block_class = {
.name = "block",
.dev_uevent = block_uevent,
};
@@ -1391,19 +1391,21 @@ out_free_disk:
return NULL;
}
-struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
+struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
+ struct lock_class_key *lkclass)
{
+ struct queue_limits default_lim = { };
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node);
- if (!q)
- return NULL;
+ q = blk_alloc_queue(lim ? lim : &default_lim, node);
+ if (IS_ERR(q))
+ return ERR_CAST(q);
disk = __alloc_disk_node(q, node, lkclass);
if (!disk) {
blk_put_queue(q);
- return NULL;
+ return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
return disk;
diff --git a/block/holder.c b/block/holder.c
index 37d18c13d958..791091a7eac2 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -8,6 +8,8 @@ struct bd_holder_disk {
int refcnt;
};
+static DEFINE_MUTEX(blk_holder_mutex);
+
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct gendisk *disk)
{
@@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
kobject_get(bdev->bd_holder_dir);
mutex_unlock(&bdev->bd_disk->open_mutex);
- mutex_lock(&disk->open_mutex);
+ mutex_lock(&blk_holder_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
@@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
return 0;
out_del_symlink:
@@ -116,7 +118,7 @@ out_del_symlink:
out_free_holder:
kfree(holder);
out_unlock:
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
if (ret)
kobject_put(bdev->bd_holder_dir);
return ret;
@@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
if (WARN_ON_ONCE(!disk->slave_dir))
return;
- mutex_lock(&disk->open_mutex);
+ mutex_lock(&blk_holder_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
del_symlink(disk->slave_dir, bdev_kobj(bdev));
@@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
list_del_init(&holder->list);
kfree(holder);
}
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
diff --git a/block/ioctl.c b/block/ioctl.c
index 4c8aebee595f..0c76137adcaa 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev,
{
struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p;
- sector_t start, length;
+ sector_t start, length, capacity, end;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev,
start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT;
+ capacity = get_capacity(disk);
+
+ if (check_add_overflow(start, length, &end))
+ return -EINVAL;
+
+ if (start >= capacity || end > capacity)
+ return -EINVAL;
switch (op) {
case BLKPG_ADD_PARTITION:
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 5f5ed5c75f04..b11e88c82c8c 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -419,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start,
int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
sector_t length)
{
- sector_t capacity = get_capacity(disk), end;
struct block_device *part;
int ret;
mutex_lock(&disk->open_mutex);
- if (check_add_overflow(start, length, &end)) {
- ret = -EINVAL;
- goto out;
- }
-
- if (start >= capacity || end > capacity) {
- ret = -EINVAL;
- goto out;
- }
-
if (!disk_live(disk)) {
ret = -ENXIO;
goto out;
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index 7b521df00a39..c80183156d68 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness);
* Code to understand MacOS partition tables.
*/
+#ifdef CONFIG_PPC_PMAC
static inline void mac_fix_string(char *stg, int len)
{
int i;
@@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len)
for (i = len - 1; i >= 0 && stg[i] == ' '; i--)
stg[i] = 0;
}
+#endif
int mac_partition(struct parsed_partitions *state)
{
diff --git a/block/sed-opal.c b/block/sed-opal.c
index fa4dba5d8531..14fe0fef811c 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1212,7 +1212,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method)
static int start_opal_session_cont(struct opal_dev *dev)
{
u32 hsn, tsn;
- int error = 0;
+ int error;
error = parse_and_check_status(dev);
if (error)
@@ -1354,7 +1354,7 @@ static int get_active_key_cont(struct opal_dev *dev)
{
const char *activekey;
size_t keylen;
- int error = 0;
+ int error;
error = parse_and_check_status(dev);
if (error)
@@ -2157,7 +2157,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
u8 lr_buffer[OPAL_UID_LENGTH];
struct opal_lock_unlock *lkul = data;
u8 read_locked = 1, write_locked = 1;
- int err = 0;
+ int err;
if (build_locking_range(lr_buffer, sizeof(lr_buffer),
lkul->session.opal_key.lr) < 0)
@@ -2580,7 +2580,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
const struct opal_step discovery0_step = {
opal_discovery0, discv
};
- int ret = 0;
+ int ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
@@ -3069,7 +3069,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
{
struct opal_suspend_data *suspend;
bool was_failure = false;
- int ret = 0;
+ int ret;
if (!dev)
return false;
@@ -3112,10 +3112,9 @@ static int opal_read_table(struct opal_dev *dev,
{ read_table_data, rw_tbl },
{ end_opal_session, }
};
- int ret = 0;
if (!rw_tbl->size)
- return ret;
+ return 0;
return execute_steps(dev, read_table_steps,
ARRAY_SIZE(read_table_steps));
@@ -3129,10 +3128,9 @@ static int opal_write_table(struct opal_dev *dev,
{ write_table_data, rw_tbl },
{ end_opal_session, }
};
- int ret = 0;
if (!rw_tbl->size)
- return ret;
+ return 0;
return execute_steps(dev, write_table_steps,
ARRAY_SIZE(write_table_steps));
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 914d8cddd43a..d90892fd6f2a 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -12,14 +12,14 @@
#include <net/checksum.h>
#include <asm/unaligned.h>
-typedef __be16 (csum_fn) (void *, unsigned int);
+typedef __be16 (csum_fn) (__be16, void *, unsigned int);
-static __be16 t10_pi_crc_fn(void *data, unsigned int len)
+static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len)
{
- return cpu_to_be16(crc_t10dif(data, len));
+ return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len));
}
-static __be16 t10_pi_ip_fn(void *data, unsigned int len)
+static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len)
{
return (__force __be16)ip_compute_csum(data, len);
}
@@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type)
{
+ u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
- struct t10_pi_tuple *pi = iter->prot_buf;
+ struct t10_pi_tuple *pi = iter->prot_buf + offset;
- pi->guard_tag = fn(iter->data_buf, iter->interval);
+ pi->guard_tag = fn(0, iter->data_buf, iter->interval);
+ if (offset)
+ pi->guard_tag = fn(pi->guard_tag, iter->prot_buf,
+ offset);
pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION)
@@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
csum_fn *fn, enum t10_dif_type type)
{
+ u8 offset = iter->pi_offset;
unsigned int i;
BUG_ON(type == T10_PI_TYPE0_PROTECTION);
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
- struct t10_pi_tuple *pi = iter->prot_buf;
+ struct t10_pi_tuple *pi = iter->prot_buf + offset;
__be16 csum;
if (type == T10_PI_TYPE1_PROTECTION ||
@@ -83,7 +88,9 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
goto next;
}
- csum = fn(iter->data_buf, iter->interval);
+ csum = fn(0, iter->data_buf, iter->interval);
+ if (offset)
+ csum = fn(csum, iter->prot_buf, offset);
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
@@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
*/
static void t10_pi_type1_prepare(struct request *rq)
{
- const int tuple_sz = rq->q->integrity.tuple_size;
+ struct blk_integrity *bi = &rq->q->integrity;
+ const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
+ u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
- struct t10_pi_tuple *pi = p;
+ struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == virt)
pi->ref_tag = cpu_to_be32(ref_tag);
@@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq)
*/
static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
- unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
- const int tuple_sz = rq->q->integrity.tuple_size;
+ struct blk_integrity *bi = &rq->q->integrity;
+ unsigned intervals = nr_bytes >> bi->interval_exp;
+ const int tuple_sz = bi->tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
+ u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
- struct t10_pi_tuple *pi = p;
+ struct t10_pi_tuple *pi = p + offset;
if (be32_to_cpu(pi->ref_tag) == ref_tag)
pi->ref_tag = cpu_to_be32(virt);
@@ -280,20 +291,24 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
};
EXPORT_SYMBOL(t10_pi_type3_ip);
-static __be64 ext_pi_crc64(void *data, unsigned int len)
+static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len)
{
- return cpu_to_be64(crc64_rocksoft(data, len));
+ return cpu_to_be64(crc64_rocksoft_update(crc, data, len));
}
static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
enum t10_dif_type type)
{
+ u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
- struct crc64_pi_tuple *pi = iter->prot_buf;
+ struct crc64_pi_tuple *pi = iter->prot_buf + offset;
- pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval);
+ pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval);
+ if (offset)
+ pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag),
+ iter->prot_buf, offset);
pi->app_tag = 0;
if (type == T10_PI_TYPE1_PROTECTION)
@@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag)
static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
enum t10_dif_type type)
{
+ u8 offset = iter->pi_offset;
unsigned int i;
for (i = 0; i < iter->data_size; i += iter->interval) {
- struct crc64_pi_tuple *pi = iter->prot_buf;
+ struct crc64_pi_tuple *pi = iter->prot_buf + offset;
u64 ref, seed;
__be64 csum;
@@ -343,7 +359,11 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
goto next;
}
- csum = ext_pi_crc64(iter->data_buf, iter->interval);
+ csum = ext_pi_crc64(0, iter->data_buf, iter->interval);
+ if (offset)
+ csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf,
+ offset);
+
if (pi->guard_tag != csum) {
pr_err("%s: guard tag error at sector %llu " \
"(rcvd %016llx, want %016llx)\n",
@@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
static void ext_pi_type1_prepare(struct request *rq)
{
- const int tuple_sz = rq->q->integrity.tuple_size;
+ struct blk_integrity *bi = &rq->q->integrity;
+ const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
+ u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
- struct crc64_pi_tuple *pi = p;
+ struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == virt)
@@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq)
static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
- unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
- const int tuple_sz = rq->q->integrity.tuple_size;
+ struct blk_integrity *bi = &rq->q->integrity;
+ unsigned intervals = nr_bytes >> bi->interval_exp;
+ const int tuple_sz = bi->tuple_size;
u64 ref_tag = ext_pi_ref_tag(rq);
+ u8 offset = bi->pi_offset;
struct bio *bio;
__rq_for_each_bio(bio, rq) {
@@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
- struct crc64_pi_tuple *pi = p;
+ struct crc64_pi_tuple *pi = p + offset;
u64 ref = get_unaligned_be48(pi->ref_tag);
if (ref == ref_tag)
diff --git a/drivers/base/base.h b/drivers/base/base.h
index eb4c0ace9242..0738ccad08b2 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; }
#endif
#ifdef CONFIG_BLOCK
-extern struct class block_class;
+extern const struct class block_class;
static inline bool is_blockdev(struct device *dev)
{
return dev->class == &block_class;
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 2b98114a9fe0..a25414228e47 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system)
struct gendisk *disk;
int err;
- disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
+ disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index b1b47d88f5db..b6dac8cee70f 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -24,8 +24,8 @@ static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache;
static struct dentry *aoe_debugfs_dir;
-/* GPFS needs a larger value than the default. */
-static int aoe_maxsectors;
+/* random default picked from the historic block max_sectors cap */
+static int aoe_maxsectors = 2560;
module_param(aoe_maxsectors, int, 0644);
MODULE_PARM_DESC(aoe_maxsectors,
"When nonzero, set the maximum number of sectors per I/O request");
@@ -334,6 +334,10 @@ aoeblk_gdalloc(void *vp)
mempool_t *mp;
struct blk_mq_tag_set *set;
sector_t ssize;
+ struct queue_limits lim = {
+ .max_hw_sectors = aoe_maxsectors,
+ .io_opt = SZ_2M,
+ };
ulong flags;
int late = 0;
int err;
@@ -371,7 +375,7 @@ aoeblk_gdalloc(void *vp)
goto err_mempool;
}
- gd = blk_mq_alloc_disk(set, d);
+ gd = blk_mq_alloc_disk(set, &lim, d);
if (IS_ERR(gd)) {
pr_err("aoe: cannot allocate block queue for %ld.%d\n",
d->aoemajor, d->aoeminor);
@@ -384,14 +388,9 @@ aoeblk_gdalloc(void *vp)
WARN_ON(d->flags & DEVFL_TKILL);
WARN_ON(d->gd);
WARN_ON(d->flags & DEVFL_UP);
- /* random number picked from the history block max_sectors cap */
- blk_queue_max_hw_sectors(gd->queue, 2560u);
- blk_queue_io_opt(gd->queue, SZ_2M);
d->bufpool = mp;
d->blkq = gd->queue;
d->gd = gd;
- if (aoe_maxsectors)
- blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors);
gd->major = AOE_MAJOR;
gd->first_minor = d->sysminor;
gd->minors = AOE_PARTITIONS;
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index d7317425be51..cc9077b588d7 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -419,13 +419,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
rcu_read_lock();
for_each_netdev_rcu(&init_net, ifp) {
dev_hold(ifp);
- if (!is_aoe_netif(ifp))
- goto cont;
+ if (!is_aoe_netif(ifp)) {
+ dev_put(ifp);
+ continue;
+ }
skb = new_skb(sizeof *h + sizeof *ch);
if (skb == NULL) {
printk(KERN_INFO "aoe: skb alloc failure\n");
- goto cont;
+ dev_put(ifp);
+ continue;
}
skb_put(skb, sizeof *h + sizeof *ch);
skb->dev = ifp;
@@ -440,9 +443,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu
h->major = cpu_to_be16(aoemajor);
h->minor = aoeminor;
h->cmd = AOECMD_CFG;
-
-cont:
- dev_put(ifp);
}
rcu_read_unlock();
}
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index c51ea95bc2ce..923a134fd766 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock)
pr_warn("aoe: packet could not be sent on %s. %s\n",
ifp ? ifp->name : "netif",
"consider increasing tx_queue_len");
+ dev_put(ifp);
spin_lock_irq(&txlock);
}
return 0;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index 50949207798d..cacc4ba942a8 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
{
struct gendisk *disk;
- disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
+ disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 970bd6ff38c4..e322cef6596b 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -318,6 +318,16 @@ static int brd_alloc(int i)
struct gendisk *disk;
char buf[DISK_NAME_LEN];
int err = -ENOMEM;
+ struct queue_limits lim = {
+ /*
+ * This is so fdisk will align partitions on 4k, because of
+ * direct_access API needing 4k alignment, returning a PFN
+ * (This is only a problem on very small devices <= 4M,
+ * otherwise fdisk will align on 1M. Regardless this call
+ * is harmless)
+ */
+ .physical_block_size = PAGE_SIZE,
+ };
list_for_each_entry(brd, &brd_devices, brd_list)
if (brd->brd_number == i)
@@ -335,10 +345,11 @@ static int brd_alloc(int i)
debugfs_create_u64(buf, 0444, brd_debugfs_dir,
&brd->brd_nr_pages);
- disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
+ disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ err = PTR_ERR(disk);
goto out_free_dev;
-
+ }
disk->major = RAMDISK_MAJOR;
disk->first_minor = i * max_part;
disk->minors = max_part;
@@ -347,15 +358,6 @@ static int brd_alloc(int i)
strscpy(disk->disk_name, buf, DISK_NAME_LEN);
set_capacity(disk, rd_size * 2);
- /*
- * This is so fdisk will align partitions on 4k, because of
- * direct_access API needing 4k alignment, returning a PFN
- * (This is only a problem on very small devices <= 4M,
- * otherwise fdisk will align on 1M. Regardless this call
- * is harmless)
- */
- blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
-
/* Tell the block layer that this is not a rotational device */
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 6bc86106c7b2..113b441d4d36 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2690,6 +2690,14 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
int id;
int vnr = adm_ctx->volume;
enum drbd_ret_code err = ERR_NOMEM;
+ struct queue_limits lim = {
+ /*
+ * Setting the max_hw_sectors to an odd value of 8kibyte here.
+ * This triggers a max_bio_size message upon first attach or
+ * connect.
+ */
+ .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8,
+ };
device = minor_to_device(minor);
if (device)
@@ -2708,9 +2716,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_init_set_defaults(device);
- disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
+ disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ err = PTR_ERR(disk);
goto out_no_disk;
+ }
device->vdisk = disk;
device->rq_queue = disk->queue;
@@ -2727,9 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
blk_queue_write_cache(disk->queue, true, true);
- /* Setting the max_hw_sectors to an odd value of 8kibyte here
- This triggers a max_bio_size message upon first attach or connect */
- blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8);
device->md_io.page = alloc_page(GFP_KERNEL);
if (!device->md_io.page)
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 6aed67278e8b..5d65c9754d83 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
return 0;
}
-static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
+static unsigned int drbd_max_peer_bio_size(struct drbd_device *device)
{
- q->limits.discard_granularity = granularity;
+ /*
+ * We may ignore peer limits if the peer is modern enough. From 8.3.8
+ * onwards the peer can use multiple BIOs for a single peer_request.
+ */
+ if (device->state.conn < C_WF_REPORT_PARAMS)
+ return device->peer_max_bio_size;
+
+ if (first_peer_device(device)->connection->agreed_pro_version < 94)
+ return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
+ /*
+ * Correct old drbd (up to 8.3.7) if it believes it can do more than
+ * 32KiB.
+ */
+ if (first_peer_device(device)->connection->agreed_pro_version == 94)
+ return DRBD_MAX_SIZE_H80_PACKET;
+
+ /*
+ * drbd 8.3.8 onwards, before 8.4.0
+ */
+ if (first_peer_device(device)->connection->agreed_pro_version < 100)
+ return DRBD_MAX_BIO_SIZE_P95;
+ return DRBD_MAX_BIO_SIZE;
}
static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
@@ -1204,149 +1226,119 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
return AL_EXTENT_SIZE >> 9;
}
-static void decide_on_discard_support(struct drbd_device *device,
+static bool drbd_discard_supported(struct drbd_connection *connection,
struct drbd_backing_dev *bdev)
{
- struct drbd_connection *connection =
- first_peer_device(device)->connection;
- struct request_queue *q = device->rq_queue;
- unsigned int max_discard_sectors;
-
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
- goto not_supported;
+ return false;
if (connection->cstate >= C_CONNECTED &&
!(connection->agreed_features & DRBD_FF_TRIM)) {
drbd_info(connection,
"peer DRBD too old, does not support TRIM: disabling discards\n");
- goto not_supported;
+ return false;
}
- /*
- * We don't care for the granularity, really.
- *
- * Stacking limits below should fix it for the local device. Whether or
- * not it is a suitable granularity on the remote device is not our
- * problem, really. If you care, you need to use devices with similar
- * topology on all peers.
- */
- blk_queue_discard_granularity(q, 512);
- max_discard_sectors = drbd_max_discard_sectors(connection);
- blk_queue_max_discard_sectors(q, max_discard_sectors);
- blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
- return;
-
-not_supported:
- blk_queue_discard_granularity(q, 0);
- blk_queue_max_discard_sectors(q, 0);
+ return true;
}
-static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
+/* This is the workaround for "bio would need to, but cannot, be split" */
+static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device)
{
- /* Fixup max_write_zeroes_sectors after blk_stack_limits():
- * if we can handle "zeroes" efficiently on the protocol,
- * we want to do that, even if our backend does not announce
- * max_write_zeroes_sectors itself. */
- struct drbd_connection *connection = first_peer_device(device)->connection;
- /* If the peer announces WZEROES support, use it. Otherwise, rather
- * send explicit zeroes than rely on some discard-zeroes-data magic. */
- if (connection->agreed_features & DRBD_FF_WZEROES)
- q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
- else
- q->limits.max_write_zeroes_sectors = 0;
-}
+ unsigned int max_segments;
-static void fixup_discard_support(struct drbd_device *device, struct request_queue *q)
-{
- unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
- unsigned int discard_granularity =
- device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
+ rcu_read_lock();
+ max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
- if (discard_granularity > max_discard) {
- blk_queue_discard_granularity(q, 0);
- blk_queue_max_discard_sectors(q, 0);
- }
+ if (!max_segments)
+ return BLK_MAX_SEGMENTS;
+ return max_segments;
}
-static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
- unsigned int max_bio_size, struct o_qlim *o)
+void drbd_reconsider_queue_parameters(struct drbd_device *device,
+ struct drbd_backing_dev *bdev, struct o_qlim *o)
{
+ struct drbd_connection *connection =
+ first_peer_device(device)->connection;
struct request_queue * const q = device->rq_queue;
- unsigned int max_hw_sectors = max_bio_size >> 9;
- unsigned int max_segments = 0;
+ unsigned int now = queue_max_hw_sectors(q) << 9;
+ struct queue_limits lim;
struct request_queue *b = NULL;
- struct disk_conf *dc;
+ unsigned int new;
if (bdev) {
b = bdev->backing_bdev->bd_disk->queue;
- max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
- rcu_read_lock();
- dc = rcu_dereference(device->ldev->disk_conf);
- max_segments = dc->max_bio_bvecs;
- rcu_read_unlock();
-
- blk_set_stacking_limits(&q->limits);
+ device->local_max_bio_size =
+ queue_max_hw_sectors(b) << SECTOR_SHIFT;
}
- blk_queue_max_hw_sectors(q, max_hw_sectors);
- /* This is the workaround for "bio would need to, but cannot, be split" */
- blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
- blk_queue_segment_boundary(q, PAGE_SIZE-1);
- decide_on_discard_support(device, bdev);
-
- if (b) {
- blk_stack_limits(&q->limits, &b->limits, 0);
- disk_update_readahead(device->vdisk);
+ /*
+ * We may later detach and re-attach on a disconnected Primary. Avoid
+ * decreasing the value in this case.
+ *
+ * We want to store what we know the peer DRBD can handle, not what the
+ * peer IO backend can handle.
+ */
+ new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size,
+ max(drbd_max_peer_bio_size(device), device->peer_max_bio_size));
+ if (new != now) {
+ if (device->state.role == R_PRIMARY && new < now)
+ drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n",
+ new, now);
+ drbd_info(device, "max BIO size = %u\n", new);
}
- fixup_write_zeroes(device, q);
- fixup_discard_support(device, q);
-}
-
-void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
-{
- unsigned int now, new, local, peer;
-
- now = queue_max_hw_sectors(device->rq_queue) << 9;
- local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
- peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+ lim = queue_limits_start_update(q);
if (bdev) {
- local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
- device->local_max_bio_size = local;
+ blk_set_stacking_limits(&lim);
+ lim.max_segments = drbd_backing_dev_max_segments(device);
+ } else {
+ lim.max_segments = BLK_MAX_SEGMENTS;
}
- local = min(local, DRBD_MAX_BIO_SIZE);
- /* We may ignore peer limits if the peer is modern enough.
- Because new from 8.3.8 onwards the peer can use multiple
- BIOs for a single peer_request */
- if (device->state.conn >= C_WF_REPORT_PARAMS) {
- if (first_peer_device(device)->connection->agreed_pro_version < 94)
- peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
- /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
- else if (first_peer_device(device)->connection->agreed_pro_version == 94)
- peer = DRBD_MAX_SIZE_H80_PACKET;
- else if (first_peer_device(device)->connection->agreed_pro_version < 100)
- peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
- else
- peer = DRBD_MAX_BIO_SIZE;
+ lim.max_hw_sectors = new >> SECTOR_SHIFT;
+ lim.seg_boundary_mask = PAGE_SIZE - 1;
- /* We may later detach and re-attach on a disconnected Primary.
- * Avoid this setting to jump back in that case.
- * We want to store what we know the peer DRBD can handle,
- * not what the peer IO backend can handle. */
- if (peer > device->peer_max_bio_size)
- device->peer_max_bio_size = peer;
+ /*
+ * We don't care for the granularity, really.
+ *
+ * Stacking limits below should fix it for the local device. Whether or
+ * not it is a suitable granularity on the remote device is not our
+ * problem, really. If you care, you need to use devices with similar
+ * topology on all peers.
+ */
+ if (drbd_discard_supported(connection, bdev)) {
+ lim.discard_granularity = 512;
+ lim.max_hw_discard_sectors =
+ drbd_max_discard_sectors(connection);
+ } else {
+ lim.discard_granularity = 0;
+ lim.max_hw_discard_sectors = 0;
}
- new = min(local, peer);
- if (device->state.role == R_PRIMARY && new < now)
- drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+ if (bdev)
+ blk_stack_limits(&lim, &b->limits, 0);
- if (new != now)
- drbd_info(device, "max BIO size = %u\n", new);
+ /*
+ * If we can handle "zeroes" efficiently on the protocol, we want to do
+ * that, even if our backend does not announce max_write_zeroes_sectors
+ * itself.
+ */
+ if (connection->agreed_features & DRBD_FF_WZEROES)
+ lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
+ else
+ lim.max_write_zeroes_sectors = 0;
+
+ if ((lim.discard_granularity >> SECTOR_SHIFT) >
+ lim.max_hw_discard_sectors) {
+ lim.discard_granularity = 0;
+ lim.max_hw_discard_sectors = 0;
+ }
- drbd_setup_queue_param(device, bdev, new, o);
+ if (queue_limits_commit_update(q, &lim))
+ drbd_err(device, "setting new queue limits failed\n");
}
/* Starts the worker thread */
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 287a8d1d3f70..e858e7e0383f 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1542,9 +1542,10 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device,
int notify_resource_state_change(struct sk_buff *skb,
unsigned int seq,
- struct drbd_resource_state_change *resource_state_change,
+ void *state_change,
enum drbd_notification_type type)
{
+ struct drbd_resource_state_change *resource_state_change = state_change;
struct drbd_resource *resource = resource_state_change->resource;
struct resource_info resource_info = {
.res_role = resource_state_change->role[NEW],
@@ -1558,13 +1559,14 @@ int notify_resource_state_change(struct sk_buff *skb,
int notify_connection_state_change(struct sk_buff *skb,
unsigned int seq,
- struct drbd_connection_state_change *connection_state_change,
+ void *state_change,
enum drbd_notification_type type)
{
- struct drbd_connection *connection = connection_state_change->connection;
+ struct drbd_connection_state_change *p = state_change;
+ struct drbd_connection *connection = p->connection;
struct connection_info connection_info = {
- .conn_connection_state = connection_state_change->cstate[NEW],
- .conn_role = connection_state_change->peer_role[NEW],
+ .conn_connection_state = p->cstate[NEW],
+ .conn_role = p->peer_role[NEW],
};
return notify_connection_state(skb, seq, connection, &connection_info, type);
@@ -1572,9 +1574,10 @@ int notify_connection_state_change(struct sk_buff *skb,
int notify_device_state_change(struct sk_buff *skb,
unsigned int seq,
- struct drbd_device_state_change *device_state_change,
+ void *state_change,
enum drbd_notification_type type)
{
+ struct drbd_device_state_change *device_state_change = state_change;
struct drbd_device *device = device_state_change->device;
struct device_info device_info = {
.dev_disk_state = device_state_change->disk_state[NEW],
@@ -1585,9 +1588,10 @@ int notify_device_state_change(struct sk_buff *skb,
int notify_peer_device_state_change(struct sk_buff *skb,
unsigned int seq,
- struct drbd_peer_device_state_change *p,
+ void *state_change,
enum drbd_notification_type type)
{
+ struct drbd_peer_device_state_change *p = state_change;
struct drbd_peer_device *peer_device = p->peer_device;
struct peer_device_info peer_device_info = {
.peer_repl_state = p->repl_state[NEW],
@@ -1605,8 +1609,8 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
bool resource_state_has_changed;
unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
- int (*last_func)(struct sk_buff *, unsigned int, void *,
- enum drbd_notification_type) = NULL;
+ int (*last_func)(struct sk_buff *, unsigned int,
+ void *, enum drbd_notification_type) = NULL;
void *last_arg = NULL;
#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
@@ -1616,7 +1620,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change)
})
#define REMEMBER_STATE_CHANGE(func, arg, type) \
({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
- last_func = (typeof(last_func))func; \
+ last_func = func; \
last_arg = arg; \
})
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
index 9d78d8e3912e..a56a57d67686 100644
--- a/drivers/block/drbd/drbd_state_change.h
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -46,19 +46,19 @@ extern void forget_state_change(struct drbd_state_change *);
extern int notify_resource_state_change(struct sk_buff *,
unsigned int,
- struct drbd_resource_state_change *,
+ void *,
enum drbd_notification_type type);
extern int notify_connection_state_change(struct sk_buff *,
unsigned int,
- struct drbd_connection_state_change *,
+ void *,
enum drbd_notification_type type);
extern int notify_device_state_change(struct sk_buff *,
unsigned int,
- struct drbd_device_state_change *,
+ void *,
enum drbd_notification_type type);
extern int notify_peer_device_state_change(struct sk_buff *,
unsigned int,
- struct drbd_peer_device_state_change *,
+ void *,
enum drbd_notification_type type);
#endif /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index d0e41d52d6a9..1b399ec8c07d 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -530,14 +530,13 @@ static struct format_descr format_req;
static char *floppy_track_buffer;
static int max_buffer_sectors;
-typedef void (*done_f)(int);
static const struct cont_t {
void (*interrupt)(void);
/* this is called after the interrupt of the
* main command */
void (*redo)(void); /* this is called to retry the operation */
void (*error)(void); /* this is called to tally an error */
- done_f done; /* this is called to say if the operation has
+ void (*done)(int); /* this is called to say if the operation has
* succeeded/failed */
} *cont;
@@ -985,6 +984,10 @@ static void empty(void)
{
}
+static void empty_done(int result)
+{
+}
+
static void (*floppy_work_fn)(void);
static void floppy_work_workfn(struct work_struct *work)
@@ -1998,14 +2001,14 @@ static const struct cont_t wakeup_cont = {
.interrupt = empty,
.redo = do_wakeup,
.error = empty,
- .done = (done_f)empty
+ .done = empty_done,
};
static const struct cont_t intr_cont = {
.interrupt = empty,
.redo = process_fd_request,
.error = empty,
- .done = (done_f)empty
+ .done = empty_done,
};
/* schedules handler, waiting for completion. May be interrupted, will then
@@ -4513,13 +4516,15 @@ static bool floppy_available(int drive)
static int floppy_alloc_disk(unsigned int drive, unsigned int type)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = 64,
+ };
struct gendisk *disk;
- disk = blk_mq_alloc_disk(&tag_sets[drive], NULL);
+ disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);
- blk_queue_max_hw_sectors(disk->queue, 64);
disk->major = FLOPPY_MAJOR;
disk->first_minor = TOMINOR(drive) | (type << 2);
disk->minors = 1;
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index f8145499da38..28a95fd366fe 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -750,12 +750,13 @@ static void loop_sysfs_exit(struct loop_device *lo)
&loop_attribute_group);
}
-static void loop_config_discard(struct loop_device *lo)
+static void loop_config_discard(struct loop_device *lo,
+ struct queue_limits *lim)
{
struct file *file = lo->lo_backing_file;
struct inode *inode = file->f_mapping->host;
- struct request_queue *q = lo->lo_queue;
- u32 granularity, max_discard_sectors;
+ u32 granularity = 0, max_discard_sectors = 0;
+ struct kstatfs sbuf;
/*
* If the backing device is a block device, mirror its zeroing
@@ -775,29 +776,17 @@ static void loop_config_discard(struct loop_device *lo)
* We use punch hole to reclaim the free space used by the
* image a.k.a. discard.
*/
- } else if (!file->f_op->fallocate) {
- max_discard_sectors = 0;
- granularity = 0;
-
- } else {
- struct kstatfs sbuf;
-
+ } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) {
max_discard_sectors = UINT_MAX >> 9;
- if (!vfs_statfs(&file->f_path, &sbuf))
- granularity = sbuf.f_bsize;
- else
- max_discard_sectors = 0;
+ granularity = sbuf.f_bsize;
}
- if (max_discard_sectors) {
- q->limits.discard_granularity = granularity;
- blk_queue_max_discard_sectors(q, max_discard_sectors);
- blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
- } else {
- q->limits.discard_granularity = 0;
- blk_queue_max_discard_sectors(q, 0);
- blk_queue_max_write_zeroes_sectors(q, 0);
- }
+ lim->max_hw_discard_sectors = max_discard_sectors;
+ lim->max_write_zeroes_sectors = max_discard_sectors;
+ if (max_discard_sectors)
+ lim->discard_granularity = granularity;
+ else
+ lim->discard_granularity = 0;
}
struct loop_worker {
@@ -986,6 +975,20 @@ loop_set_status_from_info(struct loop_device *lo,
return 0;
}
+static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize,
+ bool update_discard_settings)
+{
+ struct queue_limits lim;
+
+ lim = queue_limits_start_update(lo->lo_queue);
+ lim.logical_block_size = bsize;
+ lim.physical_block_size = bsize;
+ lim.io_min = bsize;
+ if (update_discard_settings)
+ loop_config_discard(lo, &lim);
+ return queue_limits_commit_update(lo->lo_queue, &lim);
+}
+
static int loop_configure(struct loop_device *lo, blk_mode_t mode,
struct block_device *bdev,
const struct loop_config *config)
@@ -1083,11 +1086,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode,
else
bsize = 512;
- blk_queue_logical_block_size(lo->lo_queue, bsize);
- blk_queue_physical_block_size(lo->lo_queue, bsize);
- blk_queue_io_min(lo->lo_queue, bsize);
+ error = loop_reconfigure_limits(lo, bsize, true);
+ if (WARN_ON_ONCE(error))
+ goto out_unlock;
- loop_config_discard(lo);
loop_update_rotational(lo);
loop_update_dio(lo);
loop_sysfs_init(lo);
@@ -1154,9 +1156,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release)
lo->lo_offset = 0;
lo->lo_sizelimit = 0;
memset(lo->lo_file_name, 0, LO_NAME_SIZE);
- blk_queue_logical_block_size(lo->lo_queue, 512);
- blk_queue_physical_block_size(lo->lo_queue, 512);
- blk_queue_io_min(lo->lo_queue, 512);
+ loop_reconfigure_limits(lo, 512, false);
invalidate_disk(lo->lo_disk);
loop_sysfs_exit(lo);
/* let user-space know about this change */
@@ -1488,9 +1488,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
invalidate_bdev(lo->lo_device);
blk_mq_freeze_queue(lo->lo_queue);
- blk_queue_logical_block_size(lo->lo_queue, arg);
- blk_queue_physical_block_size(lo->lo_queue, arg);
- blk_queue_io_min(lo->lo_queue, arg);
+ err = loop_reconfigure_limits(lo, arg, false);
loop_update_dio(lo);
blk_mq_unfreeze_queue(lo->lo_queue);
@@ -1982,6 +1980,12 @@ static const struct blk_mq_ops loop_mq_ops = {
static int loop_add(int i)
{
+ struct queue_limits lim = {
+ /*
+ * Random number picked from the historic block max_sectors cap.
+ */
+ .max_hw_sectors = 2560u,
+ };
struct loop_device *lo;
struct gendisk *disk;
int err;
@@ -2025,16 +2029,13 @@ static int loop_add(int i)
if (err)
goto out_free_idr;
- disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo);
+ disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_cleanup_tags;
}
lo->lo_queue = lo->lo_disk->queue;
- /* random number picked from the history block max_sectors cap */
- blk_queue_max_hw_sectors(lo->lo_queue, 2560u);
-
/*
* By default, we do buffer IO, so it doesn't make sense to enable
* merge because the I/O submitted to backing file is handled page by
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index b200950e8fb5..43a187609ef7 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -3401,6 +3401,12 @@ static const struct blk_mq_ops mtip_mq_ops = {
*/
static int mtip_block_initialize(struct driver_data *dd)
{
+ struct queue_limits lim = {
+ .physical_block_size = 4096,
+ .max_hw_sectors = 0xffff,
+ .max_segments = MTIP_MAX_SG,
+ .max_segment_size = 0x400000,
+ };
int rv = 0, wait_for_rebuild = 0;
sector_t capacity;
unsigned int index = 0;
@@ -3431,7 +3437,7 @@ static int mtip_block_initialize(struct driver_data *dd)
goto block_queue_alloc_tag_error;
}
- dd->disk = blk_mq_alloc_disk(&dd->tags, dd);
+ dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd);
if (IS_ERR(dd->disk)) {
dev_err(&dd->pdev->dev,
"Unable to allocate request queue\n");
@@ -3481,12 +3487,7 @@ skip_create_disk:
/* Set device limits. */
blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
- blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
- blk_queue_physical_block_size(dd->queue, 4096);
- blk_queue_max_hw_sectors(dd->queue, 0xffff);
- blk_queue_max_segment_size(dd->queue, 0x400000);
dma_set_max_seg_size(&dd->pdev->dev, 0x400000);
- blk_queue_io_min(dd->queue, 4096);
/* Set the capacity of the device in 512 byte sectors. */
if (!(mtip_hw_get_capacity(dd, &capacity))) {
diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c
index d914156db2d8..27b2187e7a6d 100644
--- a/drivers/block/n64cart.c
+++ b/drivers/block/n64cart.c
@@ -114,6 +114,10 @@ static const struct block_device_operations n64cart_fops = {
*/
static int __init n64cart_probe(struct platform_device *pdev)
{
+ struct queue_limits lim = {
+ .physical_block_size = 4096,
+ .logical_block_size = 4096,
+ };
struct gendisk *disk;
int err = -ENOMEM;
@@ -131,9 +135,11 @@ static int __init n64cart_probe(struct platform_device *pdev)
if (IS_ERR(reg_base))
return PTR_ERR(reg_base);
- disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
+ disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ err = PTR_ERR(disk);
goto out;
+ }
disk->first_minor = 0;
disk->flags = GENHD_FL_NO_PART;
@@ -145,8 +151,6 @@ static int __init n64cart_probe(struct platform_device *pdev)
set_disk_ro(disk, 1);
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_physical_block_size(disk->queue, 4096);
- blk_queue_logical_block_size(disk->queue, 4096);
err = add_disk(disk);
if (err)
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 33a8f37bb6a1..9d4ec9273bf9 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -316,9 +316,12 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
nsock->sent = 0;
}
-static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
+static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
loff_t blksize)
{
+ struct queue_limits lim;
+ int error;
+
if (!blksize)
blksize = 1u << NBD_DEF_BLKSIZE_BITS;
@@ -334,10 +337,16 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
if (!nbd->pid)
return 0;
+ lim = queue_limits_start_update(nbd->disk->queue);
if (nbd->config->flags & NBD_FLAG_SEND_TRIM)
- blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
- blk_queue_logical_block_size(nbd->disk->queue, blksize);
- blk_queue_physical_block_size(nbd->disk->queue, blksize);
+ lim.max_hw_discard_sectors = UINT_MAX;
+ else
+ lim.max_hw_discard_sectors = 0;
+ lim.logical_block_size = blksize;
+ lim.physical_block_size = blksize;
+ error = queue_limits_commit_update(nbd->disk->queue, &lim);
+ if (error)
+ return error;
if (max_part)
set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
@@ -346,6 +355,18 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
return 0;
}
+static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
+ loff_t blksize)
+{
+ int error;
+
+ blk_mq_freeze_queue(nbd->disk->queue);
+ error = __nbd_set_size(nbd, bytesize, blksize);
+ blk_mq_unfreeze_queue(nbd->disk->queue);
+
+ return error;
+}
+
static void nbd_complete_rq(struct request *req)
{
struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
@@ -1351,7 +1372,6 @@ static void nbd_config_put(struct nbd_device *nbd)
nbd->config = NULL;
nbd->tag_set.timeout = 0;
- blk_queue_max_discard_sectors(nbd->disk->queue, 0);
mutex_unlock(&nbd->config_lock);
nbd_put(nbd);
@@ -1783,6 +1803,12 @@ static const struct blk_mq_ops nbd_mq_ops = {
static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = 65536,
+ .max_user_sectors = 256,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
struct nbd_device *nbd;
struct gendisk *disk;
int err = -ENOMEM;
@@ -1823,7 +1849,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
if (err < 0)
goto out_free_tags;
- disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
+ disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_free_idr;
@@ -1843,11 +1869,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
* Tell the block layer that we are not a rotational device
*/
blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
- blk_queue_max_discard_sectors(disk->queue, 0);
- blk_queue_max_segment_size(disk->queue, UINT_MAX);
- blk_queue_max_segments(disk->queue, USHRT_MAX);
- blk_queue_max_hw_sectors(disk->queue, 65536);
- disk->queue->limits.max_sectors = 256;
mutex_init(&nbd->config_lock);
refcount_set(&nbd->config_refs, 0);
@@ -2433,6 +2454,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
}
dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
+ if (!dev_list) {
+ nlmsg_free(reply);
+ ret = -EMSGSIZE;
+ goto out;
+ }
+
if (index == -1) {
ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
if (ret) {
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 36755f263e8e..71c39bcd872c 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -115,6 +115,18 @@ module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
#endif
+/*
+ * Historic queue modes.
+ *
+ * These days nothing but NULL_Q_MQ is actually supported, but we keep it the
+ * enum for error reporting.
+ */
+enum {
+ NULL_Q_BIO = 0,
+ NULL_Q_RQ = 1,
+ NULL_Q_MQ = 2,
+};
+
static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max)
@@ -165,8 +177,8 @@ static bool g_blocking;
module_param_named(blocking, g_blocking, bool, 0444);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
-static bool shared_tags;
-module_param(shared_tags, bool, 0444);
+static bool g_shared_tags;
+module_param_named(shared_tags, g_shared_tags, bool, 0444);
MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
static bool g_shared_tag_bitmap;
@@ -426,6 +438,7 @@ NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
NULLB_DEVICE_ATTR(no_sched, bool, NULL);
+NULLB_DEVICE_ATTR(shared_tags, bool, NULL);
NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
static ssize_t nullb_device_power_show(struct config_item *item, char *page)
@@ -571,6 +584,7 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
+ &nullb_device_attr_shared_tags,
&nullb_device_attr_shared_tag_bitmap,
NULL,
};
@@ -653,10 +667,11 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
"badblocks,blocking,blocksize,cache_size,"
"completion_nsec,discard,home_node,hw_queue_depth,"
"irqmode,max_sectors,mbps,memory_backed,no_sched,"
- "poll_queues,power,queue_mode,shared_tag_bitmap,size,"
- "submit_queues,use_per_node_hctx,virt_boundary,zoned,"
- "zone_capacity,zone_max_active,zone_max_open,"
- "zone_nr_conv,zone_offline,zone_readonly,zone_size\n");
+ "poll_queues,power,queue_mode,shared_tag_bitmap,"
+ "shared_tags,size,submit_queues,use_per_node_hctx,"
+ "virt_boundary,zoned,zone_capacity,zone_max_active,"
+ "zone_max_open,zone_nr_conv,zone_offline,zone_readonly,"
+ "zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
@@ -738,6 +753,7 @@ static struct nullb_device *null_alloc_dev(void)
dev->zone_max_active = g_zone_max_active;
dev->virt_boundary = g_virt_boundary;
dev->no_sched = g_no_sched;
+ dev->shared_tags = g_shared_tags;
dev->shared_tag_bitmap = g_shared_tag_bitmap;
return dev;
}
@@ -752,98 +768,11 @@ static void null_free_dev(struct nullb_device *dev)
kfree(dev);
}
-static void put_tag(struct nullb_queue *nq, unsigned int tag)
-{
- clear_bit_unlock(tag, nq->tag_map);
-
- if (waitqueue_active(&nq->wait))
- wake_up(&nq->wait);
-}
-
-static unsigned int get_tag(struct nullb_queue *nq)
-{
- unsigned int tag;
-
- do {
- tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
- if (tag >= nq->queue_depth)
- return -1U;
- } while (test_and_set_bit_lock(tag, nq->tag_map));
-
- return tag;
-}
-
-static void free_cmd(struct nullb_cmd *cmd)
-{
- put_tag(cmd->nq, cmd->tag);
-}
-
-static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
-
-static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
-{
- struct nullb_cmd *cmd;
- unsigned int tag;
-
- tag = get_tag(nq);
- if (tag != -1U) {
- cmd = &nq->cmds[tag];
- cmd->tag = tag;
- cmd->error = BLK_STS_OK;
- cmd->nq = nq;
- if (nq->dev->irqmode == NULL_IRQ_TIMER) {
- hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
- HRTIMER_MODE_REL);
- cmd->timer.function = null_cmd_timer_expired;
- }
- return cmd;
- }
-
- return NULL;
-}
-
-static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)
-{
- struct nullb_cmd *cmd;
- DEFINE_WAIT(wait);
-
- do {
- /*
- * This avoids multiple return statements, multiple calls to
- * __alloc_cmd() and a fast path call to prepare_to_wait().
- */
- cmd = __alloc_cmd(nq);
- if (cmd) {
- cmd->bio = bio;
- return cmd;
- }
- prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
- io_schedule();
- finish_wait(&nq->wait, &wait);
- } while (1);
-}
-
-static void end_cmd(struct nullb_cmd *cmd)
-{
- int queue_mode = cmd->nq->dev->queue_mode;
-
- switch (queue_mode) {
- case NULL_Q_MQ:
- blk_mq_end_request(cmd->rq, cmd->error);
- return;
- case NULL_Q_BIO:
- cmd->bio->bi_status = cmd->error;
- bio_endio(cmd->bio);
- break;
- }
-
- free_cmd(cmd);
-}
-
static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
{
- end_cmd(container_of(timer, struct nullb_cmd, timer));
+ struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer);
+ blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error);
return HRTIMER_NORESTART;
}
@@ -856,7 +785,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd)
static void null_complete_rq(struct request *rq)
{
- end_cmd(blk_mq_rq_to_pdu(rq));
+ struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+ blk_mq_end_request(rq, cmd->error);
}
static struct nullb_page *null_alloc_page(void)
@@ -1273,7 +1204,7 @@ static int null_transfer(struct nullb *nullb, struct page *page,
static int null_handle_rq(struct nullb_cmd *cmd)
{
- struct request *rq = cmd->rq;
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb *nullb = cmd->nq->dev->nullb;
int err;
unsigned int len;
@@ -1298,63 +1229,21 @@ static int null_handle_rq(struct nullb_cmd *cmd)
return 0;
}
-static int null_handle_bio(struct nullb_cmd *cmd)
-{
- struct bio *bio = cmd->bio;
- struct nullb *nullb = cmd->nq->dev->nullb;
- int err;
- unsigned int len;
- sector_t sector = bio->bi_iter.bi_sector;
- struct bio_vec bvec;
- struct bvec_iter iter;
-
- spin_lock_irq(&nullb->lock);
- bio_for_each_segment(bvec, bio, iter) {
- len = bvec.bv_len;
- err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
- op_is_write(bio_op(bio)), sector,
- bio->bi_opf & REQ_FUA);
- if (err) {
- spin_unlock_irq(&nullb->lock);
- return err;
- }
- sector += len >> SECTOR_SHIFT;
- }
- spin_unlock_irq(&nullb->lock);
- return 0;
-}
-
-static void null_stop_queue(struct nullb *nullb)
-{
- struct request_queue *q = nullb->q;
-
- if (nullb->dev->queue_mode == NULL_Q_MQ)
- blk_mq_stop_hw_queues(q);
-}
-
-static void null_restart_queue_async(struct nullb *nullb)
-{
- struct request_queue *q = nullb->q;
-
- if (nullb->dev->queue_mode == NULL_Q_MQ)
- blk_mq_start_stopped_hw_queues(q, true);
-}
-
static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
{
struct nullb_device *dev = cmd->nq->dev;
struct nullb *nullb = dev->nullb;
blk_status_t sts = BLK_STS_OK;
- struct request *rq = cmd->rq;
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
if (!hrtimer_active(&nullb->bw_timer))
hrtimer_restart(&nullb->bw_timer);
if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
- null_stop_queue(nullb);
+ blk_mq_stop_hw_queues(nullb->q);
/* race with timer */
if (atomic_long_read(&nullb->cur_bytes) > 0)
- null_restart_queue_async(nullb);
+ blk_mq_start_stopped_hw_queues(nullb->q, true);
/* requeue request */
sts = BLK_STS_DEV_RESOURCE;
}
@@ -1381,37 +1270,29 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
sector_t nr_sectors)
{
struct nullb_device *dev = cmd->nq->dev;
- int err;
if (op == REQ_OP_DISCARD)
return null_handle_discard(dev, sector, nr_sectors);
+ return errno_to_blk_status(null_handle_rq(cmd));
- if (dev->queue_mode == NULL_Q_BIO)
- err = null_handle_bio(cmd);
- else
- err = null_handle_rq(cmd);
-
- return errno_to_blk_status(err);
}
static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
struct nullb_device *dev = cmd->nq->dev;
struct bio *bio;
- if (dev->memory_backed)
- return;
-
- if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
- zero_fill_bio(cmd->bio);
- } else if (req_op(cmd->rq) == REQ_OP_READ) {
- __rq_for_each_bio(bio, cmd->rq)
+ if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) {
+ __rq_for_each_bio(bio, rq)
zero_fill_bio(bio);
}
}
static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
{
+ struct request *rq = blk_mq_rq_from_pdu(cmd);
+
/*
* Since root privileges are required to configure the null_blk
* driver, it is fine that this driver does not initialize the
@@ -1425,20 +1306,10 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
/* Complete IO by inline, softirq or timer */
switch (cmd->nq->dev->irqmode) {
case NULL_IRQ_SOFTIRQ:
- switch (cmd->nq->dev->queue_mode) {
- case NULL_Q_MQ:
- blk_mq_complete_request(cmd->rq);
- break;
- case NULL_Q_BIO:
- /*
- * XXX: no proper submitting cpu information available.
- */
- end_cmd(cmd);
- break;
- }
+ blk_mq_complete_request(rq);
break;
case NULL_IRQ_NONE:
- end_cmd(cmd);
+ blk_mq_end_request(rq, cmd->error);
break;
case NULL_IRQ_TIMER:
null_cmd_end_timer(cmd);
@@ -1499,7 +1370,7 @@ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
return HRTIMER_NORESTART;
atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
- null_restart_queue_async(nullb);
+ blk_mq_start_stopped_hw_queues(nullb->q, true);
hrtimer_forward_now(&nullb->bw_timer, timer_interval);
@@ -1516,26 +1387,6 @@ static void nullb_setup_bwtimer(struct nullb *nullb)
hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
}
-static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
-{
- int index = 0;
-
- if (nullb->nr_queues != 1)
- index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
-
- return &nullb->queues[index];
-}
-
-static void null_submit_bio(struct bio *bio)
-{
- sector_t sector = bio->bi_iter.bi_sector;
- sector_t nr_sectors = bio_sectors(bio);
- struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
- struct nullb_queue *nq = nullb_to_queue(nullb);
-
- null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio));
-}
-
#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
static bool should_timeout_request(struct request *rq)
@@ -1655,7 +1506,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
blk_rq_sectors(req));
if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
blk_mq_end_request_batch))
- end_cmd(cmd);
+ blk_mq_end_request(req, cmd->error);
nr++;
}
@@ -1711,7 +1562,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cmd->timer.function = null_cmd_timer_expired;
}
- cmd->rq = rq;
cmd->error = BLK_STS_OK;
cmd->nq = nq;
cmd->fake_timeout = should_timeout_request(rq) ||
@@ -1770,34 +1620,8 @@ static void null_queue_rqs(struct request **rqlist)
*rqlist = requeue_list;
}
-static void cleanup_queue(struct nullb_queue *nq)
-{
- bitmap_free(nq->tag_map);
- kfree(nq->cmds);
-}
-
-static void cleanup_queues(struct nullb *nullb)
-{
- int i;
-
- for (i = 0; i < nullb->nr_queues; i++)
- cleanup_queue(&nullb->queues[i]);
-
- kfree(nullb->queues);
-}
-
-static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- struct nullb_queue *nq = hctx->driver_data;
- struct nullb *nullb = nq->dev->nullb;
-
- nullb->nr_queues--;
-}
-
static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
{
- init_waitqueue_head(&nq->wait);
- nq->queue_depth = nullb->queue_depth;
nq->dev = nullb->dev;
INIT_LIST_HEAD(&nq->poll_list);
spin_lock_init(&nq->poll_lock);
@@ -1815,7 +1639,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
nq = &nullb->queues[hctx_idx];
hctx->driver_data = nq;
null_init_queue(nullb, nq);
- nullb->nr_queues++;
return 0;
}
@@ -1828,7 +1651,6 @@ static const struct blk_mq_ops null_mq_ops = {
.poll = null_poll,
.map_queues = null_map_queues,
.init_hctx = null_init_hctx,
- .exit_hctx = null_exit_hctx,
};
static void null_del_dev(struct nullb *nullb)
@@ -1849,21 +1671,20 @@ static void null_del_dev(struct nullb *nullb)
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer);
atomic_long_set(&nullb->cur_bytes, LONG_MAX);
- null_restart_queue_async(nullb);
+ blk_mq_start_stopped_hw_queues(nullb->q, true);
}
put_disk(nullb->disk);
- if (dev->queue_mode == NULL_Q_MQ &&
- nullb->tag_set == &nullb->__tag_set)
+ if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
- cleanup_queues(nullb);
+ kfree(nullb->queues);
if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true);
kfree(nullb);
dev->nullb = NULL;
}
-static void null_config_discard(struct nullb *nullb)
+static void null_config_discard(struct nullb *nullb, struct queue_limits *lim)
{
if (nullb->dev->discard == false)
return;
@@ -1880,43 +1701,14 @@ static void null_config_discard(struct nullb *nullb)
return;
}
- blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+ lim->max_hw_discard_sectors = UINT_MAX >> 9;
}
-static const struct block_device_operations null_bio_ops = {
- .owner = THIS_MODULE,
- .submit_bio = null_submit_bio,
- .report_zones = null_report_zones,
-};
-
-static const struct block_device_operations null_rq_ops = {
+static const struct block_device_operations null_ops = {
.owner = THIS_MODULE,
.report_zones = null_report_zones,
};
-static int setup_commands(struct nullb_queue *nq)
-{
- struct nullb_cmd *cmd;
- int i;
-
- nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
- if (!nq->cmds)
- return -ENOMEM;
-
- nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
- if (!nq->tag_map) {
- kfree(nq->cmds);
- return -ENOMEM;
- }
-
- for (i = 0; i < nq->queue_depth; i++) {
- cmd = &nq->cmds[i];
- cmd->tag = -1U;
- }
-
- return 0;
-}
-
static int setup_queues(struct nullb *nullb)
{
int nqueues = nr_cpu_ids;
@@ -1929,101 +1721,66 @@ static int setup_queues(struct nullb *nullb)
if (!nullb->queues)
return -ENOMEM;
- nullb->queue_depth = nullb->dev->hw_queue_depth;
return 0;
}
-static int init_driver_queues(struct nullb *nullb)
+static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues)
{
- struct nullb_queue *nq;
- int i, ret = 0;
-
- for (i = 0; i < nullb->dev->submit_queues; i++) {
- nq = &nullb->queues[i];
-
- null_init_queue(nullb, nq);
-
- ret = setup_commands(nq);
- if (ret)
- return ret;
- nullb->nr_queues++;
+ set->ops = &null_mq_ops;
+ set->cmd_size = sizeof(struct nullb_cmd);
+ set->timeout = 5 * HZ;
+ set->nr_maps = 1;
+ if (poll_queues) {
+ set->nr_hw_queues += poll_queues;
+ set->nr_maps += 2;
}
- return 0;
+ return blk_mq_alloc_tag_set(set);
}
-static int null_gendisk_register(struct nullb *nullb)
+static int null_init_global_tag_set(void)
{
- sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
- struct gendisk *disk = nullb->disk;
+ int error;
- set_capacity(disk, size);
-
- disk->major = null_major;
- disk->first_minor = nullb->index;
- disk->minors = 1;
- if (queue_is_mq(nullb->q))
- disk->fops = &null_rq_ops;
- else
- disk->fops = &null_bio_ops;
- disk->private_data = nullb;
- strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+ if (tag_set.ops)
+ return 0;
- if (nullb->dev->zoned) {
- int ret = null_register_zoned_dev(nullb);
+ tag_set.nr_hw_queues = g_submit_queues;
+ tag_set.queue_depth = g_hw_queue_depth;
+ tag_set.numa_node = g_home_node;
+ tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ if (g_no_sched)
+ tag_set.flags |= BLK_MQ_F_NO_SCHED;
+ if (g_shared_tag_bitmap)
+ tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+ if (g_blocking)
+ tag_set.flags |= BLK_MQ_F_BLOCKING;
- if (ret)
- return ret;
- }
-
- return add_disk(disk);
+ error = null_init_tag_set(&tag_set, g_poll_queues);
+ if (error)
+ tag_set.ops = NULL;
+ return error;
}
-static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
+static int null_setup_tagset(struct nullb *nullb)
{
- unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
- int hw_queues, numa_node;
- unsigned int queue_depth;
- int poll_queues;
-
- if (nullb) {
- hw_queues = nullb->dev->submit_queues;
- poll_queues = nullb->dev->poll_queues;
- queue_depth = nullb->dev->hw_queue_depth;
- numa_node = nullb->dev->home_node;
- if (nullb->dev->no_sched)
- flags |= BLK_MQ_F_NO_SCHED;
- if (nullb->dev->shared_tag_bitmap)
- flags |= BLK_MQ_F_TAG_HCTX_SHARED;
- if (nullb->dev->blocking)
- flags |= BLK_MQ_F_BLOCKING;
- } else {
- hw_queues = g_submit_queues;
- poll_queues = g_poll_queues;
- queue_depth = g_hw_queue_depth;
- numa_node = g_home_node;
- if (g_no_sched)
- flags |= BLK_MQ_F_NO_SCHED;
- if (g_shared_tag_bitmap)
- flags |= BLK_MQ_F_TAG_HCTX_SHARED;
- if (g_blocking)
- flags |= BLK_MQ_F_BLOCKING;
- }
-
- set->ops = &null_mq_ops;
- set->cmd_size = sizeof(struct nullb_cmd);
- set->flags = flags;
- set->driver_data = nullb;
- set->nr_hw_queues = hw_queues;
- set->queue_depth = queue_depth;
- set->numa_node = numa_node;
- if (poll_queues) {
- set->nr_hw_queues += poll_queues;
- set->nr_maps = 3;
- } else {
- set->nr_maps = 1;
+ if (nullb->dev->shared_tags) {
+ nullb->tag_set = &tag_set;
+ return null_init_global_tag_set();
}
- return blk_mq_alloc_tag_set(set);
+ nullb->tag_set = &nullb->__tag_set;
+ nullb->tag_set->driver_data = nullb;
+ nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues;
+ nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth;
+ nullb->tag_set->numa_node = nullb->dev->home_node;
+ nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE;
+ if (nullb->dev->no_sched)
+ nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED;
+ if (nullb->dev->shared_tag_bitmap)
+ nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+ if (nullb->dev->blocking)
+ nullb->tag_set->flags |= BLK_MQ_F_BLOCKING;
+ return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues);
}
static int null_validate_conf(struct nullb_device *dev)
@@ -2032,11 +1789,15 @@ static int null_validate_conf(struct nullb_device *dev)
pr_err("legacy IO path is no longer available\n");
return -EINVAL;
}
+ if (dev->queue_mode == NULL_Q_BIO) {
+ pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n");
+ dev->queue_mode = NULL_Q_MQ;
+ }
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
- if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+ if (dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes)
dev->submit_queues = nr_online_nodes;
} else if (dev->submit_queues > nr_cpu_ids)
@@ -2048,8 +1809,6 @@ static int null_validate_conf(struct nullb_device *dev)
if (dev->poll_queues > g_poll_queues)
dev->poll_queues = g_poll_queues;
dev->prev_poll_queues = dev->poll_queues;
-
- dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
/* Do memory allocation, so set blocking */
@@ -2060,9 +1819,6 @@ static int null_validate_conf(struct nullb_device *dev)
dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
dev->cache_size);
dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
- /* can not stop a queue */
- if (dev->queue_mode == NULL_Q_BIO)
- dev->mbps = 0;
if (dev->zoned &&
(!dev->zone_size || !is_power_of_2(dev->zone_size))) {
@@ -2102,6 +1858,12 @@ static bool null_setup_fault(void)
static int null_add_dev(struct nullb_device *dev)
{
+ struct queue_limits lim = {
+ .logical_block_size = dev->blocksize,
+ .physical_block_size = dev->blocksize,
+ .max_hw_sectors = dev->max_sectors,
+ };
+
struct nullb *nullb;
int rv;
@@ -2123,36 +1885,25 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_free_nullb;
- if (dev->queue_mode == NULL_Q_MQ) {
- if (shared_tags) {
- nullb->tag_set = &tag_set;
- rv = 0;
- } else {
- nullb->tag_set = &nullb->__tag_set;
- rv = null_init_tag_set(nullb, nullb->tag_set);
- }
+ rv = null_setup_tagset(nullb);
+ if (rv)
+ goto out_cleanup_queues;
+ if (dev->virt_boundary)
+ lim.virt_boundary_mask = PAGE_SIZE - 1;
+ null_config_discard(nullb, &lim);
+ if (dev->zoned) {
+ rv = null_init_zoned_dev(dev, &lim);
if (rv)
- goto out_cleanup_queues;
-
- nullb->tag_set->timeout = 5 * HZ;
- nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb);
- if (IS_ERR(nullb->disk)) {
- rv = PTR_ERR(nullb->disk);
goto out_cleanup_tags;
- }
- nullb->q = nullb->disk->queue;
- } else if (dev->queue_mode == NULL_Q_BIO) {
- rv = -ENOMEM;
- nullb->disk = blk_alloc_disk(nullb->dev->home_node);
- if (!nullb->disk)
- goto out_cleanup_queues;
+ }
- nullb->q = nullb->disk->queue;
- rv = init_driver_queues(nullb);
- if (rv)
- goto out_cleanup_disk;
+ nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb);
+ if (IS_ERR(nullb->disk)) {
+ rv = PTR_ERR(nullb->disk);
+ goto out_cleanup_zone;
}
+ nullb->q = nullb->disk->queue;
if (dev->mbps) {
set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
@@ -2164,12 +1915,6 @@ static int null_add_dev(struct nullb_device *dev)
blk_queue_write_cache(nullb->q, true, true);
}
- if (dev->zoned) {
- rv = null_init_zoned_dev(dev, nullb->q);
- if (rv)
- goto out_cleanup_disk;
- }
-
nullb->q->queuedata = nullb;
blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
@@ -2177,22 +1922,12 @@ static int null_add_dev(struct nullb_device *dev)
rv = ida_alloc(&nullb_indexes, GFP_KERNEL);
if (rv < 0) {
mutex_unlock(&lock);
- goto out_cleanup_zone;
+ goto out_cleanup_disk;
}
nullb->index = rv;
dev->index = rv;
mutex_unlock(&lock);
- blk_queue_logical_block_size(nullb->q, dev->blocksize);
- blk_queue_physical_block_size(nullb->q, dev->blocksize);
- if (dev->max_sectors)
- blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
-
- if (dev->virt_boundary)
- blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
-
- null_config_discard(nullb);
-
if (config_item_name(&dev->group.cg_item)) {
/* Use configfs dir name as the device name */
snprintf(nullb->disk_name, sizeof(nullb->disk_name),
@@ -2201,7 +1936,22 @@ static int null_add_dev(struct nullb_device *dev)
sprintf(nullb->disk_name, "nullb%d", nullb->index);
}
- rv = null_gendisk_register(nullb);
+ set_capacity(nullb->disk,
+ ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT);
+ nullb->disk->major = null_major;
+ nullb->disk->first_minor = nullb->index;
+ nullb->disk->minors = 1;
+ nullb->disk->fops = &null_ops;
+ nullb->disk->private_data = nullb;
+ strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
+ if (nullb->dev->zoned) {
+ rv = null_register_zoned_dev(nullb);
+ if (rv)
+ goto out_ida_free;
+ }
+
+ rv = add_disk(nullb->disk);
if (rv)
goto out_ida_free;
@@ -2220,10 +1970,10 @@ out_cleanup_zone:
out_cleanup_disk:
put_disk(nullb->disk);
out_cleanup_tags:
- if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+ if (nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
out_cleanup_queues:
- cleanup_queues(nullb);
+ kfree(nullb->queues);
out_free_nullb:
kfree(nullb);
dev->nullb = NULL;
@@ -2299,7 +2049,7 @@ static int __init null_init(void)
return -EINVAL;
}
- if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+ if (g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("submit_queues param is set to %u.\n",
nr_online_nodes);
@@ -2311,18 +2061,12 @@ static int __init null_init(void)
g_submit_queues = 1;
}
- if (g_queue_mode == NULL_Q_MQ && shared_tags) {
- ret = null_init_tag_set(NULL, &tag_set);
- if (ret)
- return ret;
- }
-
config_group_init(&nullb_subsys.su_group);
mutex_init(&nullb_subsys.su_mutex);
ret = configfs_register_subsystem(&nullb_subsys);
if (ret)
- goto err_tagset;
+ return ret;
mutex_init(&lock);
@@ -2349,9 +2093,6 @@ err_dev:
unregister_blkdev(null_major, "nullb");
err_conf:
configfs_unregister_subsystem(&nullb_subsys);
-err_tagset:
- if (g_queue_mode == NULL_Q_MQ && shared_tags)
- blk_mq_free_tag_set(&tag_set);
return ret;
}
@@ -2370,7 +2111,7 @@ static void __exit null_exit(void)
}
mutex_unlock(&lock);
- if (g_queue_mode == NULL_Q_MQ && shared_tags)
+ if (tag_set.ops)
blk_mq_free_tag_set(&tag_set);
}
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 929f659dd255..477b97746823 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -16,11 +16,6 @@
#include <linux/mutex.h>
struct nullb_cmd {
- union {
- struct request *rq;
- struct bio *bio;
- };
- unsigned int tag;
blk_status_t error;
bool fake_timeout;
struct nullb_queue *nq;
@@ -28,16 +23,11 @@ struct nullb_cmd {
};
struct nullb_queue {
- unsigned long *tag_map;
- wait_queue_head_t wait;
- unsigned int queue_depth;
struct nullb_device *dev;
unsigned int requeue_selection;
struct list_head poll_list;
spinlock_t poll_lock;
-
- struct nullb_cmd *cmds;
};
struct nullb_zone {
@@ -60,13 +50,6 @@ struct nullb_zone {
unsigned int capacity;
};
-/* Queue modes */
-enum {
- NULL_Q_BIO = 0,
- NULL_Q_RQ = 1,
- NULL_Q_MQ = 2,
-};
-
struct nullb_device {
struct nullb *nullb;
struct config_group group;
@@ -119,6 +102,7 @@ struct nullb_device {
bool zoned; /* if device is zoned */
bool virt_boundary; /* virtual boundary on/off for the device */
bool no_sched; /* no IO scheduler for the device */
+ bool shared_tags; /* share tag set between devices for blk-mq */
bool shared_tag_bitmap; /* use hostwide shared tags */
};
@@ -130,14 +114,12 @@ struct nullb {
struct gendisk *disk;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
- unsigned int queue_depth;
atomic_long_t cur_bytes;
struct hrtimer bw_timer;
unsigned long cache_flush_pos;
spinlock_t lock;
struct nullb_queue *queues;
- unsigned int nr_queues;
char disk_name[DISK_NAME_LEN];
};
@@ -147,7 +129,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, unsigned int nr_sectors);
#ifdef CONFIG_BLK_DEV_ZONED
-int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q);
+int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim);
int null_register_zoned_dev(struct nullb *nullb);
void null_free_zoned_dev(struct nullb_device *dev);
int null_report_zones(struct gendisk *disk, sector_t sector,
@@ -160,7 +142,7 @@ ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
size_t count, enum blk_zone_cond cond);
#else
static inline int null_init_zoned_dev(struct nullb_device *dev,
- struct request_queue *q)
+ struct queue_limits *lim)
{
pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
return -EINVAL;
diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h
index 6b2b370e786f..ef2d05d5f0df 100644
--- a/drivers/block/null_blk/trace.h
+++ b/drivers/block/null_blk/trace.h
@@ -41,10 +41,11 @@ TRACE_EVENT(nullb_zone_op,
__field(unsigned int, zone_cond)
),
TP_fast_assign(
- __entry->op = req_op(cmd->rq);
+ __entry->op = req_op(blk_mq_rq_from_pdu(cmd));
__entry->zone_no = zone_no;
__entry->zone_cond = zone_cond;
- __assign_disk_name(__entry->disk, cmd->rq->q->disk);
+ __assign_disk_name(__entry->disk,
+ blk_mq_rq_from_pdu(cmd)->q->disk);
),
TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s",
__print_disk_name(__entry->disk),
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 6f5e0994862e..1689e2584104 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -58,7 +58,8 @@ static inline void null_unlock_zone(struct nullb_device *dev,
mutex_unlock(&zone->mutex);
}
-int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
+int null_init_zoned_dev(struct nullb_device *dev,
+ struct queue_limits *lim)
{
sector_t dev_capacity_sects, zone_capacity_sects;
struct nullb_zone *zone;
@@ -151,27 +152,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
sector += dev->zone_size_sects;
}
+ lim->zoned = true;
+ lim->chunk_sectors = dev->zone_size_sects;
+ lim->max_zone_append_sectors = dev->zone_size_sects;
+ lim->max_open_zones = dev->zone_max_open;
+ lim->max_active_zones = dev->zone_max_active;
return 0;
}
int null_register_zoned_dev(struct nullb *nullb)
{
- struct nullb_device *dev = nullb->dev;
struct request_queue *q = nullb->q;
- disk_set_zoned(nullb->disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
- blk_queue_chunk_sectors(q, dev->zone_size_sects);
nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
- blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
- disk_set_max_open_zones(nullb->disk, dev->zone_max_open);
- disk_set_max_active_zones(nullb->disk, dev->zone_max_active);
-
- if (queue_is_mq(q))
- return blk_revalidate_disk_zones(nullb->disk, NULL);
-
- return 0;
+ return blk_revalidate_disk_zones(nullb->disk, NULL);
}
void null_free_zoned_dev(struct nullb_device *dev)
@@ -394,10 +390,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
*/
if (append) {
sector = zone->wp;
- if (dev->queue_mode == NULL_Q_MQ)
- cmd->rq->__sector = sector;
- else
- cmd->bio->bi_iter.bi_sector = sector;
+ blk_mq_rq_from_pdu(cmd)->__sector = sector;
} else if (sector != zone->wp) {
ret = BLK_STS_IOERR;
goto unlock;
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index c21444716e43..21728e9ea5c3 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -828,6 +828,12 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
*/
static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
{
+ /*
+ * Some CDRW drives can not handle writes larger than one packet,
+ * even if the size is a multiple of the packet size.
+ */
+ bio->bi_opf |= REQ_NOMERGE;
+
spin_lock(&pd->iosched.lock);
if (bio_data_dir(bio) == READ)
bio_list_add(&pd->iosched.read_queue, bio);
@@ -2191,11 +2197,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write)
ret = pkt_open_write(pd);
if (ret)
goto out_putdev;
- /*
- * Some CDRW drives can not handle writes larger than one packet,
- * even if the size is a multiple of the packet size.
- */
- blk_queue_max_hw_sectors(q, pd->settings.size);
set_bit(PACKET_WRITABLE, &pd->flags);
} else {
pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
@@ -2338,9 +2339,9 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
pkt_queue_bio(pd, cloned_bio);
}
-static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
+static void pkt_make_request_write(struct bio *bio)
{
- struct pktcdvd_device *pd = q->queuedata;
+ struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
sector_t zone;
struct packet_data *pkt;
int was_empty, blocked_bio;
@@ -2432,7 +2433,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
static void pkt_submit_bio(struct bio *bio)
{
- struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
+ struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data;
struct device *ddev = disk_to_dev(pd->disk);
struct bio *split;
@@ -2476,7 +2477,7 @@ static void pkt_submit_bio(struct bio *bio)
split = bio;
}
- pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
+ pkt_make_request_write(split);
} while (split != bio);
return;
@@ -2484,15 +2485,6 @@ end_io:
bio_io_error(bio);
}
-static void pkt_init_queue(struct pktcdvd_device *pd)
-{
- struct request_queue *q = pd->disk->queue;
-
- blk_queue_logical_block_size(q, CD_FRAMESIZE);
- blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
- q->queuedata = pd;
-}
-
static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
{
struct device *ddev = disk_to_dev(pd->disk);
@@ -2536,8 +2528,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
pd->bdev_file = bdev_file;
set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE);
- pkt_init_queue(pd);
-
atomic_set(&pd->cdrw.pending_bios, 0);
pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name);
if (IS_ERR(pd->cdrw.thread)) {
@@ -2634,6 +2624,10 @@ static const struct block_device_operations pktcdvd_ops = {
*/
static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
{
+ struct queue_limits lim = {
+ .max_hw_sectors = PACKET_MAX_SECTORS,
+ .logical_block_size = CD_FRAMESIZE,
+ };
int idx;
int ret = -ENOMEM;
struct pktcdvd_device *pd;
@@ -2673,10 +2667,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->write_congestion_on = write_congestion_on;
pd->write_congestion_off = write_congestion_off;
- ret = -ENOMEM;
- disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
+ disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ ret = PTR_ERR(disk);
goto out_mem;
+ }
pd->disk = disk;
disk->major = pktdev_major;
disk->first_minor = idx;
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index 36d7b36c60c7..b810ac0a5c4b 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -382,6 +382,14 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
struct ps3disk_private *priv;
int error;
unsigned int devidx;
+ struct queue_limits lim = {
+ .logical_block_size = dev->blk_size,
+ .max_hw_sectors = dev->bounce_size >> 9,
+ .max_segments = -1,
+ .max_segment_size = dev->bounce_size,
+ .dma_alignment = dev->blk_size - 1,
+ };
+
struct request_queue *queue;
struct gendisk *gendisk;
@@ -431,7 +439,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
if (error)
goto fail_teardown;
- gendisk = blk_mq_alloc_disk(&priv->tag_set, dev);
+ gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev);
if (IS_ERR(gendisk)) {
dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n",
__func__, __LINE__);
@@ -441,15 +449,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev)
queue = gendisk->queue;
- blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
- blk_queue_dma_alignment(queue, dev->blk_size-1);
- blk_queue_logical_block_size(queue, dev->blk_size);
-
blk_queue_write_cache(queue, true, false);
- blk_queue_max_segments(queue, -1);
- blk_queue_max_segment_size(queue, dev->bounce_size);
-
priv->gendisk = gendisk;
gendisk->major = ps3disk_major;
gendisk->first_minor = devidx * PS3DISK_MINORS;
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
index 38d42af01b25..bdcf083b45e2 100644
--- a/drivers/block/ps3vram.c
+++ b/drivers/block/ps3vram.c
@@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev)
ps3vram_proc_init(dev);
- gendisk = blk_alloc_disk(NUMA_NO_NODE);
- if (!gendisk) {
+ gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+ if (IS_ERR(gendisk)) {
dev_err(&dev->core, "blk_alloc_disk failed\n");
- error = -ENOMEM;
+ error = PTR_ERR(gendisk);
goto out_cache_cleanup;
}
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
index 12b5d53ec856..26ff5cd2bf0a 100644
--- a/drivers/block/rbd.c
+++ b/drivers/block/rbd.c
@@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = {
};
__ATTRIBUTE_GROUPS(rbd_bus);
-static struct bus_type rbd_bus_type = {
+static const struct bus_type rbd_bus_type = {
.name = "rbd",
.bus_groups = rbd_bus_groups,
};
@@ -4952,6 +4952,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
struct request_queue *q;
unsigned int objset_bytes =
rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
+ struct queue_limits lim = {
+ .max_hw_sectors = objset_bytes >> SECTOR_SHIFT,
+ .max_user_sectors = objset_bytes >> SECTOR_SHIFT,
+ .io_min = rbd_dev->opts->alloc_size,
+ .io_opt = rbd_dev->opts->alloc_size,
+ .max_segments = USHRT_MAX,
+ .max_segment_size = UINT_MAX,
+ };
int err;
memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
@@ -4966,7 +4974,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
if (err)
return err;
- disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
+ if (rbd_dev->opts->trim) {
+ lim.discard_granularity = rbd_dev->opts->alloc_size;
+ lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT;
+ lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT;
+ }
+
+ disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev);
if (IS_ERR(disk)) {
err = PTR_ERR(disk);
goto out_tag_set;
@@ -4987,19 +5001,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev)
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
- blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
- q->limits.max_sectors = queue_max_hw_sectors(q);
- blk_queue_max_segments(q, USHRT_MAX);
- blk_queue_max_segment_size(q, UINT_MAX);
- blk_queue_io_min(q, rbd_dev->opts->alloc_size);
- blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
-
- if (rbd_dev->opts->trim) {
- q->limits.discard_granularity = rbd_dev->opts->alloc_size;
- blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
- blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
- }
-
if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
index 4044c369d22a..b7ffe03c6160 100644
--- a/drivers/block/rnbd/rnbd-clt.c
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -1329,43 +1329,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
}
}
-static void setup_request_queue(struct rnbd_clt_dev *dev,
- struct rnbd_msg_open_rsp *rsp)
-{
- blk_queue_logical_block_size(dev->queue,
- le16_to_cpu(rsp->logical_block_size));
- blk_queue_physical_block_size(dev->queue,
- le16_to_cpu(rsp->physical_block_size));
- blk_queue_max_hw_sectors(dev->queue,
- dev->sess->max_io_size / SECTOR_SIZE);
-
- /*
- * we don't support discards to "discontiguous" segments
- * in on request
- */
- blk_queue_max_discard_segments(dev->queue, 1);
-
- blk_queue_max_discard_sectors(dev->queue,
- le32_to_cpu(rsp->max_discard_sectors));
- dev->queue->limits.discard_granularity =
- le32_to_cpu(rsp->discard_granularity);
- dev->queue->limits.discard_alignment =
- le32_to_cpu(rsp->discard_alignment);
- if (le16_to_cpu(rsp->secure_discard))
- blk_queue_max_secure_erase_sectors(dev->queue,
- le32_to_cpu(rsp->max_discard_sectors));
- blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
- blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
- blk_queue_max_segments(dev->queue, dev->sess->max_segments);
- blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
- blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
- blk_queue_write_cache(dev->queue,
- !!(rsp->cache_policy & RNBD_WRITEBACK),
- !!(rsp->cache_policy & RNBD_FUA));
- blk_queue_max_write_zeroes_sectors(dev->queue,
- le32_to_cpu(rsp->max_write_zeroes_sectors));
-}
-
static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp, int idx)
{
@@ -1403,18 +1366,41 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
struct rnbd_msg_open_rsp *rsp)
{
+ struct queue_limits lim = {
+ .logical_block_size = le16_to_cpu(rsp->logical_block_size),
+ .physical_block_size = le16_to_cpu(rsp->physical_block_size),
+ .io_opt = dev->sess->max_io_size,
+ .max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE,
+ .max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors),
+ .discard_granularity = le32_to_cpu(rsp->discard_granularity),
+ .discard_alignment = le32_to_cpu(rsp->discard_alignment),
+ .max_segments = dev->sess->max_segments,
+ .virt_boundary_mask = SZ_4K - 1,
+ .max_write_zeroes_sectors =
+ le32_to_cpu(rsp->max_write_zeroes_sectors),
+ };
int idx = dev->clt_device_id;
dev->size = le64_to_cpu(rsp->nsectors) *
le16_to_cpu(rsp->logical_block_size);
- dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
+ if (rsp->secure_discard) {
+ lim.max_secure_erase_sectors =
+ le32_to_cpu(rsp->max_discard_sectors);
+ }
+
+ dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev);
if (IS_ERR(dev->gd))
return PTR_ERR(dev->gd);
dev->queue = dev->gd->queue;
rnbd_init_mq_hw_queues(dev);
- setup_request_queue(dev, rsp);
+ blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
+ blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
+ blk_queue_write_cache(dev->queue,
+ !!(rsp->cache_policy & RNBD_WRITEBACK),
+ !!(rsp->cache_policy & RNBD_FUA));
+
return rnbd_clt_setup_gen_disk(dev, rsp, idx);
}
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 7bf4b48e2282..c99dd6698977 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -784,6 +784,14 @@ static const struct blk_mq_ops vdc_mq_ops = {
static int probe_disk(struct vdc_port *port)
{
+ struct queue_limits lim = {
+ .physical_block_size = port->vdisk_phys_blksz,
+ .max_hw_sectors = port->max_xfer_size,
+ /* Each segment in a request is up to an aligned page in size. */
+ .seg_boundary_mask = PAGE_SIZE - 1,
+ .max_segment_size = PAGE_SIZE,
+ .max_segments = port->ring_cookies,
+ };
struct request_queue *q;
struct gendisk *g;
int err;
@@ -824,7 +832,7 @@ static int probe_disk(struct vdc_port *port)
if (err)
return err;
- g = blk_mq_alloc_disk(&port->tag_set, port);
+ g = blk_mq_alloc_disk(&port->tag_set, &lim, port);
if (IS_ERR(g)) {
printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
port->vio.name);
@@ -835,12 +843,6 @@ static int probe_disk(struct vdc_port *port)
port->disk = g;
q = g->queue;
- /* Each segment in a request is up to an aligned page in size. */
- blk_queue_segment_boundary(q, PAGE_SIZE - 1);
- blk_queue_max_segment_size(q, PAGE_SIZE);
-
- blk_queue_max_segments(q, port->ring_cookies);
- blk_queue_max_hw_sectors(q, port->max_xfer_size);
g->major = vdc_major;
g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
g->minors = 1 << PARTITION_SHIFT;
@@ -872,8 +874,6 @@ static int probe_disk(struct vdc_port *port)
}
}
- blk_queue_physical_block_size(q, port->vdisk_phys_blksz);
-
pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
g->disk_name,
port->vdisk_size, (port->vdisk_size >> (20 - 9)),
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index f85b6af414b4..6731678f3a41 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd)
goto exit_put_disks;
swd->unit[drive].disk =
- blk_mq_alloc_disk(&swd->unit[drive].tag_set,
+ blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL,
&swd->unit[drive]);
if (IS_ERR(swd->unit[drive].disk)) {
blk_mq_free_tag_set(&swd->unit[drive].tag_set);
@@ -916,7 +916,7 @@ out:
return ret;
}
-static int swim_remove(struct platform_device *dev)
+static void swim_remove(struct platform_device *dev)
{
struct swim_priv *swd = platform_get_drvdata(dev);
int drive;
@@ -937,13 +937,11 @@ static int swim_remove(struct platform_device *dev)
release_mem_region(res->start, resource_size(res));
kfree(swd);
-
- return 0;
}
static struct platform_driver swim_driver = {
.probe = swim_probe,
- .remove = swim_remove,
+ .remove_new = swim_remove,
.driver = {
.name = CARDNAME,
},
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index c2bc85826358..a04756ac778e 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev,
if (rc)
goto out_unregister;
- disk = blk_mq_alloc_disk(&fs->tag_set, fs);
+ disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs);
if (IS_ERR(disk)) {
rc = PTR_ERR(disk);
goto out_free_tag_set;
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
index 1dfb2e77898b..bea3d5cf8a83 100644
--- a/drivers/block/ublk_drv.c
+++ b/drivers/block/ublk_drv.c
@@ -246,21 +246,12 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return 0;
}
-static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
- const struct ublk_param_zoned *p = &ub->params.zoned;
-
- disk_set_zoned(ub->ub_disk);
blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue);
blk_queue_required_elevator_features(ub->ub_disk->queue,
ELEVATOR_F_ZBD_SEQ_WRITE);
- disk_set_max_active_zones(ub->ub_disk, p->max_active_zones);
- disk_set_max_open_zones(ub->ub_disk, p->max_open_zones);
- blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors);
-
ub->ub_disk->nr_zones = ublk_get_nr_zones(ub);
-
- return 0;
}
/* Based on virtblk_alloc_report_buffer */
@@ -432,9 +423,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub)
return -EOPNOTSUPP;
}
-static int ublk_dev_param_zoned_apply(struct ublk_device *ub)
+static void ublk_dev_param_zoned_apply(struct ublk_device *ub)
{
- return -EOPNOTSUPP;
}
static int ublk_revalidate_disk_zones(struct ublk_device *ub)
@@ -498,11 +488,6 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
struct request_queue *q = ub->ub_disk->queue;
const struct ublk_param_basic *p = &ub->params.basic;
- blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
- blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
- blk_queue_io_min(q, 1 << p->io_min_shift);
- blk_queue_io_opt(q, 1 << p->io_opt_shift);
-
blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
p->attrs & UBLK_ATTR_FUA);
if (p->attrs & UBLK_ATTR_ROTATIONAL)
@@ -510,29 +495,12 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub)
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- blk_queue_max_hw_sectors(q, p->max_sectors);
- blk_queue_chunk_sectors(q, p->chunk_sectors);
- blk_queue_virt_boundary(q, p->virt_boundary_mask);
-
if (p->attrs & UBLK_ATTR_READ_ONLY)
set_disk_ro(ub->ub_disk, true);
set_capacity(ub->ub_disk, p->dev_sectors);
}
-static void ublk_dev_param_discard_apply(struct ublk_device *ub)
-{
- struct request_queue *q = ub->ub_disk->queue;
- const struct ublk_param_discard *p = &ub->params.discard;
-
- q->limits.discard_alignment = p->discard_alignment;
- q->limits.discard_granularity = p->discard_granularity;
- blk_queue_max_discard_sectors(q, p->max_discard_sectors);
- blk_queue_max_write_zeroes_sectors(q,
- p->max_write_zeroes_sectors);
- blk_queue_max_discard_segments(q, p->max_discard_segments);
-}
-
static int ublk_validate_params(const struct ublk_device *ub)
{
/* basic param is the only one which must be set */
@@ -576,20 +544,12 @@ static int ublk_validate_params(const struct ublk_device *ub)
return 0;
}
-static int ublk_apply_params(struct ublk_device *ub)
+static void ublk_apply_params(struct ublk_device *ub)
{
- if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
- return -EINVAL;
-
ublk_dev_param_basic_apply(ub);
- if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
- ublk_dev_param_discard_apply(ub);
-
if (ub->params.types & UBLK_PARAM_TYPE_ZONED)
- return ublk_dev_param_zoned_apply(ub);
-
- return 0;
+ ublk_dev_param_zoned_apply(ub);
}
static inline bool ublk_support_user_copy(const struct ublk_queue *ubq)
@@ -645,14 +605,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
return ubq->flags & UBLK_F_NEED_GET_DATA;
}
-static struct ublk_device *ublk_get_device(struct ublk_device *ub)
+/* Called in slow path only, keep it noinline for trace purpose */
+static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub)
{
if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
return ub;
return NULL;
}
-static void ublk_put_device(struct ublk_device *ub)
+/* Called in slow path only, keep it noinline for trace purpose */
+static noinline void ublk_put_device(struct ublk_device *ub)
{
put_device(&ub->cdev_dev);
}
@@ -711,7 +673,7 @@ static void ublk_free_disk(struct gendisk *disk)
struct ublk_device *ub = disk->private_data;
clear_bit(UB_STATE_USED, &ub->state);
- put_device(&ub->cdev_dev);
+ ublk_put_device(ub);
}
static void ublk_store_owner_uid_gid(unsigned int *owner_uid,
@@ -2182,7 +2144,7 @@ static void ublk_remove(struct ublk_device *ub)
cancel_work_sync(&ub->stop_work);
cancel_work_sync(&ub->quiesce_work);
cdev_device_del(&ub->cdev, &ub->cdev_dev);
- put_device(&ub->cdev_dev);
+ ublk_put_device(ub);
ublks_added--;
}
@@ -2205,12 +2167,47 @@ static struct ublk_device *ublk_get_device_from_id(int idx)
static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
{
const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe);
+ const struct ublk_param_basic *p = &ub->params.basic;
int ublksrv_pid = (int)header->data[0];
+ struct queue_limits lim = {
+ .logical_block_size = 1 << p->logical_bs_shift,
+ .physical_block_size = 1 << p->physical_bs_shift,
+ .io_min = 1 << p->io_min_shift,
+ .io_opt = 1 << p->io_opt_shift,
+ .max_hw_sectors = p->max_sectors,
+ .chunk_sectors = p->chunk_sectors,
+ .virt_boundary_mask = p->virt_boundary_mask,
+
+ };
struct gendisk *disk;
int ret = -EINVAL;
if (ublksrv_pid <= 0)
return -EINVAL;
+ if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+ return -EINVAL;
+
+ if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+ const struct ublk_param_discard *pd = &ub->params.discard;
+
+ lim.discard_alignment = pd->discard_alignment;
+ lim.discard_granularity = pd->discard_granularity;
+ lim.max_hw_discard_sectors = pd->max_discard_sectors;
+ lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors;
+ lim.max_discard_segments = pd->max_discard_segments;
+ }
+
+ if (ub->params.types & UBLK_PARAM_TYPE_ZONED) {
+ const struct ublk_param_zoned *p = &ub->params.zoned;
+
+ if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED))
+ return -EOPNOTSUPP;
+
+ lim.zoned = true;
+ lim.max_active_zones = p->max_active_zones;
+ lim.max_open_zones = p->max_open_zones;
+ lim.max_zone_append_sectors = p->max_zone_append_sectors;
+ }
if (wait_for_completion_interruptible(&ub->completion) != 0)
return -EINTR;
@@ -2222,7 +2219,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
goto out_unlock;
}
- disk = blk_mq_alloc_disk(&ub->tag_set, NULL);
+ disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL);
if (IS_ERR(disk)) {
ret = PTR_ERR(disk);
goto out_unlock;
@@ -2234,15 +2231,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
ub->dev_info.ublksrv_pid = ublksrv_pid;
ub->ub_disk = disk;
- ret = ublk_apply_params(ub);
- if (ret)
- goto out_put_disk;
+ ublk_apply_params(ub);
/* don't probe partitions if any one ubq daemon is un-trusted */
if (ub->nr_privileged_daemon != ub->nr_queues_ready)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
- get_device(&ub->cdev_dev);
+ ublk_get_device(ub);
ub->dev_info.state = UBLK_S_DEV_LIVE;
if (ublk_dev_is_zoned(ub)) {
@@ -2262,7 +2257,6 @@ out_put_cdev:
ub->dev_info.state = UBLK_S_DEV_DEAD;
ublk_put_device(ub);
}
-out_put_disk:
if (ret)
put_disk(disk);
out_unlock:
@@ -2474,7 +2468,7 @@ static inline bool ublk_idr_freed(int id)
return ptr == NULL;
}
-static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
+static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait)
{
struct ublk_device *ub = *p_ub;
int idx = ub->ub_number;
@@ -2508,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
* - the device number is freed already, we will not find this
* device via ublk_get_device_from_id()
*/
- if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
+ if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx)))
return -EINTR;
return 0;
}
@@ -2907,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
ret = ublk_ctrl_add_dev(cmd);
break;
case UBLK_CMD_DEL_DEV:
- ret = ublk_ctrl_del_dev(&ub);
+ ret = ublk_ctrl_del_dev(&ub, true);
+ break;
+ case UBLK_U_CMD_DEL_DEV_ASYNC:
+ ret = ublk_ctrl_del_dev(&ub, false);
break;
case UBLK_CMD_GET_QUEUE_AFFINITY:
ret = ublk_ctrl_get_queue_affinity(ub, cmd);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 2bf14a0e2815..42dea7601d87 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -720,25 +720,24 @@ fail_report:
return ret;
}
-static int virtblk_probe_zoned_device(struct virtio_device *vdev,
- struct virtio_blk *vblk,
- struct request_queue *q)
+static int virtblk_read_zoned_limits(struct virtio_blk *vblk,
+ struct queue_limits *lim)
{
+ struct virtio_device *vdev = vblk->vdev;
u32 v, wg;
dev_dbg(&vdev->dev, "probing host-managed zoned device\n");
- disk_set_zoned(vblk->disk);
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+ lim->zoned = true;
virtio_cread(vdev, struct virtio_blk_config,
zoned.max_open_zones, &v);
- disk_set_max_open_zones(vblk->disk, v);
+ lim->max_open_zones = v;
dev_dbg(&vdev->dev, "max open zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config,
zoned.max_active_zones, &v);
- disk_set_max_active_zones(vblk->disk, v);
+ lim->max_active_zones = v;
dev_dbg(&vdev->dev, "max active zones = %u\n", v);
virtio_cread(vdev, struct virtio_blk_config,
@@ -747,8 +746,8 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
dev_warn(&vdev->dev, "zero write granularity reported\n");
return -ENODEV;
}
- blk_queue_physical_block_size(q, wg);
- blk_queue_io_min(q, wg);
+ lim->physical_block_size = wg;
+ lim->io_min = wg;
dev_dbg(&vdev->dev, "write granularity = %u\n", wg);
@@ -764,13 +763,13 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
vblk->zone_sectors);
return -ENODEV;
}
- blk_queue_chunk_sectors(q, vblk->zone_sectors);
+ lim->chunk_sectors = vblk->zone_sectors;
dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors);
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
dev_warn(&vblk->vdev->dev,
"ignoring negotiated F_DISCARD for zoned device\n");
- blk_queue_max_discard_sectors(q, 0);
+ lim->max_hw_discard_sectors = 0;
}
virtio_cread(vdev, struct virtio_blk_config,
@@ -785,25 +784,21 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev,
wg, v);
return -ENODEV;
}
- blk_queue_max_zone_append_sectors(q, v);
+ lim->max_zone_append_sectors = v;
dev_dbg(&vdev->dev, "max append sectors = %u\n", v);
- return blk_revalidate_disk_zones(vblk->disk, NULL);
+ return 0;
}
-
#else
-
/*
- * Zoned block device support is not configured in this kernel.
- * Host-managed zoned devices can't be supported, but others are
- * good to go as regular block devices.
+ * Zoned block device support is not configured in this kernel, host-managed
+ * zoned devices can't be supported.
*/
#define virtblk_report_zones NULL
-
-static inline int virtblk_probe_zoned_device(struct virtio_device *vdev,
- struct virtio_blk *vblk, struct request_queue *q)
+static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk,
+ struct queue_limits *lim)
{
- dev_err(&vdev->dev,
+ dev_err(&vblk->vdev->dev,
"virtio_blk: zoned devices are not supported");
return -EOPNOTSUPP;
}
@@ -1248,31 +1243,17 @@ static const struct blk_mq_ops virtio_mq_ops = {
static unsigned int virtblk_queue_depth;
module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
-static int virtblk_probe(struct virtio_device *vdev)
+static int virtblk_read_limits(struct virtio_blk *vblk,
+ struct queue_limits *lim)
{
- struct virtio_blk *vblk;
- struct request_queue *q;
- int err, index;
-
+ struct virtio_device *vdev = vblk->vdev;
u32 v, blk_size, max_size, sg_elems, opt_io_size;
u32 max_discard_segs = 0;
u32 discard_granularity = 0;
u16 min_io_size;
u8 physical_block_exp, alignment_offset;
- unsigned int queue_depth;
size_t max_dma_size;
-
- if (!vdev->config->get) {
- dev_err(&vdev->dev, "%s failure: config access disabled\n",
- __func__);
- return -EINVAL;
- }
-
- err = ida_alloc_range(&vd_index_ida, 0,
- minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
- if (err < 0)
- goto out;
- index = err;
+ int err;
/* We need to know how many segments before we allocate. */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
@@ -1286,78 +1267,11 @@ static int virtblk_probe(struct virtio_device *vdev)
/* Prevent integer overflows and honor max vq size */
sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
- vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
- if (!vblk) {
- err = -ENOMEM;
- goto out_free_index;
- }
-
- mutex_init(&vblk->vdev_mutex);
-
- vblk->vdev = vdev;
-
- INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
-
- err = init_vq(vblk);
- if (err)
- goto out_free_vblk;
-
- /* Default queue sizing is to fill the ring. */
- if (!virtblk_queue_depth) {
- queue_depth = vblk->vqs[0].vq->num_free;
- /* ... but without indirect descs, we use 2 descs per req */
- if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
- queue_depth /= 2;
- } else {
- queue_depth = virtblk_queue_depth;
- }
-
- memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
- vblk->tag_set.ops = &virtio_mq_ops;
- vblk->tag_set.queue_depth = queue_depth;
- vblk->tag_set.numa_node = NUMA_NO_NODE;
- vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
- vblk->tag_set.cmd_size =
- sizeof(struct virtblk_req) +
- sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
- vblk->tag_set.driver_data = vblk;
- vblk->tag_set.nr_hw_queues = vblk->num_vqs;
- vblk->tag_set.nr_maps = 1;
- if (vblk->io_queues[HCTX_TYPE_POLL])
- vblk->tag_set.nr_maps = 3;
-
- err = blk_mq_alloc_tag_set(&vblk->tag_set);
- if (err)
- goto out_free_vq;
-
- vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk);
- if (IS_ERR(vblk->disk)) {
- err = PTR_ERR(vblk->disk);
- goto out_free_tags;
- }
- q = vblk->disk->queue;
-
- virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
-
- vblk->disk->major = major;
- vblk->disk->first_minor = index_to_minor(index);
- vblk->disk->minors = 1 << PART_BITS;
- vblk->disk->private_data = vblk;
- vblk->disk->fops = &virtblk_fops;
- vblk->index = index;
-
- /* configure queue flush support */
- virtblk_update_cache_mode(vdev);
-
- /* If disk is read-only in the host, the guest should obey */
- if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
- set_disk_ro(vblk->disk, 1);
-
/* We can handle whatever the host told us to handle. */
- blk_queue_max_segments(q, sg_elems);
+ lim->max_segments = sg_elems;
/* No real sector limit. */
- blk_queue_max_hw_sectors(q, UINT_MAX);
+ lim->max_hw_sectors = UINT_MAX;
max_dma_size = virtio_max_dma_size(vdev);
max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
@@ -1369,7 +1283,7 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!err)
max_size = min(max_size, v);
- blk_queue_max_segment_size(q, max_size);
+ lim->max_segment_size = max_size;
/* Host can optionally specify the block size of the device */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
@@ -1381,38 +1295,37 @@ static int virtblk_probe(struct virtio_device *vdev)
dev_err(&vdev->dev,
"virtio_blk: invalid block size: 0x%x\n",
blk_size);
- goto out_cleanup_disk;
+ return err;
}
- blk_queue_logical_block_size(q, blk_size);
+ lim->logical_block_size = blk_size;
} else
- blk_size = queue_logical_block_size(q);
+ blk_size = lim->logical_block_size;
/* Use topology information if available */
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, physical_block_exp,
&physical_block_exp);
if (!err && physical_block_exp)
- blk_queue_physical_block_size(q,
- blk_size * (1 << physical_block_exp));
+ lim->physical_block_size = blk_size * (1 << physical_block_exp);
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, alignment_offset,
&alignment_offset);
if (!err && alignment_offset)
- blk_queue_alignment_offset(q, blk_size * alignment_offset);
+ lim->alignment_offset = blk_size * alignment_offset;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, min_io_size,
&min_io_size);
if (!err && min_io_size)
- blk_queue_io_min(q, blk_size * min_io_size);
+ lim->io_min = blk_size * min_io_size;
err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
struct virtio_blk_config, opt_io_size,
&opt_io_size);
if (!err && opt_io_size)
- blk_queue_io_opt(q, blk_size * opt_io_size);
+ lim->io_opt = blk_size * opt_io_size;
if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
virtio_cread(vdev, struct virtio_blk_config,
@@ -1420,7 +1333,7 @@ static int virtblk_probe(struct virtio_device *vdev)
virtio_cread(vdev, struct virtio_blk_config,
max_discard_sectors, &v);
- blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
+ lim->max_hw_discard_sectors = v ? v : UINT_MAX;
virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
&max_discard_segs);
@@ -1429,7 +1342,7 @@ static int virtblk_probe(struct virtio_device *vdev)
if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
virtio_cread(vdev, struct virtio_blk_config,
max_write_zeroes_sectors, &v);
- blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
+ lim->max_write_zeroes_sectors = v ? v : UINT_MAX;
}
/* The discard and secure erase limits are combined since the Linux
@@ -1455,8 +1368,7 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: secure_erase_sector_alignment can't be 0\n");
- err = -EINVAL;
- goto out_cleanup_disk;
+ return -EINVAL;
}
discard_granularity = min_not_zero(discard_granularity, v);
@@ -1470,11 +1382,10 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_sectors can't be 0\n");
- err = -EINVAL;
- goto out_cleanup_disk;
+ return -EINVAL;
}
- blk_queue_max_secure_erase_sectors(q, v);
+ lim->max_secure_erase_sectors = v;
virtio_cread(vdev, struct virtio_blk_config,
max_secure_erase_seg, &v);
@@ -1485,8 +1396,7 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!v) {
dev_err(&vdev->dev,
"virtio_blk: max_secure_erase_seg can't be 0\n");
- err = -EINVAL;
- goto out_cleanup_disk;
+ return -EINVAL;
}
max_discard_segs = min_not_zero(max_discard_segs, v);
@@ -1502,45 +1412,142 @@ static int virtblk_probe(struct virtio_device *vdev)
if (!max_discard_segs)
max_discard_segs = sg_elems;
- blk_queue_max_discard_segments(q,
- min(max_discard_segs, MAX_DISCARD_SEGMENTS));
+ lim->max_discard_segments =
+ min(max_discard_segs, MAX_DISCARD_SEGMENTS);
if (discard_granularity)
- q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT;
+ lim->discard_granularity =
+ discard_granularity << SECTOR_SHIFT;
else
- q->limits.discard_granularity = blk_size;
+ lim->discard_granularity = blk_size;
}
- virtblk_update_capacity(vblk, false);
- virtio_device_ready(vdev);
-
- /*
- * All steps that follow use the VQs therefore they need to be
- * placed after the virtio_device_ready() call above.
- */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) {
u8 model;
- virtio_cread(vdev, struct virtio_blk_config, zoned.model,
- &model);
+ virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model);
switch (model) {
case VIRTIO_BLK_Z_NONE:
case VIRTIO_BLK_Z_HA:
- /* Present the host-aware device as non-zoned */
- break;
+ /* treat host-aware devices as non-zoned */
+ return 0;
case VIRTIO_BLK_Z_HM:
- err = virtblk_probe_zoned_device(vdev, vblk, q);
+ err = virtblk_read_zoned_limits(vblk, lim);
if (err)
- goto out_cleanup_disk;
+ return err;
break;
default:
- dev_err(&vdev->dev, "unsupported zone model %d\n",
- model);
- err = -EINVAL;
- goto out_cleanup_disk;
+ dev_err(&vdev->dev, "unsupported zone model %d\n", model);
+ return -EINVAL;
}
}
+ return 0;
+}
+
+static int virtblk_probe(struct virtio_device *vdev)
+{
+ struct virtio_blk *vblk;
+ struct queue_limits lim = { };
+ int err, index;
+ unsigned int queue_depth;
+
+ if (!vdev->config->get) {
+ dev_err(&vdev->dev, "%s failure: config access disabled\n",
+ __func__);
+ return -EINVAL;
+ }
+
+ err = ida_alloc_range(&vd_index_ida, 0,
+ minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
+ if (err < 0)
+ goto out;
+ index = err;
+
+ vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
+ if (!vblk) {
+ err = -ENOMEM;
+ goto out_free_index;
+ }
+
+ mutex_init(&vblk->vdev_mutex);
+
+ vblk->vdev = vdev;
+
+ INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+
+ err = init_vq(vblk);
+ if (err)
+ goto out_free_vblk;
+
+ /* Default queue sizing is to fill the ring. */
+ if (!virtblk_queue_depth) {
+ queue_depth = vblk->vqs[0].vq->num_free;
+ /* ... but without indirect descs, we use 2 descs per req */
+ if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+ queue_depth /= 2;
+ } else {
+ queue_depth = virtblk_queue_depth;
+ }
+
+ memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+ vblk->tag_set.ops = &virtio_mq_ops;
+ vblk->tag_set.queue_depth = queue_depth;
+ vblk->tag_set.numa_node = NUMA_NO_NODE;
+ vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+ vblk->tag_set.cmd_size =
+ sizeof(struct virtblk_req) +
+ sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
+ vblk->tag_set.driver_data = vblk;
+ vblk->tag_set.nr_hw_queues = vblk->num_vqs;
+ vblk->tag_set.nr_maps = 1;
+ if (vblk->io_queues[HCTX_TYPE_POLL])
+ vblk->tag_set.nr_maps = 3;
+
+ err = blk_mq_alloc_tag_set(&vblk->tag_set);
+ if (err)
+ goto out_free_vq;
+
+ err = virtblk_read_limits(vblk, &lim);
+ if (err)
+ goto out_free_tags;
+
+ vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk);
+ if (IS_ERR(vblk->disk)) {
+ err = PTR_ERR(vblk->disk);
+ goto out_free_tags;
+ }
+
+ virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
+
+ vblk->disk->major = major;
+ vblk->disk->first_minor = index_to_minor(index);
+ vblk->disk->minors = 1 << PART_BITS;
+ vblk->disk->private_data = vblk;
+ vblk->disk->fops = &virtblk_fops;
+ vblk->index = index;
+
+ /* configure queue flush support */
+ virtblk_update_cache_mode(vdev);
+
+ /* If disk is read-only in the host, the guest should obey */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
+ set_disk_ro(vblk->disk, 1);
+
+ virtblk_update_capacity(vblk, false);
+ virtio_device_ready(vdev);
+
+ /*
+ * All steps that follow use the VQs therefore they need to be
+ * placed after the virtio_device_ready() call above.
+ */
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) {
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue);
+ err = blk_revalidate_disk_zones(vblk->disk, NULL);
+ if (err)
+ goto out_cleanup_disk;
+ }
+
err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
if (err)
goto out_cleanup_disk;
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 434fab306777..fd7c0ff2139c 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -941,39 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = {
.complete = blkif_complete_rq,
};
-static void blkif_set_queue_limits(struct blkfront_info *info)
+static void blkif_set_queue_limits(const struct blkfront_info *info,
+ struct queue_limits *lim)
{
- struct request_queue *rq = info->rq;
- struct gendisk *gd = info->gd;
unsigned int segments = info->max_indirect_segments ? :
BLKIF_MAX_SEGMENTS_PER_REQUEST;
- blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
-
if (info->feature_discard) {
- blk_queue_max_discard_sectors(rq, get_capacity(gd));
- rq->limits.discard_granularity = info->discard_granularity ?:
- info->physical_sector_size;
- rq->limits.discard_alignment = info->discard_alignment;
+ lim->max_hw_discard_sectors = UINT_MAX;
+ if (info->discard_granularity)
+ lim->discard_granularity = info->discard_granularity;
+ lim->discard_alignment = info->discard_alignment;
if (info->feature_secdiscard)
- blk_queue_max_secure_erase_sectors(rq,
- get_capacity(gd));
+ lim->max_secure_erase_sectors = UINT_MAX;
}
/* Hard sector size and max sectors impersonate the equiv. hardware. */
- blk_queue_logical_block_size(rq, info->sector_size);
- blk_queue_physical_block_size(rq, info->physical_sector_size);
- blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
+ lim->logical_block_size = info->sector_size;
+ lim->physical_block_size = info->physical_sector_size;
+ lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512;
/* Each segment in a request is up to an aligned page in size. */
- blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
- blk_queue_max_segment_size(rq, PAGE_SIZE);
+ lim->seg_boundary_mask = PAGE_SIZE - 1;
+ lim->max_segment_size = PAGE_SIZE;
/* Ensure a merged request will fit in a single I/O ring slot. */
- blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
+ lim->max_segments = segments / GRANTS_PER_PSEG;
/* Make sure buffer addresses are sector-aligned. */
- blk_queue_dma_alignment(rq, 511);
+ lim->dma_alignment = 511;
}
static const char *flush_info(struct blkfront_info *info)
@@ -1070,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
struct blkfront_info *info, u16 sector_size,
unsigned int physical_sector_size)
{
+ struct queue_limits lim = {};
struct gendisk *gd;
int nr_minors = 1;
int err;
@@ -1136,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
if (err)
goto out_release_minors;
- gd = blk_mq_alloc_disk(&info->tag_set, info);
+ blkif_set_queue_limits(info, &lim);
+ gd = blk_mq_alloc_disk(&info->tag_set, &lim, info);
if (IS_ERR(gd)) {
err = PTR_ERR(gd);
goto out_free_tag_set;
}
+ blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue);
strcpy(gd->disk_name, DEV_NAME);
ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
@@ -1162,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->gd = gd;
info->sector_size = sector_size;
info->physical_sector_size = physical_sector_size;
- blkif_set_queue_limits(info);
xlvbd_flush(info);
@@ -2006,18 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev,
static int blkif_recover(struct blkfront_info *info)
{
+ struct queue_limits lim;
unsigned int r_index;
struct request *req, *n;
int rc;
struct bio *bio;
- unsigned int segs;
struct blkfront_ring_info *rinfo;
+ lim = queue_limits_start_update(info->rq);
blkfront_gather_backend_features(info);
- /* Reset limits changed by blk_mq_update_nr_hw_queues(). */
- blkif_set_queue_limits(info);
- segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
- blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
+ blkif_set_queue_limits(info, &lim);
+ rc = queue_limits_commit_update(info->rq, &lim);
+ if (rc)
+ return rc;
for_each_rinfo(info, rinfo, r_index) {
rc = blkfront_setup_indirect(rinfo);
@@ -2037,7 +2036,9 @@ static int blkif_recover(struct blkfront_info *info)
list_for_each_entry_safe(req, n, &info->requests, queuelist) {
/* Requeue pending requests (flush or discard) */
list_del_init(&req->queuelist);
- BUG_ON(req->nr_phys_segments > segs);
+ BUG_ON(req->nr_phys_segments >
+ (info->max_indirect_segments ? :
+ BLKIF_MAX_SEGMENTS_PER_REQUEST));
blk_mq_requeue_request(req, false);
}
blk_mq_start_stopped_hw_queues(info->rq, true);
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index 11493167b0a8..7c5f4e4d9b50 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor)
struct gendisk *disk;
int err;
- disk = blk_mq_alloc_disk(&tag_set, NULL);
+ disk = blk_mq_alloc_disk(&tag_set, NULL, NULL);
if (IS_ERR(disk))
return PTR_ERR(disk);
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d96b3851b5d3..da7a20fa6152 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -2177,6 +2177,28 @@ ATTRIBUTE_GROUPS(zram_disk);
*/
static int zram_add(void)
{
+ struct queue_limits lim = {
+ .logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE,
+ /*
+ * To ensure that we always get PAGE_SIZE aligned and
+ * n*PAGE_SIZED sized I/O requests.
+ */
+ .physical_block_size = PAGE_SIZE,
+ .io_min = PAGE_SIZE,
+ .io_opt = PAGE_SIZE,
+ .max_hw_discard_sectors = UINT_MAX,
+ /*
+ * zram_bio_discard() will clear all logical blocks if logical
+ * block size is identical with physical block size(PAGE_SIZE).
+ * But if it is different, we will skip discarding some parts of
+ * logical blocks in the part of the request range which isn't
+ * aligned to physical block size. So we can't ensure that all
+ * discarded logical blocks are zeroed.
+ */
+#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE
+ .max_write_zeroes_sectors = UINT_MAX,
+#endif
+ };
struct zram *zram;
int ret, device_id;
@@ -2195,11 +2217,11 @@ static int zram_add(void)
#endif
/* gendisk structure */
- zram->disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!zram->disk) {
+ zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(zram->disk)) {
pr_err("Error allocating disk structure for device %d\n",
device_id);
- ret = -ENOMEM;
+ ret = PTR_ERR(zram->disk);
goto out_free_idr;
}
@@ -2216,29 +2238,6 @@ static int zram_add(void)
/* zram devices sort of resembles non-rotational disks */
blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue);
-
- /*
- * To ensure that we always get PAGE_SIZE aligned
- * and n*PAGE_SIZED sized I/O requests.
- */
- blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
- blk_queue_logical_block_size(zram->disk->queue,
- ZRAM_LOGICAL_BLOCK_SIZE);
- blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
- blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
- blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
-
- /*
- * zram_bio_discard() will clear all logical blocks if logical block
- * size is identical with physical block size(PAGE_SIZE). But if it is
- * different, we will skip discarding some parts of logical blocks in
- * the part of the request range which isn't aligned to physical block
- * size. So we can't ensure that all discarded logical blocks are
- * zeroed.
- */
- if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
- blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
-
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
if (ret)
diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c
index d668b174ace9..eefdd422ad8e 100644
--- a/drivers/cdrom/gdrom.c
+++ b/drivers/cdrom/gdrom.c
@@ -724,11 +724,6 @@ static void probe_gdrom_setupdisk(void)
static int probe_gdrom_setupqueue(void)
{
- blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR);
- /* using DMA so memory will need to be contiguous */
- blk_queue_max_segments(gd.gdrom_rq, 1);
- /* set a large max size to get most from DMA */
- blk_queue_max_segment_size(gd.gdrom_rq, 0x40000);
gd.disk->queue = gd.gdrom_rq;
return gdrom_init_dma_mode();
}
@@ -743,6 +738,13 @@ static const struct blk_mq_ops gdrom_mq_ops = {
*/
static int probe_gdrom(struct platform_device *devptr)
{
+ struct queue_limits lim = {
+ .logical_block_size = GDROM_HARD_SECTOR,
+ /* using DMA so memory will need to be contiguous */
+ .max_segments = 1,
+ /* set a large max size to get most from DMA */
+ .max_segment_size = 0x40000,
+ };
int err;
/*
@@ -778,7 +780,7 @@ static int probe_gdrom(struct platform_device *devptr)
if (err)
goto probe_fail_free_cd_info;
- gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL);
+ gd.disk = blk_mq_alloc_disk(&gd.tag_set, &lim, NULL);
if (IS_ERR(gd.disk)) {
err = PTR_ERR(gd.disk);
goto probe_fail_free_tag_set;
@@ -829,7 +831,7 @@ probe_fail_no_mem:
return err;
}
-static int remove_gdrom(struct platform_device *devptr)
+static void remove_gdrom(struct platform_device *devptr)
{
blk_mq_free_tag_set(&gd.tag_set);
free_irq(HW_EVENT_GDROM_CMD, &gd);
@@ -840,13 +842,11 @@ static int remove_gdrom(struct platform_device *devptr)
unregister_cdrom(gd.cd_info);
kfree(gd.cd_info);
kfree(gd.toc);
-
- return 0;
}
static struct platform_driver gdrom_driver = {
.probe = probe_gdrom,
- .remove = remove_gdrom,
+ .remove_new = remove_gdrom,
.driver = {
.name = GDROM_DEV_NAME,
},
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index d00b3abab133..330bcd9ea4a9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -900,9 +900,23 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
struct request_queue *q;
const size_t max_stripes = min_t(size_t, INT_MAX,
SIZE_MAX / sizeof(atomic_t));
+ struct queue_limits lim = {
+ .max_hw_sectors = UINT_MAX,
+ .max_sectors = UINT_MAX,
+ .max_segment_size = UINT_MAX,
+ .max_segments = BIO_MAX_VECS,
+ .max_hw_discard_sectors = UINT_MAX,
+ .io_min = block_size,
+ .logical_block_size = block_size,
+ .physical_block_size = block_size,
+ };
uint64_t n;
int idx;
+ if (cached_bdev) {
+ d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT;
+ lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev));
+ }
if (!d->stripe_size)
d->stripe_size = 1 << 31;
else if (d->stripe_size < BCH_MIN_STRIPE_SZ)
@@ -935,8 +949,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER))
goto out_ida_remove;
- d->disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!d->disk)
+ if (lim.logical_block_size > PAGE_SIZE && cached_bdev) {
+ /*
+ * This should only happen with BCACHE_SB_VERSION_BDEV.
+ * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
+ */
+ pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
+ idx, lim.logical_block_size,
+ PAGE_SIZE, bdev_logical_block_size(cached_bdev));
+
+ /* This also adjusts physical block size/min io size if needed */
+ lim.logical_block_size = bdev_logical_block_size(cached_bdev);
+ }
+
+ d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(d->disk))
goto out_bioset_exit;
set_capacity(d->disk, sectors);
@@ -949,27 +976,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size,
d->disk->private_data = d;
q = d->disk->queue;
- q->limits.max_hw_sectors = UINT_MAX;
- q->limits.max_sectors = UINT_MAX;
- q->limits.max_segment_size = UINT_MAX;
- q->limits.max_segments = BIO_MAX_VECS;
- blk_queue_max_discard_sectors(q, UINT_MAX);
- q->limits.io_min = block_size;
- q->limits.logical_block_size = block_size;
- q->limits.physical_block_size = block_size;
-
- if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) {
- /*
- * This should only happen with BCACHE_SB_VERSION_BDEV.
- * Block/page size is checked for BCACHE_SB_VERSION_CDEV.
- */
- pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n",
- d->disk->disk_name, q->limits.logical_block_size,
- PAGE_SIZE, bdev_logical_block_size(cached_bdev));
-
- /* This also adjusts physical block size/min io size if needed */
- blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev));
- }
blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue);
@@ -1416,9 +1422,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
hlist_add_head(&io->hash, dc->io_hash + RECENT_IO);
}
- dc->disk.stripe_size = q->limits.io_opt >> 9;
-
- if (dc->disk.stripe_size)
+ if (bdev_io_opt(dc->bdev))
dc->partial_stripes_expensive =
q->limits.raid_partial_stripes_expensive;
@@ -1428,9 +1432,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size)
if (ret)
return ret;
- blk_queue_io_opt(dc->disk.disk->queue,
- max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q)));
-
atomic_set(&dc->io_errors, 0);
dc->io_disable = false;
dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT;
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index eb009d6bb03a..17e9af60bbf7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -213,6 +213,7 @@ struct raid_dev {
#define RT_FLAG_RS_IN_SYNC 6
#define RT_FLAG_RS_RESYNCING 7
#define RT_FLAG_RS_GROW 8
+#define RT_FLAG_RS_FROZEN 9
/* Array elements of 64 bit needed for rebuild/failed disk bits */
#define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8)
@@ -3240,11 +3241,12 @@ size_check:
rs->md.ro = 1;
rs->md.in_sync = 1;
- /* Keep array frozen until resume. */
- set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery);
-
/* Has to be held on running the array */
mddev_suspend_and_lock_nointr(&rs->md);
+
+ /* Keep array frozen until resume. */
+ md_frozen_sync_thread(&rs->md);
+
r = md_run(&rs->md);
rs->md.in_sync = 0; /* Assume already marked dirty */
if (r) {
@@ -3339,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio)
if (unlikely(bio_end_sector(bio) > mddev->array_sectors))
return DM_MAPIO_REQUEUE;
- md_handle_request(mddev, bio);
+ if (unlikely(!md_handle_request(mddev, bio)))
+ return DM_MAPIO_REQUEUE;
return DM_MAPIO_SUBMITTED;
}
@@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv,
{
struct raid_set *rs = ti->private;
struct mddev *mddev = &rs->md;
+ int ret = 0;
if (!mddev->pers || !mddev->pers->sync_request)
return -EINVAL;
- if (!strcasecmp(argv[0], "frozen"))
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- else
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) ||
+ test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags))
+ return -EBUSY;
- if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) {
- if (mddev->sync_thread) {
- set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_reap_sync_thread(mddev);
- }
- } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
+ if (!strcasecmp(argv[0], "frozen")) {
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ md_frozen_sync_thread(mddev);
+ mddev_unlock(mddev);
+ } else if (!strcasecmp(argv[0], "idle")) {
+ ret = mddev_lock(mddev);
+ if (ret)
+ return ret;
+
+ md_idle_sync_thread(mddev);
+ mddev_unlock(mddev);
+ }
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ if (decipher_sync_action(mddev, mddev->recovery) != st_idle)
return -EBUSY;
else if (!strcasecmp(argv[0], "resync"))
; /* MD_RECOVERY_NEEDED set below */
@@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
}
+static void raid_presuspend(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+ struct mddev *mddev = &rs->md;
+
+ /*
+ * From now on, disallow raid_message() to change sync_thread until
+ * resume, raid_postsuspend() is too late.
+ */
+ set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
+
+ if (!reshape_interrupted(mddev))
+ return;
+
+ /*
+ * For raid456, if reshape is interrupted, IO across reshape position
+ * will never make progress, while caller will wait for IO to be done.
+ * Inform raid456 to handle those IO to prevent deadlock.
+ */
+ if (mddev->pers && mddev->pers->prepare_suspend)
+ mddev->pers->prepare_suspend(mddev);
+}
+
+static void raid_presuspend_undo(struct dm_target *ti)
+{
+ struct raid_set *rs = ti->private;
+
+ clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
+}
+
static void raid_postsuspend(struct dm_target *ti)
{
struct raid_set *rs = ti->private;
if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
- /* Writes have to be stopped before suspending to avoid deadlocks. */
- if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery))
- md_stop_writes(&rs->md);
-
+ /*
+ * sync_thread must be stopped during suspend, and writes have
+ * to be stopped before suspending to avoid deadlocks.
+ */
+ md_stop_writes(&rs->md);
mddev_suspend(&rs->md, false);
}
}
@@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti)
}
/* Check for any resize/reshape on @rs and adjust/initiate */
- /* Be prepared for mddev_resume() in raid_resume() */
- set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) {
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
mddev->resync_min = mddev->recovery_cp;
@@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti)
* Take this opportunity to check whether any failed
* devices are reachable again.
*/
+ mddev_lock_nointr(mddev);
attempt_restore_of_faulty_devices(rs);
+ mddev_unlock(mddev);
}
if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) {
@@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti)
if (mddev->delta_disks < 0)
rs_set_capacity(rs);
+ WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery));
+ WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery));
+ clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags);
mddev_lock_nointr(mddev);
- clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
mddev->ro = 0;
mddev->in_sync = 0;
+ md_unfrozen_sync_thread(mddev);
mddev_unlock_and_resume(mddev);
}
}
@@ -4074,6 +4123,8 @@ static struct target_type raid_target = {
.message = raid_message,
.iterate_devices = raid_iterate_devices,
.io_hints = raid_io_hints,
+ .presuspend = raid_presuspend,
+ .presuspend_undo = raid_presuspend_undo,
.postsuspend = raid_postsuspend,
.preresume = raid_preresume,
.resume = raid_resume,
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 41f1d731ae5a..88114719fe18 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
bool wc = false, fua = false;
int r;
- /*
- * Copy table's limits to the DM device's request_queue
- */
- q->limits = *limits;
-
if (dm_table_supports_nowait(t))
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
else
blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q);
if (!dm_table_supports_discards(t)) {
- q->limits.max_discard_sectors = 0;
- q->limits.max_hw_discard_sectors = 0;
- q->limits.discard_granularity = 0;
- q->limits.discard_alignment = 0;
- q->limits.discard_misaligned = 0;
+ limits->max_hw_discard_sectors = 0;
+ limits->discard_granularity = 0;
+ limits->discard_alignment = 0;
+ limits->discard_misaligned = 0;
}
+ if (!dm_table_supports_write_zeroes(t))
+ limits->max_write_zeroes_sectors = 0;
+
if (!dm_table_supports_secure_erase(t))
- q->limits.max_secure_erase_sectors = 0;
+ limits->max_secure_erase_sectors = 0;
+
+ r = queue_limits_set(q, limits);
+ if (r)
+ return r;
if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) {
wc = true;
@@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
else
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- if (!dm_table_supports_write_zeroes(t))
- q->limits.max_write_zeroes_sectors = 0;
-
dm_table_verify_integrity(t);
/*
@@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
}
dm_update_crypto_profile(q, t);
- disk_update_readahead(t->md->disk);
/*
* Check for request-based device is left to
diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c
index fdfe30f7b697..8156881a31de 100644
--- a/drivers/md/dm-zoned-metadata.c
+++ b/drivers/md/dm-zoned-metadata.c
@@ -1655,10 +1655,13 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone)
if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) {
struct dmz_dev *dev = zone->dev;
+ unsigned int noio_flag;
+ noio_flag = memalloc_noio_save();
ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET,
dmz_start_sect(zmd, zone),
- zmd->zone_nr_sectors, GFP_NOIO);
+ zmd->zone_nr_sectors);
+ memalloc_noio_restore(noio_flag);
if (ret) {
dmz_dev_err(dev, "Reset zone %u failed %d",
zone->id, ret);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 87de5b5682ad..447e132d09b5 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -2101,8 +2101,8 @@ static struct mapped_device *alloc_dev(int minor)
* established. If request-based table is loaded: blk-mq will
* override accordingly.
*/
- md->disk = blk_alloc_disk(md->numa_node_id);
- if (!md->disk)
+ md->disk = blk_alloc_disk(NULL, md->numa_node_id);
+ if (IS_ERR(md->disk))
goto bad;
md->queue = md->disk->queue;
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index 9672f75c3050..059afc24c08b 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
sector_t doff;
bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
- if (pg_index == store->file_pages - 1) {
+ /* we compare length (page numbers), not page offset. */
+ if ((pg_index - store->sb_index) == store->file_pages - 1) {
unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
if (last_page_size == 0)
@@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index,
struct page *page = store->filemap[pg_index];
if (mddev_is_clustered(bitmap->mddev)) {
- pg_index += bitmap->cluster_slot *
- DIV_ROUND_UP(store->bytes, PAGE_SIZE);
+ /* go to node bitmap area starting point */
+ pg_index += store->sb_index;
}
if (store->file)
@@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
+ index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block)
unsigned long index = file_page_index(store, chunk);
unsigned long node_offset = 0;
+ index += store->sb_index;
if (mddev_is_clustered(bitmap->mddev))
node_offset = bitmap->cluster_slot * store->file_pages;
@@ -1043,9 +1046,8 @@ void md_bitmap_unplug(struct bitmap *bitmap)
if (dirty || need_write) {
if (!writing) {
md_bitmap_wait_writes(bitmap);
- if (bitmap->mddev->queue)
- blk_add_trace_msg(bitmap->mddev->queue,
- "md bitmap_unplug");
+ mddev_add_trace_msg(bitmap->mddev,
+ "md bitmap_unplug");
}
clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING);
filemap_write_page(bitmap, i, false);
@@ -1316,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev)
}
bitmap->allclean = 1;
- if (bitmap->mddev->queue)
- blk_add_trace_msg(bitmap->mddev->queue,
- "md bitmap_daemon_work");
+ mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work");
/* Any file-page which is PENDING now needs to be written.
* So set NEEDWRITE now, then after we make any last-minute changes
diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h
deleted file mode 100644
index 5587eeedb882..000000000000
--- a/drivers/md/md-linear.h
+++ /dev/null
@@ -1,17 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _LINEAR_H
-#define _LINEAR_H
-
-struct dev_info {
- struct md_rdev *rdev;
- sector_t end_sector;
-};
-
-struct linear_conf
-{
- struct rcu_head rcu;
- sector_t array_sectors;
- int raid_disks; /* a copy of mddev->raid_disks */
- struct dev_info disks[] __counted_by(raid_disks);
-};
-#endif
diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h
deleted file mode 100644
index b3099e5fc4d7..000000000000
--- a/drivers/md/md-multipath.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef _MULTIPATH_H
-#define _MULTIPATH_H
-
-struct multipath_info {
- struct md_rdev *rdev;
-};
-
-struct mpconf {
- struct mddev *mddev;
- struct multipath_info *multipaths;
- int raid_disks;
- spinlock_t device_lock;
- struct list_head retry_list;
-
- mempool_t pool;
-};
-
-/*
- * this is our 'private' 'collective' MULTIPATH buffer head.
- * it contains information about what kind of IO operations were started
- * for this MULTIPATH operation, and about their status:
- */
-
-struct multipath_bh {
- struct mddev *mddev;
- struct bio *master_bio;
- struct bio bio;
- int path;
- struct list_head retry_list;
-};
-#endif
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 3bf7eab94efd..e575e74aabf5 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -65,7 +65,6 @@
#include <linux/percpu-refcount.h>
#include <linux/part_stat.h>
-#include <trace/events/block.h>
#include "md.h"
#include "md-bitmap.h"
#include "md-cluster.h"
@@ -99,18 +98,6 @@ static void mddev_detach(struct mddev *mddev);
static void export_rdev(struct md_rdev *rdev, struct mddev *mddev);
static void md_wakeup_thread_directly(struct md_thread __rcu *thread);
-enum md_ro_state {
- MD_RDWR,
- MD_RDONLY,
- MD_AUTO_READ,
- MD_MAX_STATE
-};
-
-static bool md_is_rdwr(struct mddev *mddev)
-{
- return (mddev->ro == MD_RDWR);
-}
-
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -378,7 +365,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio)
return true;
}
-void md_handle_request(struct mddev *mddev, struct bio *bio)
+bool md_handle_request(struct mddev *mddev, struct bio *bio)
{
check_suspended:
if (is_suspended(mddev, bio)) {
@@ -386,7 +373,7 @@ check_suspended:
/* Bail out if REQ_NOWAIT is set for the bio */
if (bio->bi_opf & REQ_NOWAIT) {
bio_wouldblock_error(bio);
- return;
+ return true;
}
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
@@ -402,10 +389,13 @@ check_suspended:
if (!mddev->pers->make_request(mddev, bio)) {
percpu_ref_put(&mddev->active_io);
+ if (!mddev->gendisk && mddev->pers->prepare_suspend)
+ return false;
goto check_suspended;
}
percpu_ref_put(&mddev->active_io);
+ return true;
}
EXPORT_SYMBOL(md_handle_request);
@@ -529,6 +519,24 @@ void mddev_resume(struct mddev *mddev)
}
EXPORT_SYMBOL_GPL(mddev_resume);
+/* sync bdev before setting device to readonly or stopping raid*/
+static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num)
+{
+ mutex_lock(&mddev->open_mutex);
+ if (mddev->pers && atomic_read(&mddev->openers) > opener_num) {
+ mutex_unlock(&mddev->open_mutex);
+ return -EBUSY;
+ }
+ if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
+ mutex_unlock(&mddev->open_mutex);
+ return -EBUSY;
+ }
+ mutex_unlock(&mddev->open_mutex);
+
+ sync_blockdev(mddev->gendisk->part0);
+ return 0;
+}
+
/*
* Generic flush handling for md
*/
@@ -2406,7 +2414,7 @@ int md_integrity_register(struct mddev *mddev)
if (list_empty(&mddev->disks))
return 0; /* nothing to do */
- if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
+ if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk))
return 0; /* shouldn't register, or already is */
rdev_for_each(rdev, mddev) {
/* skip spares and non-functional disks */
@@ -2459,7 +2467,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
{
struct blk_integrity *bi_mddev;
- if (!mddev->gendisk)
+ if (mddev_is_dm(mddev))
return 0;
bi_mddev = blk_get_integrity(mddev->gendisk);
@@ -2566,6 +2574,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev)
fail:
pr_warn("md: failed to register dev-%s for %s\n",
b, mdname(mddev));
+ mddev_destroy_serial_pool(mddev, rdev);
return err;
}
@@ -2595,7 +2604,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev)
list_del_rcu(&rdev->same_set);
pr_debug("md: unbind<%pg>\n", rdev->bdev);
mddev_destroy_serial_pool(rdev->mddev, rdev);
- rdev->mddev = NULL;
+ WRITE_ONCE(rdev->mddev, NULL);
sysfs_remove_link(&rdev->kobj, "block");
sysfs_put(rdev->sysfs_state);
sysfs_put(rdev->sysfs_unack_badblocks);
@@ -2851,8 +2860,7 @@ repeat:
pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
mdname(mddev), mddev->in_sync);
- if (mddev->queue)
- blk_add_trace_msg(mddev->queue, "md md_update_sb");
+ mddev_add_trace_msg(mddev, "md md_update_sb");
rewrite:
md_bitmap_update_sb(mddev->bitmap);
rdev_for_each(rdev, mddev) {
@@ -2933,7 +2941,6 @@ static int add_bound_rdev(struct md_rdev *rdev)
set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_new_event();
- md_wakeup_thread(mddev->thread);
return 0;
}
@@ -3048,10 +3055,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
if (err == 0) {
md_kick_rdev_from_array(rdev);
- if (mddev->pers) {
+ if (mddev->pers)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- md_wakeup_thread(mddev->thread);
- }
md_new_event();
}
}
@@ -3081,7 +3086,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
clear_bit(BlockedBadBlocks, &rdev->flags);
wake_up(&rdev->blocked_wait);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
- md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
@@ -3119,7 +3123,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)
!test_bit(Replacement, &rdev->flags))
set_bit(WantReplacement, &rdev->flags);
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
- md_wakeup_thread(rdev->mddev->thread);
err = 0;
} else if (cmd_match(buf, "-want_replacement")) {
/* Clearing 'want_replacement' is always allowed.
@@ -3249,7 +3252,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len)
if (rdev->raid_disk >= 0)
return -EBUSY;
set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
- md_wakeup_thread(rdev->mddev->thread);
} else if (rdev->mddev->pers) {
/* Activating a spare .. or possibly reactivating
* if we ever get bitmaps working here.
@@ -3343,8 +3345,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev,
if (kstrtoull(buf, 10, &new_offset) < 0)
return -EINVAL;
- if (mddev->sync_thread ||
- test_bit(MD_RECOVERY_RUNNING,&mddev->recovery))
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (new_offset == rdev->data_offset)
/* reset is always permitted */
@@ -3675,7 +3676,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr,
struct kernfs_node *kn = NULL;
bool suspend = false;
ssize_t rv;
- struct mddev *mddev = rdev->mddev;
+ struct mddev *mddev = READ_ONCE(rdev->mddev);
if (!entry->store)
return -EIO;
@@ -4017,8 +4018,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
*/
rv = -EBUSY;
- if (mddev->sync_thread ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->reshape_position != MaxSector ||
mddev->sysfs_active)
goto out_unlock;
@@ -4168,7 +4168,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
mddev->in_sync = 1;
del_timer_sync(&mddev->safemode_timer);
}
- blk_set_stacking_limits(&mddev->queue->limits);
pers->run(mddev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
if (!mddev->thread)
@@ -4475,8 +4474,8 @@ array_state_show(struct mddev *mddev, char *page)
return sprintf(page, "%s\n", array_states[st]);
}
-static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev);
-static int md_set_readonly(struct mddev *mddev, struct block_device *bdev);
+static int do_md_stop(struct mddev *mddev, int ro);
+static int md_set_readonly(struct mddev *mddev);
static int restart_array(struct mddev *mddev);
static ssize_t
@@ -4493,6 +4492,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case broken: /* cannot be set */
case bad_word:
return -EINVAL;
+ case clear:
+ case readonly:
+ case inactive:
+ case read_auto:
+ if (!mddev->pers || !md_is_rdwr(mddev))
+ break;
+ /* write sysfs will not open mddev and opener should be 0 */
+ err = mddev_set_closing_and_sync_blockdev(mddev, 0);
+ if (err)
+ return err;
+ break;
default:
break;
}
@@ -4526,14 +4536,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case inactive:
/* stop an active array, return 0 otherwise */
if (mddev->pers)
- err = do_md_stop(mddev, 2, NULL);
+ err = do_md_stop(mddev, 2);
break;
case clear:
- err = do_md_stop(mddev, 0, NULL);
+ err = do_md_stop(mddev, 0);
break;
case readonly:
if (mddev->pers)
- err = md_set_readonly(mddev, NULL);
+ err = md_set_readonly(mddev);
else {
mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
@@ -4543,7 +4553,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
case read_auto:
if (mddev->pers) {
if (md_is_rdwr(mddev))
- err = md_set_readonly(mddev, NULL);
+ err = md_set_readonly(mddev);
else if (mddev->ro == MD_RDONLY)
err = restart_array(mddev);
if (err == 0) {
@@ -4592,6 +4602,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
mddev_unlock(mddev);
+
+ if (st == readonly || st == read_auto || st == inactive ||
+ (err && st == clear))
+ clear_bit(MD_CLOSING, &mddev->flags);
+
return err ?: len;
}
static struct md_sysfs_entry md_array_state =
@@ -4919,6 +4934,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq)
mddev_lock_nointr(mddev);
}
+void md_idle_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, true);
+}
+EXPORT_SYMBOL_GPL(md_idle_sync_thread);
+
+void md_frozen_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, false);
+}
+EXPORT_SYMBOL_GPL(md_frozen_sync_thread);
+
+void md_unfrozen_sync_thread(struct mddev *mddev)
+{
+ lockdep_assert_held(&mddev->reconfig_mutex);
+
+ clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ md_wakeup_thread(mddev->thread);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+}
+EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread);
+
static void idle_sync_thread(struct mddev *mddev)
{
mutex_lock(&mddev->sync_mutex);
@@ -5710,6 +5754,51 @@ static const struct kobj_type md_ktype = {
int mdp_major = 0;
+/* stack the limit for all rdevs into lim */
+void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim)
+{
+ struct md_rdev *rdev;
+
+ rdev_for_each(rdev, mddev) {
+ queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset,
+ mddev->gendisk->disk_name);
+ }
+}
+EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits);
+
+/* apply the extra stacking limits from a new rdev into mddev */
+int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev)
+{
+ struct queue_limits lim;
+
+ if (mddev_is_dm(mddev))
+ return 0;
+
+ lim = queue_limits_start_update(mddev->gendisk->queue);
+ queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset,
+ mddev->gendisk->disk_name);
+ return queue_limits_commit_update(mddev->gendisk->queue, &lim);
+}
+EXPORT_SYMBOL_GPL(mddev_stack_new_rdev);
+
+/* update the optimal I/O size after a reshape */
+void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes)
+{
+ struct queue_limits lim;
+
+ if (mddev_is_dm(mddev))
+ return;
+
+ /* don't bother updating io_opt if we can't suspend the array */
+ if (mddev_suspend(mddev, false) < 0)
+ return;
+ lim = queue_limits_start_update(mddev->gendisk->queue);
+ lim.io_opt = lim.io_min * nr_stripes;
+ queue_limits_commit_update(mddev->gendisk->queue, &lim);
+ mddev_resume(mddev);
+}
+EXPORT_SYMBOL_GPL(mddev_update_io_opt);
+
static void mddev_delayed_delete(struct work_struct *ws)
{
struct mddev *mddev = container_of(ws, struct mddev, del_work);
@@ -5774,10 +5863,11 @@ struct mddev *md_alloc(dev_t dev, char *name)
*/
mddev->hold_active = UNTIL_STOP;
- error = -ENOMEM;
- disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
+ disk = blk_alloc_disk(NULL, NUMA_NO_NODE);
+ if (IS_ERR(disk)) {
+ error = PTR_ERR(disk);
goto out_free_mddev;
+ }
disk->major = MAJOR(mddev->unit);
disk->first_minor = unit << shift;
@@ -5791,9 +5881,7 @@ struct mddev *md_alloc(dev_t dev, char *name)
disk->fops = &md_fops;
disk->private_data = mddev;
- mddev->queue = disk->queue;
- blk_set_stacking_limits(&mddev->queue->limits);
- blk_queue_write_cache(mddev->queue, true, true);
+ blk_queue_write_cache(disk->queue, true, true);
disk->events |= DISK_EVENT_MEDIA_CHANGE;
mddev->gendisk = disk;
error = add_disk(disk);
@@ -5935,7 +6023,7 @@ int md_run(struct mddev *mddev)
invalidate_bdev(rdev->bdev);
if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
mddev->ro = MD_RDONLY;
- if (mddev->gendisk)
+ if (!mddev_is_dm(mddev))
set_disk_ro(mddev->gendisk, 1);
}
@@ -6038,7 +6126,10 @@ int md_run(struct mddev *mddev)
pr_warn("True protection against single-disk failure might be compromised.\n");
}
- mddev->recovery = 0;
+ /* dm-raid expect sync_thread to be frozen until resume */
+ if (mddev->gendisk)
+ mddev->recovery = 0;
+
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
@@ -6094,7 +6185,8 @@ int md_run(struct mddev *mddev)
}
}
- if (mddev->queue) {
+ if (!mddev_is_dm(mddev)) {
+ struct request_queue *q = mddev->gendisk->queue;
bool nonrot = true;
rdev_for_each(rdev, mddev) {
@@ -6106,14 +6198,14 @@ int md_run(struct mddev *mddev)
if (mddev->degraded)
nonrot = false;
if (nonrot)
- blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
else
- blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue);
- blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q);
/* Set the NOWAIT flags if all underlying devices support it */
if (nowait)
- blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue);
+ blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q);
}
if (pers->sync_request) {
if (mddev->kobj.sd &&
@@ -6192,7 +6284,6 @@ int do_md_run(struct mddev *mddev)
/* run start up tasks that require md_thread */
md_start(mddev);
- md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
set_capacity_and_notify(mddev->gendisk, mddev->array_sectors);
@@ -6213,7 +6304,6 @@ int md_start(struct mddev *mddev)
if (mddev->pers->start) {
set_bit(MD_RECOVERY_WAIT, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
ret = mddev->pers->start(mddev);
clear_bit(MD_RECOVERY_WAIT, &mddev->recovery);
md_wakeup_thread(mddev->sync_thread);
@@ -6258,7 +6348,6 @@ static int restart_array(struct mddev *mddev)
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
/* Kick recovery or resync if necessary */
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_state);
return 0;
@@ -6278,7 +6367,15 @@ static void md_clean(struct mddev *mddev)
mddev->persistent = 0;
mddev->level = LEVEL_NONE;
mddev->clevel[0] = 0;
- mddev->flags = 0;
+ /*
+ * Don't clear MD_CLOSING, or mddev can be opened again.
+ * 'hold_active != 0' means mddev is still in the creation
+ * process and will be used later.
+ */
+ if (mddev->hold_active)
+ mddev->flags = 0;
+ else
+ mddev->flags &= BIT_ULL_MASK(MD_CLOSING);
mddev->sb_flags = 0;
mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
@@ -6315,7 +6412,6 @@ static void md_clean(struct mddev *mddev)
static void __md_stop_writes(struct mddev *mddev)
{
- stop_sync_thread(mddev, true, false);
del_timer_sync(&mddev->safemode_timer);
if (mddev->pers && mddev->pers->quiesce) {
@@ -6340,6 +6436,8 @@ static void __md_stop_writes(struct mddev *mddev)
void md_stop_writes(struct mddev *mddev)
{
mddev_lock_nointr(mddev);
+ set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
+ stop_sync_thread(mddev, true, false);
__md_stop_writes(mddev);
mddev_unlock(mddev);
}
@@ -6353,8 +6451,10 @@ static void mddev_detach(struct mddev *mddev)
mddev->pers->quiesce(mddev, 0);
}
md_unregister_thread(mddev, &mddev->thread);
- if (mddev->queue)
- blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
+
+ /* the unplug fn references 'conf' */
+ if (!mddev_is_dm(mddev))
+ blk_sync_queue(mddev->gendisk->queue);
}
static void __md_stop(struct mddev *mddev)
@@ -6391,7 +6491,8 @@ void md_stop(struct mddev *mddev)
EXPORT_SYMBOL_GPL(md_stop);
-static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
+/* ensure 'mddev->pers' exist before calling md_set_readonly() */
+static int md_set_readonly(struct mddev *mddev)
{
int err = 0;
int did_freeze = 0;
@@ -6402,7 +6503,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
}
stop_sync_thread(mddev, false, false);
@@ -6410,36 +6510,29 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
mddev_lock_nointr(mddev);
- mutex_lock(&mddev->open_mutex);
- if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
- mddev->sync_thread ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev));
err = -EBUSY;
goto out;
}
- if (mddev->pers) {
- __md_stop_writes(mddev);
-
- if (mddev->ro == MD_RDONLY) {
- err = -ENXIO;
- goto out;
- }
+ __md_stop_writes(mddev);
- mddev->ro = MD_RDONLY;
- set_disk_ro(mddev->gendisk, 1);
+ if (mddev->ro == MD_RDONLY) {
+ err = -ENXIO;
+ goto out;
}
+ mddev->ro = MD_RDONLY;
+ set_disk_ro(mddev->gendisk, 1);
+
out:
- if ((mddev->pers && !err) || did_freeze) {
+ if (!err || did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
sysfs_notify_dirent_safe(mddev->sysfs_state);
}
- mutex_unlock(&mddev->open_mutex);
return err;
}
@@ -6447,8 +6540,7 @@ out:
* 0 - completely stop and dis-assemble array
* 2 - stop but do not disassemble array
*/
-static int do_md_stop(struct mddev *mddev, int mode,
- struct block_device *bdev)
+static int do_md_stop(struct mddev *mddev, int mode)
{
struct gendisk *disk = mddev->gendisk;
struct md_rdev *rdev;
@@ -6457,22 +6549,16 @@ static int do_md_stop(struct mddev *mddev, int mode,
if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
did_freeze = 1;
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
}
stop_sync_thread(mddev, true, false);
- mutex_lock(&mddev->open_mutex);
- if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) ||
- mddev->sysfs_active ||
- mddev->sync_thread ||
+ if (mddev->sysfs_active ||
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) {
pr_warn("md: %s still in use.\n",mdname(mddev));
- mutex_unlock(&mddev->open_mutex);
if (did_freeze) {
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
}
return -EBUSY;
}
@@ -6491,13 +6577,11 @@ static int do_md_stop(struct mddev *mddev, int mode,
sysfs_unlink_rdev(mddev, rdev);
set_capacity_and_notify(disk, 0);
- mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
if (!md_is_rdwr(mddev))
mddev->ro = MD_RDWR;
- } else
- mutex_unlock(&mddev->open_mutex);
+ }
/*
* Free resources if final stop
*/
@@ -6543,7 +6627,7 @@ static void autorun_array(struct mddev *mddev)
err = do_md_run(mddev);
if (err) {
pr_warn("md: do_md_run() returned %d\n", err);
- do_md_stop(mddev, 0, NULL);
+ do_md_stop(mddev, 0);
}
}
@@ -7013,9 +7097,7 @@ kick_rdev:
md_kick_rdev_from_array(rdev);
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
- if (mddev->thread)
- md_wakeup_thread(mddev->thread);
- else
+ if (!mddev->thread)
md_update_sb(mddev, 1);
md_new_event();
@@ -7090,14 +7172,13 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev)
if (!bdev_nowait(rdev->bdev)) {
pr_info("%s: Disabling nowait because %pg does not support nowait\n",
mdname(mddev), rdev->bdev);
- blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue);
}
/*
* Kick recovery, maybe this spare has to be added to the
* array immediately.
*/
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- md_wakeup_thread(mddev->thread);
md_new_event();
return 0;
@@ -7311,8 +7392,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
* of each device. If num_sectors is zero, we find the largest size
* that fits.
*/
- if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- mddev->sync_thread)
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return -EBUSY;
if (!md_is_rdwr(mddev))
return -EROFS;
@@ -7329,10 +7409,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (!rv) {
if (mddev_is_clustered(mddev))
md_cluster_ops->update_size(mddev, old_dev_sectors);
- else if (mddev->queue) {
+ else if (!mddev_is_dm(mddev))
set_capacity_and_notify(mddev->gendisk,
mddev->array_sectors);
- }
}
return rv;
}
@@ -7349,8 +7428,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
return -EINVAL;
- if (mddev->sync_thread ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) ||
mddev->reshape_position != MaxSector)
return -EBUSY;
@@ -7546,16 +7624,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-static inline bool md_ioctl_valid(unsigned int cmd)
+static inline int md_ioctl_valid(unsigned int cmd)
{
switch (cmd) {
- case ADD_NEW_DISK:
case GET_ARRAY_INFO:
- case GET_BITMAP_FILE:
case GET_DISK_INFO:
+ case RAID_VERSION:
+ return 0;
+ case ADD_NEW_DISK:
+ case GET_BITMAP_FILE:
case HOT_ADD_DISK:
case HOT_REMOVE_DISK:
- case RAID_VERSION:
case RESTART_ARRAY_RW:
case RUN_ARRAY:
case SET_ARRAY_INFO:
@@ -7564,9 +7643,11 @@ static inline bool md_ioctl_valid(unsigned int cmd)
case STOP_ARRAY:
case STOP_ARRAY_RO:
case CLUSTERED_DISK_NACK:
- return true;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ return 0;
default:
- return false;
+ return -ENOTTY;
}
}
@@ -7624,31 +7705,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
int err = 0;
void __user *argp = (void __user *)arg;
struct mddev *mddev = NULL;
- bool did_set_md_closing = false;
- if (!md_ioctl_valid(cmd))
- return -ENOTTY;
-
- switch (cmd) {
- case RAID_VERSION:
- case GET_ARRAY_INFO:
- case GET_DISK_INFO:
- break;
- default:
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
- }
+ err = md_ioctl_valid(cmd);
+ if (err)
+ return err;
/*
* Commands dealing with the RAID driver but not any
* particular array:
*/
- switch (cmd) {
- case RAID_VERSION:
- err = get_version(argp);
- goto out;
- default:;
- }
+ if (cmd == RAID_VERSION)
+ return get_version(argp);
/*
* Commands creating/starting a new array:
@@ -7656,35 +7723,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
mddev = bdev->bd_disk->private_data;
- if (!mddev) {
- BUG();
- goto out;
- }
-
/* Some actions do not requires the mutex */
switch (cmd) {
case GET_ARRAY_INFO:
if (!mddev->raid_disks && !mddev->external)
- err = -ENODEV;
- else
- err = get_array_info(mddev, argp);
- goto out;
+ return -ENODEV;
+ return get_array_info(mddev, argp);
case GET_DISK_INFO:
if (!mddev->raid_disks && !mddev->external)
- err = -ENODEV;
- else
- err = get_disk_info(mddev, argp);
- goto out;
+ return -ENODEV;
+ return get_disk_info(mddev, argp);
case SET_DISK_FAULTY:
- err = set_disk_faulty(mddev, new_decode_dev(arg));
- goto out;
+ return set_disk_faulty(mddev, new_decode_dev(arg));
case GET_BITMAP_FILE:
- err = get_bitmap_file(mddev, argp);
- goto out;
-
+ return get_bitmap_file(mddev, argp);
}
if (cmd == HOT_REMOVE_DISK)
@@ -7697,20 +7752,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
/* Need to flush page cache, and ensure no-one else opens
* and writes
*/
- mutex_lock(&mddev->open_mutex);
- if (mddev->pers && atomic_read(&mddev->openers) > 1) {
- mutex_unlock(&mddev->open_mutex);
- err = -EBUSY;
- goto out;
- }
- if (test_and_set_bit(MD_CLOSING, &mddev->flags)) {
- mutex_unlock(&mddev->open_mutex);
- err = -EBUSY;
- goto out;
- }
- did_set_md_closing = true;
- mutex_unlock(&mddev->open_mutex);
- sync_blockdev(bdev);
+ err = mddev_set_closing_and_sync_blockdev(mddev, 1);
+ if (err)
+ return err;
}
if (!md_is_rdwr(mddev))
@@ -7751,11 +7795,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode,
goto unlock;
case STOP_ARRAY:
- err = do_md_stop(mddev, 0, bdev);
+ err = do_md_stop(mddev, 0);
goto unlock;
case STOP_ARRAY_RO:
- err = md_set_readonly(mddev, bdev);
+ if (mddev->pers)
+ err = md_set_readonly(mddev);
goto unlock;
case HOT_REMOVE_DISK:
@@ -7850,7 +7895,7 @@ unlock:
mddev_unlock(mddev);
out:
- if(did_set_md_closing)
+ if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY))
clear_bit(MD_CLOSING, &mddev->flags);
return err;
}
@@ -8687,10 +8732,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
bio_chain(discard_bio, bio);
bio_clone_blkg_association(discard_bio, bio);
- if (mddev->gendisk)
- trace_block_bio_remap(discard_bio,
- disk_devt(mddev->gendisk),
- bio->bi_iter.bi_sector);
+ mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector);
submit_bio_noacct(discard_bio);
}
EXPORT_SYMBOL_GPL(md_submit_discard_bio);
@@ -8737,6 +8779,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio)
}
EXPORT_SYMBOL_GPL(md_account_bio);
+void md_free_cloned_bio(struct bio *bio)
+{
+ struct md_io_clone *md_io_clone = bio->bi_private;
+ struct bio *orig_bio = md_io_clone->orig_bio;
+ struct mddev *mddev = md_io_clone->mddev;
+
+ if (bio->bi_status && !orig_bio->bi_status)
+ orig_bio->bi_status = bio->bi_status;
+
+ if (md_io_clone->start_time)
+ bio_end_io_acct(orig_bio, md_io_clone->start_time);
+
+ bio_put(bio);
+ percpu_ref_put(&mddev->active_io);
+}
+EXPORT_SYMBOL_GPL(md_free_cloned_bio);
+
/* md_allow_write(mddev)
* Calling this ensures that the array is marked 'active' so that writes
* may proceed without blocking. It is important to call this before
@@ -9170,7 +9229,7 @@ void md_do_sync(struct md_thread *thread)
mddev->delta_disks > 0 &&
mddev->pers->finish_reshape &&
mddev->pers->size &&
- mddev->queue) {
+ !mddev_is_dm(mddev)) {
mddev_lock_nointr(mddev);
md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0));
mddev_unlock(mddev);
@@ -9270,9 +9329,14 @@ static bool md_spares_need_change(struct mddev *mddev)
{
struct md_rdev *rdev;
- rdev_for_each(rdev, mddev)
- if (rdev_removeable(rdev) || rdev_addable(rdev))
+ rcu_read_lock();
+ rdev_for_each_rcu(rdev, mddev) {
+ if (rdev_removeable(rdev) || rdev_addable(rdev)) {
+ rcu_read_unlock();
return true;
+ }
+ }
+ rcu_read_unlock();
return false;
}
diff --git a/drivers/md/md.h b/drivers/md/md.h
index a079ee9b6190..097d9dbd69b8 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -18,6 +18,7 @@
#include <linux/timer.h>
#include <linux/wait.h>
#include <linux/workqueue.h>
+#include <trace/events/block.h>
#include "md-cluster.h"
#define MaxSector (~(sector_t)0)
@@ -207,6 +208,7 @@ enum flag_bits {
* check if there is collision between raid1
* serial bios.
*/
+ Nonrot, /* non-rotational device (SSD) */
};
static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
@@ -222,6 +224,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
}
return 0;
}
+
+static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s,
+ int sectors)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors);
+}
+
extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
int is_new);
extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
@@ -468,7 +480,6 @@ struct mddev {
struct timer_list safemode_timer;
struct percpu_ref writes_pending;
int sync_checkers; /* # of threads checking writes_pending */
- struct request_queue *queue; /* for plugging ... */
struct bitmap *bitmap; /* the bitmap for the device */
struct {
@@ -558,6 +569,37 @@ enum recovery_flags {
MD_RESYNCING_REMOTE, /* remote node is running resync thread */
};
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static inline bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
+static inline bool reshape_interrupted(struct mddev *mddev)
+{
+ /* reshape never start */
+ if (mddev->reshape_position == MaxSector)
+ return false;
+
+ /* interrupted */
+ if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
+ return true;
+
+ /* running reshape will be interrupted soon. */
+ if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_INTR, &mddev->recovery) ||
+ test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
+ return true;
+
+ return false;
+}
+
static inline int __must_check mddev_lock(struct mddev *mddev)
{
return mutex_lock_interruptible(&mddev->reconfig_mutex);
@@ -617,6 +659,7 @@ struct md_personality
int (*start_reshape) (struct mddev *mddev);
void (*finish_reshape) (struct mddev *mddev);
void (*update_reshape_pos) (struct mddev *mddev);
+ void (*prepare_suspend) (struct mddev *mddev);
/* quiesce suspends or resumes internal processing.
* 1 - stop new actions and wait for action io to complete
* 0 - return to normal behaviour
@@ -750,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev);
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
struct bio *bio, sector_t start, sector_t size);
void md_account_bio(struct mddev *mddev, struct bio **bio);
+void md_free_cloned_bio(struct bio *bio);
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@@ -778,9 +822,12 @@ extern void md_stop_writes(struct mddev *mddev);
extern int md_rdev_init(struct md_rdev *rdev);
extern void md_rdev_clear(struct md_rdev *rdev);
-extern void md_handle_request(struct mddev *mddev, struct bio *bio);
+extern bool md_handle_request(struct mddev *mddev, struct bio *bio);
extern int mddev_suspend(struct mddev *mddev, bool interruptible);
extern void mddev_resume(struct mddev *mddev);
+extern void md_idle_sync_thread(struct mddev *mddev);
+extern void md_frozen_sync_thread(struct mddev *mddev);
+extern void md_unfrozen_sync_thread(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
@@ -821,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio
{
if (bio_op(bio) == REQ_OP_WRITE_ZEROES &&
!bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors)
- mddev->queue->limits.max_write_zeroes_sectors = 0;
+ mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0;
}
static inline int mddev_suspend_and_lock(struct mddev *mddev)
@@ -860,7 +907,31 @@ void md_autostart_arrays(int part);
int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info);
int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info);
int do_md_run(struct mddev *mddev);
+void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim);
+int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev);
+void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes);
extern const struct block_device_operations md_fops;
+/*
+ * MD devices can be used undeneath by DM, in which case ->gendisk is NULL.
+ */
+static inline bool mddev_is_dm(struct mddev *mddev)
+{
+ return !mddev->gendisk;
+}
+
+static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio,
+ sector_t sector)
+{
+ if (!mddev_is_dm(mddev))
+ trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector);
+}
+
+#define mddev_add_trace_msg(mddev, fmt, args...) \
+do { \
+ if (!mddev_is_dm(mddev)) \
+ blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \
+} while (0)
+
#endif /* _MD_MD_H */
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index c50a7abda744..c5d4aeb68404 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv)
free_conf(mddev, conf);
}
+static int raid0_set_limits(struct mddev *mddev)
+{
+ struct queue_limits lim;
+
+ blk_set_stacking_limits(&lim);
+ lim.max_hw_sectors = mddev->chunk_sectors;
+ lim.max_write_zeroes_sectors = mddev->chunk_sectors;
+ lim.io_min = mddev->chunk_sectors << 9;
+ lim.io_opt = lim.io_min * mddev->raid_disks;
+ mddev_stack_rdev_limits(mddev, &lim);
+ return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
static int raid0_run(struct mddev *mddev)
{
struct r0conf *conf;
@@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev)
mddev->private = conf;
}
conf = mddev->private;
- if (mddev->queue) {
- struct md_rdev *rdev;
-
- blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
-
- blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
- blk_queue_io_opt(mddev->queue,
- (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
- rdev_for_each(rdev, mddev) {
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- }
+ if (!mddev_is_dm(mddev)) {
+ ret = raid0_set_limits(mddev);
+ if (ret)
+ goto out_free_conf;
}
/* calculate array device size */
@@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev)
ret = md_integrity_register(mddev);
if (ret)
- free_conf(mddev, conf);
-
+ goto out_free_conf;
+ return 0;
+out_free_conf:
+ free_conf(mddev, conf);
return ret;
}
@@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio)
bio_set_dev(bio, tmp_dev->bdev);
bio->bi_iter.bi_sector = sector + zone->dev_start +
tmp_dev->data_offset;
-
- if (mddev->gendisk)
- trace_block_bio_remap(bio, disk_devt(mddev->gendisk),
- bio_sector);
+ mddev_trace_remap(mddev, bio, bio_sector);
mddev_check_write_zeroes(mddev, bio);
submit_bio_noacct(bio);
}
diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c
index 512746551f36..2ea1710a3b70 100644
--- a/drivers/md/raid1-10.c
+++ b/drivers/md/raid1-10.c
@@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev)
return false;
}
+
+/**
+ * raid1_check_read_range() - check a given read range for bad blocks,
+ * available read length is returned;
+ * @rdev: the rdev to read;
+ * @this_sector: read position;
+ * @len: read length;
+ *
+ * helper function for read_balance()
+ *
+ * 1) If there are no bad blocks in the range, @len is returned;
+ * 2) If the range are all bad blocks, 0 is returned;
+ * 3) If there are partial bad blocks:
+ * - If the bad block range starts after @this_sector, the length of first
+ * good region is returned;
+ * - If the bad block range starts before @this_sector, 0 is returned and
+ * the @len is updated to the offset into the region before we get to the
+ * good blocks;
+ */
+static inline int raid1_check_read_range(struct md_rdev *rdev,
+ sector_t this_sector, int *len)
+{
+ sector_t first_bad;
+ int bad_sectors;
+
+ /* no bad block overlap */
+ if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors))
+ return *len;
+
+ /*
+ * bad block range starts offset into our range so we can return the
+ * number of sectors before the bad blocks start.
+ */
+ if (first_bad > this_sector)
+ return first_bad - this_sector;
+
+ /* read range is fully consumed by bad blocks. */
+ if (this_sector + *len <= first_bad + bad_sectors)
+ return 0;
+
+ /*
+ * final case, bad block range starts before or at the start of our
+ * range but does not cover our entire range so we still return 0 but
+ * update the length with the number of sectors before we get to the
+ * good ones.
+ */
+ *len = first_bad + bad_sectors - this_sector;
+ return 0;
+}
+
+/*
+ * Check if read should choose the first rdev.
+ *
+ * Balance on the whole device if no resync is going on (recovery is ok) or
+ * below the resync window. Otherwise, take the first readable disk.
+ */
+static inline bool raid1_should_read_first(struct mddev *mddev,
+ sector_t this_sector, int len)
+{
+ if ((mddev->recovery_cp < this_sector + len))
+ return true;
+
+ if (mddev_is_clustered(mddev) &&
+ md_cluster_ops->area_resyncing(mddev, READ, this_sector,
+ this_sector + len))
+ return true;
+
+ return false;
+}
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 286f8b16c7bd..be8ac24f50b6 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -46,9 +46,6 @@
static void allow_barrier(struct r1conf *conf, sector_t sector_nr);
static void lower_barrier(struct r1conf *conf, sector_t sector_nr);
-#define raid1_log(md, fmt, args...) \
- do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0)
-
#define RAID_1_10_NAME "raid1"
#include "raid1-10.c"
@@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio)
* to user-side. So if something waits for IO, then it
* will wait for the 'master' bio.
*/
- sector_t first_bad;
- int bad_sectors;
-
r1_bio->bios[mirror] = NULL;
to_put = bio;
/*
@@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio)
set_bit(R1BIO_Uptodate, &r1_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !discard_error) {
r1_bio->bios[mirror] = IO_MADE_GOOD;
set_bit(R1BIO_MadeGood, &r1_bio->state);
}
@@ -582,211 +576,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector,
return len;
}
-/*
- * This routine returns the disk from which the requested read should
- * be done. There is a per-array 'next expected sequential IO' sector
- * number - if this matches on the next IO then we use the last disk.
- * There is also a per-disk 'last know head position' sector that is
- * maintained from IRQ contexts, both the normal and the resync IO
- * completion handlers update this position correctly. If there is no
- * perfect sequential match then we pick the disk whose head is closest.
- *
- * If there are 2 mirrors in the same 2 devices, performance degrades
- * because position is mirror, not device based.
- *
- * The rdev for the device selected will have nr_pending incremented.
- */
-static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors)
+static void update_read_sectors(struct r1conf *conf, int disk,
+ sector_t this_sector, int len)
{
- const sector_t this_sector = r1_bio->sector;
- int sectors;
- int best_good_sectors;
- int best_disk, best_dist_disk, best_pending_disk;
- int has_nonrot_disk;
+ struct raid1_info *info = &conf->mirrors[disk];
+
+ atomic_inc(&info->rdev->nr_pending);
+ if (info->next_seq_sect != this_sector)
+ info->seq_start = this_sector;
+ info->next_seq_sect = this_sector + len;
+}
+
+static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int len = r1_bio->sectors;
int disk;
- sector_t best_dist;
- unsigned int min_pending;
- struct md_rdev *rdev;
- int choose_first;
- int choose_next_idle;
- /*
- * Check if we can balance. We can balance on the whole
- * device if no resync is going on, or below the resync window.
- * We take the first readable disk when above the resync window.
- */
- retry:
- sectors = r1_bio->sectors;
- best_disk = -1;
- best_dist_disk = -1;
- best_dist = MaxSector;
- best_pending_disk = -1;
- min_pending = UINT_MAX;
- best_good_sectors = 0;
- has_nonrot_disk = 0;
- choose_next_idle = 0;
- clear_bit(R1BIO_FailFast, &r1_bio->state);
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ int read_len;
- if ((conf->mddev->recovery_cp < this_sector + sectors) ||
- (mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
- this_sector + sectors)))
- choose_first = 1;
- else
- choose_first = 0;
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ continue;
+
+ /* choose the first disk even if it has some bad blocks. */
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len > 0) {
+ update_read_sectors(conf, disk, this_sector, read_len);
+ *max_sectors = read_len;
+ return disk;
+ }
+ }
+
+ return -1;
+}
+
+static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int best_disk = -1;
+ int best_len = 0;
+ int disk;
for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
- sector_t dist;
- sector_t first_bad;
- int bad_sectors;
- unsigned int pending;
- bool nonrot;
+ struct md_rdev *rdev;
+ int len;
+ int read_len;
+
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
rdev = conf->mirrors[disk].rdev;
- if (r1_bio->bios[disk] == IO_BLOCKED
- || rdev == NULL
- || test_bit(Faulty, &rdev->flags))
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ test_bit(WriteMostly, &rdev->flags))
continue;
- if (!test_bit(In_sync, &rdev->flags) &&
- rdev->recovery_offset < this_sector + sectors)
+
+ /* keep track of the disk with the most readable sectors. */
+ len = r1_bio->sectors;
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len > best_len) {
+ best_disk = disk;
+ best_len = read_len;
+ }
+ }
+
+ if (best_disk != -1) {
+ *max_sectors = best_len;
+ update_read_sectors(conf, best_disk, this_sector, best_len);
+ }
+
+ return best_disk;
+}
+
+static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ sector_t this_sector = r1_bio->sector;
+ int bb_disk = -1;
+ int bb_read_len = 0;
+ int disk;
+
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ int len;
+ int read_len;
+
+ if (r1_bio->bios[disk] == IO_BLOCKED)
continue;
- if (test_bit(WriteMostly, &rdev->flags)) {
- /* Don't balance among write-mostly, just
- * use the first as a last resort */
- if (best_dist_disk < 0) {
- if (is_badblock(rdev, this_sector, sectors,
- &first_bad, &bad_sectors)) {
- if (first_bad <= this_sector)
- /* Cannot use this */
- continue;
- best_good_sectors = first_bad - this_sector;
- } else
- best_good_sectors = sectors;
- best_dist_disk = disk;
- best_pending_disk = disk;
- }
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev || test_bit(Faulty, &rdev->flags) ||
+ !test_bit(WriteMostly, &rdev->flags))
continue;
+
+ /* there are no bad blocks, we can use this disk */
+ len = r1_bio->sectors;
+ read_len = raid1_check_read_range(rdev, this_sector, &len);
+ if (read_len == r1_bio->sectors) {
+ update_read_sectors(conf, disk, this_sector, read_len);
+ return disk;
}
- /* This is a reasonable device to use. It might
- * even be best.
+
+ /*
+ * there are partial bad blocks, choose the rdev with largest
+ * read length.
*/
- if (is_badblock(rdev, this_sector, sectors,
- &first_bad, &bad_sectors)) {
- if (best_dist < MaxSector)
- /* already have a better device */
- continue;
- if (first_bad <= this_sector) {
- /* cannot read here. If this is the 'primary'
- * device, then we must not read beyond
- * bad_sectors from another device..
- */
- bad_sectors -= (this_sector - first_bad);
- if (choose_first && sectors > bad_sectors)
- sectors = bad_sectors;
- if (best_good_sectors > sectors)
- best_good_sectors = sectors;
-
- } else {
- sector_t good_sectors = first_bad - this_sector;
- if (good_sectors > best_good_sectors) {
- best_good_sectors = good_sectors;
- best_disk = disk;
- }
- if (choose_first)
- break;
- }
- continue;
- } else {
- if ((sectors > best_good_sectors) && (best_disk >= 0))
- best_disk = -1;
- best_good_sectors = sectors;
+ if (read_len > bb_read_len) {
+ bb_disk = disk;
+ bb_read_len = read_len;
}
+ }
+
+ if (bb_disk != -1) {
+ *max_sectors = bb_read_len;
+ update_read_sectors(conf, bb_disk, this_sector, bb_read_len);
+ }
+
+ return bb_disk;
+}
+
+static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio)
+{
+ /* TODO: address issues with this check and concurrency. */
+ return conf->mirrors[disk].next_seq_sect == r1_bio->sector ||
+ conf->mirrors[disk].head_position == r1_bio->sector;
+}
+
+/*
+ * If buffered sequential IO size exceeds optimal iosize, check if there is idle
+ * disk. If yes, choose the idle disk.
+ */
+static bool should_choose_next(struct r1conf *conf, int disk)
+{
+ struct raid1_info *mirror = &conf->mirrors[disk];
+ int opt_iosize;
+
+ if (!test_bit(Nonrot, &mirror->rdev->flags))
+ return false;
+
+ opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9;
+ return opt_iosize > 0 && mirror->seq_start != MaxSector &&
+ mirror->next_seq_sect > opt_iosize &&
+ mirror->next_seq_sect - opt_iosize >= mirror->seq_start;
+}
+
+static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio)
+{
+ if (!rdev || test_bit(Faulty, &rdev->flags))
+ return false;
+
+ /* still in recovery */
+ if (!test_bit(In_sync, &rdev->flags) &&
+ rdev->recovery_offset < r1_bio->sector + r1_bio->sectors)
+ return false;
+
+ /* don't read from slow disk unless have to */
+ if (test_bit(WriteMostly, &rdev->flags))
+ return false;
+
+ /* don't split IO for bad blocks unless have to */
+ if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors))
+ return false;
+
+ return true;
+}
+
+struct read_balance_ctl {
+ sector_t closest_dist;
+ int closest_dist_disk;
+ int min_pending;
+ int min_pending_disk;
+ int sequential_disk;
+ int readable_disks;
+};
+
+static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio)
+{
+ int disk;
+ struct read_balance_ctl ctl = {
+ .closest_dist_disk = -1,
+ .closest_dist = MaxSector,
+ .min_pending_disk = -1,
+ .min_pending = UINT_MAX,
+ .sequential_disk = -1,
+ };
+
+ for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) {
+ struct md_rdev *rdev;
+ sector_t dist;
+ unsigned int pending;
- if (best_disk >= 0)
- /* At least two disks to choose from so failfast is OK */
+ if (r1_bio->bios[disk] == IO_BLOCKED)
+ continue;
+
+ rdev = conf->mirrors[disk].rdev;
+ if (!rdev_readable(rdev, r1_bio))
+ continue;
+
+ /* At least two disks to choose from so failfast is OK */
+ if (ctl.readable_disks++ == 1)
set_bit(R1BIO_FailFast, &r1_bio->state);
- nonrot = bdev_nonrot(rdev->bdev);
- has_nonrot_disk |= nonrot;
pending = atomic_read(&rdev->nr_pending);
- dist = abs(this_sector - conf->mirrors[disk].head_position);
- if (choose_first) {
- best_disk = disk;
- break;
- }
+ dist = abs(r1_bio->sector - conf->mirrors[disk].head_position);
+
/* Don't change to another disk for sequential reads */
- if (conf->mirrors[disk].next_seq_sect == this_sector
- || dist == 0) {
- int opt_iosize = bdev_io_opt(rdev->bdev) >> 9;
- struct raid1_info *mirror = &conf->mirrors[disk];
+ if (is_sequential(conf, disk, r1_bio)) {
+ if (!should_choose_next(conf, disk))
+ return disk;
- best_disk = disk;
/*
- * If buffered sequential IO size exceeds optimal
- * iosize, check if there is idle disk. If yes, choose
- * the idle disk. read_balance could already choose an
- * idle disk before noticing it's a sequential IO in
- * this disk. This doesn't matter because this disk
- * will idle, next time it will be utilized after the
- * first disk has IO size exceeds optimal iosize. In
- * this way, iosize of the first disk will be optimal
- * iosize at least. iosize of the second disk might be
- * small, but not a big deal since when the second disk
- * starts IO, the first disk is likely still busy.
+ * Add 'pending' to avoid choosing this disk if
+ * there is other idle disk.
*/
- if (nonrot && opt_iosize > 0 &&
- mirror->seq_start != MaxSector &&
- mirror->next_seq_sect > opt_iosize &&
- mirror->next_seq_sect - opt_iosize >=
- mirror->seq_start) {
- choose_next_idle = 1;
- continue;
- }
- break;
+ pending++;
+ /*
+ * If there is no other idle disk, this disk
+ * will be chosen.
+ */
+ ctl.sequential_disk = disk;
}
- if (choose_next_idle)
- continue;
-
- if (min_pending > pending) {
- min_pending = pending;
- best_pending_disk = disk;
+ if (ctl.min_pending > pending) {
+ ctl.min_pending = pending;
+ ctl.min_pending_disk = disk;
}
- if (dist < best_dist) {
- best_dist = dist;
- best_dist_disk = disk;
+ if (ctl.closest_dist > dist) {
+ ctl.closest_dist = dist;
+ ctl.closest_dist_disk = disk;
}
}
/*
+ * sequential IO size exceeds optimal iosize, however, there is no other
+ * idle disk, so choose the sequential disk.
+ */
+ if (ctl.sequential_disk != -1 && ctl.min_pending != 0)
+ return ctl.sequential_disk;
+
+ /*
* If all disks are rotational, choose the closest disk. If any disk is
* non-rotational, choose the disk with less pending request even the
* disk is rotational, which might/might not be optimal for raids with
* mixed ratation/non-rotational disks depending on workload.
*/
- if (best_disk == -1) {
- if (has_nonrot_disk || min_pending == 0)
- best_disk = best_pending_disk;
- else
- best_disk = best_dist_disk;
- }
+ if (ctl.min_pending_disk != -1 &&
+ (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0))
+ return ctl.min_pending_disk;
+ else
+ return ctl.closest_dist_disk;
+}
- if (best_disk >= 0) {
- rdev = conf->mirrors[best_disk].rdev;
- if (!rdev)
- goto retry;
- atomic_inc(&rdev->nr_pending);
- sectors = best_good_sectors;
+/*
+ * This routine returns the disk from which the requested read should be done.
+ *
+ * 1) If resync is in progress, find the first usable disk and use it even if it
+ * has some bad blocks.
+ *
+ * 2) Now that there is no resync, loop through all disks and skipping slow
+ * disks and disks with bad blocks for now. Only pay attention to key disk
+ * choice.
+ *
+ * 3) If we've made it this far, now look for disks with bad blocks and choose
+ * the one with most number of sectors.
+ *
+ * 4) If we are all the way at the end, we have no choice but to use a disk even
+ * if it is write mostly.
+ *
+ * The rdev for the device selected will have nr_pending incremented.
+ */
+static int read_balance(struct r1conf *conf, struct r1bio *r1_bio,
+ int *max_sectors)
+{
+ int disk;
- if (conf->mirrors[best_disk].next_seq_sect != this_sector)
- conf->mirrors[best_disk].seq_start = this_sector;
+ clear_bit(R1BIO_FailFast, &r1_bio->state);
+
+ if (raid1_should_read_first(conf->mddev, r1_bio->sector,
+ r1_bio->sectors))
+ return choose_first_rdev(conf, r1_bio, max_sectors);
- conf->mirrors[best_disk].next_seq_sect = this_sector + sectors;
+ disk = choose_best_rdev(conf, r1_bio);
+ if (disk >= 0) {
+ *max_sectors = r1_bio->sectors;
+ update_read_sectors(conf, disk, r1_bio->sector,
+ r1_bio->sectors);
+ return disk;
}
- *max_sectors = sectors;
- return best_disk;
+ /*
+ * If we are here it means we didn't find a perfectly good disk so
+ * now spend a bit more time trying to find one with the most good
+ * sectors.
+ */
+ disk = choose_bb_rdev(conf, r1_bio, max_sectors);
+ if (disk >= 0)
+ return disk;
+
+ return choose_slow_rdev(conf, r1_bio, max_sectors);
}
static void wake_up_barrier(struct r1conf *conf)
@@ -1098,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra)
*/
spin_lock_irq(&conf->resync_lock);
conf->array_frozen = 1;
- raid1_log(conf->mddev, "wait freeze");
+ mddev_add_trace_msg(conf->mddev, "raid1 wait freeze");
wait_event_lock_irq_cmd(
conf->wait_barrier,
get_unqueued_pending(conf) == extra,
@@ -1287,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
* Reading from a write-mostly device must take care not to
* over-take any writes that are 'behind'
*/
- raid1_log(mddev, "wait behind writes");
+ mddev_add_trace_msg(mddev, "raid1 wait behind writes");
wait_event(bitmap->behind_wait,
atomic_read(&bitmap->behind_writes) == 0);
}
@@ -1320,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r1_bio;
-
- if (mddev->gendisk)
- trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
- r1_bio->sector);
-
+ mddev_trace_remap(mddev, read_bio, r1_bio->sector);
submit_bio_noacct(read_bio);
}
@@ -1474,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
- raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
+ mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked",
+ blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, bio->bi_iter.bi_sector, false);
goto retry_write;
@@ -1557,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio,
mbio->bi_private = r1_bio;
atomic_inc(&r1_bio->remaining);
-
- if (mddev->gendisk)
- trace_block_bio_remap(mbio, disk_devt(mddev->gendisk),
- r1_bio->sector);
+ mddev_trace_remap(mddev, mbio, r1_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) {
@@ -1760,6 +1849,52 @@ static int raid1_spare_active(struct mddev *mddev)
return count;
}
+static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk,
+ bool replacement)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+
+ if (replacement)
+ info += conf->raid_disks;
+
+ if (info->rdev)
+ return false;
+
+ if (bdev_nonrot(rdev->bdev)) {
+ set_bit(Nonrot, &rdev->flags);
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1);
+ }
+
+ rdev->raid_disk = disk;
+ info->head_position = 0;
+ info->seq_start = MaxSector;
+ WRITE_ONCE(info->rdev, rdev);
+
+ return true;
+}
+
+static bool raid1_remove_conf(struct r1conf *conf, int disk)
+{
+ struct raid1_info *info = conf->mirrors + disk;
+ struct md_rdev *rdev = info->rdev;
+
+ if (!rdev || test_bit(In_sync, &rdev->flags) ||
+ atomic_read(&rdev->nr_pending))
+ return false;
+
+ /* Only remove non-faulty devices if recovery is not possible. */
+ if (!test_bit(Faulty, &rdev->flags) &&
+ rdev->mddev->recovery_disabled != conf->recovery_disabled &&
+ rdev->mddev->degraded < conf->raid_disks)
+ return false;
+
+ if (test_and_clear_bit(Nonrot, &rdev->flags))
+ WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1);
+
+ WRITE_ONCE(info->rdev, NULL);
+ return true;
+}
+
static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
{
struct r1conf *conf = mddev->private;
@@ -1791,19 +1926,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
for (mirror = first; mirror <= last; mirror++) {
p = conf->mirrors + mirror;
if (!p->rdev) {
- if (mddev->gendisk)
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ err = mddev_stack_new_rdev(mddev, rdev);
+ if (err)
+ return err;
- p->head_position = 0;
- rdev->raid_disk = mirror;
- err = 0;
+ raid1_add_conf(conf, rdev, mirror, false);
/* As all devices are equivalent, we don't need a full recovery
* if this was recently any drive of the array
*/
if (rdev->saved_raid_disk < 0)
conf->fullsync = 1;
- WRITE_ONCE(p->rdev, rdev);
break;
}
if (test_bit(WantReplacement, &p->rdev->flags) &&
@@ -1813,13 +1945,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
if (err && repl_slot >= 0) {
/* Add this device as a replacement */
- p = conf->mirrors + repl_slot;
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
- rdev->raid_disk = repl_slot;
+ raid1_add_conf(conf, rdev, repl_slot, true);
err = 0;
conf->fullsync = 1;
- WRITE_ONCE(p[conf->raid_disks].rdev, rdev);
}
print_conf(conf);
@@ -1836,27 +1966,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
if (unlikely(number >= conf->raid_disks))
goto abort;
- if (rdev != p->rdev)
- p = conf->mirrors + conf->raid_disks + number;
+ if (rdev != p->rdev) {
+ number += conf->raid_disks;
+ p = conf->mirrors + number;
+ }
print_conf(conf);
if (rdev == p->rdev) {
- if (test_bit(In_sync, &rdev->flags) ||
- atomic_read(&rdev->nr_pending)) {
+ if (!raid1_remove_conf(conf, number)) {
err = -EBUSY;
goto abort;
}
- /* Only remove non-faulty devices if recovery
- * is not possible.
- */
- if (!test_bit(Faulty, &rdev->flags) &&
- mddev->recovery_disabled != conf->recovery_disabled &&
- mddev->degraded < conf->raid_disks) {
- err = -EBUSY;
- goto abort;
- }
- WRITE_ONCE(p->rdev, NULL);
- if (conf->mirrors[conf->raid_disks + number].rdev) {
+
+ if (number < conf->raid_disks &&
+ conf->mirrors[conf->raid_disks + number].rdev) {
/* We just removed a device that is being replaced.
* Move down the replacement. We drain all IO before
* doing this to avoid confusion.
@@ -1944,8 +2067,6 @@ static void end_sync_write(struct bio *bio)
struct r1bio *r1_bio = get_resync_r1bio(bio);
struct mddev *mddev = r1_bio->mddev;
struct r1conf *conf = mddev->private;
- sector_t first_bad;
- int bad_sectors;
struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev;
if (!uptodate) {
@@ -1955,14 +2076,11 @@ static void end_sync_write(struct bio *bio)
set_bit(MD_RECOVERY_NEEDED, &
mddev->recovery);
set_bit(R1BIO_WriteError, &r1_bio->state);
- } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors,
- &first_bad, &bad_sectors) &&
- !is_badblock(conf->mirrors[r1_bio->read_disk].rdev,
- r1_bio->sector,
- r1_bio->sectors,
- &first_bad, &bad_sectors)
- )
+ } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) &&
+ !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev,
+ r1_bio->sector, r1_bio->sectors)) {
set_bit(R1BIO_MadeGood, &r1_bio->state);
+ }
put_sync_write_buf(r1_bio, uptodate);
}
@@ -2279,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio)
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
rdev = conf->mirrors[d].rdev;
if (rdev &&
(test_bit(In_sync, &rdev->flags) ||
(!test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sect + s)) &&
- is_badblock(rdev, sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev, sect, s) == 0) {
atomic_inc(&rdev->nr_pending);
if (sync_page_io(rdev, sect, s<<9,
conf->tmppage, REQ_OP_READ, false))
@@ -3006,23 +3120,17 @@ static struct r1conf *setup_conf(struct mddev *mddev)
err = -EINVAL;
spin_lock_init(&conf->device_lock);
+ conf->raid_disks = mddev->raid_disks;
rdev_for_each(rdev, mddev) {
int disk_idx = rdev->raid_disk;
- if (disk_idx >= mddev->raid_disks
- || disk_idx < 0)
+
+ if (disk_idx >= conf->raid_disks || disk_idx < 0)
continue;
- if (test_bit(Replacement, &rdev->flags))
- disk = conf->mirrors + mddev->raid_disks + disk_idx;
- else
- disk = conf->mirrors + disk_idx;
- if (disk->rdev)
+ if (!raid1_add_conf(conf, rdev, disk_idx,
+ test_bit(Replacement, &rdev->flags)))
goto abort;
- disk->rdev = rdev;
- disk->head_position = 0;
- disk->seq_start = MaxSector;
}
- conf->raid_disks = mddev->raid_disks;
conf->mddev = mddev;
INIT_LIST_HEAD(&conf->retry_list);
INIT_LIST_HEAD(&conf->bio_end_io_list);
@@ -3086,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
+static int raid1_set_limits(struct mddev *mddev)
+{
+ struct queue_limits lim;
+
+ blk_set_stacking_limits(&lim);
+ lim.max_write_zeroes_sectors = 0;
+ mddev_stack_rdev_limits(mddev, &lim);
+ return queue_limits_set(mddev->gendisk->queue, &lim);
+}
+
static void raid1_free(struct mddev *mddev, void *priv);
static int raid1_run(struct mddev *mddev)
{
struct r1conf *conf;
int i;
- struct md_rdev *rdev;
int ret;
if (mddev->level != 1) {
@@ -3118,14 +3235,10 @@ static int raid1_run(struct mddev *mddev)
if (IS_ERR(conf))
return PTR_ERR(conf);
- if (mddev->queue)
- blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-
- rdev_for_each(rdev, mddev) {
- if (!mddev->gendisk)
- continue;
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ if (!mddev_is_dm(mddev)) {
+ ret = raid1_set_limits(mddev);
+ if (ret)
+ goto abort;
}
mddev->degraded = 0;
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14d4211a123a..5300cbaa58a4 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -71,6 +71,7 @@ struct r1conf {
* allow for replacements.
*/
int raid_disks;
+ int nonrot_disks;
spinlock_t device_lock;
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index a5f8419e2df1..a4556d2e46bf 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
static void end_reshape_write(struct bio *bio);
static void end_reshape(struct r10conf *conf);
-#define raid10_log(md, fmt, args...) \
- do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0)
-
#include "raid1-10.c"
#define NULL_CMD
@@ -518,11 +515,7 @@ static void raid10_end_write_request(struct bio *bio)
* The 'master' represents the composite IO operation to
* user-side. So if something waits for IO, then it will
* wait for the 'master' bio.
- */
- sector_t first_bad;
- int bad_sectors;
-
- /*
+ *
* Do not set R10BIO_Uptodate if the current device is
* rebuilding or Faulty. This is because we cannot use
* such device for properly reading the data back (we could
@@ -535,10 +528,9 @@ static void raid10_end_write_request(struct bio *bio)
set_bit(R10BIO_Uptodate, &r10_bio->state);
/* Maybe we can clear some bad blocks. */
- if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors) && !discard_error) {
+ if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors) &&
+ !discard_error) {
bio_put(bio);
if (repl)
r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
@@ -753,17 +745,8 @@ static struct md_rdev *read_balance(struct r10conf *conf,
best_good_sectors = 0;
do_balance = 1;
clear_bit(R10BIO_FailFast, &r10_bio->state);
- /*
- * Check if we can balance. We can balance on the whole
- * device if no resync is going on (recovery is ok), or below
- * the resync window. We take the first readable disk when
- * above the resync window.
- */
- if ((conf->mddev->recovery_cp < MaxSector
- && (this_sector + sectors >= conf->next_resync)) ||
- (mddev_is_clustered(conf->mddev) &&
- md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector,
- this_sector + sectors)))
+
+ if (raid1_should_read_first(conf->mddev, this_sector, sectors))
do_balance = 0;
for (slot = 0; slot < conf->copies ; slot++) {
@@ -1033,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait)
ret = false;
} else {
conf->nr_waiting++;
- raid10_log(conf->mddev, "wait barrier");
+ mddev_add_trace_msg(conf->mddev, "raid10 wait barrier");
wait_event_barrier(conf, stop_waiting_barrier(conf));
conf->nr_waiting--;
}
@@ -1152,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf,
bio_wouldblock_error(bio);
return false;
}
- raid10_log(conf->mddev, "wait reshape");
+ mddev_add_trace_msg(conf->mddev, "raid10 wait reshape");
wait_event(conf->wait_barrier,
conf->reshape_progress <= bio->bi_iter.bi_sector ||
conf->reshape_progress >= bio->bi_iter.bi_sector +
@@ -1249,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
read_bio->bi_private = r10_bio;
-
- if (mddev->gendisk)
- trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk),
- r10_bio->sector);
+ mddev_trace_remap(mddev, read_bio, r10_bio->sector);
submit_bio_noacct(read_bio);
return;
}
@@ -1288,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
&& enough(conf, devnum))
mbio->bi_opf |= MD_FAILFAST;
mbio->bi_private = r10_bio;
-
- if (conf->mddev->gendisk)
- trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk),
- r10_bio->sector);
+ mddev_trace_remap(mddev, mbio, r10_bio->sector);
/* flush_pending_writes() needs access to the rdev so...*/
mbio->bi_bdev = (void *)rdev;
@@ -1330,10 +1307,7 @@ retry_wait:
}
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
sector_t dev_sector = r10_bio->devs[i].addr;
- int bad_sectors;
- int is_bad;
/*
* Discard request doesn't care the write result
@@ -1342,9 +1316,8 @@ retry_wait:
if (!r10_bio->sectors)
continue;
- is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
- &first_bad, &bad_sectors);
- if (is_bad < 0) {
+ if (rdev_has_badblock(rdev, dev_sector,
+ r10_bio->sectors) < 0) {
/*
* Mustn't write here until the bad block
* is acknowledged
@@ -1360,8 +1333,9 @@ retry_wait:
if (unlikely(blocked_rdev)) {
/* Have to wait for this device to get unblocked, then retry */
allow_barrier(conf);
- raid10_log(conf->mddev, "%s wait rdev %d blocked",
- __func__, blocked_rdev->raid_disk);
+ mddev_add_trace_msg(conf->mddev,
+ "raid10 %s wait rdev %d blocked",
+ __func__, blocked_rdev->raid_disk);
md_wait_for_blocked_rdev(blocked_rdev, mddev);
wait_barrier(conf, false);
goto retry_wait;
@@ -1416,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
bio_wouldblock_error(bio);
return;
}
- raid10_log(conf->mddev, "wait reshape metadata");
+ mddev_add_trace_msg(conf->mddev,
+ "raid10 wait reshape metadata");
wait_event(mddev->sb_wait,
!test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
@@ -2131,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
continue;
}
- if (mddev->gendisk)
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
-
+ err = mddev_stack_new_rdev(mddev, rdev);
+ if (err)
+ return err;
p->head_position = 0;
p->recovery_disabled = mddev->recovery_disabled - 1;
rdev->raid_disk = mirror;
@@ -2150,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
clear_bit(In_sync, &rdev->flags);
set_bit(Replacement, &rdev->flags);
rdev->raid_disk = repl_slot;
- err = 0;
- if (mddev->gendisk)
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
+ err = mddev_stack_new_rdev(mddev, rdev);
+ if (err)
+ return err;
conf->fullsync = 1;
WRITE_ONCE(p->replacement, rdev);
}
@@ -2290,8 +2263,6 @@ static void end_sync_write(struct bio *bio)
struct mddev *mddev = r10_bio->mddev;
struct r10conf *conf = mddev->private;
int d;
- sector_t first_bad;
- int bad_sectors;
int slot;
int repl;
struct md_rdev *rdev = NULL;
@@ -2312,11 +2283,10 @@ static void end_sync_write(struct bio *bio)
&rdev->mddev->recovery);
set_bit(R10BIO_WriteError, &r10_bio->state);
}
- } else if (is_badblock(rdev,
- r10_bio->devs[slot].addr,
- r10_bio->sectors,
- &first_bad, &bad_sectors))
+ } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr,
+ r10_bio->sectors)) {
set_bit(R10BIO_MadeGood, &r10_bio->state);
+ }
rdev_dec_pending(rdev, mddev);
@@ -2597,11 +2567,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
int sectors, struct page *page, enum req_op op)
{
- sector_t first_bad;
- int bad_sectors;
-
- if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
- && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
+ if (rdev_has_badblock(rdev, sector, sectors) &&
+ (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags)))
return -1;
if (sync_page_io(rdev, sector, sectors << 9, page, op, false))
/* success */
@@ -2658,16 +2625,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
s = PAGE_SIZE >> 9;
do {
- sector_t first_bad;
- int bad_sectors;
-
d = r10_bio->devs[sl].devnum;
rdev = conf->mirrors[d].rdev;
if (rdev &&
test_bit(In_sync, &rdev->flags) &&
!test_bit(Faulty, &rdev->flags) &&
- is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
- &first_bad, &bad_sectors) == 0) {
+ rdev_has_badblock(rdev,
+ r10_bio->devs[sl].addr + sect,
+ s) == 0) {
atomic_inc(&rdev->nr_pending);
success = sync_page_io(rdev,
r10_bio->devs[sl].addr +
@@ -4002,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev)
return ERR_PTR(err);
}
-static void raid10_set_io_opt(struct r10conf *conf)
+static unsigned int raid10_nr_stripes(struct r10conf *conf)
{
- int raid_disks = conf->geo.raid_disks;
+ unsigned int raid_disks = conf->geo.raid_disks;
- if (!(conf->geo.raid_disks % conf->geo.near_copies))
- raid_disks /= conf->geo.near_copies;
- blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) *
- raid_disks);
+ if (conf->geo.raid_disks % conf->geo.near_copies)
+ return raid_disks;
+ return raid_disks / conf->geo.near_copies;
+}
+
+static int raid10_set_queue_limits(struct mddev *mddev)
+{
+ struct r10conf *conf = mddev->private;
+ struct queue_limits lim;
+
+ blk_set_stacking_limits(&lim);
+ lim.max_write_zeroes_sectors = 0;
+ lim.io_min = mddev->chunk_sectors << 9;
+ lim.io_opt = lim.io_min * raid10_nr_stripes(conf);
+ mddev_stack_rdev_limits(mddev, &lim);
+ return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid10_run(struct mddev *mddev)
@@ -4021,6 +3998,7 @@ static int raid10_run(struct mddev *mddev)
sector_t size;
sector_t min_offset_diff = 0;
int first = 1;
+ int ret = -EIO;
if (mddev->private == NULL) {
conf = setup_conf(mddev);
@@ -4047,12 +4025,6 @@ static int raid10_run(struct mddev *mddev)
}
}
- if (mddev->queue) {
- blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
- blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
- raid10_set_io_opt(conf);
- }
-
rdev_for_each(rdev, mddev) {
long long diff;
@@ -4081,14 +4053,16 @@ static int raid10_run(struct mddev *mddev)
if (first || diff < min_offset_diff)
min_offset_diff = diff;
- if (mddev->gendisk)
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
-
disk->head_position = 0;
first = 0;
}
+ if (!mddev_is_dm(conf->mddev)) {
+ ret = raid10_set_queue_limits(mddev);
+ if (ret)
+ goto out_free_conf;
+ }
+
/* need to check that every block has at least one working mirror */
if (!enough(conf, -1)) {
pr_err("md/raid10:%s: not enough operational mirrors.\n",
@@ -4185,7 +4159,7 @@ out_free_conf:
raid10_free_conf(conf);
mddev->private = NULL;
out:
- return -EIO;
+ return ret;
}
static void raid10_free(struct mddev *mddev, void *priv)
@@ -4954,8 +4928,7 @@ static void end_reshape(struct r10conf *conf)
conf->reshape_safe = MaxSector;
spin_unlock_irq(&conf->device_lock);
- if (conf->mddev->queue)
- raid10_set_io_opt(conf);
+ mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf));
conf->fullsync = 0;
}
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index da4ba736c4f0..a70cbec12ed0 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf)
ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid));
ppl_conf->block_size = 512;
} else {
- ppl_conf->block_size = queue_logical_block_size(mddev->queue);
+ ppl_conf->block_size =
+ queue_logical_block_size(mddev->gendisk->queue);
}
for (i = 0; i < ppl_conf->count; i++) {
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 6a7a32f7fb91..d874abfc1836 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -36,6 +36,7 @@
*/
#include <linux/blkdev.h>
+#include <linux/delay.h>
#include <linux/kthread.h>
#include <linux/raid/pq.h>
#include <linux/async_tx.h>
@@ -760,6 +761,7 @@ enum stripe_result {
STRIPE_RETRY,
STRIPE_SCHEDULE_AND_RETRY,
STRIPE_FAIL,
+ STRIPE_WAIT_RESHAPE,
};
struct stripe_request_ctx {
@@ -1210,10 +1212,8 @@ again:
*/
while (op_is_write(op) && rdev &&
test_bit(WriteErrorSeen, &rdev->flags)) {
- sector_t first_bad;
- int bad_sectors;
- int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ int bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (!bad)
break;
@@ -1295,10 +1295,7 @@ again:
if (rrdev)
set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags);
- if (conf->mddev->gendisk)
- trace_block_bio_remap(bi,
- disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, bi);
else
@@ -1342,10 +1339,7 @@ again:
*/
if (op == REQ_OP_DISCARD)
rbi->bi_vcnt = 0;
- if (conf->mddev->gendisk)
- trace_block_bio_remap(rbi,
- disk_devt(conf->mddev->gendisk),
- sh->dev[i].sector);
+ mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector);
if (should_defer && op_is_write(op))
bio_list_add(&pending_bios, rbi);
else
@@ -2412,7 +2406,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp)
atomic_inc(&conf->active_stripes);
raid5_release_stripe(sh);
- conf->max_nr_stripes++;
+ WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1);
return 1;
}
@@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num)
size_t namelen = sizeof(conf->cache_name[0]);
int devs = max(conf->raid_disks, conf->previous_raid_disks);
- if (conf->mddev->gendisk)
+ if (mddev_is_dm(conf->mddev))
snprintf(conf->cache_name[0], namelen,
- "raid%d-%s", conf->level, mdname(conf->mddev));
+ "raid%d-%p", conf->level, conf->mddev);
else
snprintf(conf->cache_name[0], namelen,
- "raid%d-%p", conf->level, conf->mddev);
+ "raid%d-%s", conf->level, mdname(conf->mddev));
snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]);
conf->active_name = 0;
@@ -2707,7 +2701,7 @@ static int drop_one_stripe(struct r5conf *conf)
shrink_buffers(sh);
free_stripe(conf->slab_cache, sh);
atomic_dec(&conf->active_stripes);
- conf->max_nr_stripes--;
+ WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1);
return 1;
}
@@ -2855,8 +2849,6 @@ static void raid5_end_write_request(struct bio *bi)
struct r5conf *conf = sh->raid_conf;
int disks = sh->disks, i;
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int replacement = 0;
for (i = 0 ; i < disks; i++) {
@@ -2888,9 +2880,8 @@ static void raid5_end_write_request(struct bio *bi)
if (replacement) {
if (bi->bi_status)
md_error(conf->mddev, rdev);
- else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_MadeGoodRepl, &sh->dev[i].flags);
} else {
if (bi->bi_status) {
@@ -2900,9 +2891,8 @@ static void raid5_end_write_request(struct bio *bi)
if (!test_and_set_bit(WantReplacement, &rdev->flags))
set_bit(MD_RECOVERY_NEEDED,
&rdev->mddev->recovery);
- } else if (is_badblock(rdev, sh->sector,
- RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors)) {
+ } else if (rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf))) {
set_bit(R5_MadeGood, &sh->dev[i].flags);
if (test_bit(R5_ReadError, &sh->dev[i].flags))
/* That was a successful write so make
@@ -4205,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_HANDLE, &sh->state);
if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) {
/* prefer read-modify-write, but need to get some data */
- if (conf->mddev->queue)
- blk_add_trace_msg(conf->mddev->queue,
- "raid5 rmw %llu %d",
- (unsigned long long)sh->sector, rmw);
+ mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d",
+ sh->sector, rmw);
+
for (i = disks; i--; ) {
struct r5dev *dev = &sh->dev[i];
if (test_bit(R5_InJournal, &dev->flags) &&
@@ -4285,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf,
set_bit(STRIPE_DELAYED, &sh->state);
}
}
- if (rcw && conf->mddev->queue)
- blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d",
- (unsigned long long)sh->sector,
- rcw, qread, test_bit(STRIPE_DELAYED, &sh->state));
+ if (rcw && !mddev_is_dm(conf->mddev))
+ blk_add_trace_msg(conf->mddev->gendisk->queue,
+ "raid5 rcw %llu %d %d %d",
+ (unsigned long long)sh->sector, rcw, qread,
+ test_bit(STRIPE_DELAYED, &sh->state));
}
if (rcw > disks && rmw > disks &&
@@ -4674,8 +4664,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
/* Now to look around and see what can be done */
for (i=disks; i--; ) {
struct md_rdev *rdev;
- sector_t first_bad;
- int bad_sectors;
int is_bad = 0;
dev = &sh->dev[i];
@@ -4719,8 +4707,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
rdev = conf->disks[i].replacement;
if (rdev && !test_bit(Faulty, &rdev->flags) &&
rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) &&
- !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors))
+ !rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf)))
set_bit(R5_ReadRepl, &dev->flags);
else {
if (rdev && !test_bit(Faulty, &rdev->flags))
@@ -4733,8 +4721,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s)
if (rdev && test_bit(Faulty, &rdev->flags))
rdev = NULL;
if (rdev) {
- is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf),
- &first_bad, &bad_sectors);
+ is_bad = rdev_has_badblock(rdev, sh->sector,
+ RAID5_STRIPE_SECTORS(conf));
if (s->blocked_rdev == NULL
&& (test_bit(Blocked, &rdev->flags)
|| is_bad < 0)) {
@@ -5463,8 +5451,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
struct r5conf *conf = mddev->private;
struct bio *align_bio;
struct md_rdev *rdev;
- sector_t sector, end_sector, first_bad;
- int bad_sectors, dd_idx;
+ sector_t sector, end_sector;
+ int dd_idx;
bool did_inc;
if (!in_chunk_boundary(mddev, raid_bio)) {
@@ -5493,8 +5481,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
atomic_inc(&rdev->nr_pending);
- if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad,
- &bad_sectors)) {
+ if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) {
rdev_dec_pending(rdev, mddev);
return 0;
}
@@ -5530,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio)
spin_unlock_irq(&conf->device_lock);
}
- if (mddev->gendisk)
- trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk),
- raid_bio->bi_iter.bi_sector);
+ mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector);
submit_bio_noacct(align_bio);
return 1;
}
@@ -5701,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule)
}
release_inactive_stripe_list(conf, cb->temp_inactive_list,
NR_STRIPE_HASH_LOCKS);
- if (mddev->queue)
- trace_block_unplug(mddev->queue, cnt, !from_schedule);
+ if (!mddev_is_dm(mddev))
+ trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule);
kfree(cb);
}
@@ -5946,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
if (ahead_of_reshape(mddev, logical_sector,
conf->reshape_safe)) {
spin_unlock_irq(&conf->device_lock);
- return STRIPE_SCHEDULE_AND_RETRY;
+ ret = STRIPE_SCHEDULE_AND_RETRY;
+ goto out;
}
}
spin_unlock_irq(&conf->device_lock);
@@ -6025,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev,
out_release:
raid5_release_stripe(sh);
+out:
+ if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) {
+ bi->bi_status = BLK_STS_RESOURCE;
+ ret = STRIPE_WAIT_RESHAPE;
+ pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress");
+ }
return ret;
}
@@ -6146,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
while (1) {
res = make_stripe_request(mddev, conf, &ctx, logical_sector,
bi);
- if (res == STRIPE_FAIL)
+ if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE)
break;
if (res == STRIPE_RETRY)
@@ -6184,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi)
if (rw == WRITE)
md_write_end(mddev);
+ if (res == STRIPE_WAIT_RESHAPE) {
+ md_free_cloned_bio(bi);
+ return false;
+ }
+
bio_endio(bi);
return true;
}
@@ -6773,7 +6770,18 @@ static void raid5d(struct md_thread *thread)
spin_unlock_irq(&conf->device_lock);
md_check_recovery(mddev);
spin_lock_irq(&conf->device_lock);
+
+ /*
+ * Waiting on MD_SB_CHANGE_PENDING below may deadlock
+ * seeing md_check_recovery() is needed to clear
+ * the flag when using mdmon.
+ */
+ continue;
}
+
+ wait_event_lock_irq(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags),
+ conf->device_lock);
}
pr_debug("%d stripes handled\n", handled);
@@ -6820,7 +6828,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
if (size <= 16 || size > 32768)
return -EINVAL;
- conf->min_nr_stripes = size;
+ WRITE_ONCE(conf->min_nr_stripes, size);
mutex_lock(&conf->cache_size_mutex);
while (size < conf->max_nr_stripes &&
drop_one_stripe(conf))
@@ -6832,7 +6840,7 @@ raid5_set_cache_size(struct mddev *mddev, int size)
mutex_lock(&conf->cache_size_mutex);
while (size > conf->max_nr_stripes)
if (!grow_one_stripe(conf, GFP_KERNEL)) {
- conf->min_nr_stripes = conf->max_nr_stripes;
+ WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes);
result = -ENOMEM;
break;
}
@@ -6967,10 +6975,8 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len)
pr_debug("md/raid: change stripe_size from %lu to %lu\n",
conf->stripe_size, new);
- if (mddev->sync_thread ||
- test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
- mddev->reshape_position != MaxSector ||
- mddev->sysfs_active) {
+ if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
+ mddev->reshape_position != MaxSector || mddev->sysfs_active) {
err = -EBUSY;
goto out_unlock;
}
@@ -7084,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len)
if (!conf)
err = -ENODEV;
else if (new != conf->skip_copy) {
- struct request_queue *q = mddev->queue;
+ struct request_queue *q = mddev->gendisk->queue;
conf->skip_copy = new;
if (new)
@@ -7390,11 +7396,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
struct shrink_control *sc)
{
struct r5conf *conf = shrink->private_data;
+ int max_stripes = READ_ONCE(conf->max_nr_stripes);
+ int min_stripes = READ_ONCE(conf->min_nr_stripes);
- if (conf->max_nr_stripes < conf->min_nr_stripes)
+ if (max_stripes < min_stripes)
/* unlikely, but not impossible */
return 0;
- return conf->max_nr_stripes - conf->min_nr_stripes;
+ return max_stripes - min_stripes;
}
static struct r5conf *setup_conf(struct mddev *mddev)
@@ -7684,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded
return 0;
}
-static void raid5_set_io_opt(struct r5conf *conf)
+static int raid5_set_limits(struct mddev *mddev)
{
- blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) *
- (conf->raid_disks - conf->max_degraded));
+ struct r5conf *conf = mddev->private;
+ struct queue_limits lim;
+ int data_disks, stripe;
+ struct md_rdev *rdev;
+
+ /*
+ * The read-ahead size must cover two whole stripes, which is
+ * 2 * (datadisks) * chunksize where 'n' is the number of raid devices.
+ */
+ data_disks = conf->previous_raid_disks - conf->max_degraded;
+
+ /*
+ * We can only discard a whole stripe. It doesn't make sense to
+ * discard data disk but write parity disk
+ */
+ stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9));
+
+ blk_set_stacking_limits(&lim);
+ lim.io_min = mddev->chunk_sectors << 9;
+ lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded);
+ lim.raid_partial_stripes_expensive = 1;
+ lim.discard_granularity = stripe;
+ lim.max_write_zeroes_sectors = 0;
+ mddev_stack_rdev_limits(mddev, &lim);
+ rdev_for_each(rdev, mddev)
+ queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset,
+ mddev->gendisk->disk_name);
+
+ /*
+ * Zeroing is required for discard, otherwise data could be lost.
+ *
+ * Consider a scenario: discard a stripe (the stripe could be
+ * inconsistent if discard_zeroes_data is 0); write one disk of the
+ * stripe (the stripe could be inconsistent again depending on which
+ * disks are used to calculate parity); the disk is broken; The stripe
+ * data of this disk is lost.
+ *
+ * We only allow DISCARD if the sysadmin has confirmed that only safe
+ * devices are in use by setting a module parameter. A better idea
+ * might be to turn DISCARD into WRITE_ZEROES requests, as that is
+ * required to be safe.
+ */
+ if (!devices_handle_discard_safely ||
+ lim.max_discard_sectors < (stripe >> 9) ||
+ lim.discard_granularity < stripe)
+ lim.max_hw_discard_sectors = 0;
+
+ /*
+ * Requests require having a bitmap for each stripe.
+ * Limit the max sectors based on this.
+ */
+ lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf);
+
+ /* No restrictions on the number of segments in the request */
+ lim.max_segments = USHRT_MAX;
+
+ return queue_limits_set(mddev->gendisk->queue, &lim);
}
static int raid5_run(struct mddev *mddev)
@@ -7700,6 +7763,7 @@ static int raid5_run(struct mddev *mddev)
int i;
long long min_offset_diff = 0;
int first = 1;
+ int ret = -EIO;
if (mddev->recovery_cp != MaxSector)
pr_notice("md/raid:%s: not clean -- starting background reconstruction\n",
@@ -7948,66 +8012,10 @@ static int raid5_run(struct mddev *mddev)
mdname(mddev));
md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
- if (mddev->queue) {
- int chunk_size;
- /* read-ahead size must cover two whole stripes, which
- * is 2 * (datadisks) * chunksize where 'n' is the
- * number of raid devices
- */
- int data_disks = conf->previous_raid_disks - conf->max_degraded;
- int stripe = data_disks *
- ((mddev->chunk_sectors << 9) / PAGE_SIZE);
-
- chunk_size = mddev->chunk_sectors << 9;
- blk_queue_io_min(mddev->queue, chunk_size);
- raid5_set_io_opt(conf);
- mddev->queue->limits.raid_partial_stripes_expensive = 1;
- /*
- * We can only discard a whole stripe. It doesn't make sense to
- * discard data disk but write parity disk
- */
- stripe = stripe * PAGE_SIZE;
- stripe = roundup_pow_of_two(stripe);
- mddev->queue->limits.discard_granularity = stripe;
-
- blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
-
- rdev_for_each(rdev, mddev) {
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->data_offset << 9);
- disk_stack_limits(mddev->gendisk, rdev->bdev,
- rdev->new_data_offset << 9);
- }
-
- /*
- * zeroing is required, otherwise data
- * could be lost. Consider a scenario: discard a stripe
- * (the stripe could be inconsistent if
- * discard_zeroes_data is 0); write one disk of the
- * stripe (the stripe could be inconsistent again
- * depending on which disks are used to calculate
- * parity); the disk is broken; The stripe data of this
- * disk is lost.
- *
- * We only allow DISCARD if the sysadmin has confirmed that
- * only safe devices are in use by setting a module parameter.
- * A better idea might be to turn DISCARD into WRITE_ZEROES
- * requests, as that is required to be safe.
- */
- if (!devices_handle_discard_safely ||
- mddev->queue->limits.max_discard_sectors < (stripe >> 9) ||
- mddev->queue->limits.discard_granularity < stripe)
- blk_queue_max_discard_sectors(mddev->queue, 0);
-
- /*
- * Requests require having a bitmap for each stripe.
- * Limit the max sectors based on this.
- */
- blk_queue_max_hw_sectors(mddev->queue,
- RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf));
-
- /* No restrictions on the number of segments in the request */
- blk_queue_max_segments(mddev->queue, USHRT_MAX);
+ if (!mddev_is_dm(mddev)) {
+ ret = raid5_set_limits(mddev);
+ if (ret)
+ goto abort;
}
if (log_init(conf, journal_dev, raid5_has_ppl(conf)))
@@ -8020,7 +8028,7 @@ abort:
free_conf(conf);
mddev->private = NULL;
pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev));
- return -EIO;
+ return ret;
}
static void raid5_free(struct mddev *mddev, void *priv)
@@ -8531,8 +8539,8 @@ static void end_reshape(struct r5conf *conf)
spin_unlock_irq(&conf->device_lock);
wake_up(&conf->wait_for_overlap);
- if (conf->mddev->queue)
- raid5_set_io_opt(conf);
+ mddev_update_io_opt(conf->mddev,
+ conf->raid_disks - conf->max_degraded);
}
}
@@ -8909,6 +8917,18 @@ static int raid5_start(struct mddev *mddev)
return r5l_start(conf->log);
}
+/*
+ * This is only used for dm-raid456, caller already frozen sync_thread, hence
+ * if rehsape is still in progress, io that is waiting for reshape can never be
+ * done now, hence wake up and handle those IO.
+ */
+static void raid5_prepare_suspend(struct mddev *mddev)
+{
+ struct r5conf *conf = mddev->private;
+
+ wake_up(&conf->wait_for_overlap);
+}
+
static struct md_personality raid6_personality =
{
.name = "raid6",
@@ -8932,6 +8952,7 @@ static struct md_personality raid6_personality =
.quiesce = raid5_quiesce,
.takeover = raid6_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid5_personality =
{
@@ -8956,6 +8977,7 @@ static struct md_personality raid5_personality =
.quiesce = raid5_quiesce,
.takeover = raid5_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static struct md_personality raid4_personality =
@@ -8981,6 +9003,7 @@ static struct md_personality raid4_personality =
.quiesce = raid5_quiesce,
.takeover = raid4_takeover,
.change_consistency_policy = raid5_change_consistency_policy,
+ .prepare_suspend = raid5_prepare_suspend,
};
static int __init raid5_init(void)
diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c
index 04115cd92433..47a314a4eb6f 100644
--- a/drivers/memstick/core/ms_block.c
+++ b/drivers/memstick/core/ms_block.c
@@ -2078,6 +2078,12 @@ static const struct blk_mq_ops msb_mq_ops = {
static int msb_init_disk(struct memstick_dev *card)
{
struct msb_data *msb = memstick_get_drvdata(card);
+ struct queue_limits lim = {
+ .logical_block_size = msb->page_size,
+ .max_hw_sectors = MS_BLOCK_MAX_PAGES,
+ .max_segments = MS_BLOCK_MAX_SEGS,
+ .max_segment_size = MS_BLOCK_MAX_PAGES * msb->page_size,
+ };
int rc;
unsigned long capacity;
@@ -2093,19 +2099,13 @@ static int msb_init_disk(struct memstick_dev *card)
if (rc)
goto out_release_id;
- msb->disk = blk_mq_alloc_disk(&msb->tag_set, card);
+ msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk);
goto out_free_tag_set;
}
msb->queue = msb->disk->queue;
- blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES);
- blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS);
- blk_queue_max_segment_size(msb->queue,
- MS_BLOCK_MAX_PAGES * msb->page_size);
- blk_queue_logical_block_size(msb->queue, msb->page_size);
-
sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id);
msb->disk->fops = &msb_bdops;
msb->disk->private_data = msb;
diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c
index 5a69ed33999b..49accfdc89d6 100644
--- a/drivers/memstick/core/mspro_block.c
+++ b/drivers/memstick/core/mspro_block.c
@@ -1103,6 +1103,12 @@ static const struct blk_mq_ops mspro_mq_ops = {
static int mspro_block_init_disk(struct memstick_dev *card)
{
struct mspro_block_data *msb = memstick_get_drvdata(card);
+ struct queue_limits lim = {
+ .logical_block_size = msb->page_size,
+ .max_hw_sectors = MSPRO_BLOCK_MAX_PAGES,
+ .max_segments = MSPRO_BLOCK_MAX_SEGS,
+ .max_segment_size = MSPRO_BLOCK_MAX_PAGES * msb->page_size,
+ };
struct mspro_devinfo *dev_info = NULL;
struct mspro_sys_info *sys_info = NULL;
struct mspro_sys_attr *s_attr = NULL;
@@ -1138,18 +1144,13 @@ static int mspro_block_init_disk(struct memstick_dev *card)
if (rc)
goto out_release_id;
- msb->disk = blk_mq_alloc_disk(&msb->tag_set, card);
+ msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card);
if (IS_ERR(msb->disk)) {
rc = PTR_ERR(msb->disk);
goto out_free_tag_set;
}
msb->queue = msb->disk->queue;
- blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES);
- blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS);
- blk_queue_max_segment_size(msb->queue,
- MSPRO_BLOCK_MAX_PAGES * msb->page_size);
-
msb->disk->major = major;
msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT;
msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT;
@@ -1158,8 +1159,6 @@ static int mspro_block_init_disk(struct memstick_dev *card)
sprintf(msb->disk->disk_name, "mspblk%d", disk_id);
- blk_queue_logical_block_size(msb->queue, msb->page_size);
-
capacity = be16_to_cpu(sys_info->user_block_count);
capacity *= be16_to_cpu(sys_info->block_size);
capacity *= msb->page_size >> 9;
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c
index a0a2412f62a7..2ae60d208cdf 100644
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -174,8 +174,8 @@ static struct scatterlist *mmc_alloc_sg(unsigned short sg_len, gfp_t gfp)
return sg;
}
-static void mmc_queue_setup_discard(struct request_queue *q,
- struct mmc_card *card)
+static void mmc_queue_setup_discard(struct mmc_card *card,
+ struct queue_limits *lim)
{
unsigned max_discard;
@@ -183,15 +183,17 @@ static void mmc_queue_setup_discard(struct request_queue *q,
if (!max_discard)
return;
- blk_queue_max_discard_sectors(q, max_discard);
- q->limits.discard_granularity = card->pref_erase << 9;
- /* granularity must not be greater than max. discard */
- if (card->pref_erase > max_discard)
- q->limits.discard_granularity = SECTOR_SIZE;
+ lim->max_hw_discard_sectors = max_discard;
if (mmc_can_secure_erase_trim(card))
- blk_queue_max_secure_erase_sectors(q, max_discard);
+ lim->max_secure_erase_sectors = max_discard;
if (mmc_can_trim(card) && card->erased_byte == 0)
- blk_queue_max_write_zeroes_sectors(q, max_discard);
+ lim->max_write_zeroes_sectors = max_discard;
+
+ /* granularity must not be greater than max. discard */
+ if (card->pref_erase > max_discard)
+ lim->discard_granularity = SECTOR_SIZE;
+ else
+ lim->discard_granularity = card->pref_erase << 9;
}
static unsigned short mmc_get_max_segments(struct mmc_host *host)
@@ -341,40 +343,53 @@ static const struct blk_mq_ops mmc_mq_ops = {
.timeout = mmc_mq_timed_out,
};
-static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
+static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq,
+ struct mmc_card *card)
{
struct mmc_host *host = card->host;
- unsigned block_size = 512;
+ struct queue_limits lim = { };
+ struct gendisk *disk;
- blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue);
- blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue);
if (mmc_can_erase(card))
- mmc_queue_setup_discard(mq->queue, card);
+ mmc_queue_setup_discard(card, &lim);
if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask)
- blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH);
- blk_queue_max_hw_sectors(mq->queue,
- min(host->max_blk_count, host->max_req_size / 512));
- if (host->can_dma_map_merge)
- WARN(!blk_queue_can_use_dma_map_merging(mq->queue,
- mmc_dev(host)),
- "merging was advertised but not possible");
- blk_queue_max_segments(mq->queue, mmc_get_max_segments(host));
-
- if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) {
- block_size = card->ext_csd.data_sector_size;
- WARN_ON(block_size != 512 && block_size != 4096);
- }
+ lim.bounce = BLK_BOUNCE_HIGH;
+
+ lim.max_hw_sectors = min(host->max_blk_count, host->max_req_size / 512);
+
+ if (mmc_card_mmc(card) && card->ext_csd.data_sector_size)
+ lim.logical_block_size = card->ext_csd.data_sector_size;
+ else
+ lim.logical_block_size = 512;
+
+ WARN_ON_ONCE(lim.logical_block_size != 512 &&
+ lim.logical_block_size != 4096);
- blk_queue_logical_block_size(mq->queue, block_size);
/*
- * After blk_queue_can_use_dma_map_merging() was called with succeed,
- * since it calls blk_queue_virt_boundary(), the mmc should not call
- * both blk_queue_max_segment_size().
+ * Setting a virt_boundary implicity sets a max_segment_size, so try
+ * to set the hardware one here.
*/
- if (!host->can_dma_map_merge)
- blk_queue_max_segment_size(mq->queue,
- round_down(host->max_seg_size, block_size));
+ if (host->can_dma_map_merge) {
+ lim.virt_boundary_mask = dma_get_merge_boundary(mmc_dev(host));
+ lim.max_segments = MMC_DMA_MAP_MERGE_SEGMENTS;
+ } else {
+ lim.max_segment_size =
+ round_down(host->max_seg_size, lim.logical_block_size);
+ lim.max_segments = host->max_segs;
+ }
+
+ disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq);
+ if (IS_ERR(disk))
+ return disk;
+ mq->queue = disk->queue;
+
+ if (mmc_host_is_spi(host) && host->use_spi_crc)
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
+ blk_queue_rq_timeout(mq->queue, 60 * HZ);
+
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue);
dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue));
@@ -386,6 +401,7 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card)
init_waitqueue_head(&mq->wait);
mmc_crypto_setup_queue(mq->queue, host);
+ return disk;
}
static inline bool mmc_merge_capable(struct mmc_host *host)
@@ -447,18 +463,9 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card)
return ERR_PTR(ret);
- disk = blk_mq_alloc_disk(&mq->tag_set, mq);
- if (IS_ERR(disk)) {
+ disk = mmc_alloc_disk(mq, card);
+ if (IS_ERR(disk))
blk_mq_free_tag_set(&mq->tag_set);
- return disk;
- }
- mq->queue = disk->queue;
-
- if (mmc_host_is_spi(host) && host->use_spi_crc)
- blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue);
- blk_queue_rq_timeout(mq->queue, 60 * HZ);
-
- mmc_setup_queue(mq, card);
return disk;
}
diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c
index f0526dcc2162..3caa0717d46c 100644
--- a/drivers/mtd/mtd_blkdevs.c
+++ b/drivers/mtd/mtd_blkdevs.c
@@ -277,6 +277,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
{
struct mtd_blktrans_ops *tr = new->tr;
struct mtd_blktrans_dev *d;
+ struct queue_limits lim = { };
int last_devnum = -1;
struct gendisk *gd;
int ret;
@@ -331,9 +332,13 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING);
if (ret)
goto out_kfree_tag_set;
+
+ lim.logical_block_size = tr->blksize;
+ if (tr->discard)
+ lim.max_hw_discard_sectors = UINT_MAX;
/* Create gendisk */
- gd = blk_mq_alloc_disk(new->tag_set, new);
+ gd = blk_mq_alloc_disk(new->tag_set, &lim, new);
if (IS_ERR(gd)) {
ret = PTR_ERR(gd);
goto out_free_tag_set;
@@ -371,14 +376,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new)
if (tr->flush)
blk_queue_write_cache(new->rq, true, false);
- blk_queue_logical_block_size(new->rq, tr->blksize);
-
blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq);
- if (tr->discard)
- blk_queue_max_discard_sectors(new->rq, UINT_MAX);
-
gd->queue = new->rq;
if (new->readonly)
diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c
index 654bd7372cd8..5c8fdcc088a0 100644
--- a/drivers/mtd/ubi/block.c
+++ b/drivers/mtd/ubi/block.c
@@ -348,6 +348,9 @@ static int calc_disk_capacity(struct ubi_volume_info *vi, u64 *disk_capacity)
int ubiblock_create(struct ubi_volume_info *vi)
{
+ struct queue_limits lim = {
+ .max_segments = UBI_MAX_SG_COUNT,
+ };
struct ubiblock *dev;
struct gendisk *gd;
u64 disk_capacity;
@@ -393,7 +396,7 @@ int ubiblock_create(struct ubi_volume_info *vi)
/* Initialize the gendisk of this ubiblock device */
- gd = blk_mq_alloc_disk(&dev->tag_set, dev);
+ gd = blk_mq_alloc_disk(&dev->tag_set, &lim, dev);
if (IS_ERR(gd)) {
ret = PTR_ERR(gd);
goto out_free_tags;
@@ -416,7 +419,6 @@ int ubiblock_create(struct ubi_volume_info *vi)
dev->gd = gd;
dev->rq = gd->queue;
- blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT);
list_add_tail(&dev->list, &ubiblock_devices);
diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c
index bb3726b622ad..4d0c527e8576 100644
--- a/drivers/nvdimm/btt.c
+++ b/drivers/nvdimm/btt.c
@@ -1496,19 +1496,21 @@ static int btt_blk_init(struct btt *btt)
{
struct nd_btt *nd_btt = btt->nd_btt;
struct nd_namespace_common *ndns = nd_btt->ndns;
- int rc = -ENOMEM;
+ struct queue_limits lim = {
+ .logical_block_size = btt->sector_size,
+ .max_hw_sectors = UINT_MAX,
+ };
+ int rc;
- btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!btt->btt_disk)
- return -ENOMEM;
+ btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(btt->btt_disk))
+ return PTR_ERR(btt->btt_disk);
nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name);
btt->btt_disk->first_minor = 0;
btt->btt_disk->fops = &btt_fops;
btt->btt_disk->private_data = btt;
- blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size);
- blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue);
diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c
index 4e8fdcb3f1c8..8dcc10b6db5b 100644
--- a/drivers/nvdimm/pmem.c
+++ b/drivers/nvdimm/pmem.c
@@ -451,6 +451,11 @@ static int pmem_attach_disk(struct device *dev,
{
struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev);
struct nd_region *nd_region = to_nd_region(dev->parent);
+ struct queue_limits lim = {
+ .logical_block_size = pmem_sector_size(ndns),
+ .physical_block_size = PAGE_SIZE,
+ .max_hw_sectors = UINT_MAX,
+ };
int nid = dev_to_node(dev), fua;
struct resource *res = &nsio->res;
struct range bb_range;
@@ -497,9 +502,9 @@ static int pmem_attach_disk(struct device *dev,
return -EBUSY;
}
- disk = blk_alloc_disk(nid);
- if (!disk)
- return -ENOMEM;
+ disk = blk_alloc_disk(&lim, nid);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
q = disk->queue;
pmem->disk = disk;
@@ -539,9 +544,6 @@ static int pmem_attach_disk(struct device *dev,
pmem->virt_addr = addr;
blk_queue_write_cache(q, true, fua);
- blk_queue_physical_block_size(q, PAGE_SIZE);
- blk_queue_logical_block_size(q, pmem_sector_size(ndns));
- blk_queue_max_hw_sectors(q, UINT_MAX);
blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q);
if (pmem->pfn_flags & PFN_MAP)
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index c727cd1f264b..a480cdeac288 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
- anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset);
+ anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL);
if (IS_ERR(anv->ctrl.admin_q)) {
ret = -ENOMEM;
goto put_dev;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 0a96362912ce..00864a634470 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock);
static DEFINE_IDA(nvme_instance_ida);
static dev_t nvme_ctrl_base_chr_devt;
-static struct class *nvme_class;
-static struct class *nvme_subsys_class;
+static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env);
+static const struct class nvme_class = {
+ .name = "nvme",
+ .dev_uevent = nvme_class_uevent,
+};
+
+static const struct class nvme_subsys_class = {
+ .name = "nvme-subsystem",
+};
static DEFINE_IDA(nvme_ns_chr_minor_ida);
static dev_t nvme_ns_chr_devt;
-static struct class *nvme_ns_chr_class;
+static const struct class nvme_ns_chr_class = {
+ .name = "nvme-generic",
+};
static void nvme_put_subsystem(struct nvme_subsystem *subsys);
static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
@@ -1398,8 +1407,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
error = nvme_submit_sync_cmd(dev->admin_q, &c, *id,
sizeof(struct nvme_id_ctrl));
- if (error)
+ if (error) {
kfree(*id);
+ *id = NULL;
+ }
return error;
}
@@ -1528,6 +1539,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid,
if (error) {
dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error);
kfree(*id);
+ *id = NULL;
}
return error;
}
@@ -1727,12 +1739,23 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return 0;
}
-#ifdef CONFIG_BLK_DEV_INTEGRITY
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
+static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head)
{
struct blk_integrity integrity = { };
+ blk_integrity_unregister(disk);
+
+ if (!head->ms)
+ return true;
+
+ /*
+ * PI can always be supported as we can ask the controller to simply
+ * insert/strip it, which is not possible for other kinds of metadata.
+ */
+ if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) ||
+ !(head->features & NVME_NS_METADATA_SUPPORTED))
+ return nvme_ns_has_pi(head);
+
switch (head->pi_type) {
case NVME_NS_DPS_PI_TYPE3:
switch (head->guard_type) {
@@ -1775,53 +1798,32 @@ static void nvme_init_integrity(struct gendisk *disk,
}
integrity.tuple_size = head->ms;
+ integrity.pi_offset = head->pi_offset;
blk_integrity_register(disk, &integrity);
- blk_queue_max_integrity_segments(disk->queue, max_integrity_segments);
-}
-#else
-static void nvme_init_integrity(struct gendisk *disk,
- struct nvme_ns_head *head, u32 max_integrity_segments)
-{
+ return true;
}
-#endif /* CONFIG_BLK_DEV_INTEGRITY */
-static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head)
+static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim)
{
- struct request_queue *queue = disk->queue;
- u32 max_discard_sectors;
-
- if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) {
- max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl);
- } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) {
- max_discard_sectors = UINT_MAX;
- } else {
- blk_queue_max_discard_sectors(queue, 0);
- return;
- }
+ struct nvme_ctrl *ctrl = ns->ctrl;
BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) <
NVME_DSM_MAX_RANGES);
- /*
- * If discard is already enabled, don't reset queue limits.
- *
- * This works around the fact that the block layer can't cope well with
- * updating the hardware limits when overridden through sysfs. This is
- * harmless because discard limits in NVMe are purely advisory.
- */
- if (queue->limits.max_discard_sectors)
- return;
+ if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX))
+ lim->max_hw_discard_sectors =
+ nvme_lba_to_sect(ns->head, ctrl->dmrsl);
+ else if (ctrl->oncs & NVME_CTRL_ONCS_DSM)
+ lim->max_hw_discard_sectors = UINT_MAX;
+ else
+ lim->max_hw_discard_sectors = 0;
+
+ lim->discard_granularity = lim->logical_block_size;
- blk_queue_max_discard_sectors(queue, max_discard_sectors);
if (ctrl->dmrl)
- blk_queue_max_discard_segments(queue, ctrl->dmrl);
+ lim->max_discard_segments = ctrl->dmrl;
else
- blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES);
- queue->limits.discard_granularity = queue_logical_block_size(queue);
-
- if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
- blk_queue_max_write_zeroes_sectors(queue, UINT_MAX);
+ lim->max_discard_segments = NVME_DSM_MAX_RANGES;
}
static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
@@ -1832,42 +1834,38 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b)
a->csi == b->csi;
}
-static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
- struct nvme_id_ns *id)
+static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid,
+ struct nvme_id_ns_nvm **nvmp)
{
- bool first = id->dps & NVME_NS_DPS_PI_FIRST;
- unsigned lbaf = nvme_lbaf_index(id->flbas);
- struct nvme_command c = { };
+ struct nvme_command c = {
+ .identify.opcode = nvme_admin_identify,
+ .identify.nsid = cpu_to_le32(nsid),
+ .identify.cns = NVME_ID_CNS_CS_NS,
+ .identify.csi = NVME_CSI_NVM,
+ };
struct nvme_id_ns_nvm *nvm;
- int ret = 0;
- u32 elbaf;
-
- head->pi_size = 0;
- head->ms = le16_to_cpu(id->lbaf[lbaf].ms);
- if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
- head->pi_size = sizeof(struct t10_pi_tuple);
- head->guard_type = NVME_NVM_NS_16B_GUARD;
- goto set_pi;
- }
+ int ret;
nvm = kzalloc(sizeof(*nvm), GFP_KERNEL);
if (!nvm)
return -ENOMEM;
- c.identify.opcode = nvme_admin_identify;
- c.identify.nsid = cpu_to_le32(head->ns_id);
- c.identify.cns = NVME_ID_CNS_CS_NS;
- c.identify.csi = NVME_CSI_NVM;
-
ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm));
if (ret)
- goto free_data;
+ kfree(nvm);
+ else
+ *nvmp = nvm;
+ return ret;
+}
- elbaf = le32_to_cpu(nvm->elbaf[lbaf]);
+static void nvme_configure_pi_elbas(struct nvme_ns_head *head,
+ struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm)
+{
+ u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]);
/* no support for storage tag formats right now */
if (nvme_elbaf_sts(elbaf))
- goto free_data;
+ return;
head->guard_type = nvme_elbaf_guard_type(elbaf);
switch (head->guard_type) {
@@ -1880,30 +1878,31 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head,
default:
break;
}
-
-free_data:
- kfree(nvm);
-set_pi:
- if (head->pi_size && (first || head->ms == head->pi_size))
- head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
- else
- head->pi_type = 0;
-
- return ret;
}
-static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static void nvme_configure_metadata(struct nvme_ctrl *ctrl,
+ struct nvme_ns_head *head, struct nvme_id_ns *id,
+ struct nvme_id_ns_nvm *nvm)
{
- int ret;
-
- ret = nvme_init_ms(ctrl, head, id);
- if (ret)
- return ret;
-
head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS);
+ head->pi_type = 0;
+ head->pi_size = 0;
+ head->pi_offset = 0;
+ head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms);
if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED))
- return 0;
+ return;
+
+ if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) {
+ nvme_configure_pi_elbas(head, id, nvm);
+ } else {
+ head->pi_size = sizeof(struct t10_pi_tuple);
+ head->guard_type = NVME_NVM_NS_16B_GUARD;
+ }
+
+ if (head->pi_size && head->ms >= head->pi_size)
+ head->pi_type = id->dps & NVME_NS_DPS_PI_MASK;
+ if (!(id->dps & NVME_NS_DPS_PI_FIRST))
+ head->pi_offset = head->ms - head->pi_size;
if (ctrl->ops->flags & NVME_F_FABRICS) {
/*
@@ -1912,7 +1911,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
* remap the separate metadata buffer from the block layer.
*/
if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT)))
- return 0;
+ return;
head->features |= NVME_NS_EXT_LBAS;
@@ -1939,33 +1938,32 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl,
else
head->features |= NVME_NS_METADATA_SUPPORTED;
}
- return 0;
}
-static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
- struct request_queue *q)
+static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl)
{
- bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT;
-
- if (ctrl->max_hw_sectors) {
- u32 max_segments =
- (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1;
+ return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1;
+}
- max_segments = min_not_zero(max_segments, ctrl->max_segments);
- blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
- blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
- }
- blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, 3);
- blk_queue_write_cache(q, vwc, vwc);
+static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
+ struct queue_limits *lim)
+{
+ lim->max_hw_sectors = ctrl->max_hw_sectors;
+ lim->max_segments = min_t(u32, USHRT_MAX,
+ min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
+ lim->max_integrity_segments = ctrl->max_integrity_segments;
+ lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
+ lim->max_segment_size = UINT_MAX;
+ lim->dma_alignment = 3;
}
-static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
- struct nvme_ns_head *head, struct nvme_id_ns *id)
+static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
- sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze));
+ struct nvme_ns_head *head = ns->head;
u32 bs = 1U << head->lba_shift;
u32 atomic_bs, phys_bs, io_opt = 0;
+ bool valid = true;
/*
* The block layer can't support LBA sizes larger than the page size
@@ -1973,12 +1971,10 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
* allow block I/O.
*/
if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) {
- capacity = 0;
bs = (1 << 9);
+ valid = false;
}
- blk_integrity_unregister(disk);
-
atomic_bs = phys_bs = bs;
if (id->nabo == 0) {
/*
@@ -1989,7 +1985,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf)
atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs;
else
- atomic_bs = (1 + ctrl->subsys->awupf) * bs;
+ atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs;
}
if (id->nsfeat & NVME_NS_FEAT_IO_OPT) {
@@ -1999,36 +1995,20 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk,
io_opt = bs * (1 + le16_to_cpu(id->nows));
}
- blk_queue_logical_block_size(disk->queue, bs);
/*
* Linux filesystems assume writing a single physical block is
* an atomic operation. Hence limit the physical block size to the
* value of the Atomic Write Unit Power Fail parameter.
*/
- blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs));
- blk_queue_io_min(disk->queue, phys_bs);
- blk_queue_io_opt(disk->queue, io_opt);
-
- /*
- * Register a metadata profile for PI, or the plain non-integrity NVMe
- * metadata masquerading as Type 0 if supported, otherwise reject block
- * I/O to namespaces with metadata except when the namespace supports
- * PI, as it can strip/insert in that case.
- */
- if (head->ms) {
- if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
- (head->features & NVME_NS_METADATA_SUPPORTED))
- nvme_init_integrity(disk, head,
- ctrl->max_integrity_segments);
- else if (!nvme_ns_has_pi(head))
- capacity = 0;
- }
-
- set_capacity_and_notify(disk, capacity);
-
- nvme_config_discard(ctrl, disk, head);
- blk_queue_max_write_zeroes_sectors(disk->queue,
- ctrl->max_zeroes_sectors);
+ lim->logical_block_size = bs;
+ lim->physical_block_size = min(phys_bs, atomic_bs);
+ lim->io_min = phys_bs;
+ lim->io_opt = io_opt;
+ if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES)
+ lim->max_write_zeroes_sectors = UINT_MAX;
+ else
+ lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors;
+ return valid;
}
static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info)
@@ -2042,7 +2022,8 @@ static inline bool nvme_first_scan(struct gendisk *disk)
return !disk_live(disk);
}
-static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
+static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id,
+ struct queue_limits *lim)
{
struct nvme_ctrl *ctrl = ns->ctrl;
u32 iob;
@@ -2070,38 +2051,36 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id)
return;
}
- blk_queue_chunk_sectors(ns->queue, iob);
+ lim->chunk_sectors = iob;
}
static int nvme_update_ns_info_generic(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ struct queue_limits lim;
+ int ret;
+
blk_mq_freeze_queue(ns->disk->queue);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
blk_mq_unfreeze_queue(ns->disk->queue);
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- ns->head->disk->flags |= GENHD_FL_HIDDEN;
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
/* Hide the block-interface for these devices */
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
-
- return 0;
+ if (!ret)
+ ret = -ENODEV;
+ return ret;
}
static int nvme_update_ns_info_block(struct nvme_ns *ns,
struct nvme_ns_info *info)
{
+ bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT;
+ struct queue_limits lim;
+ struct nvme_id_ns_nvm *nvm = NULL;
struct nvme_id_ns *id;
+ sector_t capacity;
unsigned lbaf;
int ret;
@@ -2113,30 +2092,52 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
/* namespace not allocated or attached */
info->is_removed = true;
ret = -ENODEV;
- goto error;
+ goto out;
+ }
+
+ if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) {
+ ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm);
+ if (ret < 0)
+ goto out;
}
blk_mq_freeze_queue(ns->disk->queue);
lbaf = nvme_lbaf_index(id->flbas);
ns->head->lba_shift = id->lbaf[lbaf].ds;
ns->head->nuse = le64_to_cpu(id->nuse);
- nvme_set_queue_limits(ns->ctrl, ns->queue);
-
- ret = nvme_configure_metadata(ns->ctrl, ns->head, id);
- if (ret < 0) {
- blk_mq_unfreeze_queue(ns->disk->queue);
- goto out;
- }
- nvme_set_chunk_sectors(ns, id);
- nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id);
+ capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze));
- if (ns->head->ids.csi == NVME_CSI_ZNS) {
- ret = nvme_update_zone_info(ns, lbaf);
+ lim = queue_limits_start_update(ns->disk->queue);
+ nvme_set_ctrl_limits(ns->ctrl, &lim);
+ nvme_configure_metadata(ns->ctrl, ns->head, id, nvm);
+ nvme_set_chunk_sectors(ns, id, &lim);
+ if (!nvme_update_disk_info(ns, id, &lim))
+ capacity = 0;
+ nvme_config_discard(ns, &lim);
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ ns->head->ids.csi == NVME_CSI_ZNS) {
+ ret = nvme_update_zone_info(ns, lbaf, &lim);
if (ret) {
blk_mq_unfreeze_queue(ns->disk->queue);
goto out;
}
}
+ ret = queue_limits_commit_update(ns->disk->queue, &lim);
+ if (ret) {
+ blk_mq_unfreeze_queue(ns->disk->queue);
+ goto out;
+ }
+
+ /*
+ * Register a metadata profile for PI, or the plain non-integrity NVMe
+ * metadata masquerading as Type 0 if supported, otherwise reject block
+ * I/O to namespaces with metadata except when the namespace supports
+ * PI, as it can strip/insert in that case.
+ */
+ if (!nvme_init_integrity(ns->disk, ns->head))
+ capacity = 0;
+
+ set_capacity_and_notify(ns->disk, capacity);
/*
* Only set the DEAC bit if the device guarantees that reads from
@@ -2147,62 +2148,81 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
ns->head->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
+ blk_queue_write_cache(ns->disk->queue, vwc, vwc);
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
if (blk_queue_is_zoned(ns->queue)) {
- ret = nvme_revalidate_zones(ns);
+ ret = blk_revalidate_disk_zones(ns->disk, NULL);
if (ret && !nvme_first_scan(ns->disk))
goto out;
}
- if (nvme_ns_head_multipath(ns->head)) {
- blk_mq_freeze_queue(ns->head->disk->queue);
- nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id);
- set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
- nvme_mpath_revalidate_paths(ns);
- blk_stack_limits(&ns->head->disk->queue->limits,
- &ns->queue->limits, 0);
- disk_update_readahead(ns->head->disk);
- blk_mq_unfreeze_queue(ns->head->disk->queue);
- }
-
ret = 0;
out:
- /*
- * If probing fails due an unsupported feature, hide the block device,
- * but still allow other access.
- */
- if (ret == -ENODEV) {
- ns->disk->flags |= GENHD_FL_HIDDEN;
- set_bit(NVME_NS_READY, &ns->flags);
- ret = 0;
- }
-
-error:
+ kfree(nvm);
kfree(id);
return ret;
}
static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info)
{
+ bool unsupported = false;
+ int ret;
+
switch (info->ids.csi) {
case NVME_CSI_ZNS:
if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) {
dev_info(ns->ctrl->device,
"block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n",
info->nsid);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
}
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
case NVME_CSI_NVM:
- return nvme_update_ns_info_block(ns, info);
+ ret = nvme_update_ns_info_block(ns, info);
+ break;
default:
dev_info(ns->ctrl->device,
"block device for nsid %u not supported (csi %u)\n",
info->nsid, info->ids.csi);
- return nvme_update_ns_info_generic(ns, info);
+ ret = nvme_update_ns_info_generic(ns, info);
+ break;
+ }
+
+ /*
+ * If probing fails due an unsupported feature, hide the block device,
+ * but still allow other access.
+ */
+ if (ret == -ENODEV) {
+ ns->disk->flags |= GENHD_FL_HIDDEN;
+ set_bit(NVME_NS_READY, &ns->flags);
+ unsupported = true;
+ ret = 0;
}
+
+ if (!ret && nvme_ns_head_multipath(ns->head)) {
+ struct queue_limits lim;
+
+ blk_mq_freeze_queue(ns->head->disk->queue);
+ if (unsupported)
+ ns->head->disk->flags |= GENHD_FL_HIDDEN;
+ else
+ nvme_init_integrity(ns->head->disk, ns->head);
+ set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk));
+ set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info));
+ nvme_mpath_revalidate_paths(ns);
+
+ lim = queue_limits_start_update(ns->head->disk->queue);
+ queue_limits_stack_bdev(&lim, ns->disk->part0, 0,
+ ns->head->disk->disk_name);
+ ret = queue_limits_commit_update(ns->head->disk->queue, &lim);
+ blk_mq_unfreeze_queue(ns->head->disk->queue);
+ }
+
+ return ret;
}
#ifdef CONFIG_BLK_SED_OPAL
@@ -2877,7 +2897,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
subsys->awupf = le16_to_cpu(id->awupf);
nvme_mpath_default_iopolicy(subsys);
- subsys->dev.class = nvme_subsys_class;
+ subsys->dev.class = &nvme_subsys_class;
subsys->dev.release = nvme_release_subsystem;
subsys->dev.groups = nvme_subsys_attrs_groups;
dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance);
@@ -3117,11 +3137,17 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct
return -EINVAL;
}
+ if (!ctrl->maxcmd) {
+ dev_err(ctrl->device, "Maximum outstanding commands is 0\n");
+ return -EINVAL;
+ }
+
return 0;
}
static int nvme_init_identify(struct nvme_ctrl *ctrl)
{
+ struct queue_limits lim;
struct nvme_id_ctrl *id;
u32 max_hw_sectors;
bool prev_apst_enabled;
@@ -3188,7 +3214,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl)
ctrl->max_hw_sectors =
min_not_zero(ctrl->max_hw_sectors, max_hw_sectors);
- nvme_set_queue_limits(ctrl, ctrl->admin_q);
+ lim = queue_limits_start_update(ctrl->admin_q);
+ nvme_set_ctrl_limits(ctrl, &lim);
+ ret = queue_limits_commit_update(ctrl->admin_q, &lim);
+ if (ret)
+ goto out_free;
+
ctrl->sgls = le32_to_cpu(id->sgls);
ctrl->kas = le16_to_cpu(id->kas);
ctrl->max_namespaces = le32_to_cpu(id->mnan);
@@ -3420,7 +3451,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device,
if (minor < 0)
return minor;
cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor);
- cdev_device->class = nvme_ns_chr_class;
+ cdev_device->class = &nvme_ns_chr_class;
cdev_device->release = nvme_cdev_rel;
device_initialize(cdev_device);
cdev_init(cdev, fops);
@@ -3692,7 +3723,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
if (!ns)
return;
- disk = blk_mq_alloc_disk(ctrl->tagset, ns);
+ disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns);
if (IS_ERR(disk))
goto out_free_ns;
disk->fops = &nvme_bdev_ops;
@@ -4353,6 +4384,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
const struct blk_mq_ops *ops, unsigned int cmd_size)
{
+ struct queue_limits lim = {};
int ret;
memset(set, 0, sizeof(*set));
@@ -4372,14 +4404,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ret)
return ret;
- ctrl->admin_q = blk_mq_init_queue(set);
+ ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL);
if (IS_ERR(ctrl->admin_q)) {
ret = PTR_ERR(ctrl->admin_q);
goto out_free_tagset;
}
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->fabrics_q = blk_mq_init_queue(set);
+ ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->fabrics_q)) {
ret = PTR_ERR(ctrl->fabrics_q);
goto out_cleanup_admin_q;
@@ -4443,7 +4475,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
return ret;
if (ctrl->ops->flags & NVME_F_FABRICS) {
- ctrl->connect_q = blk_mq_init_queue(set);
+ ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL);
if (IS_ERR(ctrl->connect_q)) {
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
@@ -4613,7 +4645,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->device = &ctrl->ctrl_device;
ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt),
ctrl->instance);
- ctrl->device->class = nvme_class;
+ ctrl->device->class = &nvme_class;
ctrl->device->parent = ctrl->dev;
if (ops->dev_attr_groups)
ctrl->device->groups = ops->dev_attr_groups;
@@ -4846,42 +4878,36 @@ static int __init nvme_core_init(void)
if (result < 0)
goto destroy_delete_wq;
- nvme_class = class_create("nvme");
- if (IS_ERR(nvme_class)) {
- result = PTR_ERR(nvme_class);
+ result = class_register(&nvme_class);
+ if (result)
goto unregister_chrdev;
- }
- nvme_class->dev_uevent = nvme_class_uevent;
- nvme_subsys_class = class_create("nvme-subsystem");
- if (IS_ERR(nvme_subsys_class)) {
- result = PTR_ERR(nvme_subsys_class);
+ result = class_register(&nvme_subsys_class);
+ if (result)
goto destroy_class;
- }
result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS,
"nvme-generic");
if (result < 0)
goto destroy_subsys_class;
- nvme_ns_chr_class = class_create("nvme-generic");
- if (IS_ERR(nvme_ns_chr_class)) {
- result = PTR_ERR(nvme_ns_chr_class);
+ result = class_register(&nvme_ns_chr_class);
+ if (result)
goto unregister_generic_ns;
- }
+
result = nvme_init_auth();
if (result)
goto destroy_ns_chr;
return 0;
destroy_ns_chr:
- class_destroy(nvme_ns_chr_class);
+ class_unregister(&nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
- class_destroy(nvme_subsys_class);
+ class_unregister(&nvme_subsys_class);
destroy_class:
- class_destroy(nvme_class);
+ class_unregister(&nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_delete_wq:
@@ -4897,9 +4923,9 @@ out:
static void __exit nvme_core_exit(void)
{
nvme_exit_auth();
- class_destroy(nvme_ns_chr_class);
- class_destroy(nvme_subsys_class);
- class_destroy(nvme_class);
+ class_unregister(&nvme_ns_chr_class);
+ class_unregister(&nvme_subsys_class);
+ class_unregister(&nvme_class);
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS);
destroy_workqueue(nvme_delete_wq);
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 495c171daead..1f0ea1f32d22 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -638,7 +638,7 @@ static struct key *nvmf_parse_key(int key_id)
}
key = key_lookup(key_id);
- if (!IS_ERR(key))
+ if (IS_ERR(key))
pr_err("key id %08x not found\n", key_id);
else
pr_debug("Using key id %08x\n", key_id);
@@ -1319,7 +1319,10 @@ out_free_opts:
return ERR_PTR(ret);
}
-static struct class *nvmf_class;
+static const struct class nvmf_class = {
+ .name = "nvme-fabrics",
+};
+
static struct device *nvmf_device;
static DEFINE_MUTEX(nvmf_dev_mutex);
@@ -1439,15 +1442,14 @@ static int __init nvmf_init(void)
if (!nvmf_default_host)
return -ENOMEM;
- nvmf_class = class_create("nvme-fabrics");
- if (IS_ERR(nvmf_class)) {
+ ret = class_register(&nvmf_class);
+ if (ret) {
pr_err("couldn't register class nvme-fabrics\n");
- ret = PTR_ERR(nvmf_class);
goto out_free_host;
}
nvmf_device =
- device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
+ device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl");
if (IS_ERR(nvmf_device)) {
pr_err("couldn't create nvme-fabrics device!\n");
ret = PTR_ERR(nvmf_device);
@@ -1463,9 +1465,9 @@ static int __init nvmf_init(void)
return 0;
out_destroy_device:
- device_destroy(nvmf_class, MKDEV(0, 0));
+ device_destroy(&nvmf_class, MKDEV(0, 0));
out_destroy_class:
- class_destroy(nvmf_class);
+ class_unregister(&nvmf_class);
out_free_host:
nvmf_host_put(nvmf_default_host);
return ret;
@@ -1474,8 +1476,8 @@ out_free_host:
static void __exit nvmf_exit(void)
{
misc_deregister(&nvmf_misc);
- device_destroy(nvmf_class, MKDEV(0, 0));
- class_destroy(nvmf_class);
+ device_destroy(&nvmf_class, MKDEV(0, 0));
+ class_unregister(&nvmf_class);
nvmf_host_put(nvmf_default_host);
BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64);
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 74de1e64aeea..5397fb428b24 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -516,6 +516,7 @@ static void nvme_requeue_work(struct work_struct *work)
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
{
+ struct queue_limits lim;
bool vwc = false;
mutex_init(&head->lock);
@@ -532,9 +533,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
!nvme_is_unique_nsid(ctrl, head) || !multipath)
return 0;
- head->disk = blk_alloc_disk(ctrl->numa_node);
- if (!head->disk)
- return -ENOMEM;
+ blk_set_stacking_limits(&lim);
+ lim.dma_alignment = 3;
+ if (head->ids.csi != NVME_CSI_ZNS)
+ lim.max_zone_append_sectors = 0;
+
+ head->disk = blk_alloc_disk(&lim, ctrl->numa_node);
+ if (IS_ERR(head->disk))
+ return PTR_ERR(head->disk);
head->disk->fops = &nvme_ns_head_ops;
head->disk->private_data = head;
sprintf(head->disk->disk_name, "nvme%dn%d",
@@ -553,11 +559,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues)
blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue);
- /* set to a default value of 512 until the disk is validated */
- blk_queue_logical_block_size(head->disk->queue, 512);
- blk_set_stacking_limits(&head->disk->queue->limits);
- blk_queue_dma_alignment(head->disk->queue, 3);
-
/* we need to propagate up the VMC settings */
if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
vwc = true;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 7b87763e2f8a..24193fcb8bd5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -464,6 +464,7 @@ struct nvme_ns_head {
u16 ms;
u16 pi_size;
u8 pi_type;
+ u8 pi_offset;
u8 guard_type;
u16 sgs;
u32 sws;
@@ -1035,11 +1036,11 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk)
}
#endif /* CONFIG_NVME_MULTIPATH */
-int nvme_revalidate_zones(struct nvme_ns *ns);
int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
+int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct queue_limits *lim);
#ifdef CONFIG_BLK_DEV_ZONED
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf);
blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req,
struct nvme_command *cmnd,
enum nvme_zone_mgmt_action action);
@@ -1050,13 +1051,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns,
{
return BLK_STS_NOTSUPP;
}
-
-static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
-{
- dev_warn(ns->ctrl->device,
- "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n");
- return -EPROTONOSUPPORT;
-}
#endif
static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 20fdd40b1879..366f0bb4ebfc 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
{
int ret;
bool changed;
+ u16 max_queue_size;
ret = nvme_rdma_configure_admin_queue(ctrl, new);
if (ret)
@@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1);
}
- if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) {
+ if (ctrl->ctrl.max_integrity_segments)
+ max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
+ else
+ max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
+
+ if (ctrl->ctrl.sqsize + 1 > max_queue_size) {
dev_warn(ctrl->ctrl.device,
- "ctrl sqsize %u > max queue size %u, clamping down\n",
- ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE);
- ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1;
+ "ctrl sqsize %u > max queue size %u, clamping down\n",
+ ctrl->ctrl.sqsize + 1, max_queue_size);
+ ctrl->ctrl.sqsize = max_queue_size - 1;
}
if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) {
diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c
index f2832f70e7e0..09fcaa519e5b 100644
--- a/drivers/nvme/host/sysfs.c
+++ b/drivers/nvme/host/sysfs.c
@@ -221,14 +221,11 @@ static int ns_update_nuse(struct nvme_ns *ns)
ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id);
if (ret)
- goto out_free_id;
+ return ret;
ns->head->nuse = le64_to_cpu(id->nuse);
-
-out_free_id:
kfree(id);
-
- return ret;
+ return 0;
}
static ssize_t nuse_show(struct device *dev, struct device_attribute *attr,
diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c
index 499bbb0eee8d..722384bcc765 100644
--- a/drivers/nvme/host/zns.c
+++ b/drivers/nvme/host/zns.c
@@ -7,16 +7,6 @@
#include <linux/vmalloc.h>
#include "nvme.h"
-int nvme_revalidate_zones(struct nvme_ns *ns)
-{
- struct request_queue *q = ns->queue;
-
- blk_queue_chunk_sectors(q, ns->head->zsze);
- blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append);
-
- return blk_revalidate_disk_zones(ns->disk, NULL);
-}
-
static int nvme_set_max_append(struct nvme_ctrl *ctrl)
{
struct nvme_command c = { };
@@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl)
return 0;
}
-int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
+int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf,
+ struct queue_limits *lim)
{
struct nvme_effects_log *log = ns->head->effects;
- struct request_queue *q = ns->queue;
struct nvme_command c = { };
struct nvme_id_ns_zns *id;
int status;
@@ -109,10 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf)
goto free_data;
}
- disk_set_zoned(ns->disk);
- blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
- disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1);
- disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1);
+ blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue);
+ lim->zoned = 1;
+ lim->max_open_zones = le32_to_cpu(id->mor) + 1;
+ lim->max_active_zones = le32_to_cpu(id->mar) + 1;
+ lim->chunk_sectors = ns->head->zsze;
+ lim->max_zone_append_sectors = ns->ctrl->max_zone_append;
free_data:
kfree(id);
return status;
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index 39cb570f833d..f5b7054a4a05 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -428,7 +428,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
id->cqes = (0x4 << 4) | 0x4;
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
- id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+ id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl));
id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES);
id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES);
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 2482a0db2504..77a6e817b315 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -273,6 +273,32 @@ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item,
CONFIGFS_ATTR(nvmet_, param_inline_data_size);
+static ssize_t nvmet_param_max_queue_size_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+
+ return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size);
+}
+
+static ssize_t nvmet_param_max_queue_size_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_port *port = to_nvmet_port(item);
+ int ret;
+
+ if (nvmet_is_port_enabled(port, __func__))
+ return -EACCES;
+ ret = kstrtoint(page, 0, &port->max_queue_size);
+ if (ret) {
+ pr_err("Invalid value '%s' for max_queue_size\n", page);
+ return -EINVAL;
+ }
+ return count;
+}
+
+CONFIGFS_ATTR(nvmet_, param_max_queue_size);
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
static ssize_t nvmet_param_pi_enable_show(struct config_item *item,
char *page)
@@ -1859,6 +1885,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = {
&nvmet_attr_addr_trtype,
&nvmet_attr_addr_tsas,
&nvmet_attr_param_inline_data_size,
+ &nvmet_attr_param_max_queue_size,
#ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_attr_param_pi_enable,
#endif
@@ -1917,6 +1944,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group,
INIT_LIST_HEAD(&port->subsystems);
INIT_LIST_HEAD(&port->referrals);
port->inline_data_size = -1; /* < 0 == let the transport choose */
+ port->max_queue_size = -1; /* < 0 == let the transport choose */
port->disc_addr.portid = cpu_to_le16(portid);
port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX;
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 8658e9c08534..6bbe4df0166c 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -358,6 +358,18 @@ int nvmet_enable_port(struct nvmet_port *port)
if (port->inline_data_size < 0)
port->inline_data_size = 0;
+ /*
+ * If the transport didn't set the max_queue_size properly, then clamp
+ * it to the target limits. Also set default values in case the
+ * transport didn't set it at all.
+ */
+ if (port->max_queue_size < 0)
+ port->max_queue_size = NVMET_MAX_QUEUE_SIZE;
+ else
+ port->max_queue_size = clamp_t(int, port->max_queue_size,
+ NVMET_MIN_QUEUE_SIZE,
+ NVMET_MAX_QUEUE_SIZE);
+
port->enabled = true;
port->tr_ops = ops;
return 0;
@@ -1223,9 +1235,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl)
ctrl->cap |= (15ULL << 24);
/* maximum queue entries supported: */
if (ctrl->ops->get_max_queue_size)
- ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1;
+ ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl),
+ ctrl->port->max_queue_size) - 1;
else
- ctrl->cap |= NVMET_QUEUE_SIZE - 1;
+ ctrl->cap |= ctrl->port->max_queue_size - 1;
if (nvmet_is_passthru_subsys(ctrl->subsys))
nvmet_passthrough_override_cap(ctrl);
@@ -1411,6 +1424,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
kref_init(&ctrl->ref);
ctrl->subsys = subsys;
+ ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support;
nvmet_init_cap(ctrl);
WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL);
diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c
index 68e82ccc0e4e..ce54da8c6b36 100644
--- a/drivers/nvme/target/discovery.c
+++ b/drivers/nvme/target/discovery.c
@@ -282,7 +282,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req)
id->lpa = (1 << 2);
/* no enforcement soft-limit for maxcmd - pick arbitrary high value */
- id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+ id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl));
id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */
if (ctrl->ops->flags & NVMF_KEYED_SGLS)
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 9964ffe347d2..b23f4cf840bd 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -157,7 +157,8 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR;
}
- if (sqsize > mqes) {
+ /* for fabrics, this value applies to only the I/O Submission Queues */
+ if (qid && sqsize > mqes) {
pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n",
sqsize, mqes, ctrl->cntlid);
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
@@ -251,8 +252,6 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req)
if (status)
goto out;
- ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support;
-
uuid_copy(&ctrl->hostid, &d->hostid);
ret = nvmet_setup_auth(ctrl);
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 1471af250ea6..913cd2ec7a6f 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -1556,7 +1556,9 @@ static const struct attribute_group *fcloop_dev_attr_groups[] = {
NULL,
};
-static struct class *fcloop_class;
+static const struct class fcloop_class = {
+ .name = "fcloop",
+};
static struct device *fcloop_device;
@@ -1564,15 +1566,14 @@ static int __init fcloop_init(void)
{
int ret;
- fcloop_class = class_create("fcloop");
- if (IS_ERR(fcloop_class)) {
+ ret = class_register(&fcloop_class);
+ if (ret) {
pr_err("couldn't register class fcloop\n");
- ret = PTR_ERR(fcloop_class);
return ret;
}
fcloop_device = device_create_with_groups(
- fcloop_class, NULL, MKDEV(0, 0), NULL,
+ &fcloop_class, NULL, MKDEV(0, 0), NULL,
fcloop_dev_attr_groups, "ctl");
if (IS_ERR(fcloop_device)) {
pr_err("couldn't create ctl device!\n");
@@ -1585,7 +1586,7 @@ static int __init fcloop_init(void)
return 0;
out_destroy_class:
- class_destroy(fcloop_class);
+ class_unregister(&fcloop_class);
return ret;
}
@@ -1643,8 +1644,8 @@ static void __exit fcloop_exit(void)
put_device(fcloop_device);
- device_destroy(fcloop_class, MKDEV(0, 0));
- class_destroy(fcloop_class);
+ device_destroy(&fcloop_class, MKDEV(0, 0));
+ class_unregister(&fcloop_class);
}
module_init(fcloop_init);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 33e61b4f478b..f460728e1df1 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -163,6 +163,7 @@ struct nvmet_port {
void *priv;
bool enabled;
int inline_data_size;
+ int max_queue_size;
const struct nvmet_fabrics_ops *tr_ops;
bool pi_enable;
};
@@ -543,9 +544,10 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys,
void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type,
u8 event_info, u8 log_page);
-#define NVMET_QUEUE_SIZE 1024
+#define NVMET_MIN_QUEUE_SIZE 16
+#define NVMET_MAX_QUEUE_SIZE 1024
#define NVMET_NR_QUEUES 128
-#define NVMET_MAX_CMD NVMET_QUEUE_SIZE
+#define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1)
/*
* Nice round number that makes a list of nsids fit into a page.
diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c
index f2d963e1fe94..bb4a69d538fd 100644
--- a/drivers/nvme/target/passthru.c
+++ b/drivers/nvme/target/passthru.c
@@ -132,7 +132,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req)
id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes);
id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes);
- id->maxcmd = cpu_to_le16(NVMET_MAX_CMD);
+ id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl));
/* don't support fuse commands */
id->fuses = 0;
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 3a0f2c170f4c..f2bb9d95ecf4 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -1956,6 +1956,14 @@ static int nvmet_rdma_add_port(struct nvmet_port *nport)
nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE;
}
+ if (nport->max_queue_size < 0) {
+ nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE;
+ } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) {
+ pr_warn("max_queue_size %u is too large, reducing to %u\n",
+ nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE);
+ nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE;
+ }
+
ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr,
nport->disc_addr.trsvcid, &port->addr);
if (ret) {
@@ -2015,6 +2023,8 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl)
static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl)
{
+ if (ctrl->pi_support)
+ return NVME_RDMA_MAX_METADATA_QUEUE_SIZE;
return NVME_RDMA_MAX_QUEUE_SIZE;
}
diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c
index 5b5c1e481722..3148d9f1bde6 100644
--- a/drivers/nvme/target/zns.c
+++ b/drivers/nvme/target/zns.c
@@ -456,8 +456,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req)
switch (zsa_req_op(req->cmd->zms.zsa)) {
case REQ_OP_ZONE_RESET:
ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0,
- get_capacity(req->ns->bdev->bd_disk),
- GFP_KERNEL);
+ get_capacity(req->ns->bdev->bd_disk));
if (ret < 0)
return blkdev_zone_mgmt_errno_to_nvme_status(ret);
break;
@@ -508,7 +507,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w)
goto out;
}
- ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL);
+ ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors);
if (ret < 0)
status = blkdev_zone_mgmt_errno_to_nvme_status(ret);
diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c
index c833a7c7d7b2..cead018c3f06 100644
--- a/drivers/s390/block/dasd.c
+++ b/drivers/s390/block/dasd.c
@@ -8,9 +8,6 @@
* Copyright IBM Corp. 1999, 2009
*/
-#define KMSG_COMPONENT "dasd"
-#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
-
#include <linux/kmod.h>
#include <linux/init.h>
#include <linux/interrupt.h>
@@ -30,9 +27,6 @@
#include <asm/itcw.h>
#include <asm/diag.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd:"
-
#include "dasd_int.h"
/*
* SECTION: Constant definitions to be used within this file
@@ -313,39 +307,57 @@ static int dasd_state_basic_to_known(struct dasd_device *device)
*/
static int dasd_state_basic_to_ready(struct dasd_device *device)
{
- int rc;
- struct dasd_block *block;
- struct gendisk *disk;
+ struct dasd_block *block = device->block;
+ struct queue_limits lim;
+ int rc = 0;
- rc = 0;
- block = device->block;
/* make disk known with correct capacity */
- if (block) {
- if (block->base->discipline->do_analysis != NULL)
- rc = block->base->discipline->do_analysis(block);
- if (rc) {
- if (rc != -EAGAIN) {
- device->state = DASD_STATE_UNFMT;
- disk = device->block->gdp;
- kobject_uevent(&disk_to_dev(disk)->kobj,
- KOBJ_CHANGE);
- goto out;
- }
- return rc;
- }
- if (device->discipline->setup_blk_queue)
- device->discipline->setup_blk_queue(block);
- set_capacity(block->gdp,
- block->blocks << block->s2b_shift);
+ if (!block) {
device->state = DASD_STATE_READY;
- rc = dasd_scan_partitions(block);
- if (rc) {
- device->state = DASD_STATE_BASIC;
+ goto out;
+ }
+
+ if (block->base->discipline->do_analysis != NULL)
+ rc = block->base->discipline->do_analysis(block);
+ if (rc) {
+ if (rc == -EAGAIN)
return rc;
- }
- } else {
- device->state = DASD_STATE_READY;
+ device->state = DASD_STATE_UNFMT;
+ kobject_uevent(&disk_to_dev(device->block->gdp)->kobj,
+ KOBJ_CHANGE);
+ goto out;
+ }
+
+ lim = queue_limits_start_update(block->gdp->queue);
+ lim.max_dev_sectors = device->discipline->max_sectors(block);
+ lim.max_hw_sectors = lim.max_dev_sectors;
+ lim.logical_block_size = block->bp_block;
+
+ if (device->discipline->has_discard) {
+ unsigned int max_bytes;
+
+ lim.discard_granularity = block->bp_block;
+
+ /* Calculate max_discard_sectors and make it PAGE aligned */
+ max_bytes = USHRT_MAX * block->bp_block;
+ max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE);
+
+ lim.max_hw_discard_sectors = max_bytes / block->bp_block;
+ lim.max_write_zeroes_sectors = lim.max_hw_discard_sectors;
}
+ rc = queue_limits_commit_update(block->gdp->queue, &lim);
+ if (rc)
+ return rc;
+
+ set_capacity(block->gdp, block->blocks << block->s2b_shift);
+ device->state = DASD_STATE_READY;
+
+ rc = dasd_scan_partitions(block);
+ if (rc) {
+ device->state = DASD_STATE_BASIC;
+ return rc;
+ }
+
out:
if (device->discipline->basic_to_ready)
rc = device->discipline->basic_to_ready(device);
@@ -1301,7 +1313,6 @@ int dasd_term_IO(struct dasd_ccw_req *cqr)
{
struct dasd_device *device;
int retries, rc;
- char errorstring[ERRORLENGTH];
/* Check the cqr */
rc = dasd_check_cqr(cqr);
@@ -1340,10 +1351,8 @@ int dasd_term_IO(struct dasd_ccw_req *cqr)
rc = 0;
break;
default:
- /* internal error 10 - unknown rc*/
- snprintf(errorstring, ERRORLENGTH, "10 %d", rc);
- dev_err(&device->cdev->dev, "An error occurred in the "
- "DASD device driver, reason=%s\n", errorstring);
+ dev_err(&device->cdev->dev,
+ "Unexpected error during request termination %d\n", rc);
BUG();
break;
}
@@ -1362,7 +1371,6 @@ int dasd_start_IO(struct dasd_ccw_req *cqr)
{
struct dasd_device *device;
int rc;
- char errorstring[ERRORLENGTH];
/* Check the cqr */
rc = dasd_check_cqr(cqr);
@@ -1382,10 +1390,8 @@ int dasd_start_IO(struct dasd_ccw_req *cqr)
return -EPERM;
}
if (cqr->retries < 0) {
- /* internal error 14 - start_IO run out of retries */
- sprintf(errorstring, "14 %p", cqr);
- dev_err(&device->cdev->dev, "An error occurred in the DASD "
- "device driver, reason=%s\n", errorstring);
+ dev_err(&device->cdev->dev,
+ "Start I/O ran out of retries\n");
cqr->status = DASD_CQR_ERROR;
return -EIO;
}
@@ -1463,11 +1469,8 @@ int dasd_start_IO(struct dasd_ccw_req *cqr)
"not accessible");
break;
default:
- /* internal error 11 - unknown rc */
- snprintf(errorstring, ERRORLENGTH, "11 %d", rc);
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", errorstring);
+ "Unexpected error during request start %d", rc);
BUG();
break;
}
@@ -1904,8 +1907,6 @@ static void __dasd_device_process_ccw_queue(struct dasd_device *device,
static void __dasd_process_cqr(struct dasd_device *device,
struct dasd_ccw_req *cqr)
{
- char errorstring[ERRORLENGTH];
-
switch (cqr->status) {
case DASD_CQR_SUCCESS:
cqr->status = DASD_CQR_DONE;
@@ -1917,11 +1918,8 @@ static void __dasd_process_cqr(struct dasd_device *device,
cqr->status = DASD_CQR_TERMINATED;
break;
default:
- /* internal error 12 - wrong cqr status*/
- snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status);
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", errorstring);
+ "Unexpected CQR status %02x", cqr->status);
BUG();
}
if (cqr->callback)
@@ -1986,16 +1984,14 @@ static void __dasd_device_check_expire(struct dasd_device *device)
if (device->discipline->term_IO(cqr) != 0) {
/* Hmpf, try again in 5 sec */
dev_err(&device->cdev->dev,
- "cqr %p timed out (%lus) but cannot be "
- "ended, retrying in 5 s\n",
- cqr, (cqr->expires/HZ));
+ "CQR timed out (%lus) but cannot be ended, retrying in 5s\n",
+ (cqr->expires / HZ));
cqr->expires += 5*HZ;
dasd_device_set_timer(device, 5*HZ);
} else {
dev_err(&device->cdev->dev,
- "cqr %p timed out (%lus), %i retries "
- "remaining\n", cqr, (cqr->expires/HZ),
- cqr->retries);
+ "CQR timed out (%lus), %i retries remaining\n",
+ (cqr->expires / HZ), cqr->retries);
}
__dasd_device_check_autoquiesce_timeout(device, cqr);
}
@@ -2116,8 +2112,7 @@ int dasd_flush_device_queue(struct dasd_device *device)
if (rc) {
/* unable to terminate requeust */
dev_err(&device->cdev->dev,
- "Flushing the DASD request queue "
- "failed for request %p\n", cqr);
+ "Flushing the DASD request queue failed\n");
/* stop flush processing */
goto finished;
}
@@ -2633,8 +2628,7 @@ static int __dasd_cancel_req(struct dasd_ccw_req *cqr)
rc = device->discipline->term_IO(cqr);
if (rc) {
dev_err(&device->cdev->dev,
- "Cancelling request %p failed with rc=%d\n",
- cqr, rc);
+ "Cancelling request failed with rc=%d\n", rc);
} else {
cqr->stopclk = get_tod_clock();
}
@@ -3402,8 +3396,7 @@ static void dasd_generic_auto_online(void *data, async_cookie_t cookie)
ret = ccw_device_set_online(cdev);
if (ret)
- pr_warn("%s: Setting the DASD online failed with rc=%d\n",
- dev_name(&cdev->dev), ret);
+ dev_warn(&cdev->dev, "Setting the DASD online failed with rc=%d\n", ret);
}
/*
@@ -3490,8 +3483,11 @@ int dasd_generic_set_online(struct ccw_device *cdev,
{
struct dasd_discipline *discipline;
struct dasd_device *device;
+ struct device *dev;
int rc;
+ dev = &cdev->dev;
+
/* first online clears initial online feature flag */
dasd_set_feature(cdev, DASD_FEATURE_INITIAL_ONLINE, 0);
device = dasd_create_device(cdev);
@@ -3504,11 +3500,10 @@ int dasd_generic_set_online(struct ccw_device *cdev,
/* Try to load the required module. */
rc = request_module(DASD_DIAG_MOD);
if (rc) {
- pr_warn("%s Setting the DASD online failed "
- "because the required module %s "
- "could not be loaded (rc=%d)\n",
- dev_name(&cdev->dev), DASD_DIAG_MOD,
- rc);
+ dev_warn(dev, "Setting the DASD online failed "
+ "because the required module %s "
+ "could not be loaded (rc=%d)\n",
+ DASD_DIAG_MOD, rc);
dasd_delete_device(device);
return -ENODEV;
}
@@ -3516,8 +3511,7 @@ int dasd_generic_set_online(struct ccw_device *cdev,
/* Module init could have failed, so check again here after
* request_module(). */
if (!dasd_diag_discipline_pointer) {
- pr_warn("%s Setting the DASD online failed because of missing DIAG discipline\n",
- dev_name(&cdev->dev));
+ dev_warn(dev, "Setting the DASD online failed because of missing DIAG discipline\n");
dasd_delete_device(device);
return -ENODEV;
}
@@ -3527,37 +3521,33 @@ int dasd_generic_set_online(struct ccw_device *cdev,
dasd_delete_device(device);
return -EINVAL;
}
+ device->base_discipline = base_discipline;
if (!try_module_get(discipline->owner)) {
- module_put(base_discipline->owner);
dasd_delete_device(device);
return -EINVAL;
}
- device->base_discipline = base_discipline;
device->discipline = discipline;
/* check_device will allocate block device if necessary */
rc = discipline->check_device(device);
if (rc) {
- pr_warn("%s Setting the DASD online with discipline %s failed with rc=%i\n",
- dev_name(&cdev->dev), discipline->name, rc);
- module_put(discipline->owner);
- module_put(base_discipline->owner);
+ dev_warn(dev, "Setting the DASD online with discipline %s failed with rc=%i\n",
+ discipline->name, rc);
dasd_delete_device(device);
return rc;
}
dasd_set_target_state(device, DASD_STATE_ONLINE);
if (device->state <= DASD_STATE_KNOWN) {
- pr_warn("%s Setting the DASD online failed because of a missing discipline\n",
- dev_name(&cdev->dev));
+ dev_warn(dev, "Setting the DASD online failed because of a missing discipline\n");
rc = -ENODEV;
dasd_set_target_state(device, DASD_STATE_NEW);
if (device->block)
dasd_free_block(device->block);
dasd_delete_device(device);
- } else
- pr_debug("dasd_generic device %s found\n",
- dev_name(&cdev->dev));
+ } else {
+ dev_dbg(dev, "dasd_generic device found\n");
+ }
wait_event(dasd_init_waitq, _wait_for_device(device));
@@ -3568,10 +3558,13 @@ EXPORT_SYMBOL_GPL(dasd_generic_set_online);
int dasd_generic_set_offline(struct ccw_device *cdev)
{
+ int max_count, open_count, rc;
struct dasd_device *device;
struct dasd_block *block;
- int max_count, open_count, rc;
unsigned long flags;
+ struct device *dev;
+
+ dev = &cdev->dev;
rc = 0;
spin_lock_irqsave(get_ccwdev_lock(cdev), flags);
@@ -3592,11 +3585,10 @@ int dasd_generic_set_offline(struct ccw_device *cdev)
open_count = atomic_read(&device->block->open_count);
if (open_count > max_count) {
if (open_count > 0)
- pr_warn("%s: The DASD cannot be set offline with open count %i\n",
- dev_name(&cdev->dev), open_count);
+ dev_warn(dev, "The DASD cannot be set offline with open count %i\n",
+ open_count);
else
- pr_warn("%s: The DASD cannot be set offline while it is in use\n",
- dev_name(&cdev->dev));
+ dev_warn(dev, "The DASD cannot be set offline while it is in use\n");
rc = -EBUSY;
goto out_err;
}
@@ -3956,8 +3948,8 @@ static int dasd_handle_autoquiesce(struct dasd_device *device,
if (dasd_eer_enabled(device))
dasd_eer_write(device, NULL, DASD_EER_AUTOQUIESCE);
- pr_info("%s: The DASD has been put in the quiesce state\n",
- dev_name(&device->cdev->dev));
+ dev_info(&device->cdev->dev,
+ "The DASD has been put in the quiesce state\n");
dasd_device_set_stop_bits(device, DASD_STOPPED_QUIESCE);
if (device->features & DASD_FEATURE_REQUEUEQUIESCE)
@@ -3977,10 +3969,8 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device,
NULL);
if (IS_ERR(cqr)) {
- /* internal error 13 - Allocating the RDC request failed*/
- dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", "13");
+ DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s",
+ "Could not allocate RDC request");
return cqr;
}
diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c
index 89957bb7244d..459b7f8ac883 100644
--- a/drivers/s390/block/dasd_3990_erp.c
+++ b/drivers/s390/block/dasd_3990_erp.c
@@ -7,13 +7,9 @@
*
*/
-#define KMSG_COMPONENT "dasd-eckd"
-
#include <linux/timer.h>
#include <asm/idals.h>
-#define PRINTK_HEADER "dasd_erp(3990): "
-
#include "dasd_int.h"
#include "dasd_eckd.h"
@@ -398,7 +394,6 @@ dasd_3990_handle_env_data(struct dasd_ccw_req * erp, char *sense)
struct dasd_device *device = erp->startdev;
char msg_format = (sense[7] & 0xF0);
char msg_no = (sense[7] & 0x0F);
- char errorstring[ERRORLENGTH];
switch (msg_format) {
case 0x00: /* Format 0 - Program or System Checks */
@@ -1004,12 +999,9 @@ dasd_3990_handle_env_data(struct dasd_ccw_req * erp, char *sense)
}
break;
- default: /* unknown message format - should not happen
- internal error 03 - unknown message format */
- snprintf(errorstring, ERRORLENGTH, "03 %x02", msg_format);
+ default:
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", errorstring);
+ "Unknown message format %02x", msg_format);
break;
} /* end switch message format */
@@ -1056,11 +1048,9 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense)
set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags);
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
} else {
- /* fatal error - set status to FAILED
- internal error 09 - Command Reject */
if (!test_bit(DASD_CQR_SUPPRESS_CR, &erp->flags))
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, reason=09\n");
+ "An I/O command request was rejected\n");
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
}
@@ -1128,13 +1118,7 @@ dasd_3990_erp_equip_check(struct dasd_ccw_req * erp, char *sense)
erp->function = dasd_3990_erp_equip_check;
if (sense[1] & SNS1_WRITE_INHIBITED) {
- dev_info(&device->cdev->dev,
- "Write inhibited path encountered\n");
-
- /* vary path offline
- internal error 04 - Path should be varied off-line.*/
- dev_err(&device->cdev->dev, "An error occurred in the DASD "
- "device driver, reason=%s\n", "04");
+ dev_err(&device->cdev->dev, "Write inhibited path encountered\n");
erp = dasd_3990_erp_action_1(erp);
@@ -1285,11 +1269,7 @@ dasd_3990_erp_inv_format(struct dasd_ccw_req * erp, char *sense)
erp = dasd_3990_erp_action_4(erp, sense);
} else {
- /* internal error 06 - The track format is not valid*/
- dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", "06");
-
+ dev_err(&device->cdev->dev, "Track format is not valid\n");
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
}
@@ -1663,9 +1643,8 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense)
sizeof(struct LO_eckd_data), device);
if (IS_ERR(erp)) {
- /* internal error 01 - Unable to allocate ERP */
- dev_err(&device->cdev->dev, "An error occurred in the DASD "
- "device driver, reason=%s\n", "01");
+ DBF_DEV_EVENT(DBF_ERR, device, "%s",
+ "Unable to allocate ERP request (1B 32)");
return dasd_3990_erp_cleanup(default_erp, DASD_CQR_FAILED);
}
@@ -1807,10 +1786,8 @@ dasd_3990_update_1B(struct dasd_ccw_req * previous_erp, char *sense)
cpa = previous_erp->irb.scsw.cmd.cpa;
if (cpa == 0) {
- /* internal error 02 -
- Unable to determine address of the CCW to be restarted */
- dev_err(&device->cdev->dev, "An error occurred in the DASD "
- "device driver, reason=%s\n", "02");
+ dev_err(&device->cdev->dev,
+ "Unable to determine address of to be restarted CCW\n");
previous_erp->status = DASD_CQR_FAILED;
@@ -2009,15 +1986,9 @@ dasd_3990_erp_compound_config(struct dasd_ccw_req * erp, char *sense)
{
if ((sense[25] & DASD_SENSE_BIT_1) && (sense[26] & DASD_SENSE_BIT_2)) {
-
- /* set to suspended duplex state then restart
- internal error 05 - Set device to suspended duplex state
- should be done */
struct dasd_device *device = erp->startdev;
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", "05");
-
+ "Compound configuration error occurred\n");
}
erp->function = dasd_3990_erp_compound_config;
@@ -2153,10 +2124,9 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense)
erp = dasd_3990_erp_int_req(erp);
break;
- case 0x0F: /* length mismatch during update write command
- internal error 08 - update write command error*/
- dev_err(&device->cdev->dev, "An error occurred in the "
- "DASD device driver, reason=%s\n", "08");
+ case 0x0F:
+ dev_err(&device->cdev->dev,
+ "Update write command error occurred\n");
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
break;
@@ -2165,12 +2135,9 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense)
erp = dasd_3990_erp_action_10_32(erp, sense);
break;
- case 0x15: /* next track outside defined extend
- internal error 07 - The next track is not
- within the defined storage extent */
+ case 0x15:
dev_err(&device->cdev->dev,
- "An error occurred in the DASD device driver, "
- "reason=%s\n", "07");
+ "Track outside defined extent error occurred\n");
erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED);
break;
@@ -2663,7 +2630,7 @@ dasd_3990_erp_further_erp(struct dasd_ccw_req *erp)
* necessary
*/
dev_err(&device->cdev->dev,
- "ERP %p has run out of retries and failed\n", erp);
+ "ERP %px has run out of retries and failed\n", erp);
erp->status = DASD_CQR_FAILED;
}
@@ -2704,8 +2671,7 @@ dasd_3990_erp_handle_match_erp(struct dasd_ccw_req *erp_head,
while (erp_done != erp) {
if (erp_done == NULL) /* end of chain reached */
- panic(PRINTK_HEADER "Programming error in ERP! The "
- "original request was lost\n");
+ panic("Programming error in ERP! The original request was lost\n");
/* remove the request from the device queue */
list_del(&erp_done->blocklist);
@@ -2786,11 +2752,9 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
"ERP chain at BEGINNING of ERP-ACTION\n");
for (temp_erp = cqr;
temp_erp != NULL; temp_erp = temp_erp->refers) {
-
dev_err(&device->cdev->dev,
- "ERP %p (%02x) refers to %p\n",
- temp_erp, temp_erp->status,
- temp_erp->refers);
+ "ERP %px (%02x) refers to %px\n",
+ temp_erp, temp_erp->status, temp_erp->refers);
}
}
@@ -2837,11 +2801,9 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr)
"ERP chain at END of ERP-ACTION\n");
for (temp_erp = erp;
temp_erp != NULL; temp_erp = temp_erp->refers) {
-
dev_err(&device->cdev->dev,
- "ERP %p (%02x) refers to %p\n",
- temp_erp, temp_erp->status,
- temp_erp->refers);
+ "ERP %px (%02x) refers to %px\n",
+ temp_erp, temp_erp->status, temp_erp->refers);
}
}
diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c
index c9740ae88d1a..e84cd5436556 100644
--- a/drivers/s390/block/dasd_alias.c
+++ b/drivers/s390/block/dasd_alias.c
@@ -6,20 +6,12 @@
* Author(s): Stefan Weinhuber <wein@de.ibm.com>
*/
-#define KMSG_COMPONENT "dasd-eckd"
-
#include <linux/list.h>
#include <linux/slab.h>
#include <asm/ebcdic.h>
#include "dasd_int.h"
#include "dasd_eckd.h"
-#ifdef PRINTK_HEADER
-#undef PRINTK_HEADER
-#endif /* PRINTK_HEADER */
-#define PRINTK_HEADER "dasd(eckd):"
-
-
/*
* General concept of alias management:
* - PAV and DASD alias management is specific to the eckd discipline.
diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c
index c4e36650c426..0316c20823ee 100644
--- a/drivers/s390/block/dasd_devmap.c
+++ b/drivers/s390/block/dasd_devmap.c
@@ -13,8 +13,6 @@
*
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/ctype.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -24,8 +22,6 @@
#include <linux/uaccess.h>
#include <asm/ipl.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd_devmap:"
#define DASD_MAX_PARAMS 256
#include "dasd_int.h"
@@ -1114,7 +1110,7 @@ dasd_use_diag_show(struct device *dev, struct device_attribute *attr, char *buf)
use_diag = (devmap->features & DASD_FEATURE_USEDIAG) != 0;
else
use_diag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USEDIAG) != 0;
- return sprintf(buf, use_diag ? "1\n" : "0\n");
+ return sysfs_emit(buf, use_diag ? "1\n" : "0\n");
}
static ssize_t
@@ -1163,7 +1159,7 @@ dasd_use_raw_show(struct device *dev, struct device_attribute *attr, char *buf)
use_raw = (devmap->features & DASD_FEATURE_USERAW) != 0;
else
use_raw = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USERAW) != 0;
- return sprintf(buf, use_raw ? "1\n" : "0\n");
+ return sysfs_emit(buf, use_raw ? "1\n" : "0\n");
}
static ssize_t
@@ -1259,7 +1255,7 @@ dasd_access_show(struct device *dev, struct device_attribute *attr,
if (count < 0)
return count;
- return sprintf(buf, "%d\n", count);
+ return sysfs_emit(buf, "%d\n", count);
}
static DEVICE_ATTR(host_access_count, 0444, dasd_access_show, NULL);
@@ -1338,19 +1334,19 @@ static ssize_t dasd_alias_show(struct device *dev,
device = dasd_device_from_cdev(to_ccwdev(dev));
if (IS_ERR(device))
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
if (device->discipline && device->discipline->get_uid &&
!device->discipline->get_uid(device, &uid)) {
if (uid.type == UA_BASE_PAV_ALIAS ||
uid.type == UA_HYPER_PAV_ALIAS) {
dasd_put_device(device);
- return sprintf(buf, "1\n");
+ return sysfs_emit(buf, "1\n");
}
}
dasd_put_device(device);
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
}
static DEVICE_ATTR(alias, 0444, dasd_alias_show, NULL);
@@ -1412,15 +1408,9 @@ dasd_uid_show(struct device *dev, struct device_attribute *attr, char *buf)
break;
}
- if (strlen(uid.vduit) > 0)
- snprintf(uid_string, sizeof(uid_string),
- "%s.%s.%04x.%s.%s",
- uid.vendor, uid.serial, uid.ssid, ua_string,
- uid.vduit);
- else
- snprintf(uid_string, sizeof(uid_string),
- "%s.%s.%04x.%s",
- uid.vendor, uid.serial, uid.ssid, ua_string);
+ snprintf(uid_string, sizeof(uid_string), "%s.%s.%04x.%s%s%s",
+ uid.vendor, uid.serial, uid.ssid, ua_string,
+ uid.vduit[0] ? "." : "", uid.vduit);
}
dasd_put_device(device);
@@ -1862,7 +1852,7 @@ static ssize_t dasd_pm_show(struct device *dev,
device = dasd_device_from_cdev(to_ccwdev(dev));
if (IS_ERR(device))
- return sprintf(buf, "0\n");
+ return sysfs_emit(buf, "0\n");
opm = dasd_path_get_opm(device);
nppm = dasd_path_get_nppm(device);
@@ -1872,8 +1862,8 @@ static ssize_t dasd_pm_show(struct device *dev,
ifccpm = dasd_path_get_ifccpm(device);
dasd_put_device(device);
- return sprintf(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm,
- cablepm, cuirpm, hpfpm, ifccpm);
+ return sysfs_emit(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm,
+ cablepm, cuirpm, hpfpm, ifccpm);
}
static DEVICE_ATTR(path_masks, 0444, dasd_pm_show, NULL);
diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c
index 2e4e555b37c3..ea4b1d01bb76 100644
--- a/drivers/s390/block/dasd_diag.c
+++ b/drivers/s390/block/dasd_diag.c
@@ -8,8 +8,6 @@
*
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/kernel_stat.h>
#include <linux/stddef.h>
#include <linux/kernel.h>
@@ -31,8 +29,6 @@
#include "dasd_int.h"
#include "dasd_diag.h"
-#define PRINTK_HEADER "dasd(diag):"
-
MODULE_LICENSE("GPL");
/* The maximum number of blocks per request (max_blocks) is dependent on the
@@ -621,25 +617,9 @@ dasd_diag_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
"dump sense not available for DIAG data");
}
-/*
- * Initialize block layer request queue.
- */
-static void dasd_diag_setup_blk_queue(struct dasd_block *block)
+static unsigned int dasd_diag_max_sectors(struct dasd_block *block)
{
- unsigned int logical_block_size = block->bp_block;
- struct request_queue *q = block->gdp->queue;
- int max;
-
- max = DIAG_MAX_BLOCKS << block->s2b_shift;
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- q->limits.max_dev_sectors = max;
- blk_queue_logical_block_size(q, logical_block_size);
- blk_queue_max_hw_sectors(q, max);
- blk_queue_max_segments(q, USHRT_MAX);
- /* With page sized segments each segment can be translated into one idaw/tidaw */
- blk_queue_max_segment_size(q, PAGE_SIZE);
- blk_queue_segment_boundary(q, PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, PAGE_SIZE - 1);
+ return DIAG_MAX_BLOCKS << block->s2b_shift;
}
static int dasd_diag_pe_handler(struct dasd_device *device,
@@ -652,10 +632,10 @@ static struct dasd_discipline dasd_diag_discipline = {
.owner = THIS_MODULE,
.name = "DIAG",
.ebcname = "DIAG",
+ .max_sectors = dasd_diag_max_sectors,
.check_device = dasd_diag_check_device,
.pe_handler = dasd_diag_pe_handler,
.fill_geometry = dasd_diag_fill_geometry,
- .setup_blk_queue = dasd_diag_setup_blk_queue,
.start_IO = dasd_start_diag,
.term_IO = dasd_diag_term_IO,
.handle_terminated_request = dasd_diag_handle_terminated_request,
diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c
index bd89b032968a..373c1a86c33e 100644
--- a/drivers/s390/block/dasd_eckd.c
+++ b/drivers/s390/block/dasd_eckd.c
@@ -10,8 +10,6 @@
* Author.........: Nigel Hislop <hislop_nigel@emc.com>
*/
-#define KMSG_COMPONENT "dasd-eckd"
-
#include <linux/stddef.h>
#include <linux/kernel.h>
#include <linux/slab.h>
@@ -37,11 +35,6 @@
#include "dasd_int.h"
#include "dasd_eckd.h"
-#ifdef PRINTK_HEADER
-#undef PRINTK_HEADER
-#endif /* PRINTK_HEADER */
-#define PRINTK_HEADER "dasd(eckd):"
-
/*
* raw track access always map to 64k in memory
* so it maps to 16 blocks of 4k per track
@@ -1072,22 +1065,14 @@ static void dasd_eckd_read_fc_security(struct dasd_device *device)
}
}
-static void dasd_eckd_get_uid_string(struct dasd_conf *conf,
- char *print_uid)
+static void dasd_eckd_get_uid_string(struct dasd_conf *conf, char *print_uid)
{
struct dasd_uid uid;
create_uid(conf, &uid);
- if (strlen(uid.vduit) > 0)
- snprintf(print_uid, DASD_UID_STRLEN,
- "%s.%s.%04x.%02x.%s",
- uid.vendor, uid.serial, uid.ssid,
- uid.real_unit_addr, uid.vduit);
- else
- snprintf(print_uid, DASD_UID_STRLEN,
- "%s.%s.%04x.%02x",
- uid.vendor, uid.serial, uid.ssid,
- uid.real_unit_addr);
+ snprintf(print_uid, DASD_UID_STRLEN, "%s.%s.%04x.%02x%s%s",
+ uid.vendor, uid.serial, uid.ssid, uid.real_unit_addr,
+ uid.vduit[0] ? "." : "", uid.vduit);
}
static int dasd_eckd_check_cabling(struct dasd_device *device,
@@ -5529,15 +5514,15 @@ dasd_eckd_ioctl(struct dasd_block *block, unsigned int cmd, void __user *argp)
* and return number of printed chars.
*/
static void
-dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page)
+dasd_eckd_dump_ccw_range(struct dasd_device *device, struct ccw1 *from,
+ struct ccw1 *to, char *page)
{
int len, count;
char *datap;
len = 0;
while (from <= to) {
- len += sprintf(page + len, PRINTK_HEADER
- " CCW %p: %08X %08X DAT:",
+ len += sprintf(page + len, "CCW %px: %08X %08X DAT:",
from, ((int *) from)[0], ((int *) from)[1]);
/* get pointer to data (consider IDALs) */
@@ -5560,7 +5545,7 @@ dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page)
from++;
}
if (len > 0)
- printk(KERN_ERR "%s", page);
+ dev_err(&device->cdev->dev, "%s", page);
}
static void
@@ -5591,9 +5576,12 @@ dasd_eckd_dump_sense_dbf(struct dasd_device *device, struct irb *irb,
static void dasd_eckd_dump_sense_ccw(struct dasd_device *device,
struct dasd_ccw_req *req, struct irb *irb)
{
- char *page;
struct ccw1 *first, *last, *fail, *from, *to;
+ struct device *dev;
int len, sl, sct;
+ char *page;
+
+ dev = &device->cdev->dev;
page = (char *) get_zeroed_page(GFP_ATOMIC);
if (page == NULL) {
@@ -5602,24 +5590,18 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device,
return;
}
/* dump the sense data */
- len = sprintf(page, PRINTK_HEADER
- " I/O status report for device %s:\n",
- dev_name(&device->cdev->dev));
- len += sprintf(page + len, PRINTK_HEADER
- " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X "
- "CS:%02X RC:%d\n",
+ len = sprintf(page, "I/O status report:\n");
+ len += sprintf(page + len,
+ "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X CS:%02X RC:%d\n",
req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw),
scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw),
scsw_dstat(&irb->scsw), scsw_cstat(&irb->scsw),
req ? req->intrc : 0);
- len += sprintf(page + len, PRINTK_HEADER
- " device %s: Failing CCW: %p\n",
- dev_name(&device->cdev->dev),
+ len += sprintf(page + len, "Failing CCW: %px\n",
phys_to_virt(irb->scsw.cmd.cpa));
if (irb->esw.esw0.erw.cons) {
for (sl = 0; sl < 4; sl++) {
- len += sprintf(page + len, PRINTK_HEADER
- " Sense(hex) %2d-%2d:",
+ len += sprintf(page + len, "Sense(hex) %2d-%2d:",
(8 * sl), ((8 * sl) + 7));
for (sct = 0; sct < 8; sct++) {
@@ -5631,23 +5613,20 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device,
if (irb->ecw[27] & DASD_SENSE_BIT_0) {
/* 24 Byte Sense Data */
- sprintf(page + len, PRINTK_HEADER
- " 24 Byte: %x MSG %x, "
- "%s MSGb to SYSOP\n",
+ sprintf(page + len,
+ "24 Byte: %x MSG %x, %s MSGb to SYSOP\n",
irb->ecw[7] >> 4, irb->ecw[7] & 0x0f,
irb->ecw[1] & 0x10 ? "" : "no");
} else {
/* 32 Byte Sense Data */
- sprintf(page + len, PRINTK_HEADER
- " 32 Byte: Format: %x "
- "Exception class %x\n",
+ sprintf(page + len,
+ "32 Byte: Format: %x Exception class %x\n",
irb->ecw[6] & 0x0f, irb->ecw[22] >> 4);
}
} else {
- sprintf(page + len, PRINTK_HEADER
- " SORRY - NO VALID SENSE AVAILABLE\n");
+ sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n");
}
- printk(KERN_ERR "%s", page);
+ dev_err(dev, "%s", page);
if (req) {
/* req == NULL for unsolicited interrupts */
@@ -5656,8 +5635,8 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device,
first = req->cpaddr;
for (last = first; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++);
to = min(first + 6, last);
- printk(KERN_ERR PRINTK_HEADER " Related CP in req: %p\n", req);
- dasd_eckd_dump_ccw_range(first, to, page);
+ dev_err(dev, "Related CP in req: %px\n", req);
+ dasd_eckd_dump_ccw_range(device, first, to, page);
/* print failing CCW area (maximum 4) */
/* scsw->cda is either valid or zero */
@@ -5665,19 +5644,19 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device,
fail = phys_to_virt(irb->scsw.cmd.cpa); /* failing CCW */
if (from < fail - 2) {
from = fail - 2; /* there is a gap - print header */
- printk(KERN_ERR PRINTK_HEADER "......\n");
+ dev_err(dev, "......\n");
}
to = min(fail + 1, last);
- dasd_eckd_dump_ccw_range(from, to, page + len);
+ dasd_eckd_dump_ccw_range(device, from, to, page + len);
/* print last CCWs (maximum 2) */
len = 0;
from = max(from, ++to);
if (from < last - 1) {
from = last - 1; /* there is a gap - print header */
- printk(KERN_ERR PRINTK_HEADER "......\n");
+ dev_err(dev, "......\n");
}
- dasd_eckd_dump_ccw_range(from, last, page + len);
+ dasd_eckd_dump_ccw_range(device, from, last, page + len);
}
free_page((unsigned long) page);
}
@@ -5701,11 +5680,9 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
return;
}
/* dump the sense data */
- len = sprintf(page, PRINTK_HEADER
- " I/O status report for device %s:\n",
- dev_name(&device->cdev->dev));
- len += sprintf(page + len, PRINTK_HEADER
- " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X "
+ len = sprintf(page, "I/O status report:\n");
+ len += sprintf(page + len,
+ "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X "
"CS:%02X fcxs:%02X schxs:%02X RC:%d\n",
req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw),
scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw),
@@ -5713,9 +5690,7 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
irb->scsw.tm.fcxs,
(irb->scsw.tm.ifob << 7) | irb->scsw.tm.sesq,
req ? req->intrc : 0);
- len += sprintf(page + len, PRINTK_HEADER
- " device %s: Failing TCW: %p\n",
- dev_name(&device->cdev->dev),
+ len += sprintf(page + len, "Failing TCW: %px\n",
phys_to_virt(irb->scsw.tm.tcw));
tsb = NULL;
@@ -5724,47 +5699,37 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
tsb = tcw_get_tsb(phys_to_virt(irb->scsw.tm.tcw));
if (tsb) {
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->length %d\n", tsb->length);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->flags %x\n", tsb->flags);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->dcw_offset %d\n", tsb->dcw_offset);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->count %d\n", tsb->count);
+ len += sprintf(page + len, "tsb->length %d\n", tsb->length);
+ len += sprintf(page + len, "tsb->flags %x\n", tsb->flags);
+ len += sprintf(page + len, "tsb->dcw_offset %d\n", tsb->dcw_offset);
+ len += sprintf(page + len, "tsb->count %d\n", tsb->count);
residual = tsb->count - 28;
- len += sprintf(page + len, PRINTK_HEADER
- " residual %d\n", residual);
+ len += sprintf(page + len, "residual %d\n", residual);
switch (tsb->flags & 0x07) {
case 1: /* tsa_iostat */
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.iostat.dev_time %d\n",
+ len += sprintf(page + len, "tsb->tsa.iostat.dev_time %d\n",
tsb->tsa.iostat.dev_time);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.iostat.def_time %d\n",
+ len += sprintf(page + len, "tsb->tsa.iostat.def_time %d\n",
tsb->tsa.iostat.def_time);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.iostat.queue_time %d\n",
+ len += sprintf(page + len, "tsb->tsa.iostat.queue_time %d\n",
tsb->tsa.iostat.queue_time);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.iostat.dev_busy_time %d\n",
+ len += sprintf(page + len, "tsb->tsa.iostat.dev_busy_time %d\n",
tsb->tsa.iostat.dev_busy_time);
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.iostat.dev_act_time %d\n",
+ len += sprintf(page + len, "tsb->tsa.iostat.dev_act_time %d\n",
tsb->tsa.iostat.dev_act_time);
sense = tsb->tsa.iostat.sense;
break;
case 2: /* ts_ddpc */
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.ddpc.rc %d\n", tsb->tsa.ddpc.rc);
+ len += sprintf(page + len, "tsb->tsa.ddpc.rc %d\n",
+ tsb->tsa.ddpc.rc);
for (sl = 0; sl < 2; sl++) {
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.ddpc.rcq %2d-%2d: ",
+ len += sprintf(page + len,
+ "tsb->tsa.ddpc.rcq %2d-%2d: ",
(8 * sl), ((8 * sl) + 7));
rcq = tsb->tsa.ddpc.rcq;
for (sct = 0; sct < 8; sct++) {
- len += sprintf(page + len, " %02x",
+ len += sprintf(page + len, "%02x",
rcq[8 * sl + sct]);
}
len += sprintf(page + len, "\n");
@@ -5772,15 +5737,15 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
sense = tsb->tsa.ddpc.sense;
break;
case 3: /* tsa_intrg */
- len += sprintf(page + len, PRINTK_HEADER
- " tsb->tsa.intrg.: not supported yet\n");
+ len += sprintf(page + len,
+ "tsb->tsa.intrg.: not supported yet\n");
break;
}
if (sense) {
for (sl = 0; sl < 4; sl++) {
- len += sprintf(page + len, PRINTK_HEADER
- " Sense(hex) %2d-%2d:",
+ len += sprintf(page + len,
+ "Sense(hex) %2d-%2d:",
(8 * sl), ((8 * sl) + 7));
for (sct = 0; sct < 8; sct++) {
len += sprintf(page + len, " %02x",
@@ -5791,27 +5756,23 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device,
if (sense[27] & DASD_SENSE_BIT_0) {
/* 24 Byte Sense Data */
- sprintf(page + len, PRINTK_HEADER
- " 24 Byte: %x MSG %x, "
- "%s MSGb to SYSOP\n",
+ sprintf(page + len,
+ "24 Byte: %x MSG %x, %s MSGb to SYSOP\n",
sense[7] >> 4, sense[7] & 0x0f,
sense[1] & 0x10 ? "" : "no");
} else {
/* 32 Byte Sense Data */
- sprintf(page + len, PRINTK_HEADER
- " 32 Byte: Format: %x "
- "Exception class %x\n",
+ sprintf(page + len,
+ "32 Byte: Format: %x Exception class %x\n",
sense[6] & 0x0f, sense[22] >> 4);
}
} else {
- sprintf(page + len, PRINTK_HEADER
- " SORRY - NO VALID SENSE AVAILABLE\n");
+ sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n");
}
} else {
- sprintf(page + len, PRINTK_HEADER
- " SORRY - NO TSB DATA AVAILABLE\n");
+ sprintf(page + len, "SORRY - NO TSB DATA AVAILABLE\n");
}
- printk(KERN_ERR "%s", page);
+ dev_err(&device->cdev->dev, "%s", page);
free_page((unsigned long) page);
}
@@ -6865,17 +6826,9 @@ static void dasd_eckd_handle_hpf_error(struct dasd_device *device,
dasd_schedule_requeue(device);
}
-/*
- * Initialize block layer request queue.
- */
-static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
+static unsigned int dasd_eckd_max_sectors(struct dasd_block *block)
{
- unsigned int logical_block_size = block->bp_block;
- struct request_queue *q = block->gdp->queue;
- struct dasd_device *device = block->base;
- int max;
-
- if (device->features & DASD_FEATURE_USERAW) {
+ if (block->base->features & DASD_FEATURE_USERAW) {
/*
* the max_blocks value for raw_track access is 256
* it is higher than the native ECKD value because we
@@ -6883,19 +6836,10 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block)
* so the max_hw_sectors are
* 2048 x 512B = 1024kB = 16 tracks
*/
- max = DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift;
- } else {
- max = DASD_ECKD_MAX_BLOCKS << block->s2b_shift;
+ return DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift;
}
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- q->limits.max_dev_sectors = max;
- blk_queue_logical_block_size(q, logical_block_size);
- blk_queue_max_hw_sectors(q, max);
- blk_queue_max_segments(q, USHRT_MAX);
- /* With page sized segments each segment can be translated into one idaw/tidaw */
- blk_queue_max_segment_size(q, PAGE_SIZE);
- blk_queue_segment_boundary(q, PAGE_SIZE - 1);
- blk_queue_dma_alignment(q, PAGE_SIZE - 1);
+
+ return DASD_ECKD_MAX_BLOCKS << block->s2b_shift;
}
static struct ccw_driver dasd_eckd_driver = {
@@ -6927,7 +6871,7 @@ static struct dasd_discipline dasd_eckd_discipline = {
.basic_to_ready = dasd_eckd_basic_to_ready,
.online_to_ready = dasd_eckd_online_to_ready,
.basic_to_known = dasd_eckd_basic_to_known,
- .setup_blk_queue = dasd_eckd_setup_blk_queue,
+ .max_sectors = dasd_eckd_max_sectors,
.fill_geometry = dasd_eckd_fill_geometry,
.start_IO = dasd_start_IO,
.term_IO = dasd_term_IO,
diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c
index c956de711cf7..5064a616e041 100644
--- a/drivers/s390/block/dasd_eer.c
+++ b/drivers/s390/block/dasd_eer.c
@@ -7,8 +7,6 @@
* Author(s): Stefan Weinhuber <wein@de.ibm.com>
*/
-#define KMSG_COMPONENT "dasd-eckd"
-
#include <linux/init.h>
#include <linux/fs.h>
#include <linux/kernel.h>
@@ -28,11 +26,6 @@
#include "dasd_int.h"
#include "dasd_eckd.h"
-#ifdef PRINTK_HEADER
-#undef PRINTK_HEADER
-#endif /* PRINTK_HEADER */
-#define PRINTK_HEADER "dasd(eer):"
-
/*
* SECTION: the internal buffer
*/
diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c
index c07e6e713518..4c0d3a704513 100644
--- a/drivers/s390/block/dasd_erp.c
+++ b/drivers/s390/block/dasd_erp.c
@@ -9,8 +9,6 @@
*
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/ctype.h>
#include <linux/init.h>
@@ -18,9 +16,6 @@
#include <asm/ebcdic.h>
#include <linux/uaccess.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd_erp:"
-
#include "dasd_int.h"
struct dasd_ccw_req *
@@ -170,12 +165,12 @@ dasd_log_sense(struct dasd_ccw_req *cqr, struct irb *irb)
device = cqr->startdev;
if (cqr->intrc == -ETIMEDOUT) {
dev_err(&device->cdev->dev,
- "A timeout error occurred for cqr %p\n", cqr);
+ "A timeout error occurred for cqr %px\n", cqr);
return;
}
if (cqr->intrc == -ENOLINK) {
dev_err(&device->cdev->dev,
- "A transport error occurred for cqr %p\n", cqr);
+ "A transport error occurred for cqr %px\n", cqr);
return;
}
/* dump sense data */
diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c
index c06fa2b27120..bcbb2f8e91fe 100644
--- a/drivers/s390/block/dasd_fba.c
+++ b/drivers/s390/block/dasd_fba.c
@@ -25,11 +25,6 @@
#include "dasd_int.h"
#include "dasd_fba.h"
-#ifdef PRINTK_HEADER
-#undef PRINTK_HEADER
-#endif /* PRINTK_HEADER */
-#define PRINTK_HEADER "dasd(fba):"
-
#define FBA_DEFAULT_RETRIES 32
#define DASD_FBA_CCW_WRITE 0x41
@@ -660,30 +655,27 @@ static void
dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
struct irb *irb)
{
- char *page;
struct ccw1 *act, *end, *last;
int len, sl, sct, count;
+ struct device *dev;
+ char *page;
+
+ dev = &device->cdev->dev;
page = (char *) get_zeroed_page(GFP_ATOMIC);
if (page == NULL) {
DBF_DEV_EVENT(DBF_WARNING, device, "%s",
- "No memory to dump sense data");
+ "No memory to dump sense data");
return;
}
- len = sprintf(page, PRINTK_HEADER
- " I/O status report for device %s:\n",
- dev_name(&device->cdev->dev));
- len += sprintf(page + len, PRINTK_HEADER
- " in req: %p CS: 0x%02X DS: 0x%02X\n", req,
- irb->scsw.cmd.cstat, irb->scsw.cmd.dstat);
- len += sprintf(page + len, PRINTK_HEADER
- " device %s: Failing CCW: %p\n",
- dev_name(&device->cdev->dev),
+ len = sprintf(page, "I/O status report:\n");
+ len += sprintf(page + len, "in req: %px CS: 0x%02X DS: 0x%02X\n",
+ req, irb->scsw.cmd.cstat, irb->scsw.cmd.dstat);
+ len += sprintf(page + len, "Failing CCW: %px\n",
(void *) (addr_t) irb->scsw.cmd.cpa);
if (irb->esw.esw0.erw.cons) {
for (sl = 0; sl < 4; sl++) {
- len += sprintf(page + len, PRINTK_HEADER
- " Sense(hex) %2d-%2d:",
+ len += sprintf(page + len, "Sense(hex) %2d-%2d:",
(8 * sl), ((8 * sl) + 7));
for (sct = 0; sct < 8; sct++) {
@@ -693,20 +685,18 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
len += sprintf(page + len, "\n");
}
} else {
- len += sprintf(page + len, PRINTK_HEADER
- " SORRY - NO VALID SENSE AVAILABLE\n");
+ len += sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n");
}
- printk(KERN_ERR "%s", page);
+ dev_err(dev, "%s", page);
/* dump the Channel Program */
/* print first CCWs (maximum 8) */
act = req->cpaddr;
- for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++);
+ for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++);
end = min(act + 8, last);
- len = sprintf(page, PRINTK_HEADER " Related CP in req: %p\n", req);
+ len = sprintf(page, "Related CP in req: %px\n", req);
while (act <= end) {
- len += sprintf(page + len, PRINTK_HEADER
- " CCW %p: %08X %08X DAT:",
+ len += sprintf(page + len, "CCW %px: %08X %08X DAT:",
act, ((int *) act)[0], ((int *) act)[1]);
for (count = 0; count < 32 && count < act->count;
count += sizeof(int))
@@ -716,19 +706,17 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
len += sprintf(page + len, "\n");
act++;
}
- printk(KERN_ERR "%s", page);
-
+ dev_err(dev, "%s", page);
/* print failing CCW area */
len = 0;
if (act < ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2) {
act = ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2;
- len += sprintf(page + len, PRINTK_HEADER "......\n");
+ len += sprintf(page + len, "......\n");
}
end = min((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa + 2, last);
while (act <= end) {
- len += sprintf(page + len, PRINTK_HEADER
- " CCW %p: %08X %08X DAT:",
+ len += sprintf(page + len, "CCW %px: %08X %08X DAT:",
act, ((int *) act)[0], ((int *) act)[1]);
for (count = 0; count < 32 && count < act->count;
count += sizeof(int))
@@ -742,11 +730,10 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
/* print last CCWs */
if (act < last - 2) {
act = last - 2;
- len += sprintf(page + len, PRINTK_HEADER "......\n");
+ len += sprintf(page + len, "......\n");
}
while (act <= last) {
- len += sprintf(page + len, PRINTK_HEADER
- " CCW %p: %08X %08X DAT:",
+ len += sprintf(page + len, "CCW %px: %08X %08X DAT:",
act, ((int *) act)[0], ((int *) act)[1]);
for (count = 0; count < 32 && count < act->count;
count += sizeof(int))
@@ -757,39 +744,13 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req,
act++;
}
if (len > 0)
- printk(KERN_ERR "%s", page);
+ dev_err(dev, "%s", page);
free_page((unsigned long) page);
}
-/*
- * Initialize block layer request queue.
- */
-static void dasd_fba_setup_blk_queue(struct dasd_block *block)
+static unsigned int dasd_fba_max_sectors(struct dasd_block *block)
{
- unsigned int logical_block_size = block->bp_block;
- struct request_queue *q = block->gdp->queue;
- unsigned int max_bytes, max_discard_sectors;
- int max;
-
- max = DASD_FBA_MAX_BLOCKS << block->s2b_shift;
- blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
- q->limits.max_dev_sectors = max;
- blk_queue_logical_block_size(q, logical_block_size);
- blk_queue_max_hw_sectors(q, max);
- blk_queue_max_segments(q, USHRT_MAX);
- /* With page sized segments each segment can be translated into one idaw/tidaw */
- blk_queue_max_segment_size(q, PAGE_SIZE);
- blk_queue_segment_boundary(q, PAGE_SIZE - 1);
-
- q->limits.discard_granularity = logical_block_size;
-
- /* Calculate max_discard_sectors and make it PAGE aligned */
- max_bytes = USHRT_MAX * logical_block_size;
- max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE);
- max_discard_sectors = max_bytes / logical_block_size;
-
- blk_queue_max_discard_sectors(q, max_discard_sectors);
- blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
+ return DASD_FBA_MAX_BLOCKS << block->s2b_shift;
}
static int dasd_fba_pe_handler(struct dasd_device *device,
@@ -802,10 +763,11 @@ static struct dasd_discipline dasd_fba_discipline = {
.owner = THIS_MODULE,
.name = "FBA ",
.ebcname = "FBA ",
+ .has_discard = true,
.check_device = dasd_fba_check_characteristics,
.do_analysis = dasd_fba_do_analysis,
.pe_handler = dasd_fba_pe_handler,
- .setup_blk_queue = dasd_fba_setup_blk_queue,
+ .max_sectors = dasd_fba_max_sectors,
.fill_geometry = dasd_fba_fill_geometry,
.start_IO = dasd_start_IO,
.term_IO = dasd_term_IO,
diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c
index 8bf2cf0ccc15..4533dd055ca8 100644
--- a/drivers/s390/block/dasd_genhd.c
+++ b/drivers/s390/block/dasd_genhd.c
@@ -11,8 +11,6 @@
*
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/interrupt.h>
#include <linux/major.h>
#include <linux/fs.h>
@@ -20,9 +18,6 @@
#include <linux/uaccess.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd_gendisk:"
-
#include "dasd_int.h"
static unsigned int queue_depth = 32;
@@ -39,6 +34,16 @@ MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD d
*/
int dasd_gendisk_alloc(struct dasd_block *block)
{
+ struct queue_limits lim = {
+ /*
+ * With page sized segments, each segment can be translated into
+ * one idaw/tidaw.
+ */
+ .max_segment_size = PAGE_SIZE,
+ .seg_boundary_mask = PAGE_SIZE - 1,
+ .dma_alignment = PAGE_SIZE - 1,
+ .max_segments = USHRT_MAX,
+ };
struct gendisk *gdp;
struct dasd_device *base;
int len, rc;
@@ -58,11 +63,12 @@ int dasd_gendisk_alloc(struct dasd_block *block)
if (rc)
return rc;
- gdp = blk_mq_alloc_disk(&block->tag_set, block);
+ gdp = blk_mq_alloc_disk(&block->tag_set, &lim, block);
if (IS_ERR(gdp)) {
blk_mq_free_tag_set(&block->tag_set);
return PTR_ERR(gdp);
}
+ blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue);
/* Initialize gendisk structure. */
gdp->major = DASD_MAJOR;
diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h
index aecd502aec51..e5f40536b425 100644
--- a/drivers/s390/block/dasd_int.h
+++ b/drivers/s390/block/dasd_int.h
@@ -113,9 +113,6 @@ do { \
__dev_id.ssid, __dev_id.devno, d_data); \
} while (0)
-/* limit size for an errorstring */
-#define ERRORLENGTH 30
-
/* definition of dbf debug levels */
#define DBF_EMERG 0 /* system is unusable */
#define DBF_ALERT 1 /* action must be taken immediately */
@@ -126,32 +123,6 @@ do { \
#define DBF_INFO 6 /* informational */
#define DBF_DEBUG 6 /* debug-level messages */
-/* messages to be written via klogd and dbf */
-#define DEV_MESSAGE(d_loglevel,d_device,d_string,d_args...)\
-do { \
- printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \
- dev_name(&d_device->cdev->dev), d_args); \
- DBF_DEV_EVENT(DBF_ALERT, d_device, d_string, d_args); \
-} while(0)
-
-#define MESSAGE(d_loglevel,d_string,d_args...)\
-do { \
- printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \
- DBF_EVENT(DBF_ALERT, d_string, d_args); \
-} while(0)
-
-/* messages to be written via klogd only */
-#define DEV_MESSAGE_LOG(d_loglevel,d_device,d_string,d_args...)\
-do { \
- printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \
- dev_name(&d_device->cdev->dev), d_args); \
-} while(0)
-
-#define MESSAGE_LOG(d_loglevel,d_string,d_args...)\
-do { \
- printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \
-} while(0)
-
/* Macro to calculate number of blocks per page */
#define BLOCKS_PER_PAGE(blksize) (PAGE_SIZE / blksize)
@@ -322,6 +293,7 @@ struct dasd_discipline {
struct module *owner;
char ebcname[8]; /* a name used for tagging and printks */
char name[8]; /* a name used for tagging and printks */
+ bool has_discard;
struct list_head list; /* used for list of disciplines */
@@ -360,10 +332,7 @@ struct dasd_discipline {
int (*online_to_ready) (struct dasd_device *);
int (*basic_to_known)(struct dasd_device *);
- /*
- * Initialize block layer request queue.
- */
- void (*setup_blk_queue)(struct dasd_block *);
+ unsigned int (*max_sectors)(struct dasd_block *);
/* (struct dasd_device *);
* Device operation functions. build_cp creates a ccw chain for
* a block device request, start_io starts the request and
diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c
index de85a5e4e21b..7e0ed7032f76 100644
--- a/drivers/s390/block/dasd_ioctl.c
+++ b/drivers/s390/block/dasd_ioctl.c
@@ -10,8 +10,6 @@
* i/o controls for the dasd driver.
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/interrupt.h>
#include <linux/compat.h>
#include <linux/major.h>
@@ -24,12 +22,8 @@
#include <linux/uaccess.h>
#include <linux/dasd_mod.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd_ioctl:"
-
#include "dasd_int.h"
-
static int
dasd_ioctl_api_version(void __user *argp)
{
diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c
index 62a859ea67f8..0faaa437d9be 100644
--- a/drivers/s390/block/dasd_proc.c
+++ b/drivers/s390/block/dasd_proc.c
@@ -11,8 +11,6 @@
*
*/
-#define KMSG_COMPONENT "dasd"
-
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/string.h>
@@ -23,9 +21,6 @@
#include <asm/debug.h>
#include <linux/uaccess.h>
-/* This is ugly... */
-#define PRINTK_HEADER "dasd_proc:"
-
#include "dasd_int.h"
static struct proc_dir_entry *dasd_proc_root_entry = NULL;
diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c
index 4b7ecd4fd431..9c8f529b827c 100644
--- a/drivers/s390/block/dcssblk.c
+++ b/drivers/s390/block/dcssblk.c
@@ -546,6 +546,9 @@ static const struct attribute_group *dcssblk_dev_attr_groups[] = {
static ssize_t
dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count)
{
+ struct queue_limits lim = {
+ .logical_block_size = 4096,
+ };
int rc, i, j, num_of_segments;
struct dcssblk_dev_info *dev_info;
struct segment_info *seg_info, *temp;
@@ -629,9 +632,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
dev_info->dev.release = dcssblk_release_segment;
dev_info->dev.groups = dcssblk_dev_attr_groups;
INIT_LIST_HEAD(&dev_info->lh);
- dev_info->gd = blk_alloc_disk(NUMA_NO_NODE);
- if (dev_info->gd == NULL) {
- rc = -ENOMEM;
+ dev_info->gd = blk_alloc_disk(&lim, NUMA_NO_NODE);
+ if (IS_ERR(dev_info->gd)) {
+ rc = PTR_ERR(dev_info->gd);
goto seg_list_del;
}
dev_info->gd->major = dcssblk_major;
@@ -639,7 +642,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char
dev_info->gd->fops = &dcssblk_devops;
dev_info->gd->private_data = dev_info;
dev_info->gd->flags |= GENHD_FL_NO_PART;
- blk_queue_logical_block_size(dev_info->gd->queue, 4096);
blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue);
seg_byte_size = (dev_info->end - dev_info->start + 1);
diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c
index ade95e91b3c8..9f6fdd0daa74 100644
--- a/drivers/s390/block/scm_blk.c
+++ b/drivers/s390/block/scm_blk.c
@@ -435,10 +435,17 @@ static const struct blk_mq_ops scm_mq_ops = {
int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
{
- unsigned int devindex, nr_max_blk;
+ struct queue_limits lim = {
+ .logical_block_size = 1 << 12,
+ };
+ unsigned int devindex;
struct request_queue *rq;
int len, ret;
+ lim.max_segments = min(scmdev->nr_max_block,
+ (unsigned int) (PAGE_SIZE / sizeof(struct aidaw)));
+ lim.max_hw_sectors = lim.max_segments << 3; /* 8 * 512 = blk_size */
+
devindex = atomic_inc_return(&nr_devices) - 1;
/* scma..scmz + scmaa..scmzz */
if (devindex > 701) {
@@ -462,18 +469,12 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev)
if (ret)
goto out;
- bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev);
+ bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, &lim, scmdev);
if (IS_ERR(bdev->gendisk)) {
ret = PTR_ERR(bdev->gendisk);
goto out_tag;
}
rq = bdev->rq = bdev->gendisk->queue;
- nr_max_blk = min(scmdev->nr_max_block,
- (unsigned int) (PAGE_SIZE / sizeof(struct aidaw)));
-
- blk_queue_logical_block_size(rq, 1 << 12);
- blk_queue_max_hw_sectors(rq, nr_max_blk << 3); /* 8 * 512 = blk_size */
- blk_queue_max_segments(rq, nr_max_blk);
blk_queue_flag_set(QUEUE_FLAG_NONROT, rq);
blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq);
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 44680f65ea14..9969f4e2f1c3 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -332,7 +332,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
sdev->sg_reserved_size = INT_MAX;
- q = blk_mq_init_queue(&sdev->host->tag_set);
+ q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL);
if (IS_ERR(q)) {
/* release fn is set up in scsi_sysfs_device_initialise, so
* have to free and put manually here */
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index 3b89c9d4aa40..eac7fff6992d 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -10593,7 +10593,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
err = blk_mq_alloc_tag_set(&hba->tmf_tag_set);
if (err < 0)
goto out_remove_scsi_host;
- hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set);
+ hba->tmf_queue = blk_mq_alloc_queue(&hba->tmf_tag_set, NULL, NULL);
if (IS_ERR(hba->tmf_queue)) {
err = PTR_ERR(hba->tmf_queue);
goto free_tmf_tag_set;
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 5f750fa53a2b..aea51fd850cd 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -824,11 +824,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones,
reset = &zones[1];
if (reset && reset->cond != BLK_ZONE_COND_EMPTY) {
+ unsigned int nofs_flags;
+
ASSERT(sb_zone_is_full(reset));
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- reset->start, reset->len,
- GFP_NOFS);
+ reset->start, reset->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -974,11 +977,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
* explicit ZONE_FINISH is not necessary.
*/
if (zone->wp != zone->start + zone->capacity) {
+ unsigned int nofs_flags;
int ret;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev,
REQ_OP_ZONE_FINISH, zone->start,
- zone->len, GFP_NOFS);
+ zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
@@ -996,11 +1002,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
{
+ unsigned int nofs_flags;
sector_t zone_sectors;
sector_t nr_sectors;
u8 zone_sectors_shift;
u32 sb_zone;
u32 nr_zones;
+ int ret;
zone_sectors = bdev_zone_sectors(bdev);
zone_sectors_shift = ilog2(zone_sectors);
@@ -1011,9 +1019,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
if (sb_zone + 1 >= nr_zones)
return -ENOENT;
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- zone_start_sector(sb_zone, bdev),
- zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ zone_start_sector(sb_zone, bdev),
+ zone_sectors * BTRFS_NR_SB_LOG_ZONES);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
/*
@@ -1124,12 +1135,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos)
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
u64 length, u64 *bytes)
{
+ unsigned int nofs_flags;
int ret;
*bytes = 0;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET,
- physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT,
- GFP_NOFS);
+ physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
@@ -2244,14 +2257,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
+ unsigned int nofs_flags;
if (zinfo->max_active_zones == 0)
continue;
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
- zinfo->zone_size >> SECTOR_SHIFT,
- GFP_NOFS);
+ zinfo->zone_size >> SECTOR_SHIFT);
+ memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index 4c8836ded90f..e1065ba70207 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
}
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) {
+ unsigned int nofs_flags;
+ int ret;
+
trace_f2fs_issue_reset_zone(bdev, blkstart);
- return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
- sector, nr_sects, GFP_NOFS);
+ nofs_flags = memalloc_nofs_save();
+ ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET,
+ sector, nr_sects);
+ memalloc_nofs_restore(nofs_flags);
+ return ret;
}
__queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen);
@@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
block_t zone_block, valid_block_cnt;
unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT;
int ret;
+ unsigned int nofs_flags;
if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ)
return 0;
@@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi,
"pointer: valid block[0x%x,0x%x] cond[0x%x]",
zone_segno, valid_block_cnt, zone->cond);
+ nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH,
- zone->start, zone->len, GFP_NOFS);
+ zone->start, zone->len);
+ memalloc_nofs_restore(nofs_flags);
if (ret == -EOPNOTSUPP) {
ret = blkdev_issue_zeroout(fdev->bdev, zone->wp,
zone->len - (zone->wp - zone->start),
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index b6e8e7c96251..aadad16738df 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb,
trace_zonefs_zone_mgmt(sb, z, op);
ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector,
- z->z_size >> SECTOR_SHIFT, GFP_NOFS);
+ z->z_size >> SECTOR_SHIFT);
if (ret) {
zonefs_err(sb,
"Zone management operation %s at %llu failed %d\n",
diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h
index 378b2459efe2..e253e7bd0d17 100644
--- a/include/linux/blk-integrity.h
+++ b/include/linux/blk-integrity.h
@@ -20,6 +20,7 @@ struct blk_integrity_iter {
unsigned int data_size;
unsigned short interval;
unsigned char tuple_size;
+ unsigned char pi_offset;
const char *disk_name;
};
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 492b0128b5d9..d3d8fd8e229b 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -684,17 +684,19 @@ enum {
#define BLK_MQ_NO_HCTX_IDX (-1U)
-struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set,
+ struct queue_limits *lim, void *queuedata,
struct lock_class_key *lkclass);
-#define blk_mq_alloc_disk(set, queuedata) \
+#define blk_mq_alloc_disk(set, lim, queuedata) \
({ \
static struct lock_class_key __key; \
\
- __blk_mq_alloc_disk(set, queuedata, &__key); \
+ __blk_mq_alloc_disk(set, lim, queuedata, &__key); \
})
struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
struct lock_class_key *lkclass);
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *);
+struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set,
+ struct queue_limits *lim, void *queuedata);
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q);
void blk_mq_destroy_queue(struct request_queue *);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 12d87cef2c03..cb1526ec44b5 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -207,52 +207,10 @@ static inline bool blk_path_error(blk_status_t error)
return true;
}
-/*
- * From most significant bit:
- * 1 bit: reserved for other usage, see below
- * 12 bits: original size of bio
- * 51 bits: issue time of bio
- */
-#define BIO_ISSUE_RES_BITS 1
-#define BIO_ISSUE_SIZE_BITS 12
-#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS)
-#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS)
-#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1)
-#define BIO_ISSUE_SIZE_MASK \
- (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT)
-#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1))
-
-/* Reserved bit for blk-throtl */
-#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63)
-
struct bio_issue {
u64 value;
};
-static inline u64 __bio_issue_time(u64 time)
-{
- return time & BIO_ISSUE_TIME_MASK;
-}
-
-static inline u64 bio_issue_time(struct bio_issue *issue)
-{
- return __bio_issue_time(issue->value);
-}
-
-static inline sector_t bio_issue_size(struct bio_issue *issue)
-{
- return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT);
-}
-
-static inline void bio_issue_init(struct bio_issue *issue,
- sector_t size)
-{
- size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1;
- issue->value = ((issue->value & BIO_ISSUE_RES_MASK) |
- (ktime_get_ns() & BIO_ISSUE_TIME_MASK) |
- ((u64)size << BIO_ISSUE_SIZE_SHIFT));
-}
-
typedef __u32 __bitwise blk_opf_t;
typedef unsigned int blk_qc_t;
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2f5dbde23094..f9b87c39cab0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -43,7 +43,7 @@ struct blk_crypto_profile;
extern const struct device_type disk_type;
extern const struct device_type part_type;
-extern struct class block_class;
+extern const struct class block_class;
/*
* Maximum number of blkcg policies allowed to be registered concurrently.
@@ -109,6 +109,7 @@ struct blk_integrity {
const struct blk_integrity_profile *profile;
unsigned char flags;
unsigned char tuple_size;
+ unsigned char pi_offset;
unsigned char interval_exp;
unsigned char tag_size;
};
@@ -190,8 +191,6 @@ struct gendisk {
* blk_mq_unfreeze_queue().
*/
unsigned int nr_zones;
- unsigned int max_open_zones;
- unsigned int max_active_zones;
unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock;
#endif /* CONFIG_BLK_DEV_ZONED */
@@ -293,6 +292,7 @@ struct queue_limits {
unsigned int io_opt;
unsigned int max_discard_sectors;
unsigned int max_hw_discard_sectors;
+ unsigned int max_user_discard_sectors;
unsigned int max_secure_erase_sectors;
unsigned int max_write_zeroes_sectors;
unsigned int max_zone_append_sectors;
@@ -308,6 +308,8 @@ struct queue_limits {
unsigned char discard_misaligned;
unsigned char raid_partial_stripes_expensive;
bool zoned;
+ unsigned int max_open_zones;
+ unsigned int max_active_zones;
/*
* Drivers that set dma_alignment to less than 511 must be prepared to
@@ -326,7 +328,7 @@ void disk_set_zoned(struct gendisk *disk);
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
- sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask);
+ sector_t sectors, sector_t nr_sectors);
int blk_revalidate_disk_zones(struct gendisk *disk,
void (*update_driver_data)(struct gendisk *disk));
@@ -474,6 +476,7 @@ struct request_queue {
struct mutex sysfs_lock;
struct mutex sysfs_dir_lock;
+ struct mutex limits_lock;
/*
* for reusing dead hctx instance in case of updating
@@ -640,23 +643,23 @@ static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector)
static inline void disk_set_max_open_zones(struct gendisk *disk,
unsigned int max_open_zones)
{
- disk->max_open_zones = max_open_zones;
+ disk->queue->limits.max_open_zones = max_open_zones;
}
static inline void disk_set_max_active_zones(struct gendisk *disk,
unsigned int max_active_zones)
{
- disk->max_active_zones = max_active_zones;
+ disk->queue->limits.max_active_zones = max_active_zones;
}
static inline unsigned int bdev_max_open_zones(struct block_device *bdev)
{
- return bdev->bd_disk->max_open_zones;
+ return bdev->bd_disk->queue->limits.max_open_zones;
}
static inline unsigned int bdev_max_active_zones(struct block_device *bdev)
{
- return bdev->bd_disk->max_active_zones;
+ return bdev->bd_disk->queue->limits.max_active_zones;
}
#else /* CONFIG_BLK_DEV_ZONED */
@@ -764,22 +767,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb)
int bdev_disk_changed(struct gendisk *disk, bool invalidate);
void put_disk(struct gendisk *disk);
-struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass);
+struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node,
+ struct lock_class_key *lkclass);
/**
* blk_alloc_disk - allocate a gendisk structure
+ * @lim: queue limits to be used for this disk.
* @node_id: numa node to allocate on
*
* Allocate and pre-initialize a gendisk structure for use with BIO based
* drivers.
*
+ * Returns an ERR_PTR on error, else the allocated disk.
+ *
* Context: can sleep
*/
-#define blk_alloc_disk(node_id) \
+#define blk_alloc_disk(lim, node_id) \
({ \
static struct lock_class_key __key; \
\
- __blk_alloc_disk(node_id, &__key); \
+ __blk_alloc_disk(lim, node_id, &__key); \
})
int __register_blkdev(unsigned int major, const char *name,
@@ -862,6 +869,29 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset,
return chunk_sectors - (offset & (chunk_sectors - 1));
}
+/**
+ * queue_limits_start_update - start an atomic update of queue limits
+ * @q: queue to update
+ *
+ * This functions starts an atomic update of the queue limits. It takes a lock
+ * to prevent other updates and returns a snapshot of the current limits that
+ * the caller can modify. The caller must call queue_limits_commit_update()
+ * to finish the update.
+ *
+ * Context: process context. The caller must have frozen the queue or ensured
+ * that there is outstanding I/O by other means.
+ */
+static inline struct queue_limits
+queue_limits_start_update(struct request_queue *q)
+ __acquires(q->limits_lock)
+{
+ mutex_lock(&q->limits_lock);
+ return q->limits;
+}
+int queue_limits_commit_update(struct request_queue *q,
+ struct queue_limits *lim);
+int queue_limits_set(struct request_queue *q, struct queue_limits *lim);
+
/*
* Access functions for manipulating queue properties
*/
@@ -895,8 +925,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth);
extern void blk_set_stacking_limits(struct queue_limits *lim);
extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
sector_t offset);
-extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
- sector_t offset);
+void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev,
+ sector_t offset, const char *pfx);
extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
extern void blk_queue_virt_boundary(struct request_queue *, unsigned long);
@@ -943,6 +973,7 @@ struct blk_plug {
/* if ios_left is > 1, we can batch tag/rq allocations */
struct request *cached_rq;
+ u64 cur_ktime;
unsigned short nr_ios;
unsigned short rq_count;
@@ -973,6 +1004,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
__blk_flush_plug(plug, async);
}
+/*
+ * tsk == current here
+ */
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+ struct blk_plug *plug = tsk->plug;
+
+ if (plug)
+ plug->cur_ktime = 0;
+ current->flags &= ~PF_BLOCK_TS;
+}
+
int blkdev_issue_flush(struct block_device *bdev);
long nr_blockdev_pages(void);
#else /* CONFIG_BLOCK */
@@ -996,6 +1039,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async)
{
}
+static inline void blk_plug_invalidate_ts(struct task_struct *tsk)
+{
+}
+
static inline int blkdev_issue_flush(struct block_device *bdev)
{
return 0;
diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h
index 4dd7e6fe92fb..eb2f04d636c8 100644
--- a/include/linux/nvme-rdma.h
+++ b/include/linux/nvme-rdma.h
@@ -6,7 +6,11 @@
#ifndef _LINUX_NVME_RDMA_H
#define _LINUX_NVME_RDMA_H
-#define NVME_RDMA_MAX_QUEUE_SIZE 128
+#define NVME_RDMA_IP_PORT 4420
+
+#define NVME_RDMA_MAX_QUEUE_SIZE 256
+#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128
+#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128
enum nvme_rdma_cm_fmt {
NVME_RDMA_CM_FMT_1_0 = 0x0,
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 3ef4053ea950..425573202295 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -23,8 +23,6 @@
#define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery"
-#define NVME_RDMA_IP_PORT 4420
-
#define NVME_NSID_ALL 0xffffffff
enum nvme_subsys_type {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ffe8f618ab86..15b7cb478d16 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1642,7 +1642,7 @@ extern struct pid *cad_pid;
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */
-#define PF__HOLE__20000000 0x20000000
+#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */
#define PF__HOLE__40000000 0x40000000
#define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h
index a6e04b4a21d7..11e0e00e0bb9 100644
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -176,6 +176,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
+bool cpus_equal_capacity(int this_cpu, int that_cpu);
bool cpus_share_cache(int this_cpu, int that_cpu);
bool cpus_share_resources(int this_cpu, int that_cpu);
@@ -226,6 +227,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
}
+static inline bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ return true;
+}
+
static inline bool cpus_share_cache(int this_cpu, int that_cpu)
{
return true;
diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h
index b9cfc5c96268..c8dc5f8ea699 100644
--- a/include/uapi/linux/ublk_cmd.h
+++ b/include/uapi/linux/ublk_cmd.h
@@ -49,6 +49,8 @@
_IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd)
#define UBLK_U_CMD_GET_FEATURES \
_IOR('u', 0x13, struct ublksrv_ctrl_cmd)
+#define UBLK_U_CMD_DEL_DEV_ASYNC \
+ _IOR('u', 0x14, struct ublksrv_ctrl_cmd)
/*
* 64bits are enough now, and it should be easy to extend in case of
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9116bcc90346..540f229700b6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu)
}
}
+bool cpus_equal_capacity(int this_cpu, int that_cpu)
+{
+ if (!sched_asym_cpucap_active())
+ return true;
+
+ if (this_cpu == that_cpu)
+ return true;
+
+ return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu);
+}
+
bool cpus_share_cache(int this_cpu, int that_cpu)
{
if (this_cpu == that_cpu)
@@ -6787,10 +6798,12 @@ static inline void sched_submit_work(struct task_struct *tsk)
static void sched_update_worker(struct task_struct *tsk)
{
- if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) {
+ if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) {
+ if (tsk->flags & PF_BLOCK_TS)
+ blk_plug_invalidate_ts(tsk);
if (tsk->flags & PF_WQ_WORKER)
wq_worker_running(tsk);
- else
+ else if (tsk->flags & PF_IO_WORKER)
io_wq_worker_running(tsk);
}
}