diff options
138 files changed, 3444 insertions, 3171 deletions
diff --git a/arch/m68k/emu/nfblock.c b/arch/m68k/emu/nfblock.c index a708fbd5a844..642fb80c5c4e 100644 --- a/arch/m68k/emu/nfblock.c +++ b/arch/m68k/emu/nfblock.c @@ -96,6 +96,9 @@ static const struct block_device_operations nfhd_ops = { static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) { + struct queue_limits lim = { + .logical_block_size = bsize, + }; struct nfhd_device *dev; int dev_id = id - NFHD_DEV_OFFSET; int err = -ENOMEM; @@ -117,9 +120,11 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->bsize = bsize; dev->bshift = ffs(bsize) - 10; - dev->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->disk) + dev->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(dev->disk)) { + err = PTR_ERR(dev->disk); goto free_dev; + } dev->disk->major = major_num; dev->disk->first_minor = dev_id * 16; @@ -128,7 +133,6 @@ static int __init nfhd_init_one(int id, u32 blocks, u32 bsize) dev->disk->private_data = dev; sprintf(dev->disk->disk_name, "nfhd%u", dev_id); set_capacity(dev->disk, (sector_t)blocks * (bsize / 512)); - blk_queue_logical_block_size(dev->disk->queue, bsize); err = add_disk(dev->disk); if (err) goto out_cleanup_disk; diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 92ee2697ff39..63fc062add70 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -108,8 +108,6 @@ static inline void ubd_set_bit(__u64 bit, unsigned char *data) static DEFINE_MUTEX(ubd_lock); static DEFINE_MUTEX(ubd_mutex); /* replaces BKL, might not be needed */ -static int ubd_open(struct gendisk *disk, blk_mode_t mode); -static void ubd_release(struct gendisk *disk); static int ubd_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); @@ -118,16 +116,11 @@ static int ubd_getgeo(struct block_device *bdev, struct hd_geometry *geo); static const struct block_device_operations ubd_blops = { .owner = THIS_MODULE, - .open = ubd_open, - .release = ubd_release, .ioctl = ubd_ioctl, .compat_ioctl = blkdev_compat_ptr_ioctl, .getgeo = ubd_getgeo, }; -/* Protected by ubd_lock */ -static struct gendisk *ubd_gendisk[MAX_DEV]; - #ifdef CONFIG_BLK_DEV_UBD_SYNC #define OPEN_FLAGS ((struct openflags) { .r = 1, .w = 1, .s = 1, .c = 0, \ .cl = 1 }) @@ -155,7 +148,6 @@ struct ubd { * backing or the cow file. */ char *file; char *serial; - int count; int fd; __u64 size; struct openflags boot_openflags; @@ -165,7 +157,7 @@ struct ubd { unsigned no_trim:1; struct cow cow; struct platform_device pdev; - struct request_queue *queue; + struct gendisk *disk; struct blk_mq_tag_set tag_set; spinlock_t lock; }; @@ -181,7 +173,6 @@ struct ubd { #define DEFAULT_UBD { \ .file = NULL, \ .serial = NULL, \ - .count = 0, \ .fd = -1, \ .size = -1, \ .boot_openflags = OPEN_FLAGS, \ @@ -774,8 +765,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) ubd_dev->fd = fd; if(ubd_dev->cow.file != NULL){ - blk_queue_max_hw_sectors(ubd_dev->queue, 8 * sizeof(long)); - err = -ENOMEM; ubd_dev->cow.bitmap = vmalloc(ubd_dev->cow.bitmap_len); if(ubd_dev->cow.bitmap == NULL){ @@ -797,11 +786,6 @@ static int ubd_open_dev(struct ubd *ubd_dev) if(err < 0) goto error; ubd_dev->cow.fd = err; } - if (ubd_dev->no_trim == 0) { - blk_queue_max_discard_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - blk_queue_max_write_zeroes_sectors(ubd_dev->queue, UBD_MAX_REQUEST); - } - blk_queue_flag_set(QUEUE_FLAG_NONROT, ubd_dev->queue); return 0; error: os_close_file(ubd_dev->fd); @@ -851,27 +835,6 @@ static const struct attribute_group *ubd_attr_groups[] = { NULL, }; -static int ubd_disk_register(int major, u64 size, int unit, - struct gendisk *disk) -{ - disk->major = major; - disk->first_minor = unit << UBD_SHIFT; - disk->minors = 1 << UBD_SHIFT; - disk->fops = &ubd_blops; - set_capacity(disk, size / 512); - sprintf(disk->disk_name, "ubd%c", 'a' + unit); - - ubd_devs[unit].pdev.id = unit; - ubd_devs[unit].pdev.name = DRIVER_NAME; - ubd_devs[unit].pdev.dev.release = ubd_device_release; - dev_set_drvdata(&ubd_devs[unit].pdev.dev, &ubd_devs[unit]); - platform_device_register(&ubd_devs[unit].pdev); - - disk->private_data = &ubd_devs[unit]; - disk->queue = ubd_devs[unit].queue; - return device_add_disk(&ubd_devs[unit].pdev.dev, disk, ubd_attr_groups); -} - #define ROUND_BLOCK(n) ((n + (SECTOR_SIZE - 1)) & (-SECTOR_SIZE)) static const struct blk_mq_ops ubd_mq_ops = { @@ -881,18 +844,36 @@ static const struct blk_mq_ops ubd_mq_ops = { static int ubd_add(int n, char **error_out) { struct ubd *ubd_dev = &ubd_devs[n]; + struct queue_limits lim = { + .max_segments = MAX_SG, + .seg_boundary_mask = PAGE_SIZE - 1, + }; struct gendisk *disk; int err = 0; if(ubd_dev->file == NULL) goto out; + if (ubd_dev->cow.file) + lim.max_hw_sectors = 8 * sizeof(long); + if (!ubd_dev->no_trim) { + lim.max_hw_discard_sectors = UBD_MAX_REQUEST; + lim.max_write_zeroes_sectors = UBD_MAX_REQUEST; + } + err = ubd_file_size(ubd_dev, &ubd_dev->size); if(err < 0){ *error_out = "Couldn't determine size of device's file"; goto out; } + err = ubd_open_dev(ubd_dev); + if (err) { + pr_err("ubd%c: Can't open \"%s\": errno = %d\n", + 'a' + n, ubd_dev->file, -err); + goto out; + } + ubd_dev->size = ROUND_BLOCK(ubd_dev->size); ubd_dev->tag_set.ops = &ubd_mq_ops; @@ -904,29 +885,43 @@ static int ubd_add(int n, char **error_out) err = blk_mq_alloc_tag_set(&ubd_dev->tag_set); if (err) - goto out; + goto out_close; - disk = blk_mq_alloc_disk(&ubd_dev->tag_set, ubd_dev); + disk = blk_mq_alloc_disk(&ubd_dev->tag_set, &lim, ubd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } - ubd_dev->queue = disk->queue; - blk_queue_write_cache(ubd_dev->queue, true, false); - blk_queue_max_segments(ubd_dev->queue, MAX_SG); - blk_queue_segment_boundary(ubd_dev->queue, PAGE_SIZE - 1); - err = ubd_disk_register(UBD_MAJOR, ubd_dev->size, n, disk); + blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); + blk_queue_write_cache(disk->queue, true, false); + disk->major = UBD_MAJOR; + disk->first_minor = n << UBD_SHIFT; + disk->minors = 1 << UBD_SHIFT; + disk->fops = &ubd_blops; + set_capacity(disk, ubd_dev->size / 512); + sprintf(disk->disk_name, "ubd%c", 'a' + n); + disk->private_data = ubd_dev; + set_disk_ro(disk, !ubd_dev->openflags.w); + + ubd_dev->pdev.id = n; + ubd_dev->pdev.name = DRIVER_NAME; + ubd_dev->pdev.dev.release = ubd_device_release; + dev_set_drvdata(&ubd_dev->pdev.dev, ubd_dev); + platform_device_register(&ubd_dev->pdev); + + err = device_add_disk(&ubd_dev->pdev.dev, disk, ubd_attr_groups); if (err) goto out_cleanup_disk; - ubd_gendisk[n] = disk; return 0; out_cleanup_disk: put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&ubd_dev->tag_set); +out_close: + ubd_close_dev(ubd_dev); out: return err; } @@ -1012,7 +1007,6 @@ static int ubd_id(char **str, int *start_out, int *end_out) static int ubd_remove(int n, char **error_out) { - struct gendisk *disk = ubd_gendisk[n]; struct ubd *ubd_dev; int err = -ENODEV; @@ -1023,15 +1017,15 @@ static int ubd_remove(int n, char **error_out) if(ubd_dev->file == NULL) goto out; - /* you cannot remove a open disk */ - err = -EBUSY; - if(ubd_dev->count > 0) - goto out; + if (ubd_dev->disk) { + /* you cannot remove a open disk */ + err = -EBUSY; + if (disk_openers(ubd_dev->disk)) + goto out; - ubd_gendisk[n] = NULL; - if(disk != NULL){ - del_gendisk(disk); - put_disk(disk); + del_gendisk(ubd_dev->disk); + ubd_close_dev(ubd_dev); + put_disk(ubd_dev->disk); } err = 0; @@ -1153,37 +1147,6 @@ static int __init ubd_driver_init(void){ device_initcall(ubd_driver_init); -static int ubd_open(struct gendisk *disk, blk_mode_t mode) -{ - struct ubd *ubd_dev = disk->private_data; - int err = 0; - - mutex_lock(&ubd_mutex); - if(ubd_dev->count == 0){ - err = ubd_open_dev(ubd_dev); - if(err){ - printk(KERN_ERR "%s: Can't open \"%s\": errno = %d\n", - disk->disk_name, ubd_dev->file, -err); - goto out; - } - } - ubd_dev->count++; - set_disk_ro(disk, !ubd_dev->openflags.w); -out: - mutex_unlock(&ubd_mutex); - return err; -} - -static void ubd_release(struct gendisk *disk) -{ - struct ubd *ubd_dev = disk->private_data; - - mutex_lock(&ubd_mutex); - if(--ubd_dev->count == 0) - ubd_close_dev(ubd_dev); - mutex_unlock(&ubd_mutex); -} - static void cowify_bitmap(__u64 io_offset, int length, unsigned long *cow_mask, __u64 *cow_offset, unsigned long *bitmap, __u64 bitmap_offset, unsigned long *bitmap_words, diff --git a/arch/xtensa/platforms/iss/simdisk.c b/arch/xtensa/platforms/iss/simdisk.c index 178cf96ca10a..defc67909a9c 100644 --- a/arch/xtensa/platforms/iss/simdisk.c +++ b/arch/xtensa/platforms/iss/simdisk.c @@ -264,16 +264,18 @@ static int __init simdisk_setup(struct simdisk *dev, int which, struct proc_dir_entry *procdir) { char tmp[2] = { '0' + which, 0 }; - int err = -ENOMEM; + int err; dev->fd = -1; dev->filename = NULL; spin_lock_init(&dev->lock); dev->users = 0; - dev->gd = blk_alloc_disk(NUMA_NO_NODE); - if (!dev->gd) + dev->gd = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(dev->gd)) { + err = PTR_ERR(dev->gd); goto out; + } dev->gd->major = simdisk_major; dev->gd->first_minor = which; dev->gd->minors = SIMDISK_MINORS; diff --git a/block/bdev.c b/block/bdev.c index 140093c99bdc..e7adaaf1c219 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -383,7 +383,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), + SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 2c90e5de0acd..d442ee358fc2 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -127,7 +127,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) if (!bfqg_stats_waiting(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_group_wait_time) bfq_stat_add(&stats->group_wait_time, now - stats->start_group_wait_time); @@ -144,7 +144,7 @@ static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, return; if (bfqg == curr_bfqg) return; - stats->start_group_wait_time = ktime_get_ns(); + stats->start_group_wait_time = blk_time_get_ns(); bfqg_stats_mark_waiting(stats); } @@ -156,7 +156,7 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) if (!bfqg_stats_empty(stats)) return; - now = ktime_get_ns(); + now = blk_time_get_ns(); if (now > stats->start_empty_time) bfq_stat_add(&stats->empty_time, now - stats->start_empty_time); @@ -183,7 +183,7 @@ void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) if (bfqg_stats_empty(stats)) return; - stats->start_empty_time = ktime_get_ns(); + stats->start_empty_time = blk_time_get_ns(); bfqg_stats_mark_empty(stats); } @@ -192,7 +192,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) struct bfqg_stats *stats = &bfqg->stats; if (bfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > stats->start_idle_time) bfq_stat_add(&stats->idle_time, @@ -205,7 +205,7 @@ void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { struct bfqg_stats *stats = &bfqg->stats; - stats->start_idle_time = ktime_get_ns(); + stats->start_idle_time = blk_time_get_ns(); bfqg_stats_mark_idling(stats); } @@ -242,7 +242,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; - u64 now = ktime_get_ns(); + u64 now = blk_time_get_ns(); if (now > io_start_time_ns) blkg_rwstat_add(&stats->service_time, opf, diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3cce6de464a7..4b88a54a9b76 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -1005,7 +1005,7 @@ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, rq = rq_entry_fifo(bfqq->fifo.next); - if (rq == last || ktime_get_ns() < rq->fifo_time) + if (rq == last || blk_time_get_ns() < rq->fifo_time) return NULL; bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); @@ -1829,7 +1829,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * bfq_bfqq_update_budg_for_activation for * details on the usage of the next variable. */ - arrived_in_time = ktime_get_ns() <= + arrived_in_time = blk_time_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); @@ -2208,7 +2208,7 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; @@ -2262,7 +2262,7 @@ static void bfq_add_request(struct request *rq) bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { - bfqd->last_empty_occupied_ns = ktime_get_ns(); + bfqd->last_empty_occupied_ns = blk_time_get_ns(); /* * Start the state machine for measuring the * total service time of rq: setting @@ -3294,7 +3294,7 @@ static void bfq_set_budget_timeout(struct bfq_data *bfqd, else timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; - bfqd->last_budget_start = ktime_get(); + bfqd->last_budget_start = blk_time_get(); bfqq->budget_timeout = jiffies + bfqd->bfq_timeout * timeout_coeff; @@ -3394,7 +3394,7 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) else if (bfqq->wr_coeff > 1) sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); - bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start = blk_time_get(); bfqd->last_idling_start_jiffies = jiffies; hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), @@ -3433,7 +3433,7 @@ static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) { if (rq != NULL) { /* new rq dispatch now, reset accordingly */ - bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->last_dispatch = bfqd->first_dispatch = blk_time_get_ns(); bfqd->peak_rate_samples = 1; bfqd->sequential_samples = 0; bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = @@ -3590,7 +3590,7 @@ reset_computation: */ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); if (bfqd->peak_rate_samples == 0) { /* first dispatch */ bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", @@ -4162,7 +4162,7 @@ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (compensate) delta_ktime = bfqd->last_idling_start; else - delta_ktime = ktime_get(); + delta_ktime = blk_time_get(); delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); delta_usecs = ktime_to_us(delta_ktime); @@ -5591,7 +5591,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_io_cq *bic, pid_t pid, int is_sync, unsigned int act_idx) { - u64 now_ns = ktime_get_ns(); + u64 now_ns = blk_time_get_ns(); bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); @@ -5903,7 +5903,7 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, */ if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) return; - elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + elapsed = blk_time_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; @@ -6194,7 +6194,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) bfq_add_request(rq); idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); - rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + rq->fifo_time = blk_time_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; list_add_tail(&rq->queuelist, &bfqq->fifo); bfq_rq_enqueued(bfqd, bfqq, rq); @@ -6370,7 +6370,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_weights_tree_remove(bfqq); } - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); bfqq->ttime.last_end_request = now_ns; @@ -6585,7 +6585,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) static void bfq_update_inject_limit(struct bfq_data *bfqd, struct bfq_queue *bfqq) { - u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + u64 tot_time_ns = blk_time_get_ns() - bfqd->last_empty_occupied_ns; unsigned int old_limit = bfqq->inject_limit; if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { diff --git a/block/bio-integrity.c b/block/bio-integrity.c index c9a16fba58b9..2e3e8e04961e 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -395,6 +395,7 @@ static blk_status_t bio_integrity_process(struct bio *bio, iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; iter.prot_buf = bvec_virt(bip->bip_vec); + iter.pi_offset = bi->pi_offset; __bio_for_each_segment(bv, bio, bviter, *proc_iter) { void *kaddr = bvec_kmap_local(&bv); diff --git a/block/bio.c b/block/bio.c index c9223e9d31da..d24420ed1c4c 100644 --- a/block/bio.c +++ b/block/bio.c @@ -16,7 +16,6 @@ #include <linux/workqueue.h> #include <linux/cgroup.h> #include <linux/highmem.h> -#include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> #include <linux/xarray.h> @@ -763,29 +762,31 @@ static inline void bio_put_percpu_cache(struct bio *bio) struct bio_alloc_cache *cache; cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); - if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { - put_cpu(); - bio_free(bio); - return; - } + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) + goto out_free; - bio_uninit(bio); - - if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + if (in_task()) { + bio_uninit(bio); bio->bi_next = cache->free_list; + /* Not necessary but helps not to iopoll already freed bios */ bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; - } else { - unsigned long flags; + } else if (in_hardirq()) { + lockdep_assert_irqs_disabled(); - local_irq_save(flags); + bio_uninit(bio); bio->bi_next = cache->free_list_irq; cache->free_list_irq = bio; cache->nr_irq++; - local_irq_restore(flags); + } else { + goto out_free; } put_cpu(); + return; +out_free: + put_cpu(); + bio_free(bio); } /** @@ -1154,7 +1155,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_folio_all(fi, bio) { struct page *page; - size_t done = 0; + size_t nr_pages; if (mark_dirty) { folio_lock(fi.folio); @@ -1162,10 +1163,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) folio_unlock(fi.folio); } page = folio_page(fi.folio, fi.offset / PAGE_SIZE); + nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE - + fi.offset / PAGE_SIZE + 1; do { bio_release_page(bio, page++); - done += PAGE_SIZE; - } while (done < fi.length); + } while (--nr_pages != 0); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1371,21 +1373,12 @@ int submit_bio_wait(struct bio *bio) { DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_bdev->bd_disk->lockdep_map); - unsigned long hang_check; bio->bi_private = &done; bio->bi_end_io = submit_bio_wait_endio; bio->bi_opf |= REQ_SYNC; submit_bio(bio); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&done); + blk_wait_io(&done); return blk_status_to_errno(bio->bi_status); } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index ff93c385ba5a..bdbb557feb5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -1846,7 +1846,7 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; bool clamp; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); u64 exp; u64 delay_nsec = 0; int tok; diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index b927a4a0ad03..78b74106bf10 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -19,6 +19,7 @@ #include <linux/kthread.h> #include <linux/blk-mq.h> #include <linux/llist.h> +#include "blk.h" struct blkcg_gq; struct blkg_policy_data; diff --git a/block/blk-core.c b/block/blk-core.c index de771093b526..a16b5abdbbf5 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -394,24 +394,34 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id) { struct request_queue *q; + int error; q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, node_id); if (!q) - return NULL; + return ERR_PTR(-ENOMEM); q->last_merge = NULL; q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); - if (q->id < 0) + if (q->id < 0) { + error = q->id; goto fail_q; + } q->stats = blk_alloc_queue_stats(); - if (!q->stats) + if (!q->stats) { + error = -ENOMEM; goto fail_id; + } + + error = blk_set_default_limits(lim); + if (error) + goto fail_stats; + q->limits = *lim; q->node = node_id; @@ -425,6 +435,7 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->limits_lock); mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); @@ -435,12 +446,12 @@ struct request_queue *blk_alloc_queue(int node_id) * Init percpu_ref in atomic mode so that it's faster to shutdown. * See blk_register_queue() for details. */ - if (percpu_ref_init(&q->q_usage_counter, + error = percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, - PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL); + if (error) goto fail_stats; - blk_set_default_limits(&q->limits); q->nr_requests = BLKDEV_DEFAULT_RQ; return q; @@ -451,7 +462,7 @@ fail_id: ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); - return NULL; + return ERR_PTR(error); } /** @@ -1083,6 +1094,7 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) if (tsk->plug) return; + plug->cur_ktime = 0; plug->mq_list = NULL; plug->cached_rq = NULL; plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); @@ -1182,6 +1194,8 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) */ if (unlikely(!rq_list_empty(plug->cached_rq))) blk_mq_free_plug_rqs(plug); + + current->flags &= ~PF_BLOCK_TS; } /** @@ -1229,8 +1243,7 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - blk_requestq_cachep = kmem_cache_create("request_queue", - sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + blk_requestq_cachep = KMEM_CACHE(request_queue, SLAB_PANIC); blk_debugfs_root = debugfs_create_dir("block", NULL); diff --git a/block/blk-flush.c b/block/blk-flush.c index 3f4d41952ef2..b0f314f4bc14 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -143,7 +143,7 @@ static void blk_account_io_flush(struct request *rq) part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); part_stat_add(part, nsecs[STAT_FLUSH], - ktime_get_ns() - rq->start_time_ns); + blk_time_get_ns() - rq->start_time_ns); part_stat_unlock(); } diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d4e9b4556d14..ccbeb6dfa87a 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -370,6 +370,7 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->profile = template->profile ? template->profile : &nop_profile; bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; + bi->pi_offset = template->pi_offset; blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 04d44f0bcbc8..9a85bfbbc45a 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -829,7 +829,7 @@ static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) /* step up/down based on the vrate */ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); - now_ns = ktime_get_ns(); + now_ns = blk_time_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { if (!ioc->autop_too_fast_at) @@ -1044,7 +1044,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) unsigned seq; u64 vrate; - now->now_ns = ktime_get(); + now->now_ns = blk_time_get_ns(); now->now = ktime_to_us(now->now_ns); vrate = atomic64_read(&ioc->vtime_rate); @@ -2817,7 +2817,7 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) return; } - on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + on_q_ns = blk_time_get_ns() - rq->alloc_time_ns; rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); @@ -2900,7 +2900,7 @@ static int blk_iocost_init(struct gendisk *disk) ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); - ioc->period_at = ktime_to_us(ktime_get()); + ioc->period_at = ktime_to_us(blk_time_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c1a6aba1d59e..ebb522788d97 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -609,7 +609,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) if (!iolat->blkiolat->enabled) return; - now = ktime_to_ns(ktime_get()); + now = blk_time_get_ns(); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -661,7 +661,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); struct blkcg_gq *blkg; struct cgroup_subsys_state *pos_css; - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, @@ -985,7 +985,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) struct blkcg_gq *blkg = lat_to_blkg(iolat); struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - u64 now = ktime_to_ns(ktime_get()); + u64 now = blk_time_get_ns(); int cpu; if (blk_queue_nonrot(blkg->q)) diff --git a/block/blk-lib.c b/block/blk-lib.c index e59c3069e835..dc8e35d0a51d 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -35,6 +35,26 @@ static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } +static void await_bio_endio(struct bio *bio) +{ + complete(bio->bi_private); + bio_put(bio); +} + +/* + * await_bio_chain - ends @bio and waits for every chained bio to complete + */ +static void await_bio_chain(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); + + bio->bi_private = &done; + bio->bi_end_io = await_bio_endio; + bio_endio(bio); + blk_wait_io(&done); +} + int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { @@ -77,6 +97,10 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, * is disabled. */ cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -120,32 +144,33 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, struct bio **biop, unsigned flags) { struct bio *bio = *biop; - unsigned int max_write_zeroes_sectors; + unsigned int max_sectors; if (bdev_read_only(bdev)) return -EPERM; - /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ - max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + /* Ensure that max_sectors doesn't overflow bi_size */ + max_sectors = bdev_write_zeroes_sectors(bdev); - if (max_write_zeroes_sectors == 0) + if (max_sectors == 0) return -EOPNOTSUPP; while (nr_sects) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; - if (nr_sects > max_write_zeroes_sectors) { - bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; - nr_sects -= max_write_zeroes_sectors; - sector += max_write_zeroes_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } + bio->bi_iter.bi_size = len << SECTOR_SHIFT; + nr_sects -= len; + sector += len; cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -190,6 +215,10 @@ static int __blkdev_issue_zero_pages(struct block_device *bdev, break; } cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + return -EINTR; + } } *biop = bio; @@ -280,7 +309,7 @@ retry: bio_put(bio); } blk_finish_plug(&plug); - if (ret && try_write_zeroes) { + if (ret && ret != -EINTR && try_write_zeroes) { if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { try_write_zeroes = false; goto retry; @@ -322,7 +351,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, return -EPERM; blk_start_plug(&plug); - for (;;) { + while (nr_sects) { unsigned int len = min_t(sector_t, nr_sects, max_sectors); bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); @@ -331,12 +360,17 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, sector += len; nr_sects -= len; - if (!nr_sects) { - ret = submit_bio_wait(bio); - bio_put(bio); + cond_resched(); + if (fatal_signal_pending(current)) { + await_bio_chain(bio); + ret = -EINTR; + bio = NULL; break; } - cond_resched(); + } + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); } blk_finish_plug(&plug); diff --git a/block/blk-mq.c b/block/blk-mq.c index 9c475a89e3af..555ada922cf0 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -21,7 +21,6 @@ #include <linux/llist.h> #include <linux/cpu.h> #include <linux/cache.h> -#include <linux/sched/sysctl.h> #include <linux/sched/topology.h> #include <linux/sched/signal.h> #include <linux/delay.h> @@ -322,7 +321,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) RB_CLEAR_NODE(&rq->rb_node); rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = BLK_MQ_NO_TAG; - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); rq->part = NULL; blk_crypto_rq_set_defaults(rq); } @@ -332,7 +331,7 @@ EXPORT_SYMBOL(blk_rq_init); static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); + rq->start_time_ns = blk_time_get_ns(); else rq->start_time_ns = 0; @@ -443,7 +442,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; @@ -628,7 +627,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, /* alloc_time includes depth and tag waits */ if (blk_queue_rq_alloc_time(q)) - alloc_time_ns = ktime_get_ns(); + alloc_time_ns = blk_time_get_ns(); /* * If the tag allocator sleeps we could get an allocation for a @@ -1041,7 +1040,7 @@ static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { if (blk_mq_need_time_stamp(rq)) - __blk_mq_end_request_acct(rq, ktime_get_ns()); + __blk_mq_end_request_acct(rq, blk_time_get_ns()); blk_mq_finish_request(rq); @@ -1084,7 +1083,7 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) u64 now = 0; if (iob->need_ts) - now = ktime_get_ns(); + now = blk_time_get_ns(); while ((rq = rq_list_pop(&iob->req_list)) != NULL) { prefetch(rq->bio); @@ -1167,10 +1166,11 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (force_irqthreads()) return false; - /* same CPU or cache domain? Complete locally */ + /* same CPU or cache domain and capacity? Complete locally */ if (cpu == rq->mq_ctx->cpu || (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && - cpus_share_cache(cpu, rq->mq_ctx->cpu))) + cpus_share_cache(cpu, rq->mq_ctx->cpu) && + cpus_equal_capacity(cpu, rq->mq_ctx->cpu))) return false; /* don't try to IPI to an offline CPU */ @@ -1254,7 +1254,7 @@ void blk_mq_start_request(struct request *rq) if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && !blk_rq_is_passthrough(rq)) { - rq->io_start_time_ns = ktime_get_ns(); + rq->io_start_time_ns = blk_time_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; rq_qos_issue(q, rq); @@ -1409,22 +1409,10 @@ blk_status_t blk_execute_rq(struct request *rq, bool at_head) blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); blk_mq_run_hw_queue(hctx, false); - if (blk_rq_is_poll(rq)) { + if (blk_rq_is_poll(rq)) blk_rq_poll_completion(rq, &wait.done); - } else { - /* - * Prevent hang_check timer from firing at us during very long - * I/O - */ - unsigned long hang_check = sysctl_hung_task_timeout_secs; - - if (hang_check) - while (!wait_for_completion_io_timeout(&wait.done, - hang_check * (HZ/2))) - ; - else - wait_for_completion_io(&wait.done); - } + else + blk_wait_io(&wait.done); return wait.ret; } @@ -2892,9 +2880,6 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, }; struct request *rq; - if (blk_mq_attempt_bio_merge(q, bio, nsegs)) - return NULL; - rq_qos_throttle(q, bio); if (plug) { @@ -2913,22 +2898,31 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, } /* - * Check if we can use the passed on request for submitting the passed in bio, - * and remove it from the request list if it can be used. + * Check if there is a suitable cached request and return it. */ -static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, - struct bio *bio) +static struct request *blk_mq_peek_cached_request(struct blk_plug *plug, + struct request_queue *q, blk_opf_t opf) { - enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); - enum hctx_type hctx_type = rq->mq_hctx->type; + enum hctx_type type = blk_mq_get_hctx_type(opf); + struct request *rq; - WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + if (!plug) + return NULL; + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + if (type != rq->mq_hctx->type && + (type != HCTX_TYPE_READ || rq->mq_hctx->type != HCTX_TYPE_DEFAULT)) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + return rq; +} - if (type != hctx_type && - !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) - return false; - if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) - return false; +static void blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) +{ + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); /* * If any qos ->throttle() end up blocking, we will have flushed the @@ -2941,7 +2935,6 @@ static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, blk_mq_rq_time_init(rq, 0); rq->cmd_flags = bio->bi_opf; INIT_LIST_HEAD(&rq->queuelist); - return true; } /** @@ -2963,50 +2956,43 @@ void blk_mq_submit_bio(struct bio *bio) struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); struct blk_mq_hw_ctx *hctx; - struct request *rq = NULL; unsigned int nr_segs = 1; + struct request *rq; blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (plug) { - rq = rq_list_peek(&plug->cached_rq); - if (rq && rq->q != q) - rq = NULL; - } - if (rq) { - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - if (!bio_integrity_prep(bio)) - return; - if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) - return; - if (blk_mq_use_cached_rq(rq, plug, bio)) - goto done; - percpu_ref_get(&q->q_usage_counter); - } else { + /* + * If the plug has a cached request for this queue, try use it. + * + * The cached request already holds a q_usage_counter reference and we + * don't have to acquire a new one if we use it. + */ + rq = blk_mq_peek_cached_request(plug, q, bio->bi_opf); + if (!rq) { if (unlikely(bio_queue_enter(bio))) return; - if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - goto fail; - } - if (!bio_integrity_prep(bio)) - goto fail; } - rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); - if (unlikely(!rq)) { -fail: - blk_queue_exit(q); - return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto queue_exit; + } + if (!bio_integrity_prep(bio)) + goto queue_exit; + + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + goto queue_exit; + + if (!rq) { + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); + if (unlikely(!rq)) + goto queue_exit; + } else { + blk_mq_use_cached_rq(rq, plug, bio); } -done: trace_block_getrq(bio); rq_qos_track(q, rq, bio); @@ -3037,6 +3023,15 @@ done: } else { blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); } + return; + +queue_exit: + /* + * Don't drop the queue reference if we were trying to use a cached + * request and thus didn't acquire one. + */ + if (!rq) + blk_queue_exit(q); } #ifdef CONFIG_BLK_MQ_STACKING @@ -3098,7 +3093,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) blk_mq_run_dispatch_ops(q, ret = blk_mq_request_issue_directly(rq, true)); if (ret) - blk_account_io_done(rq, ktime_get_ns()); + blk_account_io_done(rq, blk_time_get_ns()); return ret; } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -4078,15 +4073,16 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, - void *queuedata) +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata) { + struct queue_limits default_lim = { }; struct request_queue *q; int ret; - q = blk_alloc_queue(set->numa_node); - if (!q) - return ERR_PTR(-ENOMEM); + q = blk_alloc_queue(lim ? lim : &default_lim, set->numa_node); + if (IS_ERR(q)) + return q; q->queuedata = queuedata; ret = blk_mq_init_allocated_queue(set, q); if (ret) { @@ -4095,20 +4091,15 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, } return q; } - -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) -{ - return blk_mq_init_queue_data(set, NULL); -} -EXPORT_SYMBOL(blk_mq_init_queue); +EXPORT_SYMBOL(blk_mq_alloc_queue); /** * blk_mq_destroy_queue - shutdown a request queue * @q: request queue to shutdown * - * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * This shuts down a request queue allocated by blk_mq_alloc_queue(). All future * requests will be failed with -ENODEV. The caller is responsible for dropping - * the reference from blk_mq_init_queue() by calling blk_put_queue(). + * the reference from blk_mq_alloc_queue() by calling blk_put_queue(). * * Context: can sleep */ @@ -4129,13 +4120,14 @@ void blk_mq_destroy_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_mq_destroy_queue); -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass) { struct request_queue *q; struct gendisk *disk; - q = blk_mq_init_queue_data(set, queuedata); + q = blk_mq_alloc_queue(set, lim, queuedata); if (IS_ERR(q)) return ERR_CAST(q); @@ -4389,7 +4381,7 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) if (set->nr_maps == 1) set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; - if (set->ops->map_queues && !is_kdump_kernel()) { + if (set->ops->map_queues) { int i; /* @@ -4488,14 +4480,12 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) /* * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. + * memory constrained environment. Limit us to 64 tags to prevent + * using too much memory. */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->nr_maps = 1; + if (is_kdump_kernel()) set->queue_depth = min(64U, set->queue_depth); - } + /* * There is no use for more h/w queues than cpus if we just have * a single map @@ -4525,7 +4515,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) GFP_KERNEL, set->numa_node); if (!set->map[i].mq_map) goto out_free_mq_map; - set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + set->map[i].nr_queues = set->nr_hw_queues; } blk_mq_update_queue_map(set); diff --git a/block/blk-settings.c b/block/blk-settings.c index 06ea91e51b8b..3c7d8d638ab5 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -26,52 +26,21 @@ void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); /** - * blk_set_default_limits - reset limits to default values - * @lim: the queue_limits structure to reset - * - * Description: - * Returns a queue_limit struct to its default state. - */ -void blk_set_default_limits(struct queue_limits *lim) -{ - lim->max_segments = BLK_MAX_SEGMENTS; - lim->max_discard_segments = 1; - lim->max_integrity_segments = 0; - lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; - lim->virt_boundary_mask = 0; - lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; - lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_user_sectors = lim->max_dev_sectors = 0; - lim->chunk_sectors = 0; - lim->max_write_zeroes_sectors = 0; - lim->max_zone_append_sectors = 0; - lim->max_discard_sectors = 0; - lim->max_hw_discard_sectors = 0; - lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 512; - lim->discard_alignment = 0; - lim->discard_misaligned = 0; - lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce = BLK_BOUNCE_NONE; - lim->alignment_offset = 0; - lim->io_opt = 0; - lim->misaligned = 0; - lim->zoned = false; - lim->zone_write_granularity = 0; - lim->dma_alignment = 511; -} - -/** * blk_set_stacking_limits - set default limits for stacking devices * @lim: the queue_limits structure to reset * - * Description: - * Returns a queue_limit struct to its default state. Should be used - * by stacking drivers like DM that have no internal limits. + * Prepare queue limits for applying limits from underlying devices using + * blk_stack_limits(). */ void blk_set_stacking_limits(struct queue_limits *lim) { - blk_set_default_limits(lim); + memset(lim, 0, sizeof(*lim)); + lim->logical_block_size = SECTOR_SIZE; + lim->physical_block_size = SECTOR_SIZE; + lim->io_min = SECTOR_SIZE; + lim->discard_granularity = SECTOR_SIZE; + lim->dma_alignment = SECTOR_SIZE - 1; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; /* Inherit limits from component devices */ lim->max_segments = USHRT_MAX; @@ -82,9 +51,239 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_dev_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; + lim->max_user_discard_sectors = UINT_MAX; } EXPORT_SYMBOL(blk_set_stacking_limits); +static void blk_apply_bdi_limits(struct backing_dev_info *bdi, + struct queue_limits *lim) +{ + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; +} + +static int blk_validate_zoned_limits(struct queue_limits *lim) +{ + if (!lim->zoned) { + if (WARN_ON_ONCE(lim->max_open_zones) || + WARN_ON_ONCE(lim->max_active_zones) || + WARN_ON_ONCE(lim->zone_write_granularity) || + WARN_ON_ONCE(lim->max_zone_append_sectors)) + return -EINVAL; + return 0; + } + + if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED))) + return -EINVAL; + + if (lim->zone_write_granularity < lim->logical_block_size) + lim->zone_write_granularity = lim->logical_block_size; + + if (lim->max_zone_append_sectors) { + /* + * The Zone Append size is limited by the maximum I/O size + * and the zone size given that it can't span zones. + */ + lim->max_zone_append_sectors = + min3(lim->max_hw_sectors, + lim->max_zone_append_sectors, + lim->chunk_sectors); + } + + return 0; +} + +/* + * Check that the limits in lim are valid, initialize defaults for unset + * values, and cap values based on others where needed. + */ +static int blk_validate_limits(struct queue_limits *lim) +{ + unsigned int max_hw_sectors; + + /* + * Unless otherwise specified, default to 512 byte logical blocks and a + * physical block size equal to the logical block size. + */ + if (!lim->logical_block_size) + lim->logical_block_size = SECTOR_SIZE; + if (lim->physical_block_size < lim->logical_block_size) + lim->physical_block_size = lim->logical_block_size; + + /* + * The minimum I/O size defaults to the physical block size unless + * explicitly overridden. + */ + if (lim->io_min < lim->physical_block_size) + lim->io_min = lim->physical_block_size; + + /* + * max_hw_sectors has a somewhat weird default for historical reason, + * but driver really should set their own instead of relying on this + * value. + * + * The block layer relies on the fact that every driver can + * handle at lest a page worth of data per I/O, and needs the value + * aligned to the logical block size. + */ + if (!lim->max_hw_sectors) + lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + if (WARN_ON_ONCE(lim->max_hw_sectors < PAGE_SECTORS)) + return -EINVAL; + lim->max_hw_sectors = round_down(lim->max_hw_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * The actual max_sectors value is a complex beast and also takes the + * max_dev_sectors value (set by SCSI ULPs) and a user configurable + * value into account. The ->max_sectors value is always calculated + * from these, so directly setting it won't have any effect. + */ + max_hw_sectors = min_not_zero(lim->max_hw_sectors, + lim->max_dev_sectors); + if (lim->max_user_sectors) { + if (lim->max_user_sectors > max_hw_sectors || + lim->max_user_sectors < PAGE_SIZE / SECTOR_SIZE) + return -EINVAL; + lim->max_sectors = min(max_hw_sectors, lim->max_user_sectors); + } else { + lim->max_sectors = min(max_hw_sectors, BLK_DEF_MAX_SECTORS_CAP); + } + lim->max_sectors = round_down(lim->max_sectors, + lim->logical_block_size >> SECTOR_SHIFT); + + /* + * Random default for the maximum number of segments. Driver should not + * rely on this and set their own. + */ + if (!lim->max_segments) + lim->max_segments = BLK_MAX_SEGMENTS; + + lim->max_discard_sectors = + min(lim->max_hw_discard_sectors, lim->max_user_discard_sectors); + + if (!lim->max_discard_segments) + lim->max_discard_segments = 1; + + if (lim->discard_granularity < lim->physical_block_size) + lim->discard_granularity = lim->physical_block_size; + + /* + * By default there is no limit on the segment boundary alignment, + * but if there is one it can't be smaller than the page size as + * that would break all the normal I/O patterns. + */ + if (!lim->seg_boundary_mask) + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + if (WARN_ON_ONCE(lim->seg_boundary_mask < PAGE_SIZE - 1)) + return -EINVAL; + + /* + * Devices that require a virtual boundary do not support scatter/gather + * I/O natively, but instead require a descriptor list entry for each + * page (which might not be identical to the Linux PAGE_SIZE). Because + * of that they are not limited by our notion of "segment size". + */ + if (lim->virt_boundary_mask) { + if (WARN_ON_ONCE(lim->max_segment_size && + lim->max_segment_size != UINT_MAX)) + return -EINVAL; + lim->max_segment_size = UINT_MAX; + } else { + /* + * The maximum segment size has an odd historic 64k default that + * drivers probably should override. Just like the I/O size we + * require drivers to at least handle a full page per segment. + */ + if (!lim->max_segment_size) + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + if (WARN_ON_ONCE(lim->max_segment_size < PAGE_SIZE)) + return -EINVAL; + } + + /* + * We require drivers to at least do logical block aligned I/O, but + * historically could not check for that due to the separate calls + * to set the limits. Once the transition is finished the check + * below should be narrowed down to check the logical block size. + */ + if (!lim->dma_alignment) + lim->dma_alignment = SECTOR_SIZE - 1; + if (WARN_ON_ONCE(lim->dma_alignment > PAGE_SIZE)) + return -EINVAL; + + if (lim->alignment_offset) { + lim->alignment_offset &= (lim->physical_block_size - 1); + lim->misaligned = 0; + } + + return blk_validate_zoned_limits(lim); +} + +/* + * Set the default limits for a newly allocated queue. @lim contains the + * initial limits set by the driver, which could be no limit in which case + * all fields are cleared to zero. + */ +int blk_set_default_limits(struct queue_limits *lim) +{ + /* + * Most defaults are set by capping the bounds in blk_validate_limits, + * but max_user_discard_sectors is special and needs an explicit + * initialization to the max value here. + */ + lim->max_user_discard_sectors = UINT_MAX; + return blk_validate_limits(lim); +} + +/** + * queue_limits_commit_update - commit an atomic update of queue limits + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were obtained from queue_limits_start_update() + * and updated by the caller to @q. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim) + __releases(q->limits_lock) +{ + int error = blk_validate_limits(lim); + + if (!error) { + q->limits = *lim; + if (q->disk) + blk_apply_bdi_limits(q->disk->bdi, lim); + } + mutex_unlock(&q->limits_lock); + return error; +} +EXPORT_SYMBOL_GPL(queue_limits_commit_update); + +/** + * queue_limits_set - apply queue limits to queue + * @q: queue to update + * @lim: limits to apply + * + * Apply the limits in @lim that were freshly initialized to @q. + * To update existing limits use queue_limits_start_update() and + * queue_limits_commit_update() instead. + * + * Returns 0 if successful, else a negative error code. + */ +int queue_limits_set(struct request_queue *q, struct queue_limits *lim) +{ + mutex_lock(&q->limits_lock); + return queue_limits_commit_update(q, lim); +} +EXPORT_SYMBOL_GPL(queue_limits_set); + /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device @@ -177,8 +376,11 @@ EXPORT_SYMBOL(blk_queue_chunk_sectors); void blk_queue_max_discard_sectors(struct request_queue *q, unsigned int max_discard_sectors) { - q->limits.max_hw_discard_sectors = max_discard_sectors; - q->limits.max_discard_sectors = max_discard_sectors; + struct queue_limits *lim = &q->limits; + + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_discard_sectors = + min(max_discard_sectors, lim->max_user_discard_sectors); } EXPORT_SYMBOL(blk_queue_max_discard_sectors); @@ -393,15 +595,7 @@ EXPORT_SYMBOL(blk_queue_alignment_offset); void disk_update_readahead(struct gendisk *disk) { - struct request_queue *q = disk->queue; - - /* - * For read-ahead of large files to be effective, we need to read ahead - * at least twice the optimal I/O size. - */ - disk->bdi->ra_pages = - max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); - disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); + blk_apply_bdi_limits(disk->bdi, &disk->queue->limits); } EXPORT_SYMBOL_GPL(disk_update_readahead); @@ -689,33 +883,38 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->zone_write_granularity = max(t->zone_write_granularity, b->zone_write_granularity); t->zoned = max(t->zoned, b->zoned); + if (!t->zoned) { + t->zone_write_granularity = 0; + t->max_zone_append_sectors = 0; + } return ret; } EXPORT_SYMBOL(blk_stack_limits); /** - * disk_stack_limits - adjust queue limits for stacked drivers - * @disk: MD/DM gendisk (top) + * queue_limits_stack_bdev - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top device) * @bdev: the underlying block device (bottom) * @offset: offset to beginning of data within component device + * @pfx: prefix to use for warnings logged * * Description: - * Merges the limits for a top level gendisk and a bottom level - * block_device. + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. */ -void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset) +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx) { - struct request_queue *t = disk->queue; - - if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, - get_start_sect(bdev) + (offset >> 9)) < 0) + if (blk_stack_limits(t, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + offset)) pr_notice("%s: Warning: Device %pg is misaligned\n", - disk->disk_name, bdev); - - disk_update_readahead(disk); + pfx, bdev); } -EXPORT_SYMBOL(disk_stack_limits); +EXPORT_SYMBOL_GPL(queue_limits_stack_bdev); /** * blk_queue_update_dma_pad - update pad mask diff --git a/block/blk-stat.c b/block/blk-stat.c index 7ff76ae6c76a..e42c263e53fb 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat) /* src is a per-cpu stat, mean isn't initialized */ void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) { - if (!src->nr_samples) + if (dst->nr_samples + src->nr_samples <= dst->nr_samples) return; dst->min = min(dst->min, src->min); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 6b2429cad81a..8c8f69d8ba48 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -174,23 +174,29 @@ static ssize_t queue_discard_max_show(struct request_queue *q, char *page) static ssize_t queue_discard_max_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_discard; - ssize_t ret = queue_var_store(&max_discard, page, count); + unsigned long max_discard_bytes; + struct queue_limits lim; + ssize_t ret; + int err; + ret = queue_var_store(&max_discard_bytes, page, count); if (ret < 0) return ret; - if (max_discard & (q->limits.discard_granularity - 1)) + if (max_discard_bytes & (q->limits.discard_granularity - 1)) return -EINVAL; - max_discard >>= 9; - if (max_discard > UINT_MAX) + if ((max_discard_bytes >> SECTOR_SHIFT) > UINT_MAX) return -EINVAL; - if (max_discard > q->limits.max_hw_discard_sectors) - max_discard = q->limits.max_hw_discard_sectors; + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_discard_sectors = max_discard_bytes >> SECTOR_SHIFT; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); - q->limits.max_discard_sectors = max_discard; + if (err) + return err; return ret; } @@ -226,35 +232,22 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { - unsigned long var; - unsigned int max_sectors_kb, - max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, - page_kb = 1 << (PAGE_SHIFT - 10); - ssize_t ret = queue_var_store(&var, page, count); + unsigned long max_sectors_kb; + struct queue_limits lim; + ssize_t ret; + int err; + ret = queue_var_store(&max_sectors_kb, page, count); if (ret < 0) return ret; - max_sectors_kb = (unsigned int)var; - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, - q->limits.max_dev_sectors >> 1); - if (max_sectors_kb == 0) { - q->limits.max_user_sectors = 0; - max_sectors_kb = min(max_hw_sectors_kb, - BLK_DEF_MAX_SECTORS_CAP >> 1); - } else { - if (max_sectors_kb > max_hw_sectors_kb || - max_sectors_kb < page_kb) - return -EINVAL; - q->limits.max_user_sectors = max_sectors_kb << 1; - } - - spin_lock_irq(&q->queue_lock); - q->limits.max_sectors = max_sectors_kb << 1; - if (q->disk) - q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); - spin_unlock_irq(&q->queue_lock); - + blk_mq_freeze_queue(q); + lim = queue_limits_start_update(q); + lim.max_user_sectors = max_sectors_kb << 1; + err = queue_limits_commit_update(q, &lim); + blk_mq_unfreeze_queue(q); + if (err) + return err; return ret; } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 16f5766620a4..f4850a6f860b 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1098,7 +1098,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[READ])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, READ); nr_reads++; if (nr_reads >= max_nr_reads) @@ -1108,7 +1108,7 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && tg_may_dispatch(tg, bio, NULL)) { - tg_dispatch_one_bio(tg, bio_data_dir(bio)); + tg_dispatch_one_bio(tg, WRITE); nr_writes++; if (nr_writes >= max_nr_writes) @@ -1815,7 +1815,7 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); ret = tg->latency_target == DFL_LATENCY_TARGET || tg->idletime_threshold == DFL_IDLE_THRESHOLD || - (ktime_get_ns() >> 10) - tg->last_finish_time > time || + (blk_time_get_ns() >> 10) - tg->last_finish_time > time || tg->avg_idletime > tg->idletime_threshold || (tg->latency_target && tg->bio_cnt && tg->bad_bio_cnt * 5 < tg->bio_cnt); @@ -2060,7 +2060,7 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) if (last_finish_time == 0) return; - now = ktime_get_ns() >> 10; + now = blk_time_get_ns() >> 10; if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2327,7 +2327,7 @@ void blk_throtl_bio_endio(struct bio *bio) if (!tg->td->limit_valid[LIMIT_LOW]) return; - finish_time_ns = ktime_get_ns(); + finish_time_ns = blk_time_get_ns(); tg->last_finish_time = finish_time_ns >> 10; start_time = bio_issue_time(&bio->bi_issue) >> 10; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0c0e270a8265..64472134dd26 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -29,6 +29,7 @@ #include "blk-wbt.h" #include "blk-rq-qos.h" #include "elevator.h" +#include "blk.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> @@ -274,13 +275,12 @@ static inline bool stat_sample_valid(struct blk_rq_stat *stat) static u64 rwb_sync_issue_lat(struct rq_wb *rwb) { - u64 now, issue = READ_ONCE(rwb->sync_issue); + u64 issue = READ_ONCE(rwb->sync_issue); if (!issue || !rwb->sync_cookie) return 0; - now = ktime_to_ns(ktime_get()); - return now - issue; + return blk_time_get_ns() - issue; } static inline unsigned int wbt_inflight(struct rq_wb *rwb) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index d343e5756a9c..da0f4b2a8fa0 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -11,7 +11,6 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/rbtree.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> #include <linux/mm.h> @@ -177,8 +176,7 @@ static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, } } -static int blkdev_zone_reset_all_emulated(struct block_device *bdev, - gfp_t gfp_mask) +static int blkdev_zone_reset_all_emulated(struct block_device *bdev) { struct gendisk *disk = bdev->bd_disk; sector_t capacity = bdev_nr_sectors(bdev); @@ -205,7 +203,7 @@ static int blkdev_zone_reset_all_emulated(struct block_device *bdev, } bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, - gfp_mask); + GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -223,7 +221,7 @@ out_free_need_reset: return ret; } -static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +static int blkdev_zone_reset_all(struct block_device *bdev) { struct bio bio; @@ -238,7 +236,6 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * @sector: Start sector of the first zone to operate on * @nr_sectors: Number of sectors, should be at least the length of one zone and * must be zone size aligned. - * @gfp_mask: Memory allocation flags (for bio_alloc) * * Description: * Perform the specified operation on the range of zones specified by @@ -248,7 +245,7 @@ static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) * or finish request. */ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) + sector_t sector, sector_t nr_sectors) { struct request_queue *q = bdev_get_queue(bdev); sector_t zone_sectors = bdev_zone_sectors(bdev); @@ -285,12 +282,12 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, */ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { if (!blk_queue_zone_resetall(q)) - return blkdev_zone_reset_all_emulated(bdev, gfp_mask); - return blkdev_zone_reset_all(bdev, gfp_mask); + return blkdev_zone_reset_all_emulated(bdev); + return blkdev_zone_reset_all(bdev); } while (sector < end_sector) { - bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, GFP_KERNEL); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -419,8 +416,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, return -ENOTTY; } - ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: if (cmd == BLKRESETZONE) diff --git a/block/blk.h b/block/blk.h index f02b25f22e8b..a19b7b42e650 100644 --- a/block/blk.h +++ b/block/blk.h @@ -4,6 +4,8 @@ #include <linux/blk-crypto.h> #include <linux/memblock.h> /* for max_pfn/max_low_pfn */ +#include <linux/sched/sysctl.h> +#include <linux/timekeeping.h> #include <xen/xen.h> #include "blk-crypto-internal.h" @@ -70,6 +72,18 @@ static inline int bio_queue_enter(struct bio *bio) return __bio_queue_enter(q, bio); } +static inline void blk_wait_io(struct completion *done) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + while (!wait_for_completion_io_timeout(done, timeout)) + ; + else + wait_for_completion_io(done); +} + #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); @@ -329,7 +343,7 @@ void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_set_default_limits(struct queue_limits *lim); +int blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* @@ -447,7 +461,7 @@ static inline void bio_release_page(struct bio *bio, struct page *page) unpin_user_page(page); } -struct request_queue *blk_alloc_queue(int node_id); +struct request_queue *blk_alloc_queue(struct queue_limits *lim, int node_id); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); @@ -516,8 +530,75 @@ static inline int req_ref_read(struct request *req) return atomic_read(&req->ref); } +static inline u64 blk_time_get_ns(void) +{ + struct blk_plug *plug = current->plug; + + if (!plug) + return ktime_get_ns(); + + /* + * 0 could very well be a valid time, but rather than flag "this is + * a valid timestamp" separately, just accept that we'll do an extra + * ktime_get_ns() if we just happen to get 0 as the current time. + */ + if (!plug->cur_ktime) { + plug->cur_ktime = ktime_get_ns(); + current->flags |= PF_BLOCK_TS; + } + return plug->cur_ktime; +} + +static inline ktime_t blk_time_get(void) +{ + return ns_to_ktime(blk_time_get_ns()); +} + +/* + * From most significant bit: + * 1 bit: reserved for other usage, see below + * 12 bits: original size of bio + * 51 bits: issue time of bio + */ +#define BIO_ISSUE_RES_BITS 1 +#define BIO_ISSUE_SIZE_BITS 12 +#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) +#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) +#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) +#define BIO_ISSUE_SIZE_MASK \ + (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) +#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) + +/* Reserved bit for blk-throtl */ +#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) + +static inline u64 __bio_issue_time(u64 time) +{ + return time & BIO_ISSUE_TIME_MASK; +} + +static inline u64 bio_issue_time(struct bio_issue *issue) +{ + return __bio_issue_time(issue->value); +} + +static inline sector_t bio_issue_size(struct bio_issue *issue) +{ + return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); +} + +static inline void bio_issue_init(struct bio_issue *issue, + sector_t size) +{ + size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; + issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | + (blk_time_get_ns() & BIO_ISSUE_TIME_MASK) | + ((u64)size << BIO_ISSUE_SIZE_SHIFT)); +} + void bdev_release(struct file *bdev_file); int bdev_open(struct block_device *bdev, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops, struct file *bdev_file); int bdev_permission(dev_t dev, blk_mode_t mode, void *holder); + #endif /* BLK_INTERNAL_H */ diff --git a/block/bsg-lib.c b/block/bsg-lib.c index b3acdbdb6e7e..bcc7dee6abce 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -383,7 +383,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, if (blk_mq_alloc_tag_set(set)) goto out_tag_set; - q = blk_mq_init_queue(set); + q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(q)) { ret = PTR_ERR(q); goto out_queue; diff --git a/block/genhd.c b/block/genhd.c index a911d2969c07..bb29a68e1d67 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1201,7 +1201,7 @@ static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } -struct class block_class = { +const struct class block_class = { .name = "block", .dev_uevent = block_uevent, }; @@ -1391,19 +1391,21 @@ out_free_disk: return NULL; } -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass) { + struct queue_limits default_lim = { }; struct request_queue *q; struct gendisk *disk; - q = blk_alloc_queue(node); - if (!q) - return NULL; + q = blk_alloc_queue(lim ? lim : &default_lim, node); + if (IS_ERR(q)) + return ERR_CAST(q); disk = __alloc_disk_node(q, node, lkclass); if (!disk) { blk_put_queue(q); - return NULL; + return ERR_PTR(-ENOMEM); } set_bit(GD_OWNS_QUEUE, &disk->state); return disk; diff --git a/block/holder.c b/block/holder.c index 37d18c13d958..791091a7eac2 100644 --- a/block/holder.c +++ b/block/holder.c @@ -8,6 +8,8 @@ struct bd_holder_disk { int refcnt; }; +static DEFINE_MUTEX(blk_holder_mutex); + static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, struct gendisk *disk) { @@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) kobject_get(bdev->bd_holder_dir); mutex_unlock(&bdev->bd_disk->open_mutex); - mutex_lock(&disk->open_mutex); + mutex_lock(&blk_holder_mutex); WARN_ON_ONCE(!bdev->bd_holder); holder = bd_find_holder_disk(bdev, disk); @@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) goto out_del_symlink; list_add(&holder->list, &disk->slave_bdevs); - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); return 0; out_del_symlink: @@ -116,7 +118,7 @@ out_del_symlink: out_free_holder: kfree(holder); out_unlock: - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); if (ret) kobject_put(bdev->bd_holder_dir); return ret; @@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) if (WARN_ON_ONCE(!disk->slave_dir)) return; - mutex_lock(&disk->open_mutex); + mutex_lock(&blk_holder_mutex); holder = bd_find_holder_disk(bdev, disk); if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { del_symlink(disk->slave_dir, bdev_kobj(bdev)); @@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) list_del_init(&holder->list); kfree(holder); } - mutex_unlock(&disk->open_mutex); + mutex_unlock(&blk_holder_mutex); } EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/block/ioctl.c b/block/ioctl.c index 4c8aebee595f..0c76137adcaa 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -18,7 +18,7 @@ static int blkpg_do_ioctl(struct block_device *bdev, { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; - sector_t start, length; + sector_t start, length, capacity, end; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -41,6 +41,13 @@ static int blkpg_do_ioctl(struct block_device *bdev, start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; + capacity = get_capacity(disk); + + if (check_add_overflow(start, length, &end)) + return -EINVAL; + + if (start >= capacity || end > capacity) + return -EINVAL; switch (op) { case BLKPG_ADD_PARTITION: diff --git a/block/partitions/core.c b/block/partitions/core.c index 5f5ed5c75f04..b11e88c82c8c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -419,21 +419,10 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { - sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); - if (check_add_overflow(start, length, &end)) { - ret = -EINVAL; - goto out; - } - - if (start >= capacity || end > capacity) { - ret = -EINVAL; - goto out; - } - if (!disk_live(disk)) { ret = -ENXIO; goto out; diff --git a/block/partitions/mac.c b/block/partitions/mac.c index 7b521df00a39..c80183156d68 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -20,6 +20,7 @@ extern void note_bootable_part(dev_t dev, int part, int goodness); * Code to understand MacOS partition tables. */ +#ifdef CONFIG_PPC_PMAC static inline void mac_fix_string(char *stg, int len) { int i; @@ -27,6 +28,7 @@ static inline void mac_fix_string(char *stg, int len) for (i = len - 1; i >= 0 && stg[i] == ' '; i--) stg[i] = 0; } +#endif int mac_partition(struct parsed_partitions *state) { diff --git a/block/sed-opal.c b/block/sed-opal.c index fa4dba5d8531..14fe0fef811c 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1212,7 +1212,7 @@ static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method) static int start_opal_session_cont(struct opal_dev *dev) { u32 hsn, tsn; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) @@ -1354,7 +1354,7 @@ static int get_active_key_cont(struct opal_dev *dev) { const char *activekey; size_t keylen; - int error = 0; + int error; error = parse_and_check_status(dev); if (error) @@ -2157,7 +2157,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data) u8 lr_buffer[OPAL_UID_LENGTH]; struct opal_lock_unlock *lkul = data; u8 read_locked = 1, write_locked = 1; - int err = 0; + int err; if (build_locking_range(lr_buffer, sizeof(lr_buffer), lkul->session.opal_key.lr) < 0) @@ -2580,7 +2580,7 @@ static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) const struct opal_step discovery0_step = { opal_discovery0, discv }; - int ret = 0; + int ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -3069,7 +3069,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) { struct opal_suspend_data *suspend; bool was_failure = false; - int ret = 0; + int ret; if (!dev) return false; @@ -3112,10 +3112,9 @@ static int opal_read_table(struct opal_dev *dev, { read_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, read_table_steps, ARRAY_SIZE(read_table_steps)); @@ -3129,10 +3128,9 @@ static int opal_write_table(struct opal_dev *dev, { write_table_data, rw_tbl }, { end_opal_session, } }; - int ret = 0; if (!rw_tbl->size) - return ret; + return 0; return execute_steps(dev, write_table_steps, ARRAY_SIZE(write_table_steps)); diff --git a/block/t10-pi.c b/block/t10-pi.c index 914d8cddd43a..d90892fd6f2a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -12,14 +12,14 @@ #include <net/checksum.h> #include <asm/unaligned.h> -typedef __be16 (csum_fn) (void *, unsigned int); +typedef __be16 (csum_fn) (__be16, void *, unsigned int); -static __be16 t10_pi_crc_fn(void *data, unsigned int len) +static __be16 t10_pi_crc_fn(__be16 crc, void *data, unsigned int len) { - return cpu_to_be16(crc_t10dif(data, len)); + return cpu_to_be16(crc_t10dif_update(be16_to_cpu(crc), data, len)); } -static __be16 t10_pi_ip_fn(void *data, unsigned int len) +static __be16 t10_pi_ip_fn(__be16 csum, void *data, unsigned int len) { return (__force __be16)ip_compute_csum(data, len); } @@ -32,12 +32,16 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len) static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->guard_tag = fn(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = fn(pi->guard_tag, iter->prot_buf, + offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -56,12 +60,13 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, csum_fn *fn, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; BUG_ON(type == T10_PI_TYPE0_PROTECTION); for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct t10_pi_tuple *pi = iter->prot_buf; + struct t10_pi_tuple *pi = iter->prot_buf + offset; __be16 csum; if (type == T10_PI_TYPE1_PROTECTION || @@ -83,7 +88,9 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, goto next; } - csum = fn(iter->data_buf, iter->interval); + csum = fn(0, iter->data_buf, iter->interval); + if (offset) + csum = fn(csum, iter->prot_buf, offset); if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ @@ -134,8 +141,10 @@ static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) */ static void t10_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -154,7 +163,7 @@ static void t10_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == virt) pi->ref_tag = cpu_to_be32(ref_tag); @@ -183,9 +192,11 @@ static void t10_pi_type1_prepare(struct request *rq) */ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u32 ref_tag = t10_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -200,7 +211,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct t10_pi_tuple *pi = p; + struct t10_pi_tuple *pi = p + offset; if (be32_to_cpu(pi->ref_tag) == ref_tag) pi->ref_tag = cpu_to_be32(virt); @@ -280,20 +291,24 @@ const struct blk_integrity_profile t10_pi_type3_ip = { }; EXPORT_SYMBOL(t10_pi_type3_ip); -static __be64 ext_pi_crc64(void *data, unsigned int len) +static __be64 ext_pi_crc64(u64 crc, void *data, unsigned int len) { - return cpu_to_be64(crc64_rocksoft(data, len)); + return cpu_to_be64(crc64_rocksoft_update(crc, data, len)); } static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0 ; i < iter->data_size ; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; - pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); + pi->guard_tag = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + pi->guard_tag = ext_pi_crc64(be64_to_cpu(pi->guard_tag), + iter->prot_buf, offset); pi->app_tag = 0; if (type == T10_PI_TYPE1_PROTECTION) @@ -319,10 +334,11 @@ static bool ext_pi_ref_escape(u8 *ref_tag) static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, enum t10_dif_type type) { + u8 offset = iter->pi_offset; unsigned int i; for (i = 0; i < iter->data_size; i += iter->interval) { - struct crc64_pi_tuple *pi = iter->prot_buf; + struct crc64_pi_tuple *pi = iter->prot_buf + offset; u64 ref, seed; __be64 csum; @@ -343,7 +359,11 @@ static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, goto next; } - csum = ext_pi_crc64(iter->data_buf, iter->interval); + csum = ext_pi_crc64(0, iter->data_buf, iter->interval); + if (offset) + csum = ext_pi_crc64(be64_to_cpu(csum), iter->prot_buf, + offset); + if (pi->guard_tag != csum) { pr_err("%s: guard tag error at sector %llu " \ "(rcvd %016llx, want %016llx)\n", @@ -373,8 +393,10 @@ static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) static void ext_pi_type1_prepare(struct request *rq) { - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -393,7 +415,7 @@ static void ext_pi_type1_prepare(struct request *rq) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == virt) @@ -411,9 +433,11 @@ static void ext_pi_type1_prepare(struct request *rq) static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) { - unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; - const int tuple_sz = rq->q->integrity.tuple_size; + struct blk_integrity *bi = &rq->q->integrity; + unsigned intervals = nr_bytes >> bi->interval_exp; + const int tuple_sz = bi->tuple_size; u64 ref_tag = ext_pi_ref_tag(rq); + u8 offset = bi->pi_offset; struct bio *bio; __rq_for_each_bio(bio, rq) { @@ -428,7 +452,7 @@ static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { - struct crc64_pi_tuple *pi = p; + struct crc64_pi_tuple *pi = p + offset; u64 ref = get_unaligned_be48(pi->ref_tag); if (ref == ref_tag) diff --git a/drivers/base/base.h b/drivers/base/base.h index eb4c0ace9242..0738ccad08b2 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -207,7 +207,7 @@ static inline int devtmpfs_init(void) { return 0; } #endif #ifdef CONFIG_BLOCK -extern struct class block_class; +extern const struct class block_class; static inline bool is_blockdev(struct device *dev) { return dev->class == &block_class; diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 2b98114a9fe0..a25414228e47 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1779,7 +1779,7 @@ static int fd_alloc_disk(int drive, int system) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index b1b47d88f5db..b6dac8cee70f 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -24,8 +24,8 @@ static DEFINE_MUTEX(aoeblk_mutex); static struct kmem_cache *buf_pool_cache; static struct dentry *aoe_debugfs_dir; -/* GPFS needs a larger value than the default. */ -static int aoe_maxsectors; +/* random default picked from the historic block max_sectors cap */ +static int aoe_maxsectors = 2560; module_param(aoe_maxsectors, int, 0644); MODULE_PARM_DESC(aoe_maxsectors, "When nonzero, set the maximum number of sectors per I/O request"); @@ -334,6 +334,10 @@ aoeblk_gdalloc(void *vp) mempool_t *mp; struct blk_mq_tag_set *set; sector_t ssize; + struct queue_limits lim = { + .max_hw_sectors = aoe_maxsectors, + .io_opt = SZ_2M, + }; ulong flags; int late = 0; int err; @@ -371,7 +375,7 @@ aoeblk_gdalloc(void *vp) goto err_mempool; } - gd = blk_mq_alloc_disk(set, d); + gd = blk_mq_alloc_disk(set, &lim, d); if (IS_ERR(gd)) { pr_err("aoe: cannot allocate block queue for %ld.%d\n", d->aoemajor, d->aoeminor); @@ -384,14 +388,9 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - /* random number picked from the history block max_sectors cap */ - blk_queue_max_hw_sectors(gd->queue, 2560u); - blk_queue_io_opt(gd->queue, SZ_2M); d->bufpool = mp; d->blkq = gd->queue; d->gd = gd; - if (aoe_maxsectors) - blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors); gd->major = AOE_MAJOR; gd->first_minor = d->sysminor; gd->minors = AOE_PARTITIONS; diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d7317425be51..cc9077b588d7 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -419,13 +419,16 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu rcu_read_lock(); for_each_netdev_rcu(&init_net, ifp) { dev_hold(ifp); - if (!is_aoe_netif(ifp)) - goto cont; + if (!is_aoe_netif(ifp)) { + dev_put(ifp); + continue; + } skb = new_skb(sizeof *h + sizeof *ch); if (skb == NULL) { printk(KERN_INFO "aoe: skb alloc failure\n"); - goto cont; + dev_put(ifp); + continue; } skb_put(skb, sizeof *h + sizeof *ch); skb->dev = ifp; @@ -440,9 +443,6 @@ aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *qu h->major = cpu_to_be16(aoemajor); h->minor = aoeminor; h->cmd = AOECMD_CFG; - -cont: - dev_put(ifp); } rcu_read_unlock(); } diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c index c51ea95bc2ce..923a134fd766 100644 --- a/drivers/block/aoe/aoenet.c +++ b/drivers/block/aoe/aoenet.c @@ -63,6 +63,7 @@ tx(int id) __must_hold(&txlock) pr_warn("aoe: packet could not be sent on %s. %s\n", ifp ? ifp->name : "netif", "consider increasing tx_queue_len"); + dev_put(ifp); spin_lock_irq(&txlock); } return 0; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 50949207798d..cacc4ba942a8 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1994,7 +1994,7 @@ static int ataflop_alloc_disk(unsigned int drive, unsigned int type) { struct gendisk *disk; - disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL); + disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 970bd6ff38c4..e322cef6596b 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -318,6 +318,16 @@ static int brd_alloc(int i) struct gendisk *disk; char buf[DISK_NAME_LEN]; int err = -ENOMEM; + struct queue_limits lim = { + /* + * This is so fdisk will align partitions on 4k, because of + * direct_access API needing 4k alignment, returning a PFN + * (This is only a problem on very small devices <= 4M, + * otherwise fdisk will align on 1M. Regardless this call + * is harmless) + */ + .physical_block_size = PAGE_SIZE, + }; list_for_each_entry(brd, &brd_devices, brd_list) if (brd->brd_number == i) @@ -335,10 +345,11 @@ static int brd_alloc(int i) debugfs_create_u64(buf, 0444, brd_debugfs_dir, &brd->brd_nr_pages); - disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = brd->brd_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_free_dev; - + } disk->major = RAMDISK_MAJOR; disk->first_minor = i * max_part; disk->minors = max_part; @@ -347,15 +358,6 @@ static int brd_alloc(int i) strscpy(disk->disk_name, buf, DISK_NAME_LEN); set_capacity(disk, rd_size * 2); - /* - * This is so fdisk will align partitions on 4k, because of - * direct_access API needing 4k alignment, returning a PFN - * (This is only a problem on very small devices <= 4M, - * otherwise fdisk will align on 1M. Regardless this call - * is harmless) - */ - blk_queue_physical_block_size(disk->queue, PAGE_SIZE); - /* Tell the block layer that this is not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, disk->queue); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 6bc86106c7b2..113b441d4d36 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2690,6 +2690,14 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig int id; int vnr = adm_ctx->volume; enum drbd_ret_code err = ERR_NOMEM; + struct queue_limits lim = { + /* + * Setting the max_hw_sectors to an odd value of 8kibyte here. + * This triggers a max_bio_size message upon first attach or + * connect. + */ + .max_hw_sectors = DRBD_MAX_BIO_SIZE_SAFE >> 8, + }; device = minor_to_device(minor); if (device) @@ -2708,9 +2716,11 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig drbd_init_set_defaults(device); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out_no_disk; + } device->vdisk = disk; device->rq_queue = disk->queue; @@ -2727,9 +2737,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); blk_queue_write_cache(disk->queue, true, true); - /* Setting the max_hw_sectors to an odd value of 8kibyte here - This triggers a max_bio_size message upon first attach or connect */ - blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8); device->md_io.page = alloc_page(GFP_KERNEL); if (!device->md_io.page) diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6aed67278e8b..5d65c9754d83 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1189,9 +1189,31 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } -static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) +static unsigned int drbd_max_peer_bio_size(struct drbd_device *device) { - q->limits.discard_granularity = granularity; + /* + * We may ignore peer limits if the peer is modern enough. From 8.3.8 + * onwards the peer can use multiple BIOs for a single peer_request. + */ + if (device->state.conn < C_WF_REPORT_PARAMS) + return device->peer_max_bio_size; + + if (first_peer_device(device)->connection->agreed_pro_version < 94) + return min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + + /* + * Correct old drbd (up to 8.3.7) if it believes it can do more than + * 32KiB. + */ + if (first_peer_device(device)->connection->agreed_pro_version == 94) + return DRBD_MAX_SIZE_H80_PACKET; + + /* + * drbd 8.3.8 onwards, before 8.4.0 + */ + if (first_peer_device(device)->connection->agreed_pro_version < 100) + return DRBD_MAX_BIO_SIZE_P95; + return DRBD_MAX_BIO_SIZE; } static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) @@ -1204,149 +1226,119 @@ static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) return AL_EXTENT_SIZE >> 9; } -static void decide_on_discard_support(struct drbd_device *device, +static bool drbd_discard_supported(struct drbd_connection *connection, struct drbd_backing_dev *bdev) { - struct drbd_connection *connection = - first_peer_device(device)->connection; - struct request_queue *q = device->rq_queue; - unsigned int max_discard_sectors; - if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev)) - goto not_supported; + return false; if (connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); - goto not_supported; + return false; } - /* - * We don't care for the granularity, really. - * - * Stacking limits below should fix it for the local device. Whether or - * not it is a suitable granularity on the remote device is not our - * problem, really. If you care, you need to use devices with similar - * topology on all peers. - */ - blk_queue_discard_granularity(q, 512); - max_discard_sectors = drbd_max_discard_sectors(connection); - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - return; - -not_supported: - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); + return true; } -static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q) +/* This is the workaround for "bio would need to, but cannot, be split" */ +static unsigned int drbd_backing_dev_max_segments(struct drbd_device *device) { - /* Fixup max_write_zeroes_sectors after blk_stack_limits(): - * if we can handle "zeroes" efficiently on the protocol, - * we want to do that, even if our backend does not announce - * max_write_zeroes_sectors itself. */ - struct drbd_connection *connection = first_peer_device(device)->connection; - /* If the peer announces WZEROES support, use it. Otherwise, rather - * send explicit zeroes than rely on some discard-zeroes-data magic. */ - if (connection->agreed_features & DRBD_FF_WZEROES) - q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; - else - q->limits.max_write_zeroes_sectors = 0; -} + unsigned int max_segments; -static void fixup_discard_support(struct drbd_device *device, struct request_queue *q) -{ - unsigned int max_discard = device->rq_queue->limits.max_discard_sectors; - unsigned int discard_granularity = - device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT; + rcu_read_lock(); + max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + rcu_read_unlock(); - if (discard_granularity > max_discard) { - blk_queue_discard_granularity(q, 0); - blk_queue_max_discard_sectors(q, 0); - } + if (!max_segments) + return BLK_MAX_SEGMENTS; + return max_segments; } -static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, - unsigned int max_bio_size, struct o_qlim *o) +void drbd_reconsider_queue_parameters(struct drbd_device *device, + struct drbd_backing_dev *bdev, struct o_qlim *o) { + struct drbd_connection *connection = + first_peer_device(device)->connection; struct request_queue * const q = device->rq_queue; - unsigned int max_hw_sectors = max_bio_size >> 9; - unsigned int max_segments = 0; + unsigned int now = queue_max_hw_sectors(q) << 9; + struct queue_limits lim; struct request_queue *b = NULL; - struct disk_conf *dc; + unsigned int new; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; - max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); - rcu_read_lock(); - dc = rcu_dereference(device->ldev->disk_conf); - max_segments = dc->max_bio_bvecs; - rcu_read_unlock(); - - blk_set_stacking_limits(&q->limits); + device->local_max_bio_size = + queue_max_hw_sectors(b) << SECTOR_SHIFT; } - blk_queue_max_hw_sectors(q, max_hw_sectors); - /* This is the workaround for "bio would need to, but cannot, be split" */ - blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); - blk_queue_segment_boundary(q, PAGE_SIZE-1); - decide_on_discard_support(device, bdev); - - if (b) { - blk_stack_limits(&q->limits, &b->limits, 0); - disk_update_readahead(device->vdisk); + /* + * We may later detach and re-attach on a disconnected Primary. Avoid + * decreasing the value in this case. + * + * We want to store what we know the peer DRBD can handle, not what the + * peer IO backend can handle. + */ + new = min3(DRBD_MAX_BIO_SIZE, device->local_max_bio_size, + max(drbd_max_peer_bio_size(device), device->peer_max_bio_size)); + if (new != now) { + if (device->state.role == R_PRIMARY && new < now) + drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", + new, now); + drbd_info(device, "max BIO size = %u\n", new); } - fixup_write_zeroes(device, q); - fixup_discard_support(device, q); -} - -void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) -{ - unsigned int now, new, local, peer; - - now = queue_max_hw_sectors(device->rq_queue) << 9; - local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */ - peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */ + lim = queue_limits_start_update(q); if (bdev) { - local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9; - device->local_max_bio_size = local; + blk_set_stacking_limits(&lim); + lim.max_segments = drbd_backing_dev_max_segments(device); + } else { + lim.max_segments = BLK_MAX_SEGMENTS; } - local = min(local, DRBD_MAX_BIO_SIZE); - /* We may ignore peer limits if the peer is modern enough. - Because new from 8.3.8 onwards the peer can use multiple - BIOs for a single peer_request */ - if (device->state.conn >= C_WF_REPORT_PARAMS) { - if (first_peer_device(device)->connection->agreed_pro_version < 94) - peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); - /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ - else if (first_peer_device(device)->connection->agreed_pro_version == 94) - peer = DRBD_MAX_SIZE_H80_PACKET; - else if (first_peer_device(device)->connection->agreed_pro_version < 100) - peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */ - else - peer = DRBD_MAX_BIO_SIZE; + lim.max_hw_sectors = new >> SECTOR_SHIFT; + lim.seg_boundary_mask = PAGE_SIZE - 1; - /* We may later detach and re-attach on a disconnected Primary. - * Avoid this setting to jump back in that case. - * We want to store what we know the peer DRBD can handle, - * not what the peer IO backend can handle. */ - if (peer > device->peer_max_bio_size) - device->peer_max_bio_size = peer; + /* + * We don't care for the granularity, really. + * + * Stacking limits below should fix it for the local device. Whether or + * not it is a suitable granularity on the remote device is not our + * problem, really. If you care, you need to use devices with similar + * topology on all peers. + */ + if (drbd_discard_supported(connection, bdev)) { + lim.discard_granularity = 512; + lim.max_hw_discard_sectors = + drbd_max_discard_sectors(connection); + } else { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; } - new = min(local, peer); - if (device->state.role == R_PRIMARY && new < now) - drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now); + if (bdev) + blk_stack_limits(&lim, &b->limits, 0); - if (new != now) - drbd_info(device, "max BIO size = %u\n", new); + /* + * If we can handle "zeroes" efficiently on the protocol, we want to do + * that, even if our backend does not announce max_write_zeroes_sectors + * itself. + */ + if (connection->agreed_features & DRBD_FF_WZEROES) + lim.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS; + else + lim.max_write_zeroes_sectors = 0; + + if ((lim.discard_granularity >> SECTOR_SHIFT) > + lim.max_hw_discard_sectors) { + lim.discard_granularity = 0; + lim.max_hw_discard_sectors = 0; + } - drbd_setup_queue_param(device, bdev, new, o); + if (queue_limits_commit_update(q, &lim)) + drbd_err(device, "setting new queue limits failed\n"); } /* Starts the worker thread */ diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 287a8d1d3f70..e858e7e0383f 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -1542,9 +1542,10 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, int notify_resource_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_resource_state_change *resource_state_change, + void *state_change, enum drbd_notification_type type) { + struct drbd_resource_state_change *resource_state_change = state_change; struct drbd_resource *resource = resource_state_change->resource; struct resource_info resource_info = { .res_role = resource_state_change->role[NEW], @@ -1558,13 +1559,14 @@ int notify_resource_state_change(struct sk_buff *skb, int notify_connection_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_connection_state_change *connection_state_change, + void *state_change, enum drbd_notification_type type) { - struct drbd_connection *connection = connection_state_change->connection; + struct drbd_connection_state_change *p = state_change; + struct drbd_connection *connection = p->connection; struct connection_info connection_info = { - .conn_connection_state = connection_state_change->cstate[NEW], - .conn_role = connection_state_change->peer_role[NEW], + .conn_connection_state = p->cstate[NEW], + .conn_role = p->peer_role[NEW], }; return notify_connection_state(skb, seq, connection, &connection_info, type); @@ -1572,9 +1574,10 @@ int notify_connection_state_change(struct sk_buff *skb, int notify_device_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_device_state_change *device_state_change, + void *state_change, enum drbd_notification_type type) { + struct drbd_device_state_change *device_state_change = state_change; struct drbd_device *device = device_state_change->device; struct device_info device_info = { .dev_disk_state = device_state_change->disk_state[NEW], @@ -1585,9 +1588,10 @@ int notify_device_state_change(struct sk_buff *skb, int notify_peer_device_state_change(struct sk_buff *skb, unsigned int seq, - struct drbd_peer_device_state_change *p, + void *state_change, enum drbd_notification_type type) { + struct drbd_peer_device_state_change *p = state_change; struct drbd_peer_device *peer_device = p->peer_device; struct peer_device_info peer_device_info = { .peer_repl_state = p->repl_state[NEW], @@ -1605,8 +1609,8 @@ static void broadcast_state_change(struct drbd_state_change *state_change) struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; bool resource_state_has_changed; unsigned int n_device, n_connection, n_peer_device, n_peer_devices; - int (*last_func)(struct sk_buff *, unsigned int, void *, - enum drbd_notification_type) = NULL; + int (*last_func)(struct sk_buff *, unsigned int, + void *, enum drbd_notification_type) = NULL; void *last_arg = NULL; #define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) @@ -1616,7 +1620,7 @@ static void broadcast_state_change(struct drbd_state_change *state_change) }) #define REMEMBER_STATE_CHANGE(func, arg, type) \ ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ - last_func = (typeof(last_func))func; \ + last_func = func; \ last_arg = arg; \ }) diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h index 9d78d8e3912e..a56a57d67686 100644 --- a/drivers/block/drbd/drbd_state_change.h +++ b/drivers/block/drbd/drbd_state_change.h @@ -46,19 +46,19 @@ extern void forget_state_change(struct drbd_state_change *); extern int notify_resource_state_change(struct sk_buff *, unsigned int, - struct drbd_resource_state_change *, + void *, enum drbd_notification_type type); extern int notify_connection_state_change(struct sk_buff *, unsigned int, - struct drbd_connection_state_change *, + void *, enum drbd_notification_type type); extern int notify_device_state_change(struct sk_buff *, unsigned int, - struct drbd_device_state_change *, + void *, enum drbd_notification_type type); extern int notify_peer_device_state_change(struct sk_buff *, unsigned int, - struct drbd_peer_device_state_change *, + void *, enum drbd_notification_type type); #endif /* DRBD_STATE_CHANGE_H */ diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index d0e41d52d6a9..1b399ec8c07d 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -530,14 +530,13 @@ static struct format_descr format_req; static char *floppy_track_buffer; static int max_buffer_sectors; -typedef void (*done_f)(int); static const struct cont_t { void (*interrupt)(void); /* this is called after the interrupt of the * main command */ void (*redo)(void); /* this is called to retry the operation */ void (*error)(void); /* this is called to tally an error */ - done_f done; /* this is called to say if the operation has + void (*done)(int); /* this is called to say if the operation has * succeeded/failed */ } *cont; @@ -985,6 +984,10 @@ static void empty(void) { } +static void empty_done(int result) +{ +} + static void (*floppy_work_fn)(void); static void floppy_work_workfn(struct work_struct *work) @@ -1998,14 +2001,14 @@ static const struct cont_t wakeup_cont = { .interrupt = empty, .redo = do_wakeup, .error = empty, - .done = (done_f)empty + .done = empty_done, }; static const struct cont_t intr_cont = { .interrupt = empty, .redo = process_fd_request, .error = empty, - .done = (done_f)empty + .done = empty_done, }; /* schedules handler, waiting for completion. May be interrupted, will then @@ -4513,13 +4516,15 @@ static bool floppy_available(int drive) static int floppy_alloc_disk(unsigned int drive, unsigned int type) { + struct queue_limits lim = { + .max_hw_sectors = 64, + }; struct gendisk *disk; - disk = blk_mq_alloc_disk(&tag_sets[drive], NULL); + disk = blk_mq_alloc_disk(&tag_sets[drive], &lim, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); - blk_queue_max_hw_sectors(disk->queue, 64); disk->major = FLOPPY_MAJOR; disk->first_minor = TOMINOR(drive) | (type << 2); disk->minors = 1; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index f8145499da38..28a95fd366fe 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -750,12 +750,13 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } -static void loop_config_discard(struct loop_device *lo) +static void loop_config_discard(struct loop_device *lo, + struct queue_limits *lim) { struct file *file = lo->lo_backing_file; struct inode *inode = file->f_mapping->host; - struct request_queue *q = lo->lo_queue; - u32 granularity, max_discard_sectors; + u32 granularity = 0, max_discard_sectors = 0; + struct kstatfs sbuf; /* * If the backing device is a block device, mirror its zeroing @@ -775,29 +776,17 @@ static void loop_config_discard(struct loop_device *lo) * We use punch hole to reclaim the free space used by the * image a.k.a. discard. */ - } else if (!file->f_op->fallocate) { - max_discard_sectors = 0; - granularity = 0; - - } else { - struct kstatfs sbuf; - + } else if (file->f_op->fallocate && !vfs_statfs(&file->f_path, &sbuf)) { max_discard_sectors = UINT_MAX >> 9; - if (!vfs_statfs(&file->f_path, &sbuf)) - granularity = sbuf.f_bsize; - else - max_discard_sectors = 0; + granularity = sbuf.f_bsize; } - if (max_discard_sectors) { - q->limits.discard_granularity = granularity; - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); - } else { - q->limits.discard_granularity = 0; - blk_queue_max_discard_sectors(q, 0); - blk_queue_max_write_zeroes_sectors(q, 0); - } + lim->max_hw_discard_sectors = max_discard_sectors; + lim->max_write_zeroes_sectors = max_discard_sectors; + if (max_discard_sectors) + lim->discard_granularity = granularity; + else + lim->discard_granularity = 0; } struct loop_worker { @@ -986,6 +975,20 @@ loop_set_status_from_info(struct loop_device *lo, return 0; } +static int loop_reconfigure_limits(struct loop_device *lo, unsigned short bsize, + bool update_discard_settings) +{ + struct queue_limits lim; + + lim = queue_limits_start_update(lo->lo_queue); + lim.logical_block_size = bsize; + lim.physical_block_size = bsize; + lim.io_min = bsize; + if (update_discard_settings) + loop_config_discard(lo, &lim); + return queue_limits_commit_update(lo->lo_queue, &lim); +} + static int loop_configure(struct loop_device *lo, blk_mode_t mode, struct block_device *bdev, const struct loop_config *config) @@ -1083,11 +1086,10 @@ static int loop_configure(struct loop_device *lo, blk_mode_t mode, else bsize = 512; - blk_queue_logical_block_size(lo->lo_queue, bsize); - blk_queue_physical_block_size(lo->lo_queue, bsize); - blk_queue_io_min(lo->lo_queue, bsize); + error = loop_reconfigure_limits(lo, bsize, true); + if (WARN_ON_ONCE(error)) + goto out_unlock; - loop_config_discard(lo); loop_update_rotational(lo); loop_update_dio(lo); loop_sysfs_init(lo); @@ -1154,9 +1156,7 @@ static void __loop_clr_fd(struct loop_device *lo, bool release) lo->lo_offset = 0; lo->lo_sizelimit = 0; memset(lo->lo_file_name, 0, LO_NAME_SIZE); - blk_queue_logical_block_size(lo->lo_queue, 512); - blk_queue_physical_block_size(lo->lo_queue, 512); - blk_queue_io_min(lo->lo_queue, 512); + loop_reconfigure_limits(lo, 512, false); invalidate_disk(lo->lo_disk); loop_sysfs_exit(lo); /* let user-space know about this change */ @@ -1488,9 +1488,7 @@ static int loop_set_block_size(struct loop_device *lo, unsigned long arg) invalidate_bdev(lo->lo_device); blk_mq_freeze_queue(lo->lo_queue); - blk_queue_logical_block_size(lo->lo_queue, arg); - blk_queue_physical_block_size(lo->lo_queue, arg); - blk_queue_io_min(lo->lo_queue, arg); + err = loop_reconfigure_limits(lo, arg, false); loop_update_dio(lo); blk_mq_unfreeze_queue(lo->lo_queue); @@ -1982,6 +1980,12 @@ static const struct blk_mq_ops loop_mq_ops = { static int loop_add(int i) { + struct queue_limits lim = { + /* + * Random number picked from the historic block max_sectors cap. + */ + .max_hw_sectors = 2560u, + }; struct loop_device *lo; struct gendisk *disk; int err; @@ -2025,16 +2029,13 @@ static int loop_add(int i) if (err) goto out_free_idr; - disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo); + disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, &lim, lo); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_cleanup_tags; } lo->lo_queue = lo->lo_disk->queue; - /* random number picked from the history block max_sectors cap */ - blk_queue_max_hw_sectors(lo->lo_queue, 2560u); - /* * By default, we do buffer IO, so it doesn't make sense to enable * merge because the I/O submitted to backing file is handled page by diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b200950e8fb5..43a187609ef7 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3401,6 +3401,12 @@ static const struct blk_mq_ops mtip_mq_ops = { */ static int mtip_block_initialize(struct driver_data *dd) { + struct queue_limits lim = { + .physical_block_size = 4096, + .max_hw_sectors = 0xffff, + .max_segments = MTIP_MAX_SG, + .max_segment_size = 0x400000, + }; int rv = 0, wait_for_rebuild = 0; sector_t capacity; unsigned int index = 0; @@ -3431,7 +3437,7 @@ static int mtip_block_initialize(struct driver_data *dd) goto block_queue_alloc_tag_error; } - dd->disk = blk_mq_alloc_disk(&dd->tags, dd); + dd->disk = blk_mq_alloc_disk(&dd->tags, &lim, dd); if (IS_ERR(dd->disk)) { dev_err(&dd->pdev->dev, "Unable to allocate request queue\n"); @@ -3481,12 +3487,7 @@ skip_create_disk: /* Set device limits. */ blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue); - blk_queue_max_segments(dd->queue, MTIP_MAX_SG); - blk_queue_physical_block_size(dd->queue, 4096); - blk_queue_max_hw_sectors(dd->queue, 0xffff); - blk_queue_max_segment_size(dd->queue, 0x400000); dma_set_max_seg_size(&dd->pdev->dev, 0x400000); - blk_queue_io_min(dd->queue, 4096); /* Set the capacity of the device in 512 byte sectors. */ if (!(mtip_hw_get_capacity(dd, &capacity))) { diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index d914156db2d8..27b2187e7a6d 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -114,6 +114,10 @@ static const struct block_device_operations n64cart_fops = { */ static int __init n64cart_probe(struct platform_device *pdev) { + struct queue_limits lim = { + .physical_block_size = 4096, + .logical_block_size = 4096, + }; struct gendisk *disk; int err = -ENOMEM; @@ -131,9 +135,11 @@ static int __init n64cart_probe(struct platform_device *pdev) if (IS_ERR(reg_base)) return PTR_ERR(reg_base); - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + err = PTR_ERR(disk); goto out; + } disk->first_minor = 0; disk->flags = GENHD_FL_NO_PART; @@ -145,8 +151,6 @@ static int __init n64cart_probe(struct platform_device *pdev) set_disk_ro(disk, 1); blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_physical_block_size(disk->queue, 4096); - blk_queue_logical_block_size(disk->queue, 4096); err = add_disk(disk); if (err) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 33a8f37bb6a1..9d4ec9273bf9 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -316,9 +316,12 @@ static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock, nsock->sent = 0; } -static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, +static int __nbd_set_size(struct nbd_device *nbd, loff_t bytesize, loff_t blksize) { + struct queue_limits lim; + int error; + if (!blksize) blksize = 1u << NBD_DEF_BLKSIZE_BITS; @@ -334,10 +337,16 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, if (!nbd->pid) return 0; + lim = queue_limits_start_update(nbd->disk->queue); if (nbd->config->flags & NBD_FLAG_SEND_TRIM) - blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX); - blk_queue_logical_block_size(nbd->disk->queue, blksize); - blk_queue_physical_block_size(nbd->disk->queue, blksize); + lim.max_hw_discard_sectors = UINT_MAX; + else + lim.max_hw_discard_sectors = 0; + lim.logical_block_size = blksize; + lim.physical_block_size = blksize; + error = queue_limits_commit_update(nbd->disk->queue, &lim); + if (error) + return error; if (max_part) set_bit(GD_NEED_PART_SCAN, &nbd->disk->state); @@ -346,6 +355,18 @@ static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, return 0; } +static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize, + loff_t blksize) +{ + int error; + + blk_mq_freeze_queue(nbd->disk->queue); + error = __nbd_set_size(nbd, bytesize, blksize); + blk_mq_unfreeze_queue(nbd->disk->queue); + + return error; +} + static void nbd_complete_rq(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -1351,7 +1372,6 @@ static void nbd_config_put(struct nbd_device *nbd) nbd->config = NULL; nbd->tag_set.timeout = 0; - blk_queue_max_discard_sectors(nbd->disk->queue, 0); mutex_unlock(&nbd->config_lock); nbd_put(nbd); @@ -1783,6 +1803,12 @@ static const struct blk_mq_ops nbd_mq_ops = { static struct nbd_device *nbd_dev_add(int index, unsigned int refs) { + struct queue_limits lim = { + .max_hw_sectors = 65536, + .max_user_sectors = 256, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + }; struct nbd_device *nbd; struct gendisk *disk; int err = -ENOMEM; @@ -1823,7 +1849,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) if (err < 0) goto out_free_tags; - disk = blk_mq_alloc_disk(&nbd->tag_set, NULL); + disk = blk_mq_alloc_disk(&nbd->tag_set, &lim, NULL); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_free_idr; @@ -1843,11 +1869,6 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) * Tell the block layer that we are not a rotational device */ blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue); - blk_queue_max_discard_sectors(disk->queue, 0); - blk_queue_max_segment_size(disk->queue, UINT_MAX); - blk_queue_max_segments(disk->queue, USHRT_MAX); - blk_queue_max_hw_sectors(disk->queue, 65536); - disk->queue->limits.max_sectors = 256; mutex_init(&nbd->config_lock); refcount_set(&nbd->config_refs, 0); @@ -2433,6 +2454,12 @@ static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info) } dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST); + if (!dev_list) { + nlmsg_free(reply); + ret = -EMSGSIZE; + goto out; + } + if (index == -1) { ret = idr_for_each(&nbd_index_idr, &status_cb, reply); if (ret) { diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 36755f263e8e..71c39bcd872c 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -115,6 +115,18 @@ module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444); MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>"); #endif +/* + * Historic queue modes. + * + * These days nothing but NULL_Q_MQ is actually supported, but we keep it the + * enum for error reporting. + */ +enum { + NULL_Q_BIO = 0, + NULL_Q_RQ = 1, + NULL_Q_MQ = 2, +}; + static int g_queue_mode = NULL_Q_MQ; static int null_param_store_val(const char *str, int *val, int min, int max) @@ -165,8 +177,8 @@ static bool g_blocking; module_param_named(blocking, g_blocking, bool, 0444); MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device"); -static bool shared_tags; -module_param(shared_tags, bool, 0444); +static bool g_shared_tags; +module_param_named(shared_tags, g_shared_tags, bool, 0444); MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq"); static bool g_shared_tag_bitmap; @@ -426,6 +438,7 @@ NULLB_DEVICE_ATTR(zone_max_open, uint, NULL); NULLB_DEVICE_ATTR(zone_max_active, uint, NULL); NULLB_DEVICE_ATTR(virt_boundary, bool, NULL); NULLB_DEVICE_ATTR(no_sched, bool, NULL); +NULLB_DEVICE_ATTR(shared_tags, bool, NULL); NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL); static ssize_t nullb_device_power_show(struct config_item *item, char *page) @@ -571,6 +584,7 @@ static struct configfs_attribute *nullb_device_attrs[] = { &nullb_device_attr_zone_offline, &nullb_device_attr_virt_boundary, &nullb_device_attr_no_sched, + &nullb_device_attr_shared_tags, &nullb_device_attr_shared_tag_bitmap, NULL, }; @@ -653,10 +667,11 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page) "badblocks,blocking,blocksize,cache_size," "completion_nsec,discard,home_node,hw_queue_depth," "irqmode,max_sectors,mbps,memory_backed,no_sched," - "poll_queues,power,queue_mode,shared_tag_bitmap,size," - "submit_queues,use_per_node_hctx,virt_boundary,zoned," - "zone_capacity,zone_max_active,zone_max_open," - "zone_nr_conv,zone_offline,zone_readonly,zone_size\n"); + "poll_queues,power,queue_mode,shared_tag_bitmap," + "shared_tags,size,submit_queues,use_per_node_hctx," + "virt_boundary,zoned,zone_capacity,zone_max_active," + "zone_max_open,zone_nr_conv,zone_offline,zone_readonly," + "zone_size\n"); } CONFIGFS_ATTR_RO(memb_group_, features); @@ -738,6 +753,7 @@ static struct nullb_device *null_alloc_dev(void) dev->zone_max_active = g_zone_max_active; dev->virt_boundary = g_virt_boundary; dev->no_sched = g_no_sched; + dev->shared_tags = g_shared_tags; dev->shared_tag_bitmap = g_shared_tag_bitmap; return dev; } @@ -752,98 +768,11 @@ static void null_free_dev(struct nullb_device *dev) kfree(dev); } -static void put_tag(struct nullb_queue *nq, unsigned int tag) -{ - clear_bit_unlock(tag, nq->tag_map); - - if (waitqueue_active(&nq->wait)) - wake_up(&nq->wait); -} - -static unsigned int get_tag(struct nullb_queue *nq) -{ - unsigned int tag; - - do { - tag = find_first_zero_bit(nq->tag_map, nq->queue_depth); - if (tag >= nq->queue_depth) - return -1U; - } while (test_and_set_bit_lock(tag, nq->tag_map)); - - return tag; -} - -static void free_cmd(struct nullb_cmd *cmd) -{ - put_tag(cmd->nq, cmd->tag); -} - -static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer); - -static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - unsigned int tag; - - tag = get_tag(nq); - if (tag != -1U) { - cmd = &nq->cmds[tag]; - cmd->tag = tag; - cmd->error = BLK_STS_OK; - cmd->nq = nq; - if (nq->dev->irqmode == NULL_IRQ_TIMER) { - hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cmd->timer.function = null_cmd_timer_expired; - } - return cmd; - } - - return NULL; -} - -static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio) -{ - struct nullb_cmd *cmd; - DEFINE_WAIT(wait); - - do { - /* - * This avoids multiple return statements, multiple calls to - * __alloc_cmd() and a fast path call to prepare_to_wait(). - */ - cmd = __alloc_cmd(nq); - if (cmd) { - cmd->bio = bio; - return cmd; - } - prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE); - io_schedule(); - finish_wait(&nq->wait, &wait); - } while (1); -} - -static void end_cmd(struct nullb_cmd *cmd) -{ - int queue_mode = cmd->nq->dev->queue_mode; - - switch (queue_mode) { - case NULL_Q_MQ: - blk_mq_end_request(cmd->rq, cmd->error); - return; - case NULL_Q_BIO: - cmd->bio->bi_status = cmd->error; - bio_endio(cmd->bio); - break; - } - - free_cmd(cmd); -} - static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer) { - end_cmd(container_of(timer, struct nullb_cmd, timer)); + struct nullb_cmd *cmd = container_of(timer, struct nullb_cmd, timer); + blk_mq_end_request(blk_mq_rq_from_pdu(cmd), cmd->error); return HRTIMER_NORESTART; } @@ -856,7 +785,9 @@ static void null_cmd_end_timer(struct nullb_cmd *cmd) static void null_complete_rq(struct request *rq) { - end_cmd(blk_mq_rq_to_pdu(rq)); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + + blk_mq_end_request(rq, cmd->error); } static struct nullb_page *null_alloc_page(void) @@ -1273,7 +1204,7 @@ static int null_transfer(struct nullb *nullb, struct page *page, static int null_handle_rq(struct nullb_cmd *cmd) { - struct request *rq = cmd->rq; + struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb *nullb = cmd->nq->dev->nullb; int err; unsigned int len; @@ -1298,63 +1229,21 @@ static int null_handle_rq(struct nullb_cmd *cmd) return 0; } -static int null_handle_bio(struct nullb_cmd *cmd) -{ - struct bio *bio = cmd->bio; - struct nullb *nullb = cmd->nq->dev->nullb; - int err; - unsigned int len; - sector_t sector = bio->bi_iter.bi_sector; - struct bio_vec bvec; - struct bvec_iter iter; - - spin_lock_irq(&nullb->lock); - bio_for_each_segment(bvec, bio, iter) { - len = bvec.bv_len; - err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset, - op_is_write(bio_op(bio)), sector, - bio->bi_opf & REQ_FUA); - if (err) { - spin_unlock_irq(&nullb->lock); - return err; - } - sector += len >> SECTOR_SHIFT; - } - spin_unlock_irq(&nullb->lock); - return 0; -} - -static void null_stop_queue(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_stop_hw_queues(q); -} - -static void null_restart_queue_async(struct nullb *nullb) -{ - struct request_queue *q = nullb->q; - - if (nullb->dev->queue_mode == NULL_Q_MQ) - blk_mq_start_stopped_hw_queues(q, true); -} - static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; blk_status_t sts = BLK_STS_OK; - struct request *rq = cmd->rq; + struct request *rq = blk_mq_rq_from_pdu(cmd); if (!hrtimer_active(&nullb->bw_timer)) hrtimer_restart(&nullb->bw_timer); if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) { - null_stop_queue(nullb); + blk_mq_stop_hw_queues(nullb->q); /* race with timer */ if (atomic_long_read(&nullb->cur_bytes) > 0) - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); /* requeue request */ sts = BLK_STS_DEV_RESOURCE; } @@ -1381,37 +1270,29 @@ static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, sector_t nr_sectors) { struct nullb_device *dev = cmd->nq->dev; - int err; if (op == REQ_OP_DISCARD) return null_handle_discard(dev, sector, nr_sectors); + return errno_to_blk_status(null_handle_rq(cmd)); - if (dev->queue_mode == NULL_Q_BIO) - err = null_handle_bio(cmd); - else - err = null_handle_rq(cmd); - - return errno_to_blk_status(err); } static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd) { + struct request *rq = blk_mq_rq_from_pdu(cmd); struct nullb_device *dev = cmd->nq->dev; struct bio *bio; - if (dev->memory_backed) - return; - - if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) { - zero_fill_bio(cmd->bio); - } else if (req_op(cmd->rq) == REQ_OP_READ) { - __rq_for_each_bio(bio, cmd->rq) + if (!dev->memory_backed && req_op(rq) == REQ_OP_READ) { + __rq_for_each_bio(bio, rq) zero_fill_bio(bio); } } static inline void nullb_complete_cmd(struct nullb_cmd *cmd) { + struct request *rq = blk_mq_rq_from_pdu(cmd); + /* * Since root privileges are required to configure the null_blk * driver, it is fine that this driver does not initialize the @@ -1425,20 +1306,10 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) /* Complete IO by inline, softirq or timer */ switch (cmd->nq->dev->irqmode) { case NULL_IRQ_SOFTIRQ: - switch (cmd->nq->dev->queue_mode) { - case NULL_Q_MQ: - blk_mq_complete_request(cmd->rq); - break; - case NULL_Q_BIO: - /* - * XXX: no proper submitting cpu information available. - */ - end_cmd(cmd); - break; - } + blk_mq_complete_request(rq); break; case NULL_IRQ_NONE: - end_cmd(cmd); + blk_mq_end_request(rq, cmd->error); break; case NULL_IRQ_TIMER: null_cmd_end_timer(cmd); @@ -1499,7 +1370,7 @@ static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer) return HRTIMER_NORESTART; atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps)); - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); hrtimer_forward_now(&nullb->bw_timer, timer_interval); @@ -1516,26 +1387,6 @@ static void nullb_setup_bwtimer(struct nullb *nullb) hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL); } -static struct nullb_queue *nullb_to_queue(struct nullb *nullb) -{ - int index = 0; - - if (nullb->nr_queues != 1) - index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues); - - return &nullb->queues[index]; -} - -static void null_submit_bio(struct bio *bio) -{ - sector_t sector = bio->bi_iter.bi_sector; - sector_t nr_sectors = bio_sectors(bio); - struct nullb *nullb = bio->bi_bdev->bd_disk->private_data; - struct nullb_queue *nq = nullb_to_queue(nullb); - - null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio)); -} - #ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION static bool should_timeout_request(struct request *rq) @@ -1655,7 +1506,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) blk_rq_sectors(req)); if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error, blk_mq_end_request_batch)) - end_cmd(cmd); + blk_mq_end_request(req, cmd->error); nr++; } @@ -1711,7 +1562,6 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } - cmd->rq = rq; cmd->error = BLK_STS_OK; cmd->nq = nq; cmd->fake_timeout = should_timeout_request(rq) || @@ -1770,34 +1620,8 @@ static void null_queue_rqs(struct request **rqlist) *rqlist = requeue_list; } -static void cleanup_queue(struct nullb_queue *nq) -{ - bitmap_free(nq->tag_map); - kfree(nq->cmds); -} - -static void cleanup_queues(struct nullb *nullb) -{ - int i; - - for (i = 0; i < nullb->nr_queues; i++) - cleanup_queue(&nullb->queues[i]); - - kfree(nullb->queues); -} - -static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nullb_queue *nq = hctx->driver_data; - struct nullb *nullb = nq->dev->nullb; - - nullb->nr_queues--; -} - static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq) { - init_waitqueue_head(&nq->wait); - nq->queue_depth = nullb->queue_depth; nq->dev = nullb->dev; INIT_LIST_HEAD(&nq->poll_list); spin_lock_init(&nq->poll_lock); @@ -1815,7 +1639,6 @@ static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, nq = &nullb->queues[hctx_idx]; hctx->driver_data = nq; null_init_queue(nullb, nq); - nullb->nr_queues++; return 0; } @@ -1828,7 +1651,6 @@ static const struct blk_mq_ops null_mq_ops = { .poll = null_poll, .map_queues = null_map_queues, .init_hctx = null_init_hctx, - .exit_hctx = null_exit_hctx, }; static void null_del_dev(struct nullb *nullb) @@ -1849,21 +1671,20 @@ static void null_del_dev(struct nullb *nullb) if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) { hrtimer_cancel(&nullb->bw_timer); atomic_long_set(&nullb->cur_bytes, LONG_MAX); - null_restart_queue_async(nullb); + blk_mq_start_stopped_hw_queues(nullb->q, true); } put_disk(nullb->disk); - if (dev->queue_mode == NULL_Q_MQ && - nullb->tag_set == &nullb->__tag_set) + if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); - cleanup_queues(nullb); + kfree(nullb->queues); if (null_cache_active(nullb)) null_free_device_storage(nullb->dev, true); kfree(nullb); dev->nullb = NULL; } -static void null_config_discard(struct nullb *nullb) +static void null_config_discard(struct nullb *nullb, struct queue_limits *lim) { if (nullb->dev->discard == false) return; @@ -1880,43 +1701,14 @@ static void null_config_discard(struct nullb *nullb) return; } - blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9); + lim->max_hw_discard_sectors = UINT_MAX >> 9; } -static const struct block_device_operations null_bio_ops = { - .owner = THIS_MODULE, - .submit_bio = null_submit_bio, - .report_zones = null_report_zones, -}; - -static const struct block_device_operations null_rq_ops = { +static const struct block_device_operations null_ops = { .owner = THIS_MODULE, .report_zones = null_report_zones, }; -static int setup_commands(struct nullb_queue *nq) -{ - struct nullb_cmd *cmd; - int i; - - nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL); - if (!nq->cmds) - return -ENOMEM; - - nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL); - if (!nq->tag_map) { - kfree(nq->cmds); - return -ENOMEM; - } - - for (i = 0; i < nq->queue_depth; i++) { - cmd = &nq->cmds[i]; - cmd->tag = -1U; - } - - return 0; -} - static int setup_queues(struct nullb *nullb) { int nqueues = nr_cpu_ids; @@ -1929,101 +1721,66 @@ static int setup_queues(struct nullb *nullb) if (!nullb->queues) return -ENOMEM; - nullb->queue_depth = nullb->dev->hw_queue_depth; return 0; } -static int init_driver_queues(struct nullb *nullb) +static int null_init_tag_set(struct blk_mq_tag_set *set, int poll_queues) { - struct nullb_queue *nq; - int i, ret = 0; - - for (i = 0; i < nullb->dev->submit_queues; i++) { - nq = &nullb->queues[i]; - - null_init_queue(nullb, nq); - - ret = setup_commands(nq); - if (ret) - return ret; - nullb->nr_queues++; + set->ops = &null_mq_ops; + set->cmd_size = sizeof(struct nullb_cmd); + set->timeout = 5 * HZ; + set->nr_maps = 1; + if (poll_queues) { + set->nr_hw_queues += poll_queues; + set->nr_maps += 2; } - return 0; + return blk_mq_alloc_tag_set(set); } -static int null_gendisk_register(struct nullb *nullb) +static int null_init_global_tag_set(void) { - sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT; - struct gendisk *disk = nullb->disk; + int error; - set_capacity(disk, size); - - disk->major = null_major; - disk->first_minor = nullb->index; - disk->minors = 1; - if (queue_is_mq(nullb->q)) - disk->fops = &null_rq_ops; - else - disk->fops = &null_bio_ops; - disk->private_data = nullb; - strscpy_pad(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + if (tag_set.ops) + return 0; - if (nullb->dev->zoned) { - int ret = null_register_zoned_dev(nullb); + tag_set.nr_hw_queues = g_submit_queues; + tag_set.queue_depth = g_hw_queue_depth; + tag_set.numa_node = g_home_node; + tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + if (g_no_sched) + tag_set.flags |= BLK_MQ_F_NO_SCHED; + if (g_shared_tag_bitmap) + tag_set.flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (g_blocking) + tag_set.flags |= BLK_MQ_F_BLOCKING; - if (ret) - return ret; - } - - return add_disk(disk); + error = null_init_tag_set(&tag_set, g_poll_queues); + if (error) + tag_set.ops = NULL; + return error; } -static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set) +static int null_setup_tagset(struct nullb *nullb) { - unsigned int flags = BLK_MQ_F_SHOULD_MERGE; - int hw_queues, numa_node; - unsigned int queue_depth; - int poll_queues; - - if (nullb) { - hw_queues = nullb->dev->submit_queues; - poll_queues = nullb->dev->poll_queues; - queue_depth = nullb->dev->hw_queue_depth; - numa_node = nullb->dev->home_node; - if (nullb->dev->no_sched) - flags |= BLK_MQ_F_NO_SCHED; - if (nullb->dev->shared_tag_bitmap) - flags |= BLK_MQ_F_TAG_HCTX_SHARED; - if (nullb->dev->blocking) - flags |= BLK_MQ_F_BLOCKING; - } else { - hw_queues = g_submit_queues; - poll_queues = g_poll_queues; - queue_depth = g_hw_queue_depth; - numa_node = g_home_node; - if (g_no_sched) - flags |= BLK_MQ_F_NO_SCHED; - if (g_shared_tag_bitmap) - flags |= BLK_MQ_F_TAG_HCTX_SHARED; - if (g_blocking) - flags |= BLK_MQ_F_BLOCKING; - } - - set->ops = &null_mq_ops; - set->cmd_size = sizeof(struct nullb_cmd); - set->flags = flags; - set->driver_data = nullb; - set->nr_hw_queues = hw_queues; - set->queue_depth = queue_depth; - set->numa_node = numa_node; - if (poll_queues) { - set->nr_hw_queues += poll_queues; - set->nr_maps = 3; - } else { - set->nr_maps = 1; + if (nullb->dev->shared_tags) { + nullb->tag_set = &tag_set; + return null_init_global_tag_set(); } - return blk_mq_alloc_tag_set(set); + nullb->tag_set = &nullb->__tag_set; + nullb->tag_set->driver_data = nullb; + nullb->tag_set->nr_hw_queues = nullb->dev->submit_queues; + nullb->tag_set->queue_depth = nullb->dev->hw_queue_depth; + nullb->tag_set->numa_node = nullb->dev->home_node; + nullb->tag_set->flags = BLK_MQ_F_SHOULD_MERGE; + if (nullb->dev->no_sched) + nullb->tag_set->flags |= BLK_MQ_F_NO_SCHED; + if (nullb->dev->shared_tag_bitmap) + nullb->tag_set->flags |= BLK_MQ_F_TAG_HCTX_SHARED; + if (nullb->dev->blocking) + nullb->tag_set->flags |= BLK_MQ_F_BLOCKING; + return null_init_tag_set(nullb->tag_set, nullb->dev->poll_queues); } static int null_validate_conf(struct nullb_device *dev) @@ -2032,11 +1789,15 @@ static int null_validate_conf(struct nullb_device *dev) pr_err("legacy IO path is no longer available\n"); return -EINVAL; } + if (dev->queue_mode == NULL_Q_BIO) { + pr_err("BIO-based IO path is no longer available, using blk-mq instead.\n"); + dev->queue_mode = NULL_Q_MQ; + } dev->blocksize = round_down(dev->blocksize, 512); dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096); - if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) { + if (dev->use_per_node_hctx) { if (dev->submit_queues != nr_online_nodes) dev->submit_queues = nr_online_nodes; } else if (dev->submit_queues > nr_cpu_ids) @@ -2048,8 +1809,6 @@ static int null_validate_conf(struct nullb_device *dev) if (dev->poll_queues > g_poll_queues) dev->poll_queues = g_poll_queues; dev->prev_poll_queues = dev->poll_queues; - - dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ); dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER); /* Do memory allocation, so set blocking */ @@ -2060,9 +1819,6 @@ static int null_validate_conf(struct nullb_device *dev) dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024, dev->cache_size); dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps); - /* can not stop a queue */ - if (dev->queue_mode == NULL_Q_BIO) - dev->mbps = 0; if (dev->zoned && (!dev->zone_size || !is_power_of_2(dev->zone_size))) { @@ -2102,6 +1858,12 @@ static bool null_setup_fault(void) static int null_add_dev(struct nullb_device *dev) { + struct queue_limits lim = { + .logical_block_size = dev->blocksize, + .physical_block_size = dev->blocksize, + .max_hw_sectors = dev->max_sectors, + }; + struct nullb *nullb; int rv; @@ -2123,36 +1885,25 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_free_nullb; - if (dev->queue_mode == NULL_Q_MQ) { - if (shared_tags) { - nullb->tag_set = &tag_set; - rv = 0; - } else { - nullb->tag_set = &nullb->__tag_set; - rv = null_init_tag_set(nullb, nullb->tag_set); - } + rv = null_setup_tagset(nullb); + if (rv) + goto out_cleanup_queues; + if (dev->virt_boundary) + lim.virt_boundary_mask = PAGE_SIZE - 1; + null_config_discard(nullb, &lim); + if (dev->zoned) { + rv = null_init_zoned_dev(dev, &lim); if (rv) - goto out_cleanup_queues; - - nullb->tag_set->timeout = 5 * HZ; - nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb); - if (IS_ERR(nullb->disk)) { - rv = PTR_ERR(nullb->disk); goto out_cleanup_tags; - } - nullb->q = nullb->disk->queue; - } else if (dev->queue_mode == NULL_Q_BIO) { - rv = -ENOMEM; - nullb->disk = blk_alloc_disk(nullb->dev->home_node); - if (!nullb->disk) - goto out_cleanup_queues; + } - nullb->q = nullb->disk->queue; - rv = init_driver_queues(nullb); - if (rv) - goto out_cleanup_disk; + nullb->disk = blk_mq_alloc_disk(nullb->tag_set, &lim, nullb); + if (IS_ERR(nullb->disk)) { + rv = PTR_ERR(nullb->disk); + goto out_cleanup_zone; } + nullb->q = nullb->disk->queue; if (dev->mbps) { set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags); @@ -2164,12 +1915,6 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_write_cache(nullb->q, true, true); } - if (dev->zoned) { - rv = null_init_zoned_dev(dev, nullb->q); - if (rv) - goto out_cleanup_disk; - } - nullb->q->queuedata = nullb; blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); @@ -2177,22 +1922,12 @@ static int null_add_dev(struct nullb_device *dev) rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) { mutex_unlock(&lock); - goto out_cleanup_zone; + goto out_cleanup_disk; } nullb->index = rv; dev->index = rv; mutex_unlock(&lock); - blk_queue_logical_block_size(nullb->q, dev->blocksize); - blk_queue_physical_block_size(nullb->q, dev->blocksize); - if (dev->max_sectors) - blk_queue_max_hw_sectors(nullb->q, dev->max_sectors); - - if (dev->virt_boundary) - blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1); - - null_config_discard(nullb); - if (config_item_name(&dev->group.cg_item)) { /* Use configfs dir name as the device name */ snprintf(nullb->disk_name, sizeof(nullb->disk_name), @@ -2201,7 +1936,22 @@ static int null_add_dev(struct nullb_device *dev) sprintf(nullb->disk_name, "nullb%d", nullb->index); } - rv = null_gendisk_register(nullb); + set_capacity(nullb->disk, + ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT); + nullb->disk->major = null_major; + nullb->disk->first_minor = nullb->index; + nullb->disk->minors = 1; + nullb->disk->fops = &null_ops; + nullb->disk->private_data = nullb; + strscpy_pad(nullb->disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + if (nullb->dev->zoned) { + rv = null_register_zoned_dev(nullb); + if (rv) + goto out_ida_free; + } + + rv = add_disk(nullb->disk); if (rv) goto out_ida_free; @@ -2220,10 +1970,10 @@ out_cleanup_zone: out_cleanup_disk: put_disk(nullb->disk); out_cleanup_tags: - if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) + if (nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); out_cleanup_queues: - cleanup_queues(nullb); + kfree(nullb->queues); out_free_nullb: kfree(nullb); dev->nullb = NULL; @@ -2299,7 +2049,7 @@ static int __init null_init(void) return -EINVAL; } - if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) { + if (g_use_per_node_hctx) { if (g_submit_queues != nr_online_nodes) { pr_warn("submit_queues param is set to %u.\n", nr_online_nodes); @@ -2311,18 +2061,12 @@ static int __init null_init(void) g_submit_queues = 1; } - if (g_queue_mode == NULL_Q_MQ && shared_tags) { - ret = null_init_tag_set(NULL, &tag_set); - if (ret) - return ret; - } - config_group_init(&nullb_subsys.su_group); mutex_init(&nullb_subsys.su_mutex); ret = configfs_register_subsystem(&nullb_subsys); if (ret) - goto err_tagset; + return ret; mutex_init(&lock); @@ -2349,9 +2093,6 @@ err_dev: unregister_blkdev(null_major, "nullb"); err_conf: configfs_unregister_subsystem(&nullb_subsys); -err_tagset: - if (g_queue_mode == NULL_Q_MQ && shared_tags) - blk_mq_free_tag_set(&tag_set); return ret; } @@ -2370,7 +2111,7 @@ static void __exit null_exit(void) } mutex_unlock(&lock); - if (g_queue_mode == NULL_Q_MQ && shared_tags) + if (tag_set.ops) blk_mq_free_tag_set(&tag_set); } diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 929f659dd255..477b97746823 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -16,11 +16,6 @@ #include <linux/mutex.h> struct nullb_cmd { - union { - struct request *rq; - struct bio *bio; - }; - unsigned int tag; blk_status_t error; bool fake_timeout; struct nullb_queue *nq; @@ -28,16 +23,11 @@ struct nullb_cmd { }; struct nullb_queue { - unsigned long *tag_map; - wait_queue_head_t wait; - unsigned int queue_depth; struct nullb_device *dev; unsigned int requeue_selection; struct list_head poll_list; spinlock_t poll_lock; - - struct nullb_cmd *cmds; }; struct nullb_zone { @@ -60,13 +50,6 @@ struct nullb_zone { unsigned int capacity; }; -/* Queue modes */ -enum { - NULL_Q_BIO = 0, - NULL_Q_RQ = 1, - NULL_Q_MQ = 2, -}; - struct nullb_device { struct nullb *nullb; struct config_group group; @@ -119,6 +102,7 @@ struct nullb_device { bool zoned; /* if device is zoned */ bool virt_boundary; /* virtual boundary on/off for the device */ bool no_sched; /* no IO scheduler for the device */ + bool shared_tags; /* share tag set between devices for blk-mq */ bool shared_tag_bitmap; /* use hostwide shared tags */ }; @@ -130,14 +114,12 @@ struct nullb { struct gendisk *disk; struct blk_mq_tag_set *tag_set; struct blk_mq_tag_set __tag_set; - unsigned int queue_depth; atomic_long_t cur_bytes; struct hrtimer bw_timer; unsigned long cache_flush_pos; spinlock_t lock; struct nullb_queue *queues; - unsigned int nr_queues; char disk_name[DISK_NAME_LEN]; }; @@ -147,7 +129,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, unsigned int nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); +int null_init_zoned_dev(struct nullb_device *dev, struct queue_limits *lim); int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, @@ -160,7 +142,7 @@ ssize_t zone_cond_store(struct nullb_device *dev, const char *page, size_t count, enum blk_zone_cond cond); #else static inline int null_init_zoned_dev(struct nullb_device *dev, - struct request_queue *q) + struct queue_limits *lim) { pr_err("CONFIG_BLK_DEV_ZONED not enabled\n"); return -EINVAL; diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index 6b2b370e786f..ef2d05d5f0df 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -41,10 +41,11 @@ TRACE_EVENT(nullb_zone_op, __field(unsigned int, zone_cond) ), TP_fast_assign( - __entry->op = req_op(cmd->rq); + __entry->op = req_op(blk_mq_rq_from_pdu(cmd)); __entry->zone_no = zone_no; __entry->zone_cond = zone_cond; - __assign_disk_name(__entry->disk, cmd->rq->q->disk); + __assign_disk_name(__entry->disk, + blk_mq_rq_from_pdu(cmd)->q->disk); ), TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s", __print_disk_name(__entry->disk), diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 6f5e0994862e..1689e2584104 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -58,7 +58,8 @@ static inline void null_unlock_zone(struct nullb_device *dev, mutex_unlock(&zone->mutex); } -int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) +int null_init_zoned_dev(struct nullb_device *dev, + struct queue_limits *lim) { sector_t dev_capacity_sects, zone_capacity_sects; struct nullb_zone *zone; @@ -151,27 +152,22 @@ int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q) sector += dev->zone_size_sects; } + lim->zoned = true; + lim->chunk_sectors = dev->zone_size_sects; + lim->max_zone_append_sectors = dev->zone_size_sects; + lim->max_open_zones = dev->zone_max_open; + lim->max_active_zones = dev->zone_max_active; return 0; } int null_register_zoned_dev(struct nullb *nullb) { - struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; - disk_set_zoned(nullb->disk); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - blk_queue_chunk_sectors(q, dev->zone_size_sects); nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); - blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - disk_set_max_open_zones(nullb->disk, dev->zone_max_open); - disk_set_max_active_zones(nullb->disk, dev->zone_max_active); - - if (queue_is_mq(q)) - return blk_revalidate_disk_zones(nullb->disk, NULL); - - return 0; + return blk_revalidate_disk_zones(nullb->disk, NULL); } void null_free_zoned_dev(struct nullb_device *dev) @@ -394,10 +390,7 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector, */ if (append) { sector = zone->wp; - if (dev->queue_mode == NULL_Q_MQ) - cmd->rq->__sector = sector; - else - cmd->bio->bi_iter.bi_sector = sector; + blk_mq_rq_from_pdu(cmd)->__sector = sector; } else if (sector != zone->wp) { ret = BLK_STS_IOERR; goto unlock; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index c21444716e43..21728e9ea5c3 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -828,6 +828,12 @@ static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd, */ static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio) { + /* + * Some CDRW drives can not handle writes larger than one packet, + * even if the size is a multiple of the packet size. + */ + bio->bi_opf |= REQ_NOMERGE; + spin_lock(&pd->iosched.lock); if (bio_data_dir(bio) == READ) bio_list_add(&pd->iosched.read_queue, bio); @@ -2191,11 +2197,6 @@ static int pkt_open_dev(struct pktcdvd_device *pd, bool write) ret = pkt_open_write(pd); if (ret) goto out_putdev; - /* - * Some CDRW drives can not handle writes larger than one packet, - * even if the size is a multiple of the packet size. - */ - blk_queue_max_hw_sectors(q, pd->settings.size); set_bit(PACKET_WRITABLE, &pd->flags); } else { pkt_set_speed(pd, MAX_SPEED, MAX_SPEED); @@ -2338,9 +2339,9 @@ static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio) pkt_queue_bio(pd, cloned_bio); } -static void pkt_make_request_write(struct request_queue *q, struct bio *bio) +static void pkt_make_request_write(struct bio *bio) { - struct pktcdvd_device *pd = q->queuedata; + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; sector_t zone; struct packet_data *pkt; int was_empty, blocked_bio; @@ -2432,7 +2433,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) static void pkt_submit_bio(struct bio *bio) { - struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata; + struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->private_data; struct device *ddev = disk_to_dev(pd->disk); struct bio *split; @@ -2476,7 +2477,7 @@ static void pkt_submit_bio(struct bio *bio) split = bio; } - pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split); + pkt_make_request_write(split); } while (split != bio); return; @@ -2484,15 +2485,6 @@ end_io: bio_io_error(bio); } -static void pkt_init_queue(struct pktcdvd_device *pd) -{ - struct request_queue *q = pd->disk->queue; - - blk_queue_logical_block_size(q, CD_FRAMESIZE); - blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS); - q->queuedata = pd; -} - static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { struct device *ddev = disk_to_dev(pd->disk); @@ -2536,8 +2528,6 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) pd->bdev_file = bdev_file; set_blocksize(file_bdev(bdev_file), CD_FRAMESIZE); - pkt_init_queue(pd); - atomic_set(&pd->cdrw.pending_bios, 0); pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->disk->disk_name); if (IS_ERR(pd->cdrw.thread)) { @@ -2634,6 +2624,10 @@ static const struct block_device_operations pktcdvd_ops = { */ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) { + struct queue_limits lim = { + .max_hw_sectors = PACKET_MAX_SECTORS, + .logical_block_size = CD_FRAMESIZE, + }; int idx; int ret = -ENOMEM; struct pktcdvd_device *pd; @@ -2673,10 +2667,11 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) pd->write_congestion_on = write_congestion_on; pd->write_congestion_off = write_congestion_off; - ret = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); goto out_mem; + } pd->disk = disk; disk->major = pktdev_major; disk->first_minor = idx; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 36d7b36c60c7..b810ac0a5c4b 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -382,6 +382,14 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) struct ps3disk_private *priv; int error; unsigned int devidx; + struct queue_limits lim = { + .logical_block_size = dev->blk_size, + .max_hw_sectors = dev->bounce_size >> 9, + .max_segments = -1, + .max_segment_size = dev->bounce_size, + .dma_alignment = dev->blk_size - 1, + }; + struct request_queue *queue; struct gendisk *gendisk; @@ -431,7 +439,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) if (error) goto fail_teardown; - gendisk = blk_mq_alloc_disk(&priv->tag_set, dev); + gendisk = blk_mq_alloc_disk(&priv->tag_set, &lim, dev); if (IS_ERR(gendisk)) { dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n", __func__, __LINE__); @@ -441,15 +449,8 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) queue = gendisk->queue; - blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9); - blk_queue_dma_alignment(queue, dev->blk_size-1); - blk_queue_logical_block_size(queue, dev->blk_size); - blk_queue_write_cache(queue, true, false); - blk_queue_max_segments(queue, -1); - blk_queue_max_segment_size(queue, dev->bounce_size); - priv->gendisk = gendisk; gendisk->major = ps3disk_major; gendisk->first_minor = devidx * PS3DISK_MINORS; diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 38d42af01b25..bdcf083b45e2 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -730,10 +730,10 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) ps3vram_proc_init(dev); - gendisk = blk_alloc_disk(NUMA_NO_NODE); - if (!gendisk) { + gendisk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(gendisk)) { dev_err(&dev->core, "blk_alloc_disk failed\n"); - error = -ENOMEM; + error = PTR_ERR(gendisk); goto out_cache_cleanup; } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 12b5d53ec856..26ff5cd2bf0a 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -575,7 +575,7 @@ static const struct attribute_group rbd_bus_group = { }; __ATTRIBUTE_GROUPS(rbd_bus); -static struct bus_type rbd_bus_type = { +static const struct bus_type rbd_bus_type = { .name = "rbd", .bus_groups = rbd_bus_groups, }; @@ -4952,6 +4952,14 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) struct request_queue *q; unsigned int objset_bytes = rbd_dev->layout.object_size * rbd_dev->layout.stripe_count; + struct queue_limits lim = { + .max_hw_sectors = objset_bytes >> SECTOR_SHIFT, + .max_user_sectors = objset_bytes >> SECTOR_SHIFT, + .io_min = rbd_dev->opts->alloc_size, + .io_opt = rbd_dev->opts->alloc_size, + .max_segments = USHRT_MAX, + .max_segment_size = UINT_MAX, + }; int err; memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set)); @@ -4966,7 +4974,13 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (err) return err; - disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev); + if (rbd_dev->opts->trim) { + lim.discard_granularity = rbd_dev->opts->alloc_size; + lim.max_hw_discard_sectors = objset_bytes >> SECTOR_SHIFT; + lim.max_write_zeroes_sectors = objset_bytes >> SECTOR_SHIFT; + } + + disk = blk_mq_alloc_disk(&rbd_dev->tag_set, &lim, rbd_dev); if (IS_ERR(disk)) { err = PTR_ERR(disk); goto out_tag_set; @@ -4987,19 +5001,6 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) blk_queue_flag_set(QUEUE_FLAG_NONROT, q); /* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */ - blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT); - q->limits.max_sectors = queue_max_hw_sectors(q); - blk_queue_max_segments(q, USHRT_MAX); - blk_queue_max_segment_size(q, UINT_MAX); - blk_queue_io_min(q, rbd_dev->opts->alloc_size); - blk_queue_io_opt(q, rbd_dev->opts->alloc_size); - - if (rbd_dev->opts->trim) { - q->limits.discard_granularity = rbd_dev->opts->alloc_size; - blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT); - blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT); - } - if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q); diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 4044c369d22a..b7ffe03c6160 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1329,43 +1329,6 @@ static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev) } } -static void setup_request_queue(struct rnbd_clt_dev *dev, - struct rnbd_msg_open_rsp *rsp) -{ - blk_queue_logical_block_size(dev->queue, - le16_to_cpu(rsp->logical_block_size)); - blk_queue_physical_block_size(dev->queue, - le16_to_cpu(rsp->physical_block_size)); - blk_queue_max_hw_sectors(dev->queue, - dev->sess->max_io_size / SECTOR_SIZE); - - /* - * we don't support discards to "discontiguous" segments - * in on request - */ - blk_queue_max_discard_segments(dev->queue, 1); - - blk_queue_max_discard_sectors(dev->queue, - le32_to_cpu(rsp->max_discard_sectors)); - dev->queue->limits.discard_granularity = - le32_to_cpu(rsp->discard_granularity); - dev->queue->limits.discard_alignment = - le32_to_cpu(rsp->discard_alignment); - if (le16_to_cpu(rsp->secure_discard)) - blk_queue_max_secure_erase_sectors(dev->queue, - le32_to_cpu(rsp->max_discard_sectors)); - blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); - blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); - blk_queue_max_segments(dev->queue, dev->sess->max_segments); - blk_queue_io_opt(dev->queue, dev->sess->max_io_size); - blk_queue_virt_boundary(dev->queue, SZ_4K - 1); - blk_queue_write_cache(dev->queue, - !!(rsp->cache_policy & RNBD_WRITEBACK), - !!(rsp->cache_policy & RNBD_FUA)); - blk_queue_max_write_zeroes_sectors(dev->queue, - le32_to_cpu(rsp->max_write_zeroes_sectors)); -} - static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, struct rnbd_msg_open_rsp *rsp, int idx) { @@ -1403,18 +1366,41 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, static int rnbd_client_setup_device(struct rnbd_clt_dev *dev, struct rnbd_msg_open_rsp *rsp) { + struct queue_limits lim = { + .logical_block_size = le16_to_cpu(rsp->logical_block_size), + .physical_block_size = le16_to_cpu(rsp->physical_block_size), + .io_opt = dev->sess->max_io_size, + .max_hw_sectors = dev->sess->max_io_size / SECTOR_SIZE, + .max_hw_discard_sectors = le32_to_cpu(rsp->max_discard_sectors), + .discard_granularity = le32_to_cpu(rsp->discard_granularity), + .discard_alignment = le32_to_cpu(rsp->discard_alignment), + .max_segments = dev->sess->max_segments, + .virt_boundary_mask = SZ_4K - 1, + .max_write_zeroes_sectors = + le32_to_cpu(rsp->max_write_zeroes_sectors), + }; int idx = dev->clt_device_id; dev->size = le64_to_cpu(rsp->nsectors) * le16_to_cpu(rsp->logical_block_size); - dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev); + if (rsp->secure_discard) { + lim.max_secure_erase_sectors = + le32_to_cpu(rsp->max_discard_sectors); + } + + dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, &lim, dev); if (IS_ERR(dev->gd)) return PTR_ERR(dev->gd); dev->queue = dev->gd->queue; rnbd_init_mq_hw_queues(dev); - setup_request_queue(dev, rsp); + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue); + blk_queue_write_cache(dev->queue, + !!(rsp->cache_policy & RNBD_WRITEBACK), + !!(rsp->cache_policy & RNBD_FUA)); + return rnbd_clt_setup_gen_disk(dev, rsp, idx); } diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 7bf4b48e2282..c99dd6698977 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -784,6 +784,14 @@ static const struct blk_mq_ops vdc_mq_ops = { static int probe_disk(struct vdc_port *port) { + struct queue_limits lim = { + .physical_block_size = port->vdisk_phys_blksz, + .max_hw_sectors = port->max_xfer_size, + /* Each segment in a request is up to an aligned page in size. */ + .seg_boundary_mask = PAGE_SIZE - 1, + .max_segment_size = PAGE_SIZE, + .max_segments = port->ring_cookies, + }; struct request_queue *q; struct gendisk *g; int err; @@ -824,7 +832,7 @@ static int probe_disk(struct vdc_port *port) if (err) return err; - g = blk_mq_alloc_disk(&port->tag_set, port); + g = blk_mq_alloc_disk(&port->tag_set, &lim, port); if (IS_ERR(g)) { printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n", port->vio.name); @@ -835,12 +843,6 @@ static int probe_disk(struct vdc_port *port) port->disk = g; q = g->queue; - /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_max_segment_size(q, PAGE_SIZE); - - blk_queue_max_segments(q, port->ring_cookies); - blk_queue_max_hw_sectors(q, port->max_xfer_size); g->major = vdc_major; g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT; g->minors = 1 << PARTITION_SHIFT; @@ -872,8 +874,6 @@ static int probe_disk(struct vdc_port *port) } } - blk_queue_physical_block_size(q, port->vdisk_phys_blksz); - pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n", g->disk_name, port->vdisk_size, (port->vdisk_size >> (20 - 9)), diff --git a/drivers/block/swim.c b/drivers/block/swim.c index f85b6af414b4..6731678f3a41 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -820,7 +820,7 @@ static int swim_floppy_init(struct swim_priv *swd) goto exit_put_disks; swd->unit[drive].disk = - blk_mq_alloc_disk(&swd->unit[drive].tag_set, + blk_mq_alloc_disk(&swd->unit[drive].tag_set, NULL, &swd->unit[drive]); if (IS_ERR(swd->unit[drive].disk)) { blk_mq_free_tag_set(&swd->unit[drive].tag_set); @@ -916,7 +916,7 @@ out: return ret; } -static int swim_remove(struct platform_device *dev) +static void swim_remove(struct platform_device *dev) { struct swim_priv *swd = platform_get_drvdata(dev); int drive; @@ -937,13 +937,11 @@ static int swim_remove(struct platform_device *dev) release_mem_region(res->start, resource_size(res)); kfree(swd); - - return 0; } static struct platform_driver swim_driver = { .probe = swim_probe, - .remove = swim_remove, + .remove_new = swim_remove, .driver = { .name = CARDNAME, }, diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c2bc85826358..a04756ac778e 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1210,7 +1210,7 @@ static int swim3_attach(struct macio_dev *mdev, if (rc) goto out_unregister; - disk = blk_mq_alloc_disk(&fs->tag_set, fs); + disk = blk_mq_alloc_disk(&fs->tag_set, NULL, fs); if (IS_ERR(disk)) { rc = PTR_ERR(disk); goto out_free_tag_set; diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 1dfb2e77898b..bea3d5cf8a83 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -246,21 +246,12 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return 0; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - const struct ublk_param_zoned *p = &ub->params.zoned; - - disk_set_zoned(ub->ub_disk); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ub->ub_disk->queue); blk_queue_required_elevator_features(ub->ub_disk->queue, ELEVATOR_F_ZBD_SEQ_WRITE); - disk_set_max_active_zones(ub->ub_disk, p->max_active_zones); - disk_set_max_open_zones(ub->ub_disk, p->max_open_zones); - blk_queue_max_zone_append_sectors(ub->ub_disk->queue, p->max_zone_append_sectors); - ub->ub_disk->nr_zones = ublk_get_nr_zones(ub); - - return 0; } /* Based on virtblk_alloc_report_buffer */ @@ -432,9 +423,8 @@ static int ublk_dev_param_zoned_validate(const struct ublk_device *ub) return -EOPNOTSUPP; } -static int ublk_dev_param_zoned_apply(struct ublk_device *ub) +static void ublk_dev_param_zoned_apply(struct ublk_device *ub) { - return -EOPNOTSUPP; } static int ublk_revalidate_disk_zones(struct ublk_device *ub) @@ -498,11 +488,6 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) struct request_queue *q = ub->ub_disk->queue; const struct ublk_param_basic *p = &ub->params.basic; - blk_queue_logical_block_size(q, 1 << p->logical_bs_shift); - blk_queue_physical_block_size(q, 1 << p->physical_bs_shift); - blk_queue_io_min(q, 1 << p->io_min_shift); - blk_queue_io_opt(q, 1 << p->io_opt_shift); - blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE, p->attrs & UBLK_ATTR_FUA); if (p->attrs & UBLK_ATTR_ROTATIONAL) @@ -510,29 +495,12 @@ static void ublk_dev_param_basic_apply(struct ublk_device *ub) else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - blk_queue_max_hw_sectors(q, p->max_sectors); - blk_queue_chunk_sectors(q, p->chunk_sectors); - blk_queue_virt_boundary(q, p->virt_boundary_mask); - if (p->attrs & UBLK_ATTR_READ_ONLY) set_disk_ro(ub->ub_disk, true); set_capacity(ub->ub_disk, p->dev_sectors); } -static void ublk_dev_param_discard_apply(struct ublk_device *ub) -{ - struct request_queue *q = ub->ub_disk->queue; - const struct ublk_param_discard *p = &ub->params.discard; - - q->limits.discard_alignment = p->discard_alignment; - q->limits.discard_granularity = p->discard_granularity; - blk_queue_max_discard_sectors(q, p->max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, - p->max_write_zeroes_sectors); - blk_queue_max_discard_segments(q, p->max_discard_segments); -} - static int ublk_validate_params(const struct ublk_device *ub) { /* basic param is the only one which must be set */ @@ -576,20 +544,12 @@ static int ublk_validate_params(const struct ublk_device *ub) return 0; } -static int ublk_apply_params(struct ublk_device *ub) +static void ublk_apply_params(struct ublk_device *ub) { - if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) - return -EINVAL; - ublk_dev_param_basic_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) - ublk_dev_param_discard_apply(ub); - if (ub->params.types & UBLK_PARAM_TYPE_ZONED) - return ublk_dev_param_zoned_apply(ub); - - return 0; + ublk_dev_param_zoned_apply(ub); } static inline bool ublk_support_user_copy(const struct ublk_queue *ubq) @@ -645,14 +605,16 @@ static inline bool ublk_need_get_data(const struct ublk_queue *ubq) return ubq->flags & UBLK_F_NEED_GET_DATA; } -static struct ublk_device *ublk_get_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline struct ublk_device *ublk_get_device(struct ublk_device *ub) { if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) return ub; return NULL; } -static void ublk_put_device(struct ublk_device *ub) +/* Called in slow path only, keep it noinline for trace purpose */ +static noinline void ublk_put_device(struct ublk_device *ub) { put_device(&ub->cdev_dev); } @@ -711,7 +673,7 @@ static void ublk_free_disk(struct gendisk *disk) struct ublk_device *ub = disk->private_data; clear_bit(UB_STATE_USED, &ub->state); - put_device(&ub->cdev_dev); + ublk_put_device(ub); } static void ublk_store_owner_uid_gid(unsigned int *owner_uid, @@ -2182,7 +2144,7 @@ static void ublk_remove(struct ublk_device *ub) cancel_work_sync(&ub->stop_work); cancel_work_sync(&ub->quiesce_work); cdev_device_del(&ub->cdev, &ub->cdev_dev); - put_device(&ub->cdev_dev); + ublk_put_device(ub); ublks_added--; } @@ -2205,12 +2167,47 @@ static struct ublk_device *ublk_get_device_from_id(int idx) static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) { const struct ublksrv_ctrl_cmd *header = io_uring_sqe_cmd(cmd->sqe); + const struct ublk_param_basic *p = &ub->params.basic; int ublksrv_pid = (int)header->data[0]; + struct queue_limits lim = { + .logical_block_size = 1 << p->logical_bs_shift, + .physical_block_size = 1 << p->physical_bs_shift, + .io_min = 1 << p->io_min_shift, + .io_opt = 1 << p->io_opt_shift, + .max_hw_sectors = p->max_sectors, + .chunk_sectors = p->chunk_sectors, + .virt_boundary_mask = p->virt_boundary_mask, + + }; struct gendisk *disk; int ret = -EINVAL; if (ublksrv_pid <= 0) return -EINVAL; + if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC)) + return -EINVAL; + + if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) { + const struct ublk_param_discard *pd = &ub->params.discard; + + lim.discard_alignment = pd->discard_alignment; + lim.discard_granularity = pd->discard_granularity; + lim.max_hw_discard_sectors = pd->max_discard_sectors; + lim.max_write_zeroes_sectors = pd->max_write_zeroes_sectors; + lim.max_discard_segments = pd->max_discard_segments; + } + + if (ub->params.types & UBLK_PARAM_TYPE_ZONED) { + const struct ublk_param_zoned *p = &ub->params.zoned; + + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) + return -EOPNOTSUPP; + + lim.zoned = true; + lim.max_active_zones = p->max_active_zones; + lim.max_open_zones = p->max_open_zones; + lim.max_zone_append_sectors = p->max_zone_append_sectors; + } if (wait_for_completion_interruptible(&ub->completion) != 0) return -EINTR; @@ -2222,7 +2219,7 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) goto out_unlock; } - disk = blk_mq_alloc_disk(&ub->tag_set, NULL); + disk = blk_mq_alloc_disk(&ub->tag_set, &lim, NULL); if (IS_ERR(disk)) { ret = PTR_ERR(disk); goto out_unlock; @@ -2234,15 +2231,13 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd) ub->dev_info.ublksrv_pid = ublksrv_pid; ub->ub_disk = disk; - ret = ublk_apply_params(ub); - if (ret) - goto out_put_disk; + ublk_apply_params(ub); /* don't probe partitions if any one ubq daemon is un-trusted */ if (ub->nr_privileged_daemon != ub->nr_queues_ready) set_bit(GD_SUPPRESS_PART_SCAN, &disk->state); - get_device(&ub->cdev_dev); + ublk_get_device(ub); ub->dev_info.state = UBLK_S_DEV_LIVE; if (ublk_dev_is_zoned(ub)) { @@ -2262,7 +2257,6 @@ out_put_cdev: ub->dev_info.state = UBLK_S_DEV_DEAD; ublk_put_device(ub); } -out_put_disk: if (ret) put_disk(disk); out_unlock: @@ -2474,7 +2468,7 @@ static inline bool ublk_idr_freed(int id) return ptr == NULL; } -static int ublk_ctrl_del_dev(struct ublk_device **p_ub) +static int ublk_ctrl_del_dev(struct ublk_device **p_ub, bool wait) { struct ublk_device *ub = *p_ub; int idx = ub->ub_number; @@ -2508,7 +2502,7 @@ static int ublk_ctrl_del_dev(struct ublk_device **p_ub) * - the device number is freed already, we will not find this * device via ublk_get_device_from_id() */ - if (wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) + if (wait && wait_event_interruptible(ublk_idr_wq, ublk_idr_freed(idx))) return -EINTR; return 0; } @@ -2907,7 +2901,10 @@ static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, ret = ublk_ctrl_add_dev(cmd); break; case UBLK_CMD_DEL_DEV: - ret = ublk_ctrl_del_dev(&ub); + ret = ublk_ctrl_del_dev(&ub, true); + break; + case UBLK_U_CMD_DEL_DEV_ASYNC: + ret = ublk_ctrl_del_dev(&ub, false); break; case UBLK_CMD_GET_QUEUE_AFFINITY: ret = ublk_ctrl_get_queue_affinity(ub, cmd); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2bf14a0e2815..42dea7601d87 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -720,25 +720,24 @@ fail_report: return ret; } -static int virtblk_probe_zoned_device(struct virtio_device *vdev, - struct virtio_blk *vblk, - struct request_queue *q) +static int virtblk_read_zoned_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { + struct virtio_device *vdev = vblk->vdev; u32 v, wg; dev_dbg(&vdev->dev, "probing host-managed zoned device\n"); - disk_set_zoned(vblk->disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); + lim->zoned = true; virtio_cread(vdev, struct virtio_blk_config, zoned.max_open_zones, &v); - disk_set_max_open_zones(vblk->disk, v); + lim->max_open_zones = v; dev_dbg(&vdev->dev, "max open zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, zoned.max_active_zones, &v); - disk_set_max_active_zones(vblk->disk, v); + lim->max_active_zones = v; dev_dbg(&vdev->dev, "max active zones = %u\n", v); virtio_cread(vdev, struct virtio_blk_config, @@ -747,8 +746,8 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, dev_warn(&vdev->dev, "zero write granularity reported\n"); return -ENODEV; } - blk_queue_physical_block_size(q, wg); - blk_queue_io_min(q, wg); + lim->physical_block_size = wg; + lim->io_min = wg; dev_dbg(&vdev->dev, "write granularity = %u\n", wg); @@ -764,13 +763,13 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, vblk->zone_sectors); return -ENODEV; } - blk_queue_chunk_sectors(q, vblk->zone_sectors); + lim->chunk_sectors = vblk->zone_sectors; dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { dev_warn(&vblk->vdev->dev, "ignoring negotiated F_DISCARD for zoned device\n"); - blk_queue_max_discard_sectors(q, 0); + lim->max_hw_discard_sectors = 0; } virtio_cread(vdev, struct virtio_blk_config, @@ -785,25 +784,21 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, wg, v); return -ENODEV; } - blk_queue_max_zone_append_sectors(q, v); + lim->max_zone_append_sectors = v; dev_dbg(&vdev->dev, "max append sectors = %u\n", v); - return blk_revalidate_disk_zones(vblk->disk, NULL); + return 0; } - #else - /* - * Zoned block device support is not configured in this kernel. - * Host-managed zoned devices can't be supported, but others are - * good to go as regular block devices. + * Zoned block device support is not configured in this kernel, host-managed + * zoned devices can't be supported. */ #define virtblk_report_zones NULL - -static inline int virtblk_probe_zoned_device(struct virtio_device *vdev, - struct virtio_blk *vblk, struct request_queue *q) +static inline int virtblk_read_zoned_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { - dev_err(&vdev->dev, + dev_err(&vblk->vdev->dev, "virtio_blk: zoned devices are not supported"); return -EOPNOTSUPP; } @@ -1248,31 +1243,17 @@ static const struct blk_mq_ops virtio_mq_ops = { static unsigned int virtblk_queue_depth; module_param_named(queue_depth, virtblk_queue_depth, uint, 0444); -static int virtblk_probe(struct virtio_device *vdev) +static int virtblk_read_limits(struct virtio_blk *vblk, + struct queue_limits *lim) { - struct virtio_blk *vblk; - struct request_queue *q; - int err, index; - + struct virtio_device *vdev = vblk->vdev; u32 v, blk_size, max_size, sg_elems, opt_io_size; u32 max_discard_segs = 0; u32 discard_granularity = 0; u16 min_io_size; u8 physical_block_exp, alignment_offset; - unsigned int queue_depth; size_t max_dma_size; - - if (!vdev->config->get) { - dev_err(&vdev->dev, "%s failure: config access disabled\n", - __func__); - return -EINVAL; - } - - err = ida_alloc_range(&vd_index_ida, 0, - minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); - if (err < 0) - goto out; - index = err; + int err; /* We need to know how many segments before we allocate. */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX, @@ -1286,78 +1267,11 @@ static int virtblk_probe(struct virtio_device *vdev) /* Prevent integer overflows and honor max vq size */ sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2); - vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); - if (!vblk) { - err = -ENOMEM; - goto out_free_index; - } - - mutex_init(&vblk->vdev_mutex); - - vblk->vdev = vdev; - - INIT_WORK(&vblk->config_work, virtblk_config_changed_work); - - err = init_vq(vblk); - if (err) - goto out_free_vblk; - - /* Default queue sizing is to fill the ring. */ - if (!virtblk_queue_depth) { - queue_depth = vblk->vqs[0].vq->num_free; - /* ... but without indirect descs, we use 2 descs per req */ - if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) - queue_depth /= 2; - } else { - queue_depth = virtblk_queue_depth; - } - - memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); - vblk->tag_set.ops = &virtio_mq_ops; - vblk->tag_set.queue_depth = queue_depth; - vblk->tag_set.numa_node = NUMA_NO_NODE; - vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; - vblk->tag_set.cmd_size = - sizeof(struct virtblk_req) + - sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; - vblk->tag_set.driver_data = vblk; - vblk->tag_set.nr_hw_queues = vblk->num_vqs; - vblk->tag_set.nr_maps = 1; - if (vblk->io_queues[HCTX_TYPE_POLL]) - vblk->tag_set.nr_maps = 3; - - err = blk_mq_alloc_tag_set(&vblk->tag_set); - if (err) - goto out_free_vq; - - vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk); - if (IS_ERR(vblk->disk)) { - err = PTR_ERR(vblk->disk); - goto out_free_tags; - } - q = vblk->disk->queue; - - virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); - - vblk->disk->major = major; - vblk->disk->first_minor = index_to_minor(index); - vblk->disk->minors = 1 << PART_BITS; - vblk->disk->private_data = vblk; - vblk->disk->fops = &virtblk_fops; - vblk->index = index; - - /* configure queue flush support */ - virtblk_update_cache_mode(vdev); - - /* If disk is read-only in the host, the guest should obey */ - if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) - set_disk_ro(vblk->disk, 1); - /* We can handle whatever the host told us to handle. */ - blk_queue_max_segments(q, sg_elems); + lim->max_segments = sg_elems; /* No real sector limit. */ - blk_queue_max_hw_sectors(q, UINT_MAX); + lim->max_hw_sectors = UINT_MAX; max_dma_size = virtio_max_dma_size(vdev); max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size; @@ -1369,7 +1283,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (!err) max_size = min(max_size, v); - blk_queue_max_segment_size(q, max_size); + lim->max_segment_size = max_size; /* Host can optionally specify the block size of the device */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE, @@ -1381,38 +1295,37 @@ static int virtblk_probe(struct virtio_device *vdev) dev_err(&vdev->dev, "virtio_blk: invalid block size: 0x%x\n", blk_size); - goto out_cleanup_disk; + return err; } - blk_queue_logical_block_size(q, blk_size); + lim->logical_block_size = blk_size; } else - blk_size = queue_logical_block_size(q); + blk_size = lim->logical_block_size; /* Use topology information if available */ err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, physical_block_exp, &physical_block_exp); if (!err && physical_block_exp) - blk_queue_physical_block_size(q, - blk_size * (1 << physical_block_exp)); + lim->physical_block_size = blk_size * (1 << physical_block_exp); err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, alignment_offset, &alignment_offset); if (!err && alignment_offset) - blk_queue_alignment_offset(q, blk_size * alignment_offset); + lim->alignment_offset = blk_size * alignment_offset; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, min_io_size, &min_io_size); if (!err && min_io_size) - blk_queue_io_min(q, blk_size * min_io_size); + lim->io_min = blk_size * min_io_size; err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY, struct virtio_blk_config, opt_io_size, &opt_io_size); if (!err && opt_io_size) - blk_queue_io_opt(q, blk_size * opt_io_size); + lim->io_opt = blk_size * opt_io_size; if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { virtio_cread(vdev, struct virtio_blk_config, @@ -1420,7 +1333,7 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_cread(vdev, struct virtio_blk_config, max_discard_sectors, &v); - blk_queue_max_discard_sectors(q, v ? v : UINT_MAX); + lim->max_hw_discard_sectors = v ? v : UINT_MAX; virtio_cread(vdev, struct virtio_blk_config, max_discard_seg, &max_discard_segs); @@ -1429,7 +1342,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) { virtio_cread(vdev, struct virtio_blk_config, max_write_zeroes_sectors, &v); - blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX); + lim->max_write_zeroes_sectors = v ? v : UINT_MAX; } /* The discard and secure erase limits are combined since the Linux @@ -1455,8 +1368,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: secure_erase_sector_alignment can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } discard_granularity = min_not_zero(discard_granularity, v); @@ -1470,11 +1382,10 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: max_secure_erase_sectors can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } - blk_queue_max_secure_erase_sectors(q, v); + lim->max_secure_erase_sectors = v; virtio_cread(vdev, struct virtio_blk_config, max_secure_erase_seg, &v); @@ -1485,8 +1396,7 @@ static int virtblk_probe(struct virtio_device *vdev) if (!v) { dev_err(&vdev->dev, "virtio_blk: max_secure_erase_seg can't be 0\n"); - err = -EINVAL; - goto out_cleanup_disk; + return -EINVAL; } max_discard_segs = min_not_zero(max_discard_segs, v); @@ -1502,45 +1412,142 @@ static int virtblk_probe(struct virtio_device *vdev) if (!max_discard_segs) max_discard_segs = sg_elems; - blk_queue_max_discard_segments(q, - min(max_discard_segs, MAX_DISCARD_SEGMENTS)); + lim->max_discard_segments = + min(max_discard_segs, MAX_DISCARD_SEGMENTS); if (discard_granularity) - q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT; + lim->discard_granularity = + discard_granularity << SECTOR_SHIFT; else - q->limits.discard_granularity = blk_size; + lim->discard_granularity = blk_size; } - virtblk_update_capacity(vblk, false); - virtio_device_ready(vdev); - - /* - * All steps that follow use the VQs therefore they need to be - * placed after the virtio_device_ready() call above. - */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_ZONED)) { u8 model; - virtio_cread(vdev, struct virtio_blk_config, zoned.model, - &model); + virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); switch (model) { case VIRTIO_BLK_Z_NONE: case VIRTIO_BLK_Z_HA: - /* Present the host-aware device as non-zoned */ - break; + /* treat host-aware devices as non-zoned */ + return 0; case VIRTIO_BLK_Z_HM: - err = virtblk_probe_zoned_device(vdev, vblk, q); + err = virtblk_read_zoned_limits(vblk, lim); if (err) - goto out_cleanup_disk; + return err; break; default: - dev_err(&vdev->dev, "unsupported zone model %d\n", - model); - err = -EINVAL; - goto out_cleanup_disk; + dev_err(&vdev->dev, "unsupported zone model %d\n", model); + return -EINVAL; } } + return 0; +} + +static int virtblk_probe(struct virtio_device *vdev) +{ + struct virtio_blk *vblk; + struct queue_limits lim = { }; + int err, index; + unsigned int queue_depth; + + if (!vdev->config->get) { + dev_err(&vdev->dev, "%s failure: config access disabled\n", + __func__); + return -EINVAL; + } + + err = ida_alloc_range(&vd_index_ida, 0, + minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL); + if (err < 0) + goto out; + index = err; + + vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL); + if (!vblk) { + err = -ENOMEM; + goto out_free_index; + } + + mutex_init(&vblk->vdev_mutex); + + vblk->vdev = vdev; + + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); + + err = init_vq(vblk); + if (err) + goto out_free_vblk; + + /* Default queue sizing is to fill the ring. */ + if (!virtblk_queue_depth) { + queue_depth = vblk->vqs[0].vq->num_free; + /* ... but without indirect descs, we use 2 descs per req */ + if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC)) + queue_depth /= 2; + } else { + queue_depth = virtblk_queue_depth; + } + + memset(&vblk->tag_set, 0, sizeof(vblk->tag_set)); + vblk->tag_set.ops = &virtio_mq_ops; + vblk->tag_set.queue_depth = queue_depth; + vblk->tag_set.numa_node = NUMA_NO_NODE; + vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + vblk->tag_set.cmd_size = + sizeof(struct virtblk_req) + + sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT; + vblk->tag_set.driver_data = vblk; + vblk->tag_set.nr_hw_queues = vblk->num_vqs; + vblk->tag_set.nr_maps = 1; + if (vblk->io_queues[HCTX_TYPE_POLL]) + vblk->tag_set.nr_maps = 3; + + err = blk_mq_alloc_tag_set(&vblk->tag_set); + if (err) + goto out_free_vq; + + err = virtblk_read_limits(vblk, &lim); + if (err) + goto out_free_tags; + + vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, &lim, vblk); + if (IS_ERR(vblk->disk)) { + err = PTR_ERR(vblk->disk); + goto out_free_tags; + } + + virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); + + vblk->disk->major = major; + vblk->disk->first_minor = index_to_minor(index); + vblk->disk->minors = 1 << PART_BITS; + vblk->disk->private_data = vblk; + vblk->disk->fops = &virtblk_fops; + vblk->index = index; + + /* configure queue flush support */ + virtblk_update_cache_mode(vdev); + + /* If disk is read-only in the host, the guest should obey */ + if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) + set_disk_ro(vblk->disk, 1); + + virtblk_update_capacity(vblk, false); + virtio_device_ready(vdev); + + /* + * All steps that follow use the VQs therefore they need to be + * placed after the virtio_device_ready() call above. + */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && lim.zoned) { + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, vblk->disk->queue); + err = blk_revalidate_disk_zones(vblk->disk, NULL); + if (err) + goto out_cleanup_disk; + } + err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups); if (err) goto out_cleanup_disk; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 434fab306777..fd7c0ff2139c 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -941,39 +941,35 @@ static const struct blk_mq_ops blkfront_mq_ops = { .complete = blkif_complete_rq, }; -static void blkif_set_queue_limits(struct blkfront_info *info) +static void blkif_set_queue_limits(const struct blkfront_info *info, + struct queue_limits *lim) { - struct request_queue *rq = info->rq; - struct gendisk *gd = info->gd; unsigned int segments = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_flag_set(QUEUE_FLAG_VIRT, rq); - if (info->feature_discard) { - blk_queue_max_discard_sectors(rq, get_capacity(gd)); - rq->limits.discard_granularity = info->discard_granularity ?: - info->physical_sector_size; - rq->limits.discard_alignment = info->discard_alignment; + lim->max_hw_discard_sectors = UINT_MAX; + if (info->discard_granularity) + lim->discard_granularity = info->discard_granularity; + lim->discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - blk_queue_max_secure_erase_sectors(rq, - get_capacity(gd)); + lim->max_secure_erase_sectors = UINT_MAX; } /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_logical_block_size(rq, info->sector_size); - blk_queue_physical_block_size(rq, info->physical_sector_size); - blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512); + lim->logical_block_size = info->sector_size; + lim->physical_block_size = info->physical_sector_size; + lim->max_hw_sectors = (segments * XEN_PAGE_SIZE) / 512; /* Each segment in a request is up to an aligned page in size. */ - blk_queue_segment_boundary(rq, PAGE_SIZE - 1); - blk_queue_max_segment_size(rq, PAGE_SIZE); + lim->seg_boundary_mask = PAGE_SIZE - 1; + lim->max_segment_size = PAGE_SIZE; /* Ensure a merged request will fit in a single I/O ring slot. */ - blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG); + lim->max_segments = segments / GRANTS_PER_PSEG; /* Make sure buffer addresses are sector-aligned. */ - blk_queue_dma_alignment(rq, 511); + lim->dma_alignment = 511; } static const char *flush_info(struct blkfront_info *info) @@ -1070,6 +1066,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, u16 sector_size, unsigned int physical_sector_size) { + struct queue_limits lim = {}; struct gendisk *gd; int nr_minors = 1; int err; @@ -1136,11 +1133,13 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (err) goto out_release_minors; - gd = blk_mq_alloc_disk(&info->tag_set, info); + blkif_set_queue_limits(info, &lim); + gd = blk_mq_alloc_disk(&info->tag_set, &lim, info); if (IS_ERR(gd)) { err = PTR_ERR(gd); goto out_free_tag_set; } + blk_queue_flag_set(QUEUE_FLAG_VIRT, gd->queue); strcpy(gd->disk_name, DEV_NAME); ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); @@ -1162,7 +1161,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, info->gd = gd; info->sector_size = sector_size; info->physical_sector_size = physical_sector_size; - blkif_set_queue_limits(info); xlvbd_flush(info); @@ -2006,18 +2004,19 @@ static int blkfront_probe(struct xenbus_device *dev, static int blkif_recover(struct blkfront_info *info) { + struct queue_limits lim; unsigned int r_index; struct request *req, *n; int rc; struct bio *bio; - unsigned int segs; struct blkfront_ring_info *rinfo; + lim = queue_limits_start_update(info->rq); blkfront_gather_backend_features(info); - /* Reset limits changed by blk_mq_update_nr_hw_queues(). */ - blkif_set_queue_limits(info); - segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; - blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG); + blkif_set_queue_limits(info, &lim); + rc = queue_limits_commit_update(info->rq, &lim); + if (rc) + return rc; for_each_rinfo(info, rinfo, r_index) { rc = blkfront_setup_indirect(rinfo); @@ -2037,7 +2036,9 @@ static int blkif_recover(struct blkfront_info *info) list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); - BUG_ON(req->nr_phys_segments > segs); + BUG_ON(req->nr_phys_segments > + (info->max_indirect_segments ? : + BLKIF_MAX_SEGMENTS_PER_REQUEST)); blk_mq_requeue_request(req, false); } blk_mq_start_stopped_hw_queues(info->rq, true); diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 11493167b0a8..7c5f4e4d9b50 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -318,7 +318,7 @@ static int z2ram_register_disk(int minor) struct gendisk *disk; int err; - disk = blk_mq_alloc_disk(&tag_set, NULL); + disk = blk_mq_alloc_disk(&tag_set, NULL, NULL); if (IS_ERR(disk)) return PTR_ERR(disk); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d96b3851b5d3..da7a20fa6152 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2177,6 +2177,28 @@ ATTRIBUTE_GROUPS(zram_disk); */ static int zram_add(void) { + struct queue_limits lim = { + .logical_block_size = ZRAM_LOGICAL_BLOCK_SIZE, + /* + * To ensure that we always get PAGE_SIZE aligned and + * n*PAGE_SIZED sized I/O requests. + */ + .physical_block_size = PAGE_SIZE, + .io_min = PAGE_SIZE, + .io_opt = PAGE_SIZE, + .max_hw_discard_sectors = UINT_MAX, + /* + * zram_bio_discard() will clear all logical blocks if logical + * block size is identical with physical block size(PAGE_SIZE). + * But if it is different, we will skip discarding some parts of + * logical blocks in the part of the request range which isn't + * aligned to physical block size. So we can't ensure that all + * discarded logical blocks are zeroed. + */ +#if ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE + .max_write_zeroes_sectors = UINT_MAX, +#endif + }; struct zram *zram; int ret, device_id; @@ -2195,11 +2217,11 @@ static int zram_add(void) #endif /* gendisk structure */ - zram->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!zram->disk) { + zram->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(zram->disk)) { pr_err("Error allocating disk structure for device %d\n", device_id); - ret = -ENOMEM; + ret = PTR_ERR(zram->disk); goto out_free_idr; } @@ -2216,29 +2238,6 @@ static int zram_add(void) /* zram devices sort of resembles non-rotational disks */ blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, zram->disk->queue); - - /* - * To ensure that we always get PAGE_SIZE aligned - * and n*PAGE_SIZED sized I/O requests. - */ - blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE); - blk_queue_logical_block_size(zram->disk->queue, - ZRAM_LOGICAL_BLOCK_SIZE); - blk_queue_io_min(zram->disk->queue, PAGE_SIZE); - blk_queue_io_opt(zram->disk->queue, PAGE_SIZE); - blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX); - - /* - * zram_bio_discard() will clear all logical blocks if logical block - * size is identical with physical block size(PAGE_SIZE). But if it is - * different, we will skip discarding some parts of logical blocks in - * the part of the request range which isn't aligned to physical block - * size. So we can't ensure that all discarded logical blocks are - * zeroed. - */ - if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) - blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index d668b174ace9..eefdd422ad8e 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -724,11 +724,6 @@ static void probe_gdrom_setupdisk(void) static int probe_gdrom_setupqueue(void) { - blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR); - /* using DMA so memory will need to be contiguous */ - blk_queue_max_segments(gd.gdrom_rq, 1); - /* set a large max size to get most from DMA */ - blk_queue_max_segment_size(gd.gdrom_rq, 0x40000); gd.disk->queue = gd.gdrom_rq; return gdrom_init_dma_mode(); } @@ -743,6 +738,13 @@ static const struct blk_mq_ops gdrom_mq_ops = { */ static int probe_gdrom(struct platform_device *devptr) { + struct queue_limits lim = { + .logical_block_size = GDROM_HARD_SECTOR, + /* using DMA so memory will need to be contiguous */ + .max_segments = 1, + /* set a large max size to get most from DMA */ + .max_segment_size = 0x40000, + }; int err; /* @@ -778,7 +780,7 @@ static int probe_gdrom(struct platform_device *devptr) if (err) goto probe_fail_free_cd_info; - gd.disk = blk_mq_alloc_disk(&gd.tag_set, NULL); + gd.disk = blk_mq_alloc_disk(&gd.tag_set, &lim, NULL); if (IS_ERR(gd.disk)) { err = PTR_ERR(gd.disk); goto probe_fail_free_tag_set; @@ -829,7 +831,7 @@ probe_fail_no_mem: return err; } -static int remove_gdrom(struct platform_device *devptr) +static void remove_gdrom(struct platform_device *devptr) { blk_mq_free_tag_set(&gd.tag_set); free_irq(HW_EVENT_GDROM_CMD, &gd); @@ -840,13 +842,11 @@ static int remove_gdrom(struct platform_device *devptr) unregister_cdrom(gd.cd_info); kfree(gd.cd_info); kfree(gd.toc); - - return 0; } static struct platform_driver gdrom_driver = { .probe = probe_gdrom, - .remove = remove_gdrom, + .remove_new = remove_gdrom, .driver = { .name = GDROM_DEV_NAME, }, diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index d00b3abab133..330bcd9ea4a9 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -900,9 +900,23 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, struct request_queue *q; const size_t max_stripes = min_t(size_t, INT_MAX, SIZE_MAX / sizeof(atomic_t)); + struct queue_limits lim = { + .max_hw_sectors = UINT_MAX, + .max_sectors = UINT_MAX, + .max_segment_size = UINT_MAX, + .max_segments = BIO_MAX_VECS, + .max_hw_discard_sectors = UINT_MAX, + .io_min = block_size, + .logical_block_size = block_size, + .physical_block_size = block_size, + }; uint64_t n; int idx; + if (cached_bdev) { + d->stripe_size = bdev_io_opt(cached_bdev) >> SECTOR_SHIFT; + lim.io_opt = umax(block_size, bdev_io_opt(cached_bdev)); + } if (!d->stripe_size) d->stripe_size = 1 << 31; else if (d->stripe_size < BCH_MIN_STRIPE_SZ) @@ -935,8 +949,21 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, BIOSET_NEED_BVECS|BIOSET_NEED_RESCUER)) goto out_ida_remove; - d->disk = blk_alloc_disk(NUMA_NO_NODE); - if (!d->disk) + if (lim.logical_block_size > PAGE_SIZE && cached_bdev) { + /* + * This should only happen with BCACHE_SB_VERSION_BDEV. + * Block/page size is checked for BCACHE_SB_VERSION_CDEV. + */ + pr_info("bcache%i: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", + idx, lim.logical_block_size, + PAGE_SIZE, bdev_logical_block_size(cached_bdev)); + + /* This also adjusts physical block size/min io size if needed */ + lim.logical_block_size = bdev_logical_block_size(cached_bdev); + } + + d->disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(d->disk)) goto out_bioset_exit; set_capacity(d->disk, sectors); @@ -949,27 +976,6 @@ static int bcache_device_init(struct bcache_device *d, unsigned int block_size, d->disk->private_data = d; q = d->disk->queue; - q->limits.max_hw_sectors = UINT_MAX; - q->limits.max_sectors = UINT_MAX; - q->limits.max_segment_size = UINT_MAX; - q->limits.max_segments = BIO_MAX_VECS; - blk_queue_max_discard_sectors(q, UINT_MAX); - q->limits.io_min = block_size; - q->limits.logical_block_size = block_size; - q->limits.physical_block_size = block_size; - - if (q->limits.logical_block_size > PAGE_SIZE && cached_bdev) { - /* - * This should only happen with BCACHE_SB_VERSION_BDEV. - * Block/page size is checked for BCACHE_SB_VERSION_CDEV. - */ - pr_info("%s: sb/logical block size (%u) greater than page size (%lu) falling back to device logical block size (%u)\n", - d->disk->disk_name, q->limits.logical_block_size, - PAGE_SIZE, bdev_logical_block_size(cached_bdev)); - - /* This also adjusts physical block size/min io size if needed */ - blk_queue_logical_block_size(q, bdev_logical_block_size(cached_bdev)); - } blk_queue_flag_set(QUEUE_FLAG_NONROT, d->disk->queue); @@ -1416,9 +1422,7 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); } - dc->disk.stripe_size = q->limits.io_opt >> 9; - - if (dc->disk.stripe_size) + if (bdev_io_opt(dc->bdev)) dc->partial_stripes_expensive = q->limits.raid_partial_stripes_expensive; @@ -1428,9 +1432,6 @@ static int cached_dev_init(struct cached_dev *dc, unsigned int block_size) if (ret) return ret; - blk_queue_io_opt(dc->disk.disk->queue, - max(queue_io_opt(dc->disk.disk->queue), queue_io_opt(q))); - atomic_set(&dc->io_errors, 0); dc->io_disable = false; dc->error_limit = DEFAULT_CACHED_DEV_ERROR_LIMIT; diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index eb009d6bb03a..17e9af60bbf7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -213,6 +213,7 @@ struct raid_dev { #define RT_FLAG_RS_IN_SYNC 6 #define RT_FLAG_RS_RESYNCING 7 #define RT_FLAG_RS_GROW 8 +#define RT_FLAG_RS_FROZEN 9 /* Array elements of 64 bit needed for rebuild/failed disk bits */ #define DISKS_ARRAY_ELEMS ((MAX_RAID_DEVICES + (sizeof(uint64_t) * 8 - 1)) / sizeof(uint64_t) / 8) @@ -3240,11 +3241,12 @@ size_check: rs->md.ro = 1; rs->md.in_sync = 1; - /* Keep array frozen until resume. */ - set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); - /* Has to be held on running the array */ mddev_suspend_and_lock_nointr(&rs->md); + + /* Keep array frozen until resume. */ + md_frozen_sync_thread(&rs->md); + r = md_run(&rs->md); rs->md.in_sync = 0; /* Assume already marked dirty */ if (r) { @@ -3339,7 +3341,8 @@ static int raid_map(struct dm_target *ti, struct bio *bio) if (unlikely(bio_end_sector(bio) > mddev->array_sectors)) return DM_MAPIO_REQUEUE; - md_handle_request(mddev, bio); + if (unlikely(!md_handle_request(mddev, bio))) + return DM_MAPIO_REQUEUE; return DM_MAPIO_SUBMITTED; } @@ -3718,21 +3721,33 @@ static int raid_message(struct dm_target *ti, unsigned int argc, char **argv, { struct raid_set *rs = ti->private; struct mddev *mddev = &rs->md; + int ret = 0; if (!mddev->pers || !mddev->pers->sync_request) return -EINVAL; - if (!strcasecmp(argv[0], "frozen")) - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - else - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (test_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags) || + test_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags)) + return -EBUSY; - if (!strcasecmp(argv[0], "idle") || !strcasecmp(argv[0], "frozen")) { - if (mddev->sync_thread) { - set_bit(MD_RECOVERY_INTR, &mddev->recovery); - md_reap_sync_thread(mddev); - } - } else if (decipher_sync_action(mddev, mddev->recovery) != st_idle) + if (!strcasecmp(argv[0], "frozen")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_frozen_sync_thread(mddev); + mddev_unlock(mddev); + } else if (!strcasecmp(argv[0], "idle")) { + ret = mddev_lock(mddev); + if (ret) + return ret; + + md_idle_sync_thread(mddev); + mddev_unlock(mddev); + } + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + if (decipher_sync_action(mddev, mddev->recovery) != st_idle) return -EBUSY; else if (!strcasecmp(argv[0], "resync")) ; /* MD_RECOVERY_NEEDED set below */ @@ -3791,15 +3806,46 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); } +static void raid_presuspend(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + struct mddev *mddev = &rs->md; + + /* + * From now on, disallow raid_message() to change sync_thread until + * resume, raid_postsuspend() is too late. + */ + set_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); + + if (!reshape_interrupted(mddev)) + return; + + /* + * For raid456, if reshape is interrupted, IO across reshape position + * will never make progress, while caller will wait for IO to be done. + * Inform raid456 to handle those IO to prevent deadlock. + */ + if (mddev->pers && mddev->pers->prepare_suspend) + mddev->pers->prepare_suspend(mddev); +} + +static void raid_presuspend_undo(struct dm_target *ti) +{ + struct raid_set *rs = ti->private; + + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); +} + static void raid_postsuspend(struct dm_target *ti) { struct raid_set *rs = ti->private; if (!test_and_set_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { - /* Writes have to be stopped before suspending to avoid deadlocks. */ - if (!test_bit(MD_RECOVERY_FROZEN, &rs->md.recovery)) - md_stop_writes(&rs->md); - + /* + * sync_thread must be stopped during suspend, and writes have + * to be stopped before suspending to avoid deadlocks. + */ + md_stop_writes(&rs->md); mddev_suspend(&rs->md, false); } } @@ -4012,8 +4058,6 @@ static int raid_preresume(struct dm_target *ti) } /* Check for any resize/reshape on @rs and adjust/initiate */ - /* Be prepared for mddev_resume() in raid_resume() */ - set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); if (mddev->recovery_cp && mddev->recovery_cp < MaxSector) { set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); mddev->resync_min = mddev->recovery_cp; @@ -4047,7 +4091,9 @@ static void raid_resume(struct dm_target *ti) * Take this opportunity to check whether any failed * devices are reachable again. */ + mddev_lock_nointr(mddev); attempt_restore_of_faulty_devices(rs); + mddev_unlock(mddev); } if (test_and_clear_bit(RT_FLAG_RS_SUSPENDED, &rs->runtime_flags)) { @@ -4055,10 +4101,13 @@ static void raid_resume(struct dm_target *ti) if (mddev->delta_disks < 0) rs_set_capacity(rs); + WARN_ON_ONCE(!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)); + WARN_ON_ONCE(test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)); + clear_bit(RT_FLAG_RS_FROZEN, &rs->runtime_flags); mddev_lock_nointr(mddev); - clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); mddev->ro = 0; mddev->in_sync = 0; + md_unfrozen_sync_thread(mddev); mddev_unlock_and_resume(mddev); } } @@ -4074,6 +4123,8 @@ static struct target_type raid_target = { .message = raid_message, .iterate_devices = raid_iterate_devices, .io_hints = raid_io_hints, + .presuspend = raid_presuspend, + .presuspend_undo = raid_presuspend_undo, .postsuspend = raid_postsuspend, .preresume = raid_preresume, .resume = raid_resume, diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 41f1d731ae5a..88114719fe18 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1963,26 +1963,27 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, bool wc = false, fua = false; int r; - /* - * Copy table's limits to the DM device's request_queue - */ - q->limits = *limits; - if (dm_table_supports_nowait(t)) blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); else blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, q); if (!dm_table_supports_discards(t)) { - q->limits.max_discard_sectors = 0; - q->limits.max_hw_discard_sectors = 0; - q->limits.discard_granularity = 0; - q->limits.discard_alignment = 0; - q->limits.discard_misaligned = 0; + limits->max_hw_discard_sectors = 0; + limits->discard_granularity = 0; + limits->discard_alignment = 0; + limits->discard_misaligned = 0; } + if (!dm_table_supports_write_zeroes(t)) + limits->max_write_zeroes_sectors = 0; + if (!dm_table_supports_secure_erase(t)) - q->limits.max_secure_erase_sectors = 0; + limits->max_secure_erase_sectors = 0; + + r = queue_limits_set(q, limits); + if (r) + return r; if (dm_table_supports_flush(t, (1UL << QUEUE_FLAG_WC))) { wc = true; @@ -2007,9 +2008,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, else blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - if (!dm_table_supports_write_zeroes(t)) - q->limits.max_write_zeroes_sectors = 0; - dm_table_verify_integrity(t); /* @@ -2047,7 +2045,6 @@ int dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, } dm_update_crypto_profile(q, t); - disk_update_readahead(t->md->disk); /* * Check for request-based device is left to diff --git a/drivers/md/dm-zoned-metadata.c b/drivers/md/dm-zoned-metadata.c index fdfe30f7b697..8156881a31de 100644 --- a/drivers/md/dm-zoned-metadata.c +++ b/drivers/md/dm-zoned-metadata.c @@ -1655,10 +1655,13 @@ static int dmz_reset_zone(struct dmz_metadata *zmd, struct dm_zone *zone) if (!dmz_is_empty(zone) || dmz_seq_write_err(zone)) { struct dmz_dev *dev = zone->dev; + unsigned int noio_flag; + noio_flag = memalloc_noio_save(); ret = blkdev_zone_mgmt(dev->bdev, REQ_OP_ZONE_RESET, dmz_start_sect(zmd, zone), - zmd->zone_nr_sectors, GFP_NOIO); + zmd->zone_nr_sectors); + memalloc_noio_restore(noio_flag); if (ret) { dmz_dev_err(dev, "Reset zone %u failed %d", zone->id, ret); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 87de5b5682ad..447e132d09b5 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2101,8 +2101,8 @@ static struct mapped_device *alloc_dev(int minor) * established. If request-based table is loaded: blk-mq will * override accordingly. */ - md->disk = blk_alloc_disk(md->numa_node_id); - if (!md->disk) + md->disk = blk_alloc_disk(NULL, md->numa_node_id); + if (IS_ERR(md->disk)) goto bad; md->queue = md->disk->queue; diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9672f75c3050..059afc24c08b 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -234,7 +234,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap, sector_t doff; bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev; - if (pg_index == store->file_pages - 1) { + /* we compare length (page numbers), not page offset. */ + if ((pg_index - store->sb_index) == store->file_pages - 1) { unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1); if (last_page_size == 0) @@ -438,8 +439,8 @@ static void filemap_write_page(struct bitmap *bitmap, unsigned long pg_index, struct page *page = store->filemap[pg_index]; if (mddev_is_clustered(bitmap->mddev)) { - pg_index += bitmap->cluster_slot * - DIV_ROUND_UP(store->bytes, PAGE_SIZE); + /* go to node bitmap area starting point */ + pg_index += store->sb_index; } if (store->file) @@ -952,6 +953,7 @@ static void md_bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; + index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; @@ -982,6 +984,7 @@ static void md_bitmap_file_clear_bit(struct bitmap *bitmap, sector_t block) unsigned long index = file_page_index(store, chunk); unsigned long node_offset = 0; + index += store->sb_index; if (mddev_is_clustered(bitmap->mddev)) node_offset = bitmap->cluster_slot * store->file_pages; @@ -1043,9 +1046,8 @@ void md_bitmap_unplug(struct bitmap *bitmap) if (dirty || need_write) { if (!writing) { md_bitmap_wait_writes(bitmap); - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_unplug"); + mddev_add_trace_msg(bitmap->mddev, + "md bitmap_unplug"); } clear_page_attr(bitmap, i, BITMAP_PAGE_PENDING); filemap_write_page(bitmap, i, false); @@ -1316,9 +1318,7 @@ void md_bitmap_daemon_work(struct mddev *mddev) } bitmap->allclean = 1; - if (bitmap->mddev->queue) - blk_add_trace_msg(bitmap->mddev->queue, - "md bitmap_daemon_work"); + mddev_add_trace_msg(bitmap->mddev, "md bitmap_daemon_work"); /* Any file-page which is PENDING now needs to be written. * So set NEEDWRITE now, then after we make any last-minute changes diff --git a/drivers/md/md-linear.h b/drivers/md/md-linear.h deleted file mode 100644 index 5587eeedb882..000000000000 --- a/drivers/md/md-linear.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINEAR_H -#define _LINEAR_H - -struct dev_info { - struct md_rdev *rdev; - sector_t end_sector; -}; - -struct linear_conf -{ - struct rcu_head rcu; - sector_t array_sectors; - int raid_disks; /* a copy of mddev->raid_disks */ - struct dev_info disks[] __counted_by(raid_disks); -}; -#endif diff --git a/drivers/md/md-multipath.h b/drivers/md/md-multipath.h deleted file mode 100644 index b3099e5fc4d7..000000000000 --- a/drivers/md/md-multipath.h +++ /dev/null @@ -1,32 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _MULTIPATH_H -#define _MULTIPATH_H - -struct multipath_info { - struct md_rdev *rdev; -}; - -struct mpconf { - struct mddev *mddev; - struct multipath_info *multipaths; - int raid_disks; - spinlock_t device_lock; - struct list_head retry_list; - - mempool_t pool; -}; - -/* - * this is our 'private' 'collective' MULTIPATH buffer head. - * it contains information about what kind of IO operations were started - * for this MULTIPATH operation, and about their status: - */ - -struct multipath_bh { - struct mddev *mddev; - struct bio *master_bio; - struct bio bio; - int path; - struct list_head retry_list; -}; -#endif diff --git a/drivers/md/md.c b/drivers/md/md.c index 3bf7eab94efd..e575e74aabf5 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -65,7 +65,6 @@ #include <linux/percpu-refcount.h> #include <linux/part_stat.h> -#include <trace/events/block.h> #include "md.h" #include "md-bitmap.h" #include "md-cluster.h" @@ -99,18 +98,6 @@ static void mddev_detach(struct mddev *mddev); static void export_rdev(struct md_rdev *rdev, struct mddev *mddev); static void md_wakeup_thread_directly(struct md_thread __rcu *thread); -enum md_ro_state { - MD_RDWR, - MD_RDONLY, - MD_AUTO_READ, - MD_MAX_STATE -}; - -static bool md_is_rdwr(struct mddev *mddev) -{ - return (mddev->ro == MD_RDWR); -} - /* * Default number of read corrections we'll attempt on an rdev * before ejecting it from the array. We divide the read error @@ -378,7 +365,7 @@ static bool is_suspended(struct mddev *mddev, struct bio *bio) return true; } -void md_handle_request(struct mddev *mddev, struct bio *bio) +bool md_handle_request(struct mddev *mddev, struct bio *bio) { check_suspended: if (is_suspended(mddev, bio)) { @@ -386,7 +373,7 @@ check_suspended: /* Bail out if REQ_NOWAIT is set for the bio */ if (bio->bi_opf & REQ_NOWAIT) { bio_wouldblock_error(bio); - return; + return true; } for (;;) { prepare_to_wait(&mddev->sb_wait, &__wait, @@ -402,10 +389,13 @@ check_suspended: if (!mddev->pers->make_request(mddev, bio)) { percpu_ref_put(&mddev->active_io); + if (!mddev->gendisk && mddev->pers->prepare_suspend) + return false; goto check_suspended; } percpu_ref_put(&mddev->active_io); + return true; } EXPORT_SYMBOL(md_handle_request); @@ -529,6 +519,24 @@ void mddev_resume(struct mddev *mddev) } EXPORT_SYMBOL_GPL(mddev_resume); +/* sync bdev before setting device to readonly or stopping raid*/ +static int mddev_set_closing_and_sync_blockdev(struct mddev *mddev, int opener_num) +{ + mutex_lock(&mddev->open_mutex); + if (mddev->pers && atomic_read(&mddev->openers) > opener_num) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { + mutex_unlock(&mddev->open_mutex); + return -EBUSY; + } + mutex_unlock(&mddev->open_mutex); + + sync_blockdev(mddev->gendisk->part0); + return 0; +} + /* * Generic flush handling for md */ @@ -2406,7 +2414,7 @@ int md_integrity_register(struct mddev *mddev) if (list_empty(&mddev->disks)) return 0; /* nothing to do */ - if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) + if (mddev_is_dm(mddev) || blk_get_integrity(mddev->gendisk)) return 0; /* shouldn't register, or already is */ rdev_for_each(rdev, mddev) { /* skip spares and non-functional disks */ @@ -2459,7 +2467,7 @@ int md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { struct blk_integrity *bi_mddev; - if (!mddev->gendisk) + if (mddev_is_dm(mddev)) return 0; bi_mddev = blk_get_integrity(mddev->gendisk); @@ -2566,6 +2574,7 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) fail: pr_warn("md: failed to register dev-%s for %s\n", b, mdname(mddev)); + mddev_destroy_serial_pool(mddev, rdev); return err; } @@ -2595,7 +2604,7 @@ static void md_kick_rdev_from_array(struct md_rdev *rdev) list_del_rcu(&rdev->same_set); pr_debug("md: unbind<%pg>\n", rdev->bdev); mddev_destroy_serial_pool(rdev->mddev, rdev); - rdev->mddev = NULL; + WRITE_ONCE(rdev->mddev, NULL); sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); sysfs_put(rdev->sysfs_unack_badblocks); @@ -2851,8 +2860,7 @@ repeat: pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", mdname(mddev), mddev->in_sync); - if (mddev->queue) - blk_add_trace_msg(mddev->queue, "md md_update_sb"); + mddev_add_trace_msg(mddev, "md md_update_sb"); rewrite: md_bitmap_update_sb(mddev->bitmap); rdev_for_each(rdev, mddev) { @@ -2933,7 +2941,6 @@ static int add_bound_rdev(struct md_rdev *rdev) set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); md_new_event(); - md_wakeup_thread(mddev->thread); return 0; } @@ -3048,10 +3055,8 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) if (err == 0) { md_kick_rdev_from_array(rdev); - if (mddev->pers) { + if (mddev->pers) set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - md_wakeup_thread(mddev->thread); - } md_new_event(); } } @@ -3081,7 +3086,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) clear_bit(BlockedBadBlocks, &rdev->flags); wake_up(&rdev->blocked_wait); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { @@ -3119,7 +3123,6 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) !test_bit(Replacement, &rdev->flags)) set_bit(WantReplacement, &rdev->flags); set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); err = 0; } else if (cmd_match(buf, "-want_replacement")) { /* Clearing 'want_replacement' is always allowed. @@ -3249,7 +3252,6 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) if (rdev->raid_disk >= 0) return -EBUSY; set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - md_wakeup_thread(rdev->mddev->thread); } else if (rdev->mddev->pers) { /* Activating a spare .. or possibly reactivating * if we ever get bitmaps working here. @@ -3343,8 +3345,7 @@ static ssize_t new_offset_store(struct md_rdev *rdev, if (kstrtoull(buf, 10, &new_offset) < 0) return -EINVAL; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING,&mddev->recovery)) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (new_offset == rdev->data_offset) /* reset is always permitted */ @@ -3675,7 +3676,7 @@ rdev_attr_store(struct kobject *kobj, struct attribute *attr, struct kernfs_node *kn = NULL; bool suspend = false; ssize_t rv; - struct mddev *mddev = rdev->mddev; + struct mddev *mddev = READ_ONCE(rdev->mddev); if (!entry->store) return -EIO; @@ -4017,8 +4018,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) */ rv = -EBUSY; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || mddev->reshape_position != MaxSector || mddev->sysfs_active) goto out_unlock; @@ -4168,7 +4168,6 @@ level_store(struct mddev *mddev, const char *buf, size_t len) mddev->in_sync = 1; del_timer_sync(&mddev->safemode_timer); } - blk_set_stacking_limits(&mddev->queue->limits); pers->run(mddev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); if (!mddev->thread) @@ -4475,8 +4474,8 @@ array_state_show(struct mddev *mddev, char *page) return sprintf(page, "%s\n", array_states[st]); } -static int do_md_stop(struct mddev *mddev, int ro, struct block_device *bdev); -static int md_set_readonly(struct mddev *mddev, struct block_device *bdev); +static int do_md_stop(struct mddev *mddev, int ro); +static int md_set_readonly(struct mddev *mddev); static int restart_array(struct mddev *mddev); static ssize_t @@ -4493,6 +4492,17 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case broken: /* cannot be set */ case bad_word: return -EINVAL; + case clear: + case readonly: + case inactive: + case read_auto: + if (!mddev->pers || !md_is_rdwr(mddev)) + break; + /* write sysfs will not open mddev and opener should be 0 */ + err = mddev_set_closing_and_sync_blockdev(mddev, 0); + if (err) + return err; + break; default: break; } @@ -4526,14 +4536,14 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case inactive: /* stop an active array, return 0 otherwise */ if (mddev->pers) - err = do_md_stop(mddev, 2, NULL); + err = do_md_stop(mddev, 2); break; case clear: - err = do_md_stop(mddev, 0, NULL); + err = do_md_stop(mddev, 0); break; case readonly: if (mddev->pers) - err = md_set_readonly(mddev, NULL); + err = md_set_readonly(mddev); else { mddev->ro = MD_RDONLY; set_disk_ro(mddev->gendisk, 1); @@ -4543,7 +4553,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) case read_auto: if (mddev->pers) { if (md_is_rdwr(mddev)) - err = md_set_readonly(mddev, NULL); + err = md_set_readonly(mddev); else if (mddev->ro == MD_RDONLY) err = restart_array(mddev); if (err == 0) { @@ -4592,6 +4602,11 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) sysfs_notify_dirent_safe(mddev->sysfs_state); } mddev_unlock(mddev); + + if (st == readonly || st == read_auto || st == inactive || + (err && st == clear)) + clear_bit(MD_CLOSING, &mddev->flags); + return err ?: len; } static struct md_sysfs_entry md_array_state = @@ -4919,6 +4934,35 @@ static void stop_sync_thread(struct mddev *mddev, bool locked, bool check_seq) mddev_lock_nointr(mddev); } +void md_idle_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, true); +} +EXPORT_SYMBOL_GPL(md_idle_sync_thread); + +void md_frozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); +} +EXPORT_SYMBOL_GPL(md_frozen_sync_thread); + +void md_unfrozen_sync_thread(struct mddev *mddev) +{ + lockdep_assert_held(&mddev->reconfig_mutex); + + clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); + md_wakeup_thread(mddev->thread); + sysfs_notify_dirent_safe(mddev->sysfs_action); +} +EXPORT_SYMBOL_GPL(md_unfrozen_sync_thread); + static void idle_sync_thread(struct mddev *mddev) { mutex_lock(&mddev->sync_mutex); @@ -5710,6 +5754,51 @@ static const struct kobj_type md_ktype = { int mdp_major = 0; +/* stack the limit for all rdevs into lim */ +void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim) +{ + struct md_rdev *rdev; + + rdev_for_each(rdev, mddev) { + queue_limits_stack_bdev(lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + } +} +EXPORT_SYMBOL_GPL(mddev_stack_rdev_limits); + +/* apply the extra stacking limits from a new rdev into mddev */ +int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return 0; + + lim = queue_limits_start_update(mddev->gendisk->queue); + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->data_offset, + mddev->gendisk->disk_name); + return queue_limits_commit_update(mddev->gendisk->queue, &lim); +} +EXPORT_SYMBOL_GPL(mddev_stack_new_rdev); + +/* update the optimal I/O size after a reshape */ +void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes) +{ + struct queue_limits lim; + + if (mddev_is_dm(mddev)) + return; + + /* don't bother updating io_opt if we can't suspend the array */ + if (mddev_suspend(mddev, false) < 0) + return; + lim = queue_limits_start_update(mddev->gendisk->queue); + lim.io_opt = lim.io_min * nr_stripes; + queue_limits_commit_update(mddev->gendisk->queue, &lim); + mddev_resume(mddev); +} +EXPORT_SYMBOL_GPL(mddev_update_io_opt); + static void mddev_delayed_delete(struct work_struct *ws) { struct mddev *mddev = container_of(ws, struct mddev, del_work); @@ -5774,10 +5863,11 @@ struct mddev *md_alloc(dev_t dev, char *name) */ mddev->hold_active = UNTIL_STOP; - error = -ENOMEM; - disk = blk_alloc_disk(NUMA_NO_NODE); - if (!disk) + disk = blk_alloc_disk(NULL, NUMA_NO_NODE); + if (IS_ERR(disk)) { + error = PTR_ERR(disk); goto out_free_mddev; + } disk->major = MAJOR(mddev->unit); disk->first_minor = unit << shift; @@ -5791,9 +5881,7 @@ struct mddev *md_alloc(dev_t dev, char *name) disk->fops = &md_fops; disk->private_data = mddev; - mddev->queue = disk->queue; - blk_set_stacking_limits(&mddev->queue->limits); - blk_queue_write_cache(mddev->queue, true, true); + blk_queue_write_cache(disk->queue, true, true); disk->events |= DISK_EVENT_MEDIA_CHANGE; mddev->gendisk = disk; error = add_disk(disk); @@ -5935,7 +6023,7 @@ int md_run(struct mddev *mddev) invalidate_bdev(rdev->bdev); if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) { mddev->ro = MD_RDONLY; - if (mddev->gendisk) + if (!mddev_is_dm(mddev)) set_disk_ro(mddev->gendisk, 1); } @@ -6038,7 +6126,10 @@ int md_run(struct mddev *mddev) pr_warn("True protection against single-disk failure might be compromised.\n"); } - mddev->recovery = 0; + /* dm-raid expect sync_thread to be frozen until resume */ + if (mddev->gendisk) + mddev->recovery = 0; + /* may be over-ridden by personality */ mddev->resync_max_sectors = mddev->dev_sectors; @@ -6094,7 +6185,8 @@ int md_run(struct mddev *mddev) } } - if (mddev->queue) { + if (!mddev_is_dm(mddev)) { + struct request_queue *q = mddev->gendisk->queue; bool nonrot = true; rdev_for_each(rdev, mddev) { @@ -6106,14 +6198,14 @@ int md_run(struct mddev *mddev) if (mddev->degraded) nonrot = false; if (nonrot) - blk_queue_flag_set(QUEUE_FLAG_NONROT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_NONROT, q); else - blk_queue_flag_clear(QUEUE_FLAG_NONROT, mddev->queue); - blk_queue_flag_set(QUEUE_FLAG_IO_STAT, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_NONROT, q); + blk_queue_flag_set(QUEUE_FLAG_IO_STAT, q); /* Set the NOWAIT flags if all underlying devices support it */ if (nowait) - blk_queue_flag_set(QUEUE_FLAG_NOWAIT, mddev->queue); + blk_queue_flag_set(QUEUE_FLAG_NOWAIT, q); } if (pers->sync_request) { if (mddev->kobj.sd && @@ -6192,7 +6284,6 @@ int do_md_run(struct mddev *mddev) /* run start up tasks that require md_thread */ md_start(mddev); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); @@ -6213,7 +6304,6 @@ int md_start(struct mddev *mddev) if (mddev->pers->start) { set_bit(MD_RECOVERY_WAIT, &mddev->recovery); - md_wakeup_thread(mddev->thread); ret = mddev->pers->start(mddev); clear_bit(MD_RECOVERY_WAIT, &mddev->recovery); md_wakeup_thread(mddev->sync_thread); @@ -6258,7 +6348,6 @@ static int restart_array(struct mddev *mddev) pr_debug("md: %s switched to read-write mode.\n", mdname(mddev)); /* Kick recovery or resync if necessary */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); sysfs_notify_dirent_safe(mddev->sysfs_state); return 0; @@ -6278,7 +6367,15 @@ static void md_clean(struct mddev *mddev) mddev->persistent = 0; mddev->level = LEVEL_NONE; mddev->clevel[0] = 0; - mddev->flags = 0; + /* + * Don't clear MD_CLOSING, or mddev can be opened again. + * 'hold_active != 0' means mddev is still in the creation + * process and will be used later. + */ + if (mddev->hold_active) + mddev->flags = 0; + else + mddev->flags &= BIT_ULL_MASK(MD_CLOSING); mddev->sb_flags = 0; mddev->ro = MD_RDWR; mddev->metadata_type[0] = 0; @@ -6315,7 +6412,6 @@ static void md_clean(struct mddev *mddev) static void __md_stop_writes(struct mddev *mddev) { - stop_sync_thread(mddev, true, false); del_timer_sync(&mddev->safemode_timer); if (mddev->pers && mddev->pers->quiesce) { @@ -6340,6 +6436,8 @@ static void __md_stop_writes(struct mddev *mddev) void md_stop_writes(struct mddev *mddev) { mddev_lock_nointr(mddev); + set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + stop_sync_thread(mddev, true, false); __md_stop_writes(mddev); mddev_unlock(mddev); } @@ -6353,8 +6451,10 @@ static void mddev_detach(struct mddev *mddev) mddev->pers->quiesce(mddev, 0); } md_unregister_thread(mddev, &mddev->thread); - if (mddev->queue) - blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ + + /* the unplug fn references 'conf' */ + if (!mddev_is_dm(mddev)) + blk_sync_queue(mddev->gendisk->queue); } static void __md_stop(struct mddev *mddev) @@ -6391,7 +6491,8 @@ void md_stop(struct mddev *mddev) EXPORT_SYMBOL_GPL(md_stop); -static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) +/* ensure 'mddev->pers' exist before calling md_set_readonly() */ +static int md_set_readonly(struct mddev *mddev) { int err = 0; int did_freeze = 0; @@ -6402,7 +6503,6 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - md_wakeup_thread(mddev->thread); } stop_sync_thread(mddev, false, false); @@ -6410,36 +6510,29 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); mddev_lock_nointr(mddev); - mutex_lock(&mddev->open_mutex); - if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || - mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); err = -EBUSY; goto out; } - if (mddev->pers) { - __md_stop_writes(mddev); - - if (mddev->ro == MD_RDONLY) { - err = -ENXIO; - goto out; - } + __md_stop_writes(mddev); - mddev->ro = MD_RDONLY; - set_disk_ro(mddev->gendisk, 1); + if (mddev->ro == MD_RDONLY) { + err = -ENXIO; + goto out; } + mddev->ro = MD_RDONLY; + set_disk_ro(mddev->gendisk, 1); + out: - if ((mddev->pers && !err) || did_freeze) { + if (!err || did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); sysfs_notify_dirent_safe(mddev->sysfs_state); } - mutex_unlock(&mddev->open_mutex); return err; } @@ -6447,8 +6540,7 @@ out: * 0 - completely stop and dis-assemble array * 2 - stop but do not disassemble array */ -static int do_md_stop(struct mddev *mddev, int mode, - struct block_device *bdev) +static int do_md_stop(struct mddev *mddev, int mode) { struct gendisk *disk = mddev->gendisk; struct md_rdev *rdev; @@ -6457,22 +6549,16 @@ static int do_md_stop(struct mddev *mddev, int mode, if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { did_freeze = 1; set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); - md_wakeup_thread(mddev->thread); } stop_sync_thread(mddev, true, false); - mutex_lock(&mddev->open_mutex); - if ((mddev->pers && atomic_read(&mddev->openers) > !!bdev) || - mddev->sysfs_active || - mddev->sync_thread || + if (mddev->sysfs_active || test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) { pr_warn("md: %s still in use.\n",mdname(mddev)); - mutex_unlock(&mddev->open_mutex); if (did_freeze) { clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); } return -EBUSY; } @@ -6491,13 +6577,11 @@ static int do_md_stop(struct mddev *mddev, int mode, sysfs_unlink_rdev(mddev, rdev); set_capacity_and_notify(disk, 0); - mutex_unlock(&mddev->open_mutex); mddev->changed = 1; if (!md_is_rdwr(mddev)) mddev->ro = MD_RDWR; - } else - mutex_unlock(&mddev->open_mutex); + } /* * Free resources if final stop */ @@ -6543,7 +6627,7 @@ static void autorun_array(struct mddev *mddev) err = do_md_run(mddev); if (err) { pr_warn("md: do_md_run() returned %d\n", err); - do_md_stop(mddev, 0, NULL); + do_md_stop(mddev, 0); } } @@ -7013,9 +7097,7 @@ kick_rdev: md_kick_rdev_from_array(rdev); set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags); - if (mddev->thread) - md_wakeup_thread(mddev->thread); - else + if (!mddev->thread) md_update_sb(mddev, 1); md_new_event(); @@ -7090,14 +7172,13 @@ static int hot_add_disk(struct mddev *mddev, dev_t dev) if (!bdev_nowait(rdev->bdev)) { pr_info("%s: Disabling nowait because %pg does not support nowait\n", mdname(mddev), rdev->bdev); - blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->queue); + blk_queue_flag_clear(QUEUE_FLAG_NOWAIT, mddev->gendisk->queue); } /* * Kick recovery, maybe this spare has to be added to the * array immediately. */ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); - md_wakeup_thread(mddev->thread); md_new_event(); return 0; @@ -7311,8 +7392,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) * of each device. If num_sectors is zero, we find the largest size * that fits. */ - if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - mddev->sync_thread) + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) return -EBUSY; if (!md_is_rdwr(mddev)) return -EROFS; @@ -7329,10 +7409,9 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) if (!rv) { if (mddev_is_clustered(mddev)) md_cluster_ops->update_size(mddev, old_dev_sectors); - else if (mddev->queue) { + else if (!mddev_is_dm(mddev)) set_capacity_and_notify(mddev->gendisk, mddev->array_sectors); - } } return rv; } @@ -7349,8 +7428,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks) if (raid_disks <= 0 || (mddev->max_disks && raid_disks >= mddev->max_disks)) return -EINVAL; - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || test_bit(MD_RESYNCING_REMOTE, &mddev->recovery) || mddev->reshape_position != MaxSector) return -EBUSY; @@ -7546,16 +7624,17 @@ static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -static inline bool md_ioctl_valid(unsigned int cmd) +static inline int md_ioctl_valid(unsigned int cmd) { switch (cmd) { - case ADD_NEW_DISK: case GET_ARRAY_INFO: - case GET_BITMAP_FILE: case GET_DISK_INFO: + case RAID_VERSION: + return 0; + case ADD_NEW_DISK: + case GET_BITMAP_FILE: case HOT_ADD_DISK: case HOT_REMOVE_DISK: - case RAID_VERSION: case RESTART_ARRAY_RW: case RUN_ARRAY: case SET_ARRAY_INFO: @@ -7564,9 +7643,11 @@ static inline bool md_ioctl_valid(unsigned int cmd) case STOP_ARRAY: case STOP_ARRAY_RO: case CLUSTERED_DISK_NACK: - return true; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + return 0; default: - return false; + return -ENOTTY; } } @@ -7624,31 +7705,17 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, int err = 0; void __user *argp = (void __user *)arg; struct mddev *mddev = NULL; - bool did_set_md_closing = false; - if (!md_ioctl_valid(cmd)) - return -ENOTTY; - - switch (cmd) { - case RAID_VERSION: - case GET_ARRAY_INFO: - case GET_DISK_INFO: - break; - default: - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - } + err = md_ioctl_valid(cmd); + if (err) + return err; /* * Commands dealing with the RAID driver but not any * particular array: */ - switch (cmd) { - case RAID_VERSION: - err = get_version(argp); - goto out; - default:; - } + if (cmd == RAID_VERSION) + return get_version(argp); /* * Commands creating/starting a new array: @@ -7656,35 +7723,23 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, mddev = bdev->bd_disk->private_data; - if (!mddev) { - BUG(); - goto out; - } - /* Some actions do not requires the mutex */ switch (cmd) { case GET_ARRAY_INFO: if (!mddev->raid_disks && !mddev->external) - err = -ENODEV; - else - err = get_array_info(mddev, argp); - goto out; + return -ENODEV; + return get_array_info(mddev, argp); case GET_DISK_INFO: if (!mddev->raid_disks && !mddev->external) - err = -ENODEV; - else - err = get_disk_info(mddev, argp); - goto out; + return -ENODEV; + return get_disk_info(mddev, argp); case SET_DISK_FAULTY: - err = set_disk_faulty(mddev, new_decode_dev(arg)); - goto out; + return set_disk_faulty(mddev, new_decode_dev(arg)); case GET_BITMAP_FILE: - err = get_bitmap_file(mddev, argp); - goto out; - + return get_bitmap_file(mddev, argp); } if (cmd == HOT_REMOVE_DISK) @@ -7697,20 +7752,9 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, /* Need to flush page cache, and ensure no-one else opens * and writes */ - mutex_lock(&mddev->open_mutex); - if (mddev->pers && atomic_read(&mddev->openers) > 1) { - mutex_unlock(&mddev->open_mutex); - err = -EBUSY; - goto out; - } - if (test_and_set_bit(MD_CLOSING, &mddev->flags)) { - mutex_unlock(&mddev->open_mutex); - err = -EBUSY; - goto out; - } - did_set_md_closing = true; - mutex_unlock(&mddev->open_mutex); - sync_blockdev(bdev); + err = mddev_set_closing_and_sync_blockdev(mddev, 1); + if (err) + return err; } if (!md_is_rdwr(mddev)) @@ -7751,11 +7795,12 @@ static int md_ioctl(struct block_device *bdev, blk_mode_t mode, goto unlock; case STOP_ARRAY: - err = do_md_stop(mddev, 0, bdev); + err = do_md_stop(mddev, 0); goto unlock; case STOP_ARRAY_RO: - err = md_set_readonly(mddev, bdev); + if (mddev->pers) + err = md_set_readonly(mddev); goto unlock; case HOT_REMOVE_DISK: @@ -7850,7 +7895,7 @@ unlock: mddev_unlock(mddev); out: - if(did_set_md_closing) + if (cmd == STOP_ARRAY_RO || (err && cmd == STOP_ARRAY)) clear_bit(MD_CLOSING, &mddev->flags); return err; } @@ -8687,10 +8732,7 @@ void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, bio_chain(discard_bio, bio); bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(discard_bio, - disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); + mddev_trace_remap(mddev, discard_bio, bio->bi_iter.bi_sector); submit_bio_noacct(discard_bio); } EXPORT_SYMBOL_GPL(md_submit_discard_bio); @@ -8737,6 +8779,23 @@ void md_account_bio(struct mddev *mddev, struct bio **bio) } EXPORT_SYMBOL_GPL(md_account_bio); +void md_free_cloned_bio(struct bio *bio) +{ + struct md_io_clone *md_io_clone = bio->bi_private; + struct bio *orig_bio = md_io_clone->orig_bio; + struct mddev *mddev = md_io_clone->mddev; + + if (bio->bi_status && !orig_bio->bi_status) + orig_bio->bi_status = bio->bi_status; + + if (md_io_clone->start_time) + bio_end_io_acct(orig_bio, md_io_clone->start_time); + + bio_put(bio); + percpu_ref_put(&mddev->active_io); +} +EXPORT_SYMBOL_GPL(md_free_cloned_bio); + /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before @@ -9170,7 +9229,7 @@ void md_do_sync(struct md_thread *thread) mddev->delta_disks > 0 && mddev->pers->finish_reshape && mddev->pers->size && - mddev->queue) { + !mddev_is_dm(mddev)) { mddev_lock_nointr(mddev); md_set_array_sectors(mddev, mddev->pers->size(mddev, 0, 0)); mddev_unlock(mddev); @@ -9270,9 +9329,14 @@ static bool md_spares_need_change(struct mddev *mddev) { struct md_rdev *rdev; - rdev_for_each(rdev, mddev) - if (rdev_removeable(rdev) || rdev_addable(rdev)) + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { + if (rdev_removeable(rdev) || rdev_addable(rdev)) { + rcu_read_unlock(); return true; + } + } + rcu_read_unlock(); return false; } diff --git a/drivers/md/md.h b/drivers/md/md.h index a079ee9b6190..097d9dbd69b8 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -18,6 +18,7 @@ #include <linux/timer.h> #include <linux/wait.h> #include <linux/workqueue.h> +#include <trace/events/block.h> #include "md-cluster.h" #define MaxSector (~(sector_t)0) @@ -207,6 +208,7 @@ enum flag_bits { * check if there is collision between raid1 * serial bios. */ + Nonrot, /* non-rotational device (SSD) */ }; static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, @@ -222,6 +224,16 @@ static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, } return 0; } + +static inline int rdev_has_badblock(struct md_rdev *rdev, sector_t s, + int sectors) +{ + sector_t first_bad; + int bad_sectors; + + return is_badblock(rdev, s, sectors, &first_bad, &bad_sectors); +} + extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, int is_new); extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, @@ -468,7 +480,6 @@ struct mddev { struct timer_list safemode_timer; struct percpu_ref writes_pending; int sync_checkers; /* # of threads checking writes_pending */ - struct request_queue *queue; /* for plugging ... */ struct bitmap *bitmap; /* the bitmap for the device */ struct { @@ -558,6 +569,37 @@ enum recovery_flags { MD_RESYNCING_REMOTE, /* remote node is running resync thread */ }; +enum md_ro_state { + MD_RDWR, + MD_RDONLY, + MD_AUTO_READ, + MD_MAX_STATE +}; + +static inline bool md_is_rdwr(struct mddev *mddev) +{ + return (mddev->ro == MD_RDWR); +} + +static inline bool reshape_interrupted(struct mddev *mddev) +{ + /* reshape never start */ + if (mddev->reshape_position == MaxSector) + return false; + + /* interrupted */ + if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) + return true; + + /* running reshape will be interrupted soon. */ + if (test_bit(MD_RECOVERY_WAIT, &mddev->recovery) || + test_bit(MD_RECOVERY_INTR, &mddev->recovery) || + test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) + return true; + + return false; +} + static inline int __must_check mddev_lock(struct mddev *mddev) { return mutex_lock_interruptible(&mddev->reconfig_mutex); @@ -617,6 +659,7 @@ struct md_personality int (*start_reshape) (struct mddev *mddev); void (*finish_reshape) (struct mddev *mddev); void (*update_reshape_pos) (struct mddev *mddev); + void (*prepare_suspend) (struct mddev *mddev); /* quiesce suspends or resumes internal processing. * 1 - stop new actions and wait for action io to complete * 0 - return to normal behaviour @@ -750,6 +793,7 @@ extern void md_finish_reshape(struct mddev *mddev); void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, struct bio *bio, sector_t start, sector_t size); void md_account_bio(struct mddev *mddev, struct bio **bio); +void md_free_cloned_bio(struct bio *bio); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, @@ -778,9 +822,12 @@ extern void md_stop_writes(struct mddev *mddev); extern int md_rdev_init(struct md_rdev *rdev); extern void md_rdev_clear(struct md_rdev *rdev); -extern void md_handle_request(struct mddev *mddev, struct bio *bio); +extern bool md_handle_request(struct mddev *mddev, struct bio *bio); extern int mddev_suspend(struct mddev *mddev, bool interruptible); extern void mddev_resume(struct mddev *mddev); +extern void md_idle_sync_thread(struct mddev *mddev); +extern void md_frozen_sync_thread(struct mddev *mddev); +extern void md_unfrozen_sync_thread(struct mddev *mddev); extern void md_reload_sb(struct mddev *mddev, int raid_disk); extern void md_update_sb(struct mddev *mddev, int force); @@ -821,7 +868,7 @@ static inline void mddev_check_write_zeroes(struct mddev *mddev, struct bio *bio { if (bio_op(bio) == REQ_OP_WRITE_ZEROES && !bio->bi_bdev->bd_disk->queue->limits.max_write_zeroes_sectors) - mddev->queue->limits.max_write_zeroes_sectors = 0; + mddev->gendisk->queue->limits.max_write_zeroes_sectors = 0; } static inline int mddev_suspend_and_lock(struct mddev *mddev) @@ -860,7 +907,31 @@ void md_autostart_arrays(int part); int md_set_array_info(struct mddev *mddev, struct mdu_array_info_s *info); int md_add_new_disk(struct mddev *mddev, struct mdu_disk_info_s *info); int do_md_run(struct mddev *mddev); +void mddev_stack_rdev_limits(struct mddev *mddev, struct queue_limits *lim); +int mddev_stack_new_rdev(struct mddev *mddev, struct md_rdev *rdev); +void mddev_update_io_opt(struct mddev *mddev, unsigned int nr_stripes); extern const struct block_device_operations md_fops; +/* + * MD devices can be used undeneath by DM, in which case ->gendisk is NULL. + */ +static inline bool mddev_is_dm(struct mddev *mddev) +{ + return !mddev->gendisk; +} + +static inline void mddev_trace_remap(struct mddev *mddev, struct bio *bio, + sector_t sector) +{ + if (!mddev_is_dm(mddev)) + trace_block_bio_remap(bio, disk_devt(mddev->gendisk), sector); +} + +#define mddev_add_trace_msg(mddev, fmt, args...) \ +do { \ + if (!mddev_is_dm(mddev)) \ + blk_add_trace_msg((mddev)->gendisk->queue, fmt, ##args); \ +} while (0) + #endif /* _MD_MD_H */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c50a7abda744..c5d4aeb68404 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -379,6 +379,19 @@ static void raid0_free(struct mddev *mddev, void *priv) free_conf(mddev, conf); } +static int raid0_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_hw_sectors = mddev->chunk_sectors; + lim.max_write_zeroes_sectors = mddev->chunk_sectors; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * mddev->raid_disks; + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static int raid0_run(struct mddev *mddev) { struct r0conf *conf; @@ -399,20 +412,10 @@ static int raid0_run(struct mddev *mddev) mddev->private = conf; } conf = mddev->private; - if (mddev->queue) { - struct md_rdev *rdev; - - blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors); - blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors); - - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - blk_queue_io_opt(mddev->queue, - (mddev->chunk_sectors << 9) * mddev->raid_disks); - - rdev_for_each(rdev, mddev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - } + if (!mddev_is_dm(mddev)) { + ret = raid0_set_limits(mddev); + if (ret) + goto out_free_conf; } /* calculate array device size */ @@ -426,8 +429,10 @@ static int raid0_run(struct mddev *mddev) ret = md_integrity_register(mddev); if (ret) - free_conf(mddev, conf); - + goto out_free_conf; + return 0; +out_free_conf: + free_conf(mddev, conf); return ret; } @@ -578,10 +583,7 @@ static void raid0_map_submit_bio(struct mddev *mddev, struct bio *bio) bio_set_dev(bio, tmp_dev->bdev); bio->bi_iter.bi_sector = sector + zone->dev_start + tmp_dev->data_offset; - - if (mddev->gendisk) - trace_block_bio_remap(bio, disk_devt(mddev->gendisk), - bio_sector); + mddev_trace_remap(mddev, bio, bio_sector); mddev_check_write_zeroes(mddev, bio); submit_bio_noacct(bio); } diff --git a/drivers/md/raid1-10.c b/drivers/md/raid1-10.c index 512746551f36..2ea1710a3b70 100644 --- a/drivers/md/raid1-10.c +++ b/drivers/md/raid1-10.c @@ -227,3 +227,72 @@ static inline bool exceed_read_errors(struct mddev *mddev, struct md_rdev *rdev) return false; } + +/** + * raid1_check_read_range() - check a given read range for bad blocks, + * available read length is returned; + * @rdev: the rdev to read; + * @this_sector: read position; + * @len: read length; + * + * helper function for read_balance() + * + * 1) If there are no bad blocks in the range, @len is returned; + * 2) If the range are all bad blocks, 0 is returned; + * 3) If there are partial bad blocks: + * - If the bad block range starts after @this_sector, the length of first + * good region is returned; + * - If the bad block range starts before @this_sector, 0 is returned and + * the @len is updated to the offset into the region before we get to the + * good blocks; + */ +static inline int raid1_check_read_range(struct md_rdev *rdev, + sector_t this_sector, int *len) +{ + sector_t first_bad; + int bad_sectors; + + /* no bad block overlap */ + if (!is_badblock(rdev, this_sector, *len, &first_bad, &bad_sectors)) + return *len; + + /* + * bad block range starts offset into our range so we can return the + * number of sectors before the bad blocks start. + */ + if (first_bad > this_sector) + return first_bad - this_sector; + + /* read range is fully consumed by bad blocks. */ + if (this_sector + *len <= first_bad + bad_sectors) + return 0; + + /* + * final case, bad block range starts before or at the start of our + * range but does not cover our entire range so we still return 0 but + * update the length with the number of sectors before we get to the + * good ones. + */ + *len = first_bad + bad_sectors - this_sector; + return 0; +} + +/* + * Check if read should choose the first rdev. + * + * Balance on the whole device if no resync is going on (recovery is ok) or + * below the resync window. Otherwise, take the first readable disk. + */ +static inline bool raid1_should_read_first(struct mddev *mddev, + sector_t this_sector, int len) +{ + if ((mddev->recovery_cp < this_sector + len)) + return true; + + if (mddev_is_clustered(mddev) && + md_cluster_ops->area_resyncing(mddev, READ, this_sector, + this_sector + len)) + return true; + + return false; +} diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 286f8b16c7bd..be8ac24f50b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,9 +46,6 @@ static void allow_barrier(struct r1conf *conf, sector_t sector_nr); static void lower_barrier(struct r1conf *conf, sector_t sector_nr); -#define raid1_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid1 " fmt, ##args); } while (0) - #define RAID_1_10_NAME "raid1" #include "raid1-10.c" @@ -498,9 +495,6 @@ static void raid1_end_write_request(struct bio *bio) * to user-side. So if something waits for IO, then it * will wait for the 'master' bio. */ - sector_t first_bad; - int bad_sectors; - r1_bio->bios[mirror] = NULL; to_put = bio; /* @@ -516,8 +510,8 @@ static void raid1_end_write_request(struct bio *bio) set_bit(R1BIO_Uptodate, &r1_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !discard_error) { r1_bio->bios[mirror] = IO_MADE_GOOD; set_bit(R1BIO_MadeGood, &r1_bio->state); } @@ -582,211 +576,312 @@ static sector_t align_to_barrier_unit_end(sector_t start_sector, return len; } -/* - * This routine returns the disk from which the requested read should - * be done. There is a per-array 'next expected sequential IO' sector - * number - if this matches on the next IO then we use the last disk. - * There is also a per-disk 'last know head position' sector that is - * maintained from IRQ contexts, both the normal and the resync IO - * completion handlers update this position correctly. If there is no - * perfect sequential match then we pick the disk whose head is closest. - * - * If there are 2 mirrors in the same 2 devices, performance degrades - * because position is mirror, not device based. - * - * The rdev for the device selected will have nr_pending incremented. - */ -static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sectors) +static void update_read_sectors(struct r1conf *conf, int disk, + sector_t this_sector, int len) { - const sector_t this_sector = r1_bio->sector; - int sectors; - int best_good_sectors; - int best_disk, best_dist_disk, best_pending_disk; - int has_nonrot_disk; + struct raid1_info *info = &conf->mirrors[disk]; + + atomic_inc(&info->rdev->nr_pending); + if (info->next_seq_sect != this_sector) + info->seq_start = this_sector; + info->next_seq_sect = this_sector + len; +} + +static int choose_first_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int len = r1_bio->sectors; int disk; - sector_t best_dist; - unsigned int min_pending; - struct md_rdev *rdev; - int choose_first; - int choose_next_idle; - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on, or below the resync window. - * We take the first readable disk when above the resync window. - */ - retry: - sectors = r1_bio->sectors; - best_disk = -1; - best_dist_disk = -1; - best_dist = MaxSector; - best_pending_disk = -1; - min_pending = UINT_MAX; - best_good_sectors = 0; - has_nonrot_disk = 0; - choose_next_idle = 0; - clear_bit(R1BIO_FailFast, &r1_bio->state); + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int read_len; - if ((conf->mddev->recovery_cp < this_sector + sectors) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) - choose_first = 1; - else - choose_first = 0; + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags)) + continue; + + /* choose the first disk even if it has some bad blocks. */ + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > 0) { + update_read_sectors(conf, disk, this_sector, read_len); + *max_sectors = read_len; + return disk; + } + } + + return -1; +} + +static int choose_bb_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int best_disk = -1; + int best_len = 0; + int disk; for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { - sector_t dist; - sector_t first_bad; - int bad_sectors; - unsigned int pending; - bool nonrot; + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; rdev = conf->mirrors[disk].rdev; - if (r1_bio->bios[disk] == IO_BLOCKED - || rdev == NULL - || test_bit(Faulty, &rdev->flags)) + if (!rdev || test_bit(Faulty, &rdev->flags) || + test_bit(WriteMostly, &rdev->flags)) continue; - if (!test_bit(In_sync, &rdev->flags) && - rdev->recovery_offset < this_sector + sectors) + + /* keep track of the disk with the most readable sectors. */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len > best_len) { + best_disk = disk; + best_len = read_len; + } + } + + if (best_disk != -1) { + *max_sectors = best_len; + update_read_sectors(conf, best_disk, this_sector, best_len); + } + + return best_disk; +} + +static int choose_slow_rdev(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + sector_t this_sector = r1_bio->sector; + int bb_disk = -1; + int bb_read_len = 0; + int disk; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + int len; + int read_len; + + if (r1_bio->bios[disk] == IO_BLOCKED) continue; - if (test_bit(WriteMostly, &rdev->flags)) { - /* Don't balance among write-mostly, just - * use the first as a last resort */ - if (best_dist_disk < 0) { - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (first_bad <= this_sector) - /* Cannot use this */ - continue; - best_good_sectors = first_bad - this_sector; - } else - best_good_sectors = sectors; - best_dist_disk = disk; - best_pending_disk = disk; - } + + rdev = conf->mirrors[disk].rdev; + if (!rdev || test_bit(Faulty, &rdev->flags) || + !test_bit(WriteMostly, &rdev->flags)) continue; + + /* there are no bad blocks, we can use this disk */ + len = r1_bio->sectors; + read_len = raid1_check_read_range(rdev, this_sector, &len); + if (read_len == r1_bio->sectors) { + update_read_sectors(conf, disk, this_sector, read_len); + return disk; } - /* This is a reasonable device to use. It might - * even be best. + + /* + * there are partial bad blocks, choose the rdev with largest + * read length. */ - if (is_badblock(rdev, this_sector, sectors, - &first_bad, &bad_sectors)) { - if (best_dist < MaxSector) - /* already have a better device */ - continue; - if (first_bad <= this_sector) { - /* cannot read here. If this is the 'primary' - * device, then we must not read beyond - * bad_sectors from another device.. - */ - bad_sectors -= (this_sector - first_bad); - if (choose_first && sectors > bad_sectors) - sectors = bad_sectors; - if (best_good_sectors > sectors) - best_good_sectors = sectors; - - } else { - sector_t good_sectors = first_bad - this_sector; - if (good_sectors > best_good_sectors) { - best_good_sectors = good_sectors; - best_disk = disk; - } - if (choose_first) - break; - } - continue; - } else { - if ((sectors > best_good_sectors) && (best_disk >= 0)) - best_disk = -1; - best_good_sectors = sectors; + if (read_len > bb_read_len) { + bb_disk = disk; + bb_read_len = read_len; } + } + + if (bb_disk != -1) { + *max_sectors = bb_read_len; + update_read_sectors(conf, bb_disk, this_sector, bb_read_len); + } + + return bb_disk; +} + +static bool is_sequential(struct r1conf *conf, int disk, struct r1bio *r1_bio) +{ + /* TODO: address issues with this check and concurrency. */ + return conf->mirrors[disk].next_seq_sect == r1_bio->sector || + conf->mirrors[disk].head_position == r1_bio->sector; +} + +/* + * If buffered sequential IO size exceeds optimal iosize, check if there is idle + * disk. If yes, choose the idle disk. + */ +static bool should_choose_next(struct r1conf *conf, int disk) +{ + struct raid1_info *mirror = &conf->mirrors[disk]; + int opt_iosize; + + if (!test_bit(Nonrot, &mirror->rdev->flags)) + return false; + + opt_iosize = bdev_io_opt(mirror->rdev->bdev) >> 9; + return opt_iosize > 0 && mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= mirror->seq_start; +} + +static bool rdev_readable(struct md_rdev *rdev, struct r1bio *r1_bio) +{ + if (!rdev || test_bit(Faulty, &rdev->flags)) + return false; + + /* still in recovery */ + if (!test_bit(In_sync, &rdev->flags) && + rdev->recovery_offset < r1_bio->sector + r1_bio->sectors) + return false; + + /* don't read from slow disk unless have to */ + if (test_bit(WriteMostly, &rdev->flags)) + return false; + + /* don't split IO for bad blocks unless have to */ + if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors)) + return false; + + return true; +} + +struct read_balance_ctl { + sector_t closest_dist; + int closest_dist_disk; + int min_pending; + int min_pending_disk; + int sequential_disk; + int readable_disks; +}; + +static int choose_best_rdev(struct r1conf *conf, struct r1bio *r1_bio) +{ + int disk; + struct read_balance_ctl ctl = { + .closest_dist_disk = -1, + .closest_dist = MaxSector, + .min_pending_disk = -1, + .min_pending = UINT_MAX, + .sequential_disk = -1, + }; + + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { + struct md_rdev *rdev; + sector_t dist; + unsigned int pending; - if (best_disk >= 0) - /* At least two disks to choose from so failfast is OK */ + if (r1_bio->bios[disk] == IO_BLOCKED) + continue; + + rdev = conf->mirrors[disk].rdev; + if (!rdev_readable(rdev, r1_bio)) + continue; + + /* At least two disks to choose from so failfast is OK */ + if (ctl.readable_disks++ == 1) set_bit(R1BIO_FailFast, &r1_bio->state); - nonrot = bdev_nonrot(rdev->bdev); - has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); - dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first) { - best_disk = disk; - break; - } + dist = abs(r1_bio->sector - conf->mirrors[disk].head_position); + /* Don't change to another disk for sequential reads */ - if (conf->mirrors[disk].next_seq_sect == this_sector - || dist == 0) { - int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; - struct raid1_info *mirror = &conf->mirrors[disk]; + if (is_sequential(conf, disk, r1_bio)) { + if (!should_choose_next(conf, disk)) + return disk; - best_disk = disk; /* - * If buffered sequential IO size exceeds optimal - * iosize, check if there is idle disk. If yes, choose - * the idle disk. read_balance could already choose an - * idle disk before noticing it's a sequential IO in - * this disk. This doesn't matter because this disk - * will idle, next time it will be utilized after the - * first disk has IO size exceeds optimal iosize. In - * this way, iosize of the first disk will be optimal - * iosize at least. iosize of the second disk might be - * small, but not a big deal since when the second disk - * starts IO, the first disk is likely still busy. + * Add 'pending' to avoid choosing this disk if + * there is other idle disk. */ - if (nonrot && opt_iosize > 0 && - mirror->seq_start != MaxSector && - mirror->next_seq_sect > opt_iosize && - mirror->next_seq_sect - opt_iosize >= - mirror->seq_start) { - choose_next_idle = 1; - continue; - } - break; + pending++; + /* + * If there is no other idle disk, this disk + * will be chosen. + */ + ctl.sequential_disk = disk; } - if (choose_next_idle) - continue; - - if (min_pending > pending) { - min_pending = pending; - best_pending_disk = disk; + if (ctl.min_pending > pending) { + ctl.min_pending = pending; + ctl.min_pending_disk = disk; } - if (dist < best_dist) { - best_dist = dist; - best_dist_disk = disk; + if (ctl.closest_dist > dist) { + ctl.closest_dist = dist; + ctl.closest_dist_disk = disk; } } /* + * sequential IO size exceeds optimal iosize, however, there is no other + * idle disk, so choose the sequential disk. + */ + if (ctl.sequential_disk != -1 && ctl.min_pending != 0) + return ctl.sequential_disk; + + /* * If all disks are rotational, choose the closest disk. If any disk is * non-rotational, choose the disk with less pending request even the * disk is rotational, which might/might not be optimal for raids with * mixed ratation/non-rotational disks depending on workload. */ - if (best_disk == -1) { - if (has_nonrot_disk || min_pending == 0) - best_disk = best_pending_disk; - else - best_disk = best_dist_disk; - } + if (ctl.min_pending_disk != -1 && + (READ_ONCE(conf->nonrot_disks) || ctl.min_pending == 0)) + return ctl.min_pending_disk; + else + return ctl.closest_dist_disk; +} - if (best_disk >= 0) { - rdev = conf->mirrors[best_disk].rdev; - if (!rdev) - goto retry; - atomic_inc(&rdev->nr_pending); - sectors = best_good_sectors; +/* + * This routine returns the disk from which the requested read should be done. + * + * 1) If resync is in progress, find the first usable disk and use it even if it + * has some bad blocks. + * + * 2) Now that there is no resync, loop through all disks and skipping slow + * disks and disks with bad blocks for now. Only pay attention to key disk + * choice. + * + * 3) If we've made it this far, now look for disks with bad blocks and choose + * the one with most number of sectors. + * + * 4) If we are all the way at the end, we have no choice but to use a disk even + * if it is write mostly. + * + * The rdev for the device selected will have nr_pending incremented. + */ +static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, + int *max_sectors) +{ + int disk; - if (conf->mirrors[best_disk].next_seq_sect != this_sector) - conf->mirrors[best_disk].seq_start = this_sector; + clear_bit(R1BIO_FailFast, &r1_bio->state); + + if (raid1_should_read_first(conf->mddev, r1_bio->sector, + r1_bio->sectors)) + return choose_first_rdev(conf, r1_bio, max_sectors); - conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; + disk = choose_best_rdev(conf, r1_bio); + if (disk >= 0) { + *max_sectors = r1_bio->sectors; + update_read_sectors(conf, disk, r1_bio->sector, + r1_bio->sectors); + return disk; } - *max_sectors = sectors; - return best_disk; + /* + * If we are here it means we didn't find a perfectly good disk so + * now spend a bit more time trying to find one with the most good + * sectors. + */ + disk = choose_bb_rdev(conf, r1_bio, max_sectors); + if (disk >= 0) + return disk; + + return choose_slow_rdev(conf, r1_bio, max_sectors); } static void wake_up_barrier(struct r1conf *conf) @@ -1098,7 +1193,7 @@ static void freeze_array(struct r1conf *conf, int extra) */ spin_lock_irq(&conf->resync_lock); conf->array_frozen = 1; - raid1_log(conf->mddev, "wait freeze"); + mddev_add_trace_msg(conf->mddev, "raid1 wait freeze"); wait_event_lock_irq_cmd( conf->wait_barrier, get_unqueued_pending(conf) == extra, @@ -1287,7 +1382,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, * Reading from a write-mostly device must take care not to * over-take any writes that are 'behind' */ - raid1_log(mddev, "wait behind writes"); + mddev_add_trace_msg(mddev, "raid1 wait behind writes"); wait_event(bitmap->behind_wait, atomic_read(&bitmap->behind_writes) == 0); } @@ -1320,11 +1415,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio, test_bit(R1BIO_FailFast, &r1_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r1_bio; - - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r1_bio->sector); - + mddev_trace_remap(mddev, read_bio, r1_bio->sector); submit_bio_noacct(read_bio); } @@ -1474,7 +1565,8 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, bio_wouldblock_error(bio); return; } - raid1_log(mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + mddev_add_trace_msg(mddev, "raid1 wait rdev %d blocked", + blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf, bio->bi_iter.bi_sector, false); goto retry_write; @@ -1557,10 +1649,7 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); - - if (mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(mddev->gendisk), - r1_bio->sector); + mddev_trace_remap(mddev, mbio, r1_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; if (!raid1_add_bio_to_plug(mddev, mbio, raid1_unplug, disks)) { @@ -1760,6 +1849,52 @@ static int raid1_spare_active(struct mddev *mddev) return count; } +static bool raid1_add_conf(struct r1conf *conf, struct md_rdev *rdev, int disk, + bool replacement) +{ + struct raid1_info *info = conf->mirrors + disk; + + if (replacement) + info += conf->raid_disks; + + if (info->rdev) + return false; + + if (bdev_nonrot(rdev->bdev)) { + set_bit(Nonrot, &rdev->flags); + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks + 1); + } + + rdev->raid_disk = disk; + info->head_position = 0; + info->seq_start = MaxSector; + WRITE_ONCE(info->rdev, rdev); + + return true; +} + +static bool raid1_remove_conf(struct r1conf *conf, int disk) +{ + struct raid1_info *info = conf->mirrors + disk; + struct md_rdev *rdev = info->rdev; + + if (!rdev || test_bit(In_sync, &rdev->flags) || + atomic_read(&rdev->nr_pending)) + return false; + + /* Only remove non-faulty devices if recovery is not possible. */ + if (!test_bit(Faulty, &rdev->flags) && + rdev->mddev->recovery_disabled != conf->recovery_disabled && + rdev->mddev->degraded < conf->raid_disks) + return false; + + if (test_and_clear_bit(Nonrot, &rdev->flags)) + WRITE_ONCE(conf->nonrot_disks, conf->nonrot_disks - 1); + + WRITE_ONCE(info->rdev, NULL); + return true; +} + static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) { struct r1conf *conf = mddev->private; @@ -1791,19 +1926,16 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) for (mirror = first; mirror <= last; mirror++) { p = conf->mirrors + mirror; if (!p->rdev) { - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; - p->head_position = 0; - rdev->raid_disk = mirror; - err = 0; + raid1_add_conf(conf, rdev, mirror, false); /* As all devices are equivalent, we don't need a full recovery * if this was recently any drive of the array */ if (rdev->saved_raid_disk < 0) conf->fullsync = 1; - WRITE_ONCE(p->rdev, rdev); break; } if (test_bit(WantReplacement, &p->rdev->flags) && @@ -1813,13 +1945,11 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) if (err && repl_slot >= 0) { /* Add this device as a replacement */ - p = conf->mirrors + repl_slot; clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); - rdev->raid_disk = repl_slot; + raid1_add_conf(conf, rdev, repl_slot, true); err = 0; conf->fullsync = 1; - WRITE_ONCE(p[conf->raid_disks].rdev, rdev); } print_conf(conf); @@ -1836,27 +1966,20 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) if (unlikely(number >= conf->raid_disks)) goto abort; - if (rdev != p->rdev) - p = conf->mirrors + conf->raid_disks + number; + if (rdev != p->rdev) { + number += conf->raid_disks; + p = conf->mirrors + number; + } print_conf(conf); if (rdev == p->rdev) { - if (test_bit(In_sync, &rdev->flags) || - atomic_read(&rdev->nr_pending)) { + if (!raid1_remove_conf(conf, number)) { err = -EBUSY; goto abort; } - /* Only remove non-faulty devices if recovery - * is not possible. - */ - if (!test_bit(Faulty, &rdev->flags) && - mddev->recovery_disabled != conf->recovery_disabled && - mddev->degraded < conf->raid_disks) { - err = -EBUSY; - goto abort; - } - WRITE_ONCE(p->rdev, NULL); - if (conf->mirrors[conf->raid_disks + number].rdev) { + + if (number < conf->raid_disks && + conf->mirrors[conf->raid_disks + number].rdev) { /* We just removed a device that is being replaced. * Move down the replacement. We drain all IO before * doing this to avoid confusion. @@ -1944,8 +2067,6 @@ static void end_sync_write(struct bio *bio) struct r1bio *r1_bio = get_resync_r1bio(bio); struct mddev *mddev = r1_bio->mddev; struct r1conf *conf = mddev->private; - sector_t first_bad; - int bad_sectors; struct md_rdev *rdev = conf->mirrors[find_bio_disk(r1_bio, bio)].rdev; if (!uptodate) { @@ -1955,14 +2076,11 @@ static void end_sync_write(struct bio *bio) set_bit(MD_RECOVERY_NEEDED, & mddev->recovery); set_bit(R1BIO_WriteError, &r1_bio->state); - } else if (is_badblock(rdev, r1_bio->sector, r1_bio->sectors, - &first_bad, &bad_sectors) && - !is_badblock(conf->mirrors[r1_bio->read_disk].rdev, - r1_bio->sector, - r1_bio->sectors, - &first_bad, &bad_sectors) - ) + } else if (rdev_has_badblock(rdev, r1_bio->sector, r1_bio->sectors) && + !rdev_has_badblock(conf->mirrors[r1_bio->read_disk].rdev, + r1_bio->sector, r1_bio->sectors)) { set_bit(R1BIO_MadeGood, &r1_bio->state); + } put_sync_write_buf(r1_bio, uptodate); } @@ -2279,16 +2397,12 @@ static void fix_read_error(struct r1conf *conf, struct r1bio *r1_bio) s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - rdev = conf->mirrors[d].rdev; if (rdev && (test_bit(In_sync, &rdev->flags) || (!test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sect + s)) && - is_badblock(rdev, sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, sect, s) == 0) { atomic_inc(&rdev->nr_pending); if (sync_page_io(rdev, sect, s<<9, conf->tmppage, REQ_OP_READ, false)) @@ -3006,23 +3120,17 @@ static struct r1conf *setup_conf(struct mddev *mddev) err = -EINVAL; spin_lock_init(&conf->device_lock); + conf->raid_disks = mddev->raid_disks; rdev_for_each(rdev, mddev) { int disk_idx = rdev->raid_disk; - if (disk_idx >= mddev->raid_disks - || disk_idx < 0) + + if (disk_idx >= conf->raid_disks || disk_idx < 0) continue; - if (test_bit(Replacement, &rdev->flags)) - disk = conf->mirrors + mddev->raid_disks + disk_idx; - else - disk = conf->mirrors + disk_idx; - if (disk->rdev) + if (!raid1_add_conf(conf, rdev, disk_idx, + test_bit(Replacement, &rdev->flags))) goto abort; - disk->rdev = rdev; - disk->head_position = 0; - disk->seq_start = MaxSector; } - conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; INIT_LIST_HEAD(&conf->retry_list); INIT_LIST_HEAD(&conf->bio_end_io_list); @@ -3086,12 +3194,21 @@ static struct r1conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } +static int raid1_set_limits(struct mddev *mddev) +{ + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); +} + static void raid1_free(struct mddev *mddev, void *priv); static int raid1_run(struct mddev *mddev) { struct r1conf *conf; int i; - struct md_rdev *rdev; int ret; if (mddev->level != 1) { @@ -3118,14 +3235,10 @@ static int raid1_run(struct mddev *mddev) if (IS_ERR(conf)) return PTR_ERR(conf); - if (mddev->queue) - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - if (!mddev->gendisk) - continue; - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + if (!mddev_is_dm(mddev)) { + ret = raid1_set_limits(mddev); + if (ret) + goto abort; } mddev->degraded = 0; diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index 14d4211a123a..5300cbaa58a4 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h @@ -71,6 +71,7 @@ struct r1conf { * allow for replacements. */ int raid_disks; + int nonrot_disks; spinlock_t device_lock; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index a5f8419e2df1..a4556d2e46bf 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -76,9 +76,6 @@ static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); static void end_reshape_write(struct bio *bio); static void end_reshape(struct r10conf *conf); -#define raid10_log(md, fmt, args...) \ - do { if ((md)->queue) blk_add_trace_msg((md)->queue, "raid10 " fmt, ##args); } while (0) - #include "raid1-10.c" #define NULL_CMD @@ -518,11 +515,7 @@ static void raid10_end_write_request(struct bio *bio) * The 'master' represents the composite IO operation to * user-side. So if something waits for IO, then it will * wait for the 'master' bio. - */ - sector_t first_bad; - int bad_sectors; - - /* + * * Do not set R10BIO_Uptodate if the current device is * rebuilding or Faulty. This is because we cannot use * such device for properly reading the data back (we could @@ -535,10 +528,9 @@ static void raid10_end_write_request(struct bio *bio) set_bit(R10BIO_Uptodate, &r10_bio->state); /* Maybe we can clear some bad blocks. */ - if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors) && !discard_error) { + if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, + r10_bio->sectors) && + !discard_error) { bio_put(bio); if (repl) r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; @@ -753,17 +745,8 @@ static struct md_rdev *read_balance(struct r10conf *conf, best_good_sectors = 0; do_balance = 1; clear_bit(R10BIO_FailFast, &r10_bio->state); - /* - * Check if we can balance. We can balance on the whole - * device if no resync is going on (recovery is ok), or below - * the resync window. We take the first readable disk when - * above the resync window. - */ - if ((conf->mddev->recovery_cp < MaxSector - && (this_sector + sectors >= conf->next_resync)) || - (mddev_is_clustered(conf->mddev) && - md_cluster_ops->area_resyncing(conf->mddev, READ, this_sector, - this_sector + sectors))) + + if (raid1_should_read_first(conf->mddev, this_sector, sectors)) do_balance = 0; for (slot = 0; slot < conf->copies ; slot++) { @@ -1033,7 +1016,7 @@ static bool wait_barrier(struct r10conf *conf, bool nowait) ret = false; } else { conf->nr_waiting++; - raid10_log(conf->mddev, "wait barrier"); + mddev_add_trace_msg(conf->mddev, "raid10 wait barrier"); wait_event_barrier(conf, stop_waiting_barrier(conf)); conf->nr_waiting--; } @@ -1152,7 +1135,7 @@ static bool regular_request_wait(struct mddev *mddev, struct r10conf *conf, bio_wouldblock_error(bio); return false; } - raid10_log(conf->mddev, "wait reshape"); + mddev_add_trace_msg(conf->mddev, "raid10 wait reshape"); wait_event(conf->wait_barrier, conf->reshape_progress <= bio->bi_iter.bi_sector || conf->reshape_progress >= bio->bi_iter.bi_sector + @@ -1249,10 +1232,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio, test_bit(R10BIO_FailFast, &r10_bio->state)) read_bio->bi_opf |= MD_FAILFAST; read_bio->bi_private = r10_bio; - - if (mddev->gendisk) - trace_block_bio_remap(read_bio, disk_devt(mddev->gendisk), - r10_bio->sector); + mddev_trace_remap(mddev, read_bio, r10_bio->sector); submit_bio_noacct(read_bio); return; } @@ -1288,10 +1268,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, && enough(conf, devnum)) mbio->bi_opf |= MD_FAILFAST; mbio->bi_private = r10_bio; - - if (conf->mddev->gendisk) - trace_block_bio_remap(mbio, disk_devt(conf->mddev->gendisk), - r10_bio->sector); + mddev_trace_remap(mddev, mbio, r10_bio->sector); /* flush_pending_writes() needs access to the rdev so...*/ mbio->bi_bdev = (void *)rdev; @@ -1330,10 +1307,7 @@ retry_wait: } if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; /* * Discard request doesn't care the write result @@ -1342,9 +1316,8 @@ retry_wait: if (!r10_bio->sectors) continue; - is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { + if (rdev_has_badblock(rdev, dev_sector, + r10_bio->sectors) < 0) { /* * Mustn't write here until the bad block * is acknowledged @@ -1360,8 +1333,9 @@ retry_wait: if (unlikely(blocked_rdev)) { /* Have to wait for this device to get unblocked, then retry */ allow_barrier(conf); - raid10_log(conf->mddev, "%s wait rdev %d blocked", - __func__, blocked_rdev->raid_disk); + mddev_add_trace_msg(conf->mddev, + "raid10 %s wait rdev %d blocked", + __func__, blocked_rdev->raid_disk); md_wait_for_blocked_rdev(blocked_rdev, mddev); wait_barrier(conf, false); goto retry_wait; @@ -1416,7 +1390,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, bio_wouldblock_error(bio); return; } - raid10_log(conf->mddev, "wait reshape metadata"); + mddev_add_trace_msg(conf->mddev, + "raid10 wait reshape metadata"); wait_event(mddev->sb_wait, !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags)); @@ -2131,10 +2106,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) continue; } - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; p->head_position = 0; p->recovery_disabled = mddev->recovery_disabled - 1; rdev->raid_disk = mirror; @@ -2150,10 +2124,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(In_sync, &rdev->flags); set_bit(Replacement, &rdev->flags); rdev->raid_disk = repl_slot; - err = 0; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); + err = mddev_stack_new_rdev(mddev, rdev); + if (err) + return err; conf->fullsync = 1; WRITE_ONCE(p->replacement, rdev); } @@ -2290,8 +2263,6 @@ static void end_sync_write(struct bio *bio) struct mddev *mddev = r10_bio->mddev; struct r10conf *conf = mddev->private; int d; - sector_t first_bad; - int bad_sectors; int slot; int repl; struct md_rdev *rdev = NULL; @@ -2312,11 +2283,10 @@ static void end_sync_write(struct bio *bio) &rdev->mddev->recovery); set_bit(R10BIO_WriteError, &r10_bio->state); } - } else if (is_badblock(rdev, - r10_bio->devs[slot].addr, - r10_bio->sectors, - &first_bad, &bad_sectors)) + } else if (rdev_has_badblock(rdev, r10_bio->devs[slot].addr, + r10_bio->sectors)) { set_bit(R10BIO_MadeGood, &r10_bio->state); + } rdev_dec_pending(rdev, mddev); @@ -2597,11 +2567,8 @@ static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, int sectors, struct page *page, enum req_op op) { - sector_t first_bad; - int bad_sectors; - - if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) - && (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) + if (rdev_has_badblock(rdev, sector, sectors) && + (op == REQ_OP_READ || test_bit(WriteErrorSeen, &rdev->flags))) return -1; if (sync_page_io(rdev, sector, sectors << 9, page, op, false)) /* success */ @@ -2658,16 +2625,14 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 s = PAGE_SIZE >> 9; do { - sector_t first_bad; - int bad_sectors; - d = r10_bio->devs[sl].devnum; rdev = conf->mirrors[d].rdev; if (rdev && test_bit(In_sync, &rdev->flags) && !test_bit(Faulty, &rdev->flags) && - is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, - &first_bad, &bad_sectors) == 0) { + rdev_has_badblock(rdev, + r10_bio->devs[sl].addr + sect, + s) == 0) { atomic_inc(&rdev->nr_pending); success = sync_page_io(rdev, r10_bio->devs[sl].addr + @@ -4002,14 +3967,26 @@ static struct r10conf *setup_conf(struct mddev *mddev) return ERR_PTR(err); } -static void raid10_set_io_opt(struct r10conf *conf) +static unsigned int raid10_nr_stripes(struct r10conf *conf) { - int raid_disks = conf->geo.raid_disks; + unsigned int raid_disks = conf->geo.raid_disks; - if (!(conf->geo.raid_disks % conf->geo.near_copies)) - raid_disks /= conf->geo.near_copies; - blk_queue_io_opt(conf->mddev->queue, (conf->mddev->chunk_sectors << 9) * - raid_disks); + if (conf->geo.raid_disks % conf->geo.near_copies) + return raid_disks; + return raid_disks / conf->geo.near_copies; +} + +static int raid10_set_queue_limits(struct mddev *mddev) +{ + struct r10conf *conf = mddev->private; + struct queue_limits lim; + + blk_set_stacking_limits(&lim); + lim.max_write_zeroes_sectors = 0; + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * raid10_nr_stripes(conf); + mddev_stack_rdev_limits(mddev, &lim); + return queue_limits_set(mddev->gendisk->queue, &lim); } static int raid10_run(struct mddev *mddev) @@ -4021,6 +3998,7 @@ static int raid10_run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + int ret = -EIO; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -4047,12 +4025,6 @@ static int raid10_run(struct mddev *mddev) } } - if (mddev->queue) { - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); - raid10_set_io_opt(conf); - } - rdev_for_each(rdev, mddev) { long long diff; @@ -4081,14 +4053,16 @@ static int raid10_run(struct mddev *mddev) if (first || diff < min_offset_diff) min_offset_diff = diff; - if (mddev->gendisk) - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - disk->head_position = 0; first = 0; } + if (!mddev_is_dm(conf->mddev)) { + ret = raid10_set_queue_limits(mddev); + if (ret) + goto out_free_conf; + } + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { pr_err("md/raid10:%s: not enough operational mirrors.\n", @@ -4185,7 +4159,7 @@ out_free_conf: raid10_free_conf(conf); mddev->private = NULL; out: - return -EIO; + return ret; } static void raid10_free(struct mddev *mddev, void *priv) @@ -4954,8 +4928,7 @@ static void end_reshape(struct r10conf *conf) conf->reshape_safe = MaxSector; spin_unlock_irq(&conf->device_lock); - if (conf->mddev->queue) - raid10_set_io_opt(conf); + mddev_update_io_opt(conf->mddev, raid10_nr_stripes(conf)); conf->fullsync = 0; } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index da4ba736c4f0..a70cbec12ed0 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -1393,7 +1393,8 @@ int ppl_init_log(struct r5conf *conf) ppl_conf->signature = ~crc32c_le(~0, mddev->uuid, sizeof(mddev->uuid)); ppl_conf->block_size = 512; } else { - ppl_conf->block_size = queue_logical_block_size(mddev->queue); + ppl_conf->block_size = + queue_logical_block_size(mddev->gendisk->queue); } for (i = 0; i < ppl_conf->count; i++) { diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6a7a32f7fb91..d874abfc1836 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -36,6 +36,7 @@ */ #include <linux/blkdev.h> +#include <linux/delay.h> #include <linux/kthread.h> #include <linux/raid/pq.h> #include <linux/async_tx.h> @@ -760,6 +761,7 @@ enum stripe_result { STRIPE_RETRY, STRIPE_SCHEDULE_AND_RETRY, STRIPE_FAIL, + STRIPE_WAIT_RESHAPE, }; struct stripe_request_ctx { @@ -1210,10 +1212,8 @@ again: */ while (op_is_write(op) && rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - int bad_sectors; - int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors); + int bad = rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf)); if (!bad) break; @@ -1295,10 +1295,7 @@ again: if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); - if (conf->mddev->gendisk) - trace_block_bio_remap(bi, - disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + mddev_trace_remap(conf->mddev, bi, sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, bi); else @@ -1342,10 +1339,7 @@ again: */ if (op == REQ_OP_DISCARD) rbi->bi_vcnt = 0; - if (conf->mddev->gendisk) - trace_block_bio_remap(rbi, - disk_devt(conf->mddev->gendisk), - sh->dev[i].sector); + mddev_trace_remap(conf->mddev, rbi, sh->dev[i].sector); if (should_defer && op_is_write(op)) bio_list_add(&pending_bios, rbi); else @@ -2412,7 +2406,7 @@ static int grow_one_stripe(struct r5conf *conf, gfp_t gfp) atomic_inc(&conf->active_stripes); raid5_release_stripe(sh); - conf->max_nr_stripes++; + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes + 1); return 1; } @@ -2422,12 +2416,12 @@ static int grow_stripes(struct r5conf *conf, int num) size_t namelen = sizeof(conf->cache_name[0]); int devs = max(conf->raid_disks, conf->previous_raid_disks); - if (conf->mddev->gendisk) + if (mddev_is_dm(conf->mddev)) snprintf(conf->cache_name[0], namelen, - "raid%d-%s", conf->level, mdname(conf->mddev)); + "raid%d-%p", conf->level, conf->mddev); else snprintf(conf->cache_name[0], namelen, - "raid%d-%p", conf->level, conf->mddev); + "raid%d-%s", conf->level, mdname(conf->mddev)); snprintf(conf->cache_name[1], namelen, "%.27s-alt", conf->cache_name[0]); conf->active_name = 0; @@ -2707,7 +2701,7 @@ static int drop_one_stripe(struct r5conf *conf) shrink_buffers(sh); free_stripe(conf->slab_cache, sh); atomic_dec(&conf->active_stripes); - conf->max_nr_stripes--; + WRITE_ONCE(conf->max_nr_stripes, conf->max_nr_stripes - 1); return 1; } @@ -2855,8 +2849,6 @@ static void raid5_end_write_request(struct bio *bi) struct r5conf *conf = sh->raid_conf; int disks = sh->disks, i; struct md_rdev *rdev; - sector_t first_bad; - int bad_sectors; int replacement = 0; for (i = 0 ; i < disks; i++) { @@ -2888,9 +2880,8 @@ static void raid5_end_write_request(struct bio *bi) if (replacement) { if (bi->bi_status) md_error(conf->mddev, rdev); - else if (is_badblock(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) + else if (rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { if (bi->bi_status) { @@ -2900,9 +2891,8 @@ static void raid5_end_write_request(struct bio *bi) if (!test_and_set_bit(WantReplacement, &rdev->flags)) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); - } else if (is_badblock(rdev, sh->sector, - RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) { + } else if (rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) /* That was a successful write so make @@ -4205,10 +4195,9 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ - if (conf->mddev->queue) - blk_add_trace_msg(conf->mddev->queue, - "raid5 rmw %llu %d", - (unsigned long long)sh->sector, rmw); + mddev_add_trace_msg(conf->mddev, "raid5 rmw %llu %d", + sh->sector, rmw); + for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; if (test_bit(R5_InJournal, &dev->flags) && @@ -4285,10 +4274,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, set_bit(STRIPE_DELAYED, &sh->state); } } - if (rcw && conf->mddev->queue) - blk_add_trace_msg(conf->mddev->queue, "raid5 rcw %llu %d %d %d", - (unsigned long long)sh->sector, - rcw, qread, test_bit(STRIPE_DELAYED, &sh->state)); + if (rcw && !mddev_is_dm(conf->mddev)) + blk_add_trace_msg(conf->mddev->gendisk->queue, + "raid5 rcw %llu %d %d %d", + (unsigned long long)sh->sector, rcw, qread, + test_bit(STRIPE_DELAYED, &sh->state)); } if (rcw > disks && rmw > disks && @@ -4674,8 +4664,6 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) /* Now to look around and see what can be done */ for (i=disks; i--; ) { struct md_rdev *rdev; - sector_t first_bad; - int bad_sectors; int is_bad = 0; dev = &sh->dev[i]; @@ -4719,8 +4707,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rdev = conf->disks[i].replacement; if (rdev && !test_bit(Faulty, &rdev->flags) && rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && - !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors)) + !rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf))) set_bit(R5_ReadRepl, &dev->flags); else { if (rdev && !test_bit(Faulty, &rdev->flags)) @@ -4733,8 +4721,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), - &first_bad, &bad_sectors); + is_bad = rdev_has_badblock(rdev, sh->sector, + RAID5_STRIPE_SECTORS(conf)); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) || is_bad < 0)) { @@ -5463,8 +5451,8 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) struct r5conf *conf = mddev->private; struct bio *align_bio; struct md_rdev *rdev; - sector_t sector, end_sector, first_bad; - int bad_sectors, dd_idx; + sector_t sector, end_sector; + int dd_idx; bool did_inc; if (!in_chunk_boundary(mddev, raid_bio)) { @@ -5493,8 +5481,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) atomic_inc(&rdev->nr_pending); - if (is_badblock(rdev, sector, bio_sectors(raid_bio), &first_bad, - &bad_sectors)) { + if (rdev_has_badblock(rdev, sector, bio_sectors(raid_bio))) { rdev_dec_pending(rdev, mddev); return 0; } @@ -5530,9 +5517,7 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) spin_unlock_irq(&conf->device_lock); } - if (mddev->gendisk) - trace_block_bio_remap(align_bio, disk_devt(mddev->gendisk), - raid_bio->bi_iter.bi_sector); + mddev_trace_remap(mddev, align_bio, raid_bio->bi_iter.bi_sector); submit_bio_noacct(align_bio); return 1; } @@ -5701,8 +5686,8 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) } release_inactive_stripe_list(conf, cb->temp_inactive_list, NR_STRIPE_HASH_LOCKS); - if (mddev->queue) - trace_block_unplug(mddev->queue, cnt, !from_schedule); + if (!mddev_is_dm(mddev)) + trace_block_unplug(mddev->gendisk->queue, cnt, !from_schedule); kfree(cb); } @@ -5946,7 +5931,8 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, if (ahead_of_reshape(mddev, logical_sector, conf->reshape_safe)) { spin_unlock_irq(&conf->device_lock); - return STRIPE_SCHEDULE_AND_RETRY; + ret = STRIPE_SCHEDULE_AND_RETRY; + goto out; } } spin_unlock_irq(&conf->device_lock); @@ -6025,6 +6011,12 @@ static enum stripe_result make_stripe_request(struct mddev *mddev, out_release: raid5_release_stripe(sh); +out: + if (ret == STRIPE_SCHEDULE_AND_RETRY && reshape_interrupted(mddev)) { + bi->bi_status = BLK_STS_RESOURCE; + ret = STRIPE_WAIT_RESHAPE; + pr_err_ratelimited("dm-raid456: io across reshape position while reshape can't make progress"); + } return ret; } @@ -6146,7 +6138,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) while (1) { res = make_stripe_request(mddev, conf, &ctx, logical_sector, bi); - if (res == STRIPE_FAIL) + if (res == STRIPE_FAIL || res == STRIPE_WAIT_RESHAPE) break; if (res == STRIPE_RETRY) @@ -6184,6 +6176,11 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) if (rw == WRITE) md_write_end(mddev); + if (res == STRIPE_WAIT_RESHAPE) { + md_free_cloned_bio(bi); + return false; + } + bio_endio(bi); return true; } @@ -6773,7 +6770,18 @@ static void raid5d(struct md_thread *thread) spin_unlock_irq(&conf->device_lock); md_check_recovery(mddev); spin_lock_irq(&conf->device_lock); + + /* + * Waiting on MD_SB_CHANGE_PENDING below may deadlock + * seeing md_check_recovery() is needed to clear + * the flag when using mdmon. + */ + continue; } + + wait_event_lock_irq(mddev->sb_wait, + !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags), + conf->device_lock); } pr_debug("%d stripes handled\n", handled); @@ -6820,7 +6828,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) if (size <= 16 || size > 32768) return -EINVAL; - conf->min_nr_stripes = size; + WRITE_ONCE(conf->min_nr_stripes, size); mutex_lock(&conf->cache_size_mutex); while (size < conf->max_nr_stripes && drop_one_stripe(conf)) @@ -6832,7 +6840,7 @@ raid5_set_cache_size(struct mddev *mddev, int size) mutex_lock(&conf->cache_size_mutex); while (size > conf->max_nr_stripes) if (!grow_one_stripe(conf, GFP_KERNEL)) { - conf->min_nr_stripes = conf->max_nr_stripes; + WRITE_ONCE(conf->min_nr_stripes, conf->max_nr_stripes); result = -ENOMEM; break; } @@ -6967,10 +6975,8 @@ raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) pr_debug("md/raid: change stripe_size from %lu to %lu\n", conf->stripe_size, new); - if (mddev->sync_thread || - test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || - mddev->reshape_position != MaxSector || - mddev->sysfs_active) { + if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || + mddev->reshape_position != MaxSector || mddev->sysfs_active) { err = -EBUSY; goto out_unlock; } @@ -7084,7 +7090,7 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) if (!conf) err = -ENODEV; else if (new != conf->skip_copy) { - struct request_queue *q = mddev->queue; + struct request_queue *q = mddev->gendisk->queue; conf->skip_copy = new; if (new) @@ -7390,11 +7396,13 @@ static unsigned long raid5_cache_count(struct shrinker *shrink, struct shrink_control *sc) { struct r5conf *conf = shrink->private_data; + int max_stripes = READ_ONCE(conf->max_nr_stripes); + int min_stripes = READ_ONCE(conf->min_nr_stripes); - if (conf->max_nr_stripes < conf->min_nr_stripes) + if (max_stripes < min_stripes) /* unlikely, but not impossible */ return 0; - return conf->max_nr_stripes - conf->min_nr_stripes; + return max_stripes - min_stripes; } static struct r5conf *setup_conf(struct mddev *mddev) @@ -7684,10 +7692,65 @@ static int only_parity(int raid_disk, int algo, int raid_disks, int max_degraded return 0; } -static void raid5_set_io_opt(struct r5conf *conf) +static int raid5_set_limits(struct mddev *mddev) { - blk_queue_io_opt(conf->mddev->queue, (conf->chunk_sectors << 9) * - (conf->raid_disks - conf->max_degraded)); + struct r5conf *conf = mddev->private; + struct queue_limits lim; + int data_disks, stripe; + struct md_rdev *rdev; + + /* + * The read-ahead size must cover two whole stripes, which is + * 2 * (datadisks) * chunksize where 'n' is the number of raid devices. + */ + data_disks = conf->previous_raid_disks - conf->max_degraded; + + /* + * We can only discard a whole stripe. It doesn't make sense to + * discard data disk but write parity disk + */ + stripe = roundup_pow_of_two(data_disks * (mddev->chunk_sectors << 9)); + + blk_set_stacking_limits(&lim); + lim.io_min = mddev->chunk_sectors << 9; + lim.io_opt = lim.io_min * (conf->raid_disks - conf->max_degraded); + lim.raid_partial_stripes_expensive = 1; + lim.discard_granularity = stripe; + lim.max_write_zeroes_sectors = 0; + mddev_stack_rdev_limits(mddev, &lim); + rdev_for_each(rdev, mddev) + queue_limits_stack_bdev(&lim, rdev->bdev, rdev->new_data_offset, + mddev->gendisk->disk_name); + + /* + * Zeroing is required for discard, otherwise data could be lost. + * + * Consider a scenario: discard a stripe (the stripe could be + * inconsistent if discard_zeroes_data is 0); write one disk of the + * stripe (the stripe could be inconsistent again depending on which + * disks are used to calculate parity); the disk is broken; The stripe + * data of this disk is lost. + * + * We only allow DISCARD if the sysadmin has confirmed that only safe + * devices are in use by setting a module parameter. A better idea + * might be to turn DISCARD into WRITE_ZEROES requests, as that is + * required to be safe. + */ + if (!devices_handle_discard_safely || + lim.max_discard_sectors < (stripe >> 9) || + lim.discard_granularity < stripe) + lim.max_hw_discard_sectors = 0; + + /* + * Requests require having a bitmap for each stripe. + * Limit the max sectors based on this. + */ + lim.max_hw_sectors = RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf); + + /* No restrictions on the number of segments in the request */ + lim.max_segments = USHRT_MAX; + + return queue_limits_set(mddev->gendisk->queue, &lim); } static int raid5_run(struct mddev *mddev) @@ -7700,6 +7763,7 @@ static int raid5_run(struct mddev *mddev) int i; long long min_offset_diff = 0; int first = 1; + int ret = -EIO; if (mddev->recovery_cp != MaxSector) pr_notice("md/raid:%s: not clean -- starting background reconstruction\n", @@ -7948,66 +8012,10 @@ static int raid5_run(struct mddev *mddev) mdname(mddev)); md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); - if (mddev->queue) { - int chunk_size; - /* read-ahead size must cover two whole stripes, which - * is 2 * (datadisks) * chunksize where 'n' is the - * number of raid devices - */ - int data_disks = conf->previous_raid_disks - conf->max_degraded; - int stripe = data_disks * - ((mddev->chunk_sectors << 9) / PAGE_SIZE); - - chunk_size = mddev->chunk_sectors << 9; - blk_queue_io_min(mddev->queue, chunk_size); - raid5_set_io_opt(conf); - mddev->queue->limits.raid_partial_stripes_expensive = 1; - /* - * We can only discard a whole stripe. It doesn't make sense to - * discard data disk but write parity disk - */ - stripe = stripe * PAGE_SIZE; - stripe = roundup_pow_of_two(stripe); - mddev->queue->limits.discard_granularity = stripe; - - blk_queue_max_write_zeroes_sectors(mddev->queue, 0); - - rdev_for_each(rdev, mddev) { - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->data_offset << 9); - disk_stack_limits(mddev->gendisk, rdev->bdev, - rdev->new_data_offset << 9); - } - - /* - * zeroing is required, otherwise data - * could be lost. Consider a scenario: discard a stripe - * (the stripe could be inconsistent if - * discard_zeroes_data is 0); write one disk of the - * stripe (the stripe could be inconsistent again - * depending on which disks are used to calculate - * parity); the disk is broken; The stripe data of this - * disk is lost. - * - * We only allow DISCARD if the sysadmin has confirmed that - * only safe devices are in use by setting a module parameter. - * A better idea might be to turn DISCARD into WRITE_ZEROES - * requests, as that is required to be safe. - */ - if (!devices_handle_discard_safely || - mddev->queue->limits.max_discard_sectors < (stripe >> 9) || - mddev->queue->limits.discard_granularity < stripe) - blk_queue_max_discard_sectors(mddev->queue, 0); - - /* - * Requests require having a bitmap for each stripe. - * Limit the max sectors based on this. - */ - blk_queue_max_hw_sectors(mddev->queue, - RAID5_MAX_REQ_STRIPES << RAID5_STRIPE_SHIFT(conf)); - - /* No restrictions on the number of segments in the request */ - blk_queue_max_segments(mddev->queue, USHRT_MAX); + if (!mddev_is_dm(mddev)) { + ret = raid5_set_limits(mddev); + if (ret) + goto abort; } if (log_init(conf, journal_dev, raid5_has_ppl(conf))) @@ -8020,7 +8028,7 @@ abort: free_conf(conf); mddev->private = NULL; pr_warn("md/raid:%s: failed to run raid set.\n", mdname(mddev)); - return -EIO; + return ret; } static void raid5_free(struct mddev *mddev, void *priv) @@ -8531,8 +8539,8 @@ static void end_reshape(struct r5conf *conf) spin_unlock_irq(&conf->device_lock); wake_up(&conf->wait_for_overlap); - if (conf->mddev->queue) - raid5_set_io_opt(conf); + mddev_update_io_opt(conf->mddev, + conf->raid_disks - conf->max_degraded); } } @@ -8909,6 +8917,18 @@ static int raid5_start(struct mddev *mddev) return r5l_start(conf->log); } +/* + * This is only used for dm-raid456, caller already frozen sync_thread, hence + * if rehsape is still in progress, io that is waiting for reshape can never be + * done now, hence wake up and handle those IO. + */ +static void raid5_prepare_suspend(struct mddev *mddev) +{ + struct r5conf *conf = mddev->private; + + wake_up(&conf->wait_for_overlap); +} + static struct md_personality raid6_personality = { .name = "raid6", @@ -8932,6 +8952,7 @@ static struct md_personality raid6_personality = .quiesce = raid5_quiesce, .takeover = raid6_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid5_personality = { @@ -8956,6 +8977,7 @@ static struct md_personality raid5_personality = .quiesce = raid5_quiesce, .takeover = raid5_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static struct md_personality raid4_personality = @@ -8981,6 +9003,7 @@ static struct md_personality raid4_personality = .quiesce = raid5_quiesce, .takeover = raid4_takeover, .change_consistency_policy = raid5_change_consistency_policy, + .prepare_suspend = raid5_prepare_suspend, }; static int __init raid5_init(void) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 04115cd92433..47a314a4eb6f 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -2078,6 +2078,12 @@ static const struct blk_mq_ops msb_mq_ops = { static int msb_init_disk(struct memstick_dev *card) { struct msb_data *msb = memstick_get_drvdata(card); + struct queue_limits lim = { + .logical_block_size = msb->page_size, + .max_hw_sectors = MS_BLOCK_MAX_PAGES, + .max_segments = MS_BLOCK_MAX_SEGS, + .max_segment_size = MS_BLOCK_MAX_PAGES * msb->page_size, + }; int rc; unsigned long capacity; @@ -2093,19 +2099,13 @@ static int msb_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; } msb->queue = msb->disk->queue; - blk_queue_max_hw_sectors(msb->queue, MS_BLOCK_MAX_PAGES); - blk_queue_max_segments(msb->queue, MS_BLOCK_MAX_SEGS); - blk_queue_max_segment_size(msb->queue, - MS_BLOCK_MAX_PAGES * msb->page_size); - blk_queue_logical_block_size(msb->queue, msb->page_size); - sprintf(msb->disk->disk_name, "msblk%d", msb->disk_id); msb->disk->fops = &msb_bdops; msb->disk->private_data = msb; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 5a69ed33999b..49accfdc89d6 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1103,6 +1103,12 @@ static const struct blk_mq_ops mspro_mq_ops = { static int mspro_block_init_disk(struct memstick_dev *card) { struct mspro_block_data *msb = memstick_get_drvdata(card); + struct queue_limits lim = { + .logical_block_size = msb->page_size, + .max_hw_sectors = MSPRO_BLOCK_MAX_PAGES, + .max_segments = MSPRO_BLOCK_MAX_SEGS, + .max_segment_size = MSPRO_BLOCK_MAX_PAGES * msb->page_size, + }; struct mspro_devinfo *dev_info = NULL; struct mspro_sys_info *sys_info = NULL; struct mspro_sys_attr *s_attr = NULL; @@ -1138,18 +1144,13 @@ static int mspro_block_init_disk(struct memstick_dev *card) if (rc) goto out_release_id; - msb->disk = blk_mq_alloc_disk(&msb->tag_set, card); + msb->disk = blk_mq_alloc_disk(&msb->tag_set, &lim, card); if (IS_ERR(msb->disk)) { rc = PTR_ERR(msb->disk); goto out_free_tag_set; } msb->queue = msb->disk->queue; - blk_queue_max_hw_sectors(msb->queue, MSPRO_BLOCK_MAX_PAGES); - blk_queue_max_segments(msb->queue, MSPRO_BLOCK_MAX_SEGS); - blk_queue_max_segment_size(msb->queue, - MSPRO_BLOCK_MAX_PAGES * msb->page_size); - msb->disk->major = major; msb->disk->first_minor = disk_id << MSPRO_BLOCK_PART_SHIFT; msb->disk->minors = 1 << MSPRO_BLOCK_PART_SHIFT; @@ -1158,8 +1159,6 @@ static int mspro_block_init_disk(struct memstick_dev *card) sprintf(msb->disk->disk_name, "mspblk%d", disk_id); - blk_queue_logical_block_size(msb->queue, msb->page_size); - capacity = be16_to_cpu(sys_info->user_block_count); capacity *= be16_to_cpu(sys_info->block_size); capacity *= msb->page_size >> 9; diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c index a0a2412f62a7..2ae60d208cdf 100644 --- a/drivers/mmc/core/queue.c +++ b/drivers/mmc/core/queue.c @@ -174,8 +174,8 @@ static struct scatterlist *mmc_alloc_sg(unsigned short sg_len, gfp_t gfp) return sg; } -static void mmc_queue_setup_discard(struct request_queue *q, - struct mmc_card *card) +static void mmc_queue_setup_discard(struct mmc_card *card, + struct queue_limits *lim) { unsigned max_discard; @@ -183,15 +183,17 @@ static void mmc_queue_setup_discard(struct request_queue *q, if (!max_discard) return; - blk_queue_max_discard_sectors(q, max_discard); - q->limits.discard_granularity = card->pref_erase << 9; - /* granularity must not be greater than max. discard */ - if (card->pref_erase > max_discard) - q->limits.discard_granularity = SECTOR_SIZE; + lim->max_hw_discard_sectors = max_discard; if (mmc_can_secure_erase_trim(card)) - blk_queue_max_secure_erase_sectors(q, max_discard); + lim->max_secure_erase_sectors = max_discard; if (mmc_can_trim(card) && card->erased_byte == 0) - blk_queue_max_write_zeroes_sectors(q, max_discard); + lim->max_write_zeroes_sectors = max_discard; + + /* granularity must not be greater than max. discard */ + if (card->pref_erase > max_discard) + lim->discard_granularity = SECTOR_SIZE; + else + lim->discard_granularity = card->pref_erase << 9; } static unsigned short mmc_get_max_segments(struct mmc_host *host) @@ -341,40 +343,53 @@ static const struct blk_mq_ops mmc_mq_ops = { .timeout = mmc_mq_timed_out, }; -static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) +static struct gendisk *mmc_alloc_disk(struct mmc_queue *mq, + struct mmc_card *card) { struct mmc_host *host = card->host; - unsigned block_size = 512; + struct queue_limits lim = { }; + struct gendisk *disk; - blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); - blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); if (mmc_can_erase(card)) - mmc_queue_setup_discard(mq->queue, card); + mmc_queue_setup_discard(card, &lim); if (!mmc_dev(host)->dma_mask || !*mmc_dev(host)->dma_mask) - blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_HIGH); - blk_queue_max_hw_sectors(mq->queue, - min(host->max_blk_count, host->max_req_size / 512)); - if (host->can_dma_map_merge) - WARN(!blk_queue_can_use_dma_map_merging(mq->queue, - mmc_dev(host)), - "merging was advertised but not possible"); - blk_queue_max_segments(mq->queue, mmc_get_max_segments(host)); - - if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) { - block_size = card->ext_csd.data_sector_size; - WARN_ON(block_size != 512 && block_size != 4096); - } + lim.bounce = BLK_BOUNCE_HIGH; + + lim.max_hw_sectors = min(host->max_blk_count, host->max_req_size / 512); + + if (mmc_card_mmc(card) && card->ext_csd.data_sector_size) + lim.logical_block_size = card->ext_csd.data_sector_size; + else + lim.logical_block_size = 512; + + WARN_ON_ONCE(lim.logical_block_size != 512 && + lim.logical_block_size != 4096); - blk_queue_logical_block_size(mq->queue, block_size); /* - * After blk_queue_can_use_dma_map_merging() was called with succeed, - * since it calls blk_queue_virt_boundary(), the mmc should not call - * both blk_queue_max_segment_size(). + * Setting a virt_boundary implicity sets a max_segment_size, so try + * to set the hardware one here. */ - if (!host->can_dma_map_merge) - blk_queue_max_segment_size(mq->queue, - round_down(host->max_seg_size, block_size)); + if (host->can_dma_map_merge) { + lim.virt_boundary_mask = dma_get_merge_boundary(mmc_dev(host)); + lim.max_segments = MMC_DMA_MAP_MERGE_SEGMENTS; + } else { + lim.max_segment_size = + round_down(host->max_seg_size, lim.logical_block_size); + lim.max_segments = host->max_segs; + } + + disk = blk_mq_alloc_disk(&mq->tag_set, &lim, mq); + if (IS_ERR(disk)) + return disk; + mq->queue = disk->queue; + + if (mmc_host_is_spi(host) && host->use_spi_crc) + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); + blk_queue_rq_timeout(mq->queue, 60 * HZ); + + blk_queue_flag_set(QUEUE_FLAG_NONROT, mq->queue); + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, mq->queue); dma_set_max_seg_size(mmc_dev(host), queue_max_segment_size(mq->queue)); @@ -386,6 +401,7 @@ static void mmc_setup_queue(struct mmc_queue *mq, struct mmc_card *card) init_waitqueue_head(&mq->wait); mmc_crypto_setup_queue(mq->queue, host); + return disk; } static inline bool mmc_merge_capable(struct mmc_host *host) @@ -447,18 +463,9 @@ struct gendisk *mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card) return ERR_PTR(ret); - disk = blk_mq_alloc_disk(&mq->tag_set, mq); - if (IS_ERR(disk)) { + disk = mmc_alloc_disk(mq, card); + if (IS_ERR(disk)) blk_mq_free_tag_set(&mq->tag_set); - return disk; - } - mq->queue = disk->queue; - - if (mmc_host_is_spi(host) && host->use_spi_crc) - blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, mq->queue); - blk_queue_rq_timeout(mq->queue, 60 * HZ); - - mmc_setup_queue(mq, card); return disk; } diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index f0526dcc2162..3caa0717d46c 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -277,6 +277,7 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) { struct mtd_blktrans_ops *tr = new->tr; struct mtd_blktrans_dev *d; + struct queue_limits lim = { }; int last_devnum = -1; struct gendisk *gd; int ret; @@ -331,9 +332,13 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING); if (ret) goto out_kfree_tag_set; + + lim.logical_block_size = tr->blksize; + if (tr->discard) + lim.max_hw_discard_sectors = UINT_MAX; /* Create gendisk */ - gd = blk_mq_alloc_disk(new->tag_set, new); + gd = blk_mq_alloc_disk(new->tag_set, &lim, new); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tag_set; @@ -371,14 +376,9 @@ int add_mtd_blktrans_dev(struct mtd_blktrans_dev *new) if (tr->flush) blk_queue_write_cache(new->rq, true, false); - blk_queue_logical_block_size(new->rq, tr->blksize); - blk_queue_flag_set(QUEUE_FLAG_NONROT, new->rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, new->rq); - if (tr->discard) - blk_queue_max_discard_sectors(new->rq, UINT_MAX); - gd->queue = new->rq; if (new->readonly) diff --git a/drivers/mtd/ubi/block.c b/drivers/mtd/ubi/block.c index 654bd7372cd8..5c8fdcc088a0 100644 --- a/drivers/mtd/ubi/block.c +++ b/drivers/mtd/ubi/block.c @@ -348,6 +348,9 @@ static int calc_disk_capacity(struct ubi_volume_info *vi, u64 *disk_capacity) int ubiblock_create(struct ubi_volume_info *vi) { + struct queue_limits lim = { + .max_segments = UBI_MAX_SG_COUNT, + }; struct ubiblock *dev; struct gendisk *gd; u64 disk_capacity; @@ -393,7 +396,7 @@ int ubiblock_create(struct ubi_volume_info *vi) /* Initialize the gendisk of this ubiblock device */ - gd = blk_mq_alloc_disk(&dev->tag_set, dev); + gd = blk_mq_alloc_disk(&dev->tag_set, &lim, dev); if (IS_ERR(gd)) { ret = PTR_ERR(gd); goto out_free_tags; @@ -416,7 +419,6 @@ int ubiblock_create(struct ubi_volume_info *vi) dev->gd = gd; dev->rq = gd->queue; - blk_queue_max_segments(dev->rq, UBI_MAX_SG_COUNT); list_add_tail(&dev->list, &ubiblock_devices); diff --git a/drivers/nvdimm/btt.c b/drivers/nvdimm/btt.c index bb3726b622ad..4d0c527e8576 100644 --- a/drivers/nvdimm/btt.c +++ b/drivers/nvdimm/btt.c @@ -1496,19 +1496,21 @@ static int btt_blk_init(struct btt *btt) { struct nd_btt *nd_btt = btt->nd_btt; struct nd_namespace_common *ndns = nd_btt->ndns; - int rc = -ENOMEM; + struct queue_limits lim = { + .logical_block_size = btt->sector_size, + .max_hw_sectors = UINT_MAX, + }; + int rc; - btt->btt_disk = blk_alloc_disk(NUMA_NO_NODE); - if (!btt->btt_disk) - return -ENOMEM; + btt->btt_disk = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(btt->btt_disk)) + return PTR_ERR(btt->btt_disk); nvdimm_namespace_disk_name(ndns, btt->btt_disk->disk_name); btt->btt_disk->first_minor = 0; btt->btt_disk->fops = &btt_fops; btt->btt_disk->private_data = btt; - blk_queue_logical_block_size(btt->btt_disk->queue, btt->sector_size); - blk_queue_max_hw_sectors(btt->btt_disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, btt->btt_disk->queue); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, btt->btt_disk->queue); diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 4e8fdcb3f1c8..8dcc10b6db5b 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -451,6 +451,11 @@ static int pmem_attach_disk(struct device *dev, { struct nd_namespace_io *nsio = to_nd_namespace_io(&ndns->dev); struct nd_region *nd_region = to_nd_region(dev->parent); + struct queue_limits lim = { + .logical_block_size = pmem_sector_size(ndns), + .physical_block_size = PAGE_SIZE, + .max_hw_sectors = UINT_MAX, + }; int nid = dev_to_node(dev), fua; struct resource *res = &nsio->res; struct range bb_range; @@ -497,9 +502,9 @@ static int pmem_attach_disk(struct device *dev, return -EBUSY; } - disk = blk_alloc_disk(nid); - if (!disk) - return -ENOMEM; + disk = blk_alloc_disk(&lim, nid); + if (IS_ERR(disk)) + return PTR_ERR(disk); q = disk->queue; pmem->disk = disk; @@ -539,9 +544,6 @@ static int pmem_attach_disk(struct device *dev, pmem->virt_addr = addr; blk_queue_write_cache(q, true, fua); - blk_queue_physical_block_size(q, PAGE_SIZE); - blk_queue_logical_block_size(q, pmem_sector_size(ndns)); - blk_queue_max_hw_sectors(q, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_NONROT, q); blk_queue_flag_set(QUEUE_FLAG_SYNCHRONOUS, q); if (pmem->pfn_flags & PFN_MAP) diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c index c727cd1f264b..a480cdeac288 100644 --- a/drivers/nvme/host/apple.c +++ b/drivers/nvme/host/apple.c @@ -1516,7 +1516,7 @@ static int apple_nvme_probe(struct platform_device *pdev) goto put_dev; } - anv->ctrl.admin_q = blk_mq_init_queue(&anv->admin_tagset); + anv->ctrl.admin_q = blk_mq_alloc_queue(&anv->admin_tagset, NULL, NULL); if (IS_ERR(anv->ctrl.admin_q)) { ret = -ENOMEM; goto put_dev; diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 0a96362912ce..00864a634470 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -114,12 +114,21 @@ static DEFINE_MUTEX(nvme_subsystems_lock); static DEFINE_IDA(nvme_instance_ida); static dev_t nvme_ctrl_base_chr_devt; -static struct class *nvme_class; -static struct class *nvme_subsys_class; +static int nvme_class_uevent(const struct device *dev, struct kobj_uevent_env *env); +static const struct class nvme_class = { + .name = "nvme", + .dev_uevent = nvme_class_uevent, +}; + +static const struct class nvme_subsys_class = { + .name = "nvme-subsystem", +}; static DEFINE_IDA(nvme_ns_chr_minor_ida); static dev_t nvme_ns_chr_devt; -static struct class *nvme_ns_chr_class; +static const struct class nvme_ns_chr_class = { + .name = "nvme-generic", +}; static void nvme_put_subsystem(struct nvme_subsystem *subsys); static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl, @@ -1398,8 +1407,10 @@ static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, sizeof(struct nvme_id_ctrl)); - if (error) + if (error) { kfree(*id); + *id = NULL; + } return error; } @@ -1528,6 +1539,7 @@ int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); kfree(*id); + *id = NULL; } return error; } @@ -1727,12 +1739,23 @@ int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) return 0; } -#ifdef CONFIG_BLK_DEV_INTEGRITY -static void nvme_init_integrity(struct gendisk *disk, - struct nvme_ns_head *head, u32 max_integrity_segments) +static bool nvme_init_integrity(struct gendisk *disk, struct nvme_ns_head *head) { struct blk_integrity integrity = { }; + blk_integrity_unregister(disk); + + if (!head->ms) + return true; + + /* + * PI can always be supported as we can ask the controller to simply + * insert/strip it, which is not possible for other kinds of metadata. + */ + if (!IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) || + !(head->features & NVME_NS_METADATA_SUPPORTED)) + return nvme_ns_has_pi(head); + switch (head->pi_type) { case NVME_NS_DPS_PI_TYPE3: switch (head->guard_type) { @@ -1775,53 +1798,32 @@ static void nvme_init_integrity(struct gendisk *disk, } integrity.tuple_size = head->ms; + integrity.pi_offset = head->pi_offset; blk_integrity_register(disk, &integrity); - blk_queue_max_integrity_segments(disk->queue, max_integrity_segments); -} -#else -static void nvme_init_integrity(struct gendisk *disk, - struct nvme_ns_head *head, u32 max_integrity_segments) -{ + return true; } -#endif /* CONFIG_BLK_DEV_INTEGRITY */ -static void nvme_config_discard(struct nvme_ctrl *ctrl, struct gendisk *disk, - struct nvme_ns_head *head) +static void nvme_config_discard(struct nvme_ns *ns, struct queue_limits *lim) { - struct request_queue *queue = disk->queue; - u32 max_discard_sectors; - - if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(head, UINT_MAX)) { - max_discard_sectors = nvme_lba_to_sect(head, ctrl->dmrsl); - } else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) { - max_discard_sectors = UINT_MAX; - } else { - blk_queue_max_discard_sectors(queue, 0); - return; - } + struct nvme_ctrl *ctrl = ns->ctrl; BUILD_BUG_ON(PAGE_SIZE / sizeof(struct nvme_dsm_range) < NVME_DSM_MAX_RANGES); - /* - * If discard is already enabled, don't reset queue limits. - * - * This works around the fact that the block layer can't cope well with - * updating the hardware limits when overridden through sysfs. This is - * harmless because discard limits in NVMe are purely advisory. - */ - if (queue->limits.max_discard_sectors) - return; + if (ctrl->dmrsl && ctrl->dmrsl <= nvme_sect_to_lba(ns->head, UINT_MAX)) + lim->max_hw_discard_sectors = + nvme_lba_to_sect(ns->head, ctrl->dmrsl); + else if (ctrl->oncs & NVME_CTRL_ONCS_DSM) + lim->max_hw_discard_sectors = UINT_MAX; + else + lim->max_hw_discard_sectors = 0; + + lim->discard_granularity = lim->logical_block_size; - blk_queue_max_discard_sectors(queue, max_discard_sectors); if (ctrl->dmrl) - blk_queue_max_discard_segments(queue, ctrl->dmrl); + lim->max_discard_segments = ctrl->dmrl; else - blk_queue_max_discard_segments(queue, NVME_DSM_MAX_RANGES); - queue->limits.discard_granularity = queue_logical_block_size(queue); - - if (ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) - blk_queue_max_write_zeroes_sectors(queue, UINT_MAX); + lim->max_discard_segments = NVME_DSM_MAX_RANGES; } static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) @@ -1832,42 +1834,38 @@ static bool nvme_ns_ids_equal(struct nvme_ns_ids *a, struct nvme_ns_ids *b) a->csi == b->csi; } -static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, - struct nvme_id_ns *id) +static int nvme_identify_ns_nvm(struct nvme_ctrl *ctrl, unsigned int nsid, + struct nvme_id_ns_nvm **nvmp) { - bool first = id->dps & NVME_NS_DPS_PI_FIRST; - unsigned lbaf = nvme_lbaf_index(id->flbas); - struct nvme_command c = { }; + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.nsid = cpu_to_le32(nsid), + .identify.cns = NVME_ID_CNS_CS_NS, + .identify.csi = NVME_CSI_NVM, + }; struct nvme_id_ns_nvm *nvm; - int ret = 0; - u32 elbaf; - - head->pi_size = 0; - head->ms = le16_to_cpu(id->lbaf[lbaf].ms); - if (!(ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { - head->pi_size = sizeof(struct t10_pi_tuple); - head->guard_type = NVME_NVM_NS_16B_GUARD; - goto set_pi; - } + int ret; nvm = kzalloc(sizeof(*nvm), GFP_KERNEL); if (!nvm) return -ENOMEM; - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(head->ns_id); - c.identify.cns = NVME_ID_CNS_CS_NS; - c.identify.csi = NVME_CSI_NVM; - ret = nvme_submit_sync_cmd(ctrl->admin_q, &c, nvm, sizeof(*nvm)); if (ret) - goto free_data; + kfree(nvm); + else + *nvmp = nvm; + return ret; +} - elbaf = le32_to_cpu(nvm->elbaf[lbaf]); +static void nvme_configure_pi_elbas(struct nvme_ns_head *head, + struct nvme_id_ns *id, struct nvme_id_ns_nvm *nvm) +{ + u32 elbaf = le32_to_cpu(nvm->elbaf[nvme_lbaf_index(id->flbas)]); /* no support for storage tag formats right now */ if (nvme_elbaf_sts(elbaf)) - goto free_data; + return; head->guard_type = nvme_elbaf_guard_type(elbaf); switch (head->guard_type) { @@ -1880,30 +1878,31 @@ static int nvme_init_ms(struct nvme_ctrl *ctrl, struct nvme_ns_head *head, default: break; } - -free_data: - kfree(nvm); -set_pi: - if (head->pi_size && (first || head->ms == head->pi_size)) - head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; - else - head->pi_type = 0; - - return ret; } -static int nvme_configure_metadata(struct nvme_ctrl *ctrl, - struct nvme_ns_head *head, struct nvme_id_ns *id) +static void nvme_configure_metadata(struct nvme_ctrl *ctrl, + struct nvme_ns_head *head, struct nvme_id_ns *id, + struct nvme_id_ns_nvm *nvm) { - int ret; - - ret = nvme_init_ms(ctrl, head, id); - if (ret) - return ret; - head->features &= ~(NVME_NS_METADATA_SUPPORTED | NVME_NS_EXT_LBAS); + head->pi_type = 0; + head->pi_size = 0; + head->pi_offset = 0; + head->ms = le16_to_cpu(id->lbaf[nvme_lbaf_index(id->flbas)].ms); if (!head->ms || !(ctrl->ops->flags & NVME_F_METADATA_SUPPORTED)) - return 0; + return; + + if (nvm && (ctrl->ctratt & NVME_CTRL_ATTR_ELBAS)) { + nvme_configure_pi_elbas(head, id, nvm); + } else { + head->pi_size = sizeof(struct t10_pi_tuple); + head->guard_type = NVME_NVM_NS_16B_GUARD; + } + + if (head->pi_size && head->ms >= head->pi_size) + head->pi_type = id->dps & NVME_NS_DPS_PI_MASK; + if (!(id->dps & NVME_NS_DPS_PI_FIRST)) + head->pi_offset = head->ms - head->pi_size; if (ctrl->ops->flags & NVME_F_FABRICS) { /* @@ -1912,7 +1911,7 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl, * remap the separate metadata buffer from the block layer. */ if (WARN_ON_ONCE(!(id->flbas & NVME_NS_FLBAS_META_EXT))) - return 0; + return; head->features |= NVME_NS_EXT_LBAS; @@ -1939,33 +1938,32 @@ static int nvme_configure_metadata(struct nvme_ctrl *ctrl, else head->features |= NVME_NS_METADATA_SUPPORTED; } - return 0; } -static void nvme_set_queue_limits(struct nvme_ctrl *ctrl, - struct request_queue *q) +static u32 nvme_max_drv_segments(struct nvme_ctrl *ctrl) { - bool vwc = ctrl->vwc & NVME_CTRL_VWC_PRESENT; - - if (ctrl->max_hw_sectors) { - u32 max_segments = - (ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> 9)) + 1; + return ctrl->max_hw_sectors / (NVME_CTRL_PAGE_SIZE >> SECTOR_SHIFT) + 1; +} - max_segments = min_not_zero(max_segments, ctrl->max_segments); - blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors); - blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX)); - } - blk_queue_virt_boundary(q, NVME_CTRL_PAGE_SIZE - 1); - blk_queue_dma_alignment(q, 3); - blk_queue_write_cache(q, vwc, vwc); +static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl, + struct queue_limits *lim) +{ + lim->max_hw_sectors = ctrl->max_hw_sectors; + lim->max_segments = min_t(u32, USHRT_MAX, + min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments)); + lim->max_integrity_segments = ctrl->max_integrity_segments; + lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1; + lim->max_segment_size = UINT_MAX; + lim->dma_alignment = 3; } -static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, - struct nvme_ns_head *head, struct nvme_id_ns *id) +static bool nvme_update_disk_info(struct nvme_ns *ns, struct nvme_id_ns *id, + struct queue_limits *lim) { - sector_t capacity = nvme_lba_to_sect(head, le64_to_cpu(id->nsze)); + struct nvme_ns_head *head = ns->head; u32 bs = 1U << head->lba_shift; u32 atomic_bs, phys_bs, io_opt = 0; + bool valid = true; /* * The block layer can't support LBA sizes larger than the page size @@ -1973,12 +1971,10 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, * allow block I/O. */ if (head->lba_shift > PAGE_SHIFT || head->lba_shift < SECTOR_SHIFT) { - capacity = 0; bs = (1 << 9); + valid = false; } - blk_integrity_unregister(disk); - atomic_bs = phys_bs = bs; if (id->nabo == 0) { /* @@ -1989,7 +1985,7 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, if (id->nsfeat & NVME_NS_FEAT_ATOMICS && id->nawupf) atomic_bs = (1 + le16_to_cpu(id->nawupf)) * bs; else - atomic_bs = (1 + ctrl->subsys->awupf) * bs; + atomic_bs = (1 + ns->ctrl->subsys->awupf) * bs; } if (id->nsfeat & NVME_NS_FEAT_IO_OPT) { @@ -1999,36 +1995,20 @@ static void nvme_update_disk_info(struct nvme_ctrl *ctrl, struct gendisk *disk, io_opt = bs * (1 + le16_to_cpu(id->nows)); } - blk_queue_logical_block_size(disk->queue, bs); /* * Linux filesystems assume writing a single physical block is * an atomic operation. Hence limit the physical block size to the * value of the Atomic Write Unit Power Fail parameter. */ - blk_queue_physical_block_size(disk->queue, min(phys_bs, atomic_bs)); - blk_queue_io_min(disk->queue, phys_bs); - blk_queue_io_opt(disk->queue, io_opt); - - /* - * Register a metadata profile for PI, or the plain non-integrity NVMe - * metadata masquerading as Type 0 if supported, otherwise reject block - * I/O to namespaces with metadata except when the namespace supports - * PI, as it can strip/insert in that case. - */ - if (head->ms) { - if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) && - (head->features & NVME_NS_METADATA_SUPPORTED)) - nvme_init_integrity(disk, head, - ctrl->max_integrity_segments); - else if (!nvme_ns_has_pi(head)) - capacity = 0; - } - - set_capacity_and_notify(disk, capacity); - - nvme_config_discard(ctrl, disk, head); - blk_queue_max_write_zeroes_sectors(disk->queue, - ctrl->max_zeroes_sectors); + lim->logical_block_size = bs; + lim->physical_block_size = min(phys_bs, atomic_bs); + lim->io_min = phys_bs; + lim->io_opt = io_opt; + if (ns->ctrl->quirks & NVME_QUIRK_DEALLOCATE_ZEROES) + lim->max_write_zeroes_sectors = UINT_MAX; + else + lim->max_write_zeroes_sectors = ns->ctrl->max_zeroes_sectors; + return valid; } static bool nvme_ns_is_readonly(struct nvme_ns *ns, struct nvme_ns_info *info) @@ -2042,7 +2022,8 @@ static inline bool nvme_first_scan(struct gendisk *disk) return !disk_live(disk); } -static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) +static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id, + struct queue_limits *lim) { struct nvme_ctrl *ctrl = ns->ctrl; u32 iob; @@ -2070,38 +2051,36 @@ static void nvme_set_chunk_sectors(struct nvme_ns *ns, struct nvme_id_ns *id) return; } - blk_queue_chunk_sectors(ns->queue, iob); + lim->chunk_sectors = iob; } static int nvme_update_ns_info_generic(struct nvme_ns *ns, struct nvme_ns_info *info) { + struct queue_limits lim; + int ret; + blk_mq_freeze_queue(ns->disk->queue); - nvme_set_queue_limits(ns->ctrl, ns->queue); + lim = queue_limits_start_update(ns->disk->queue); + nvme_set_ctrl_limits(ns->ctrl, &lim); + ret = queue_limits_commit_update(ns->disk->queue, &lim); set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); blk_mq_unfreeze_queue(ns->disk->queue); - if (nvme_ns_head_multipath(ns->head)) { - blk_mq_freeze_queue(ns->head->disk->queue); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - ns->head->disk->flags |= GENHD_FL_HIDDEN; - blk_mq_unfreeze_queue(ns->head->disk->queue); - } - /* Hide the block-interface for these devices */ - ns->disk->flags |= GENHD_FL_HIDDEN; - set_bit(NVME_NS_READY, &ns->flags); - - return 0; + if (!ret) + ret = -ENODEV; + return ret; } static int nvme_update_ns_info_block(struct nvme_ns *ns, struct nvme_ns_info *info) { + bool vwc = ns->ctrl->vwc & NVME_CTRL_VWC_PRESENT; + struct queue_limits lim; + struct nvme_id_ns_nvm *nvm = NULL; struct nvme_id_ns *id; + sector_t capacity; unsigned lbaf; int ret; @@ -2113,30 +2092,52 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, /* namespace not allocated or attached */ info->is_removed = true; ret = -ENODEV; - goto error; + goto out; + } + + if (ns->ctrl->ctratt & NVME_CTRL_ATTR_ELBAS) { + ret = nvme_identify_ns_nvm(ns->ctrl, info->nsid, &nvm); + if (ret < 0) + goto out; } blk_mq_freeze_queue(ns->disk->queue); lbaf = nvme_lbaf_index(id->flbas); ns->head->lba_shift = id->lbaf[lbaf].ds; ns->head->nuse = le64_to_cpu(id->nuse); - nvme_set_queue_limits(ns->ctrl, ns->queue); - - ret = nvme_configure_metadata(ns->ctrl, ns->head, id); - if (ret < 0) { - blk_mq_unfreeze_queue(ns->disk->queue); - goto out; - } - nvme_set_chunk_sectors(ns, id); - nvme_update_disk_info(ns->ctrl, ns->disk, ns->head, id); + capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id->nsze)); - if (ns->head->ids.csi == NVME_CSI_ZNS) { - ret = nvme_update_zone_info(ns, lbaf); + lim = queue_limits_start_update(ns->disk->queue); + nvme_set_ctrl_limits(ns->ctrl, &lim); + nvme_configure_metadata(ns->ctrl, ns->head, id, nvm); + nvme_set_chunk_sectors(ns, id, &lim); + if (!nvme_update_disk_info(ns, id, &lim)) + capacity = 0; + nvme_config_discard(ns, &lim); + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + ns->head->ids.csi == NVME_CSI_ZNS) { + ret = nvme_update_zone_info(ns, lbaf, &lim); if (ret) { blk_mq_unfreeze_queue(ns->disk->queue); goto out; } } + ret = queue_limits_commit_update(ns->disk->queue, &lim); + if (ret) { + blk_mq_unfreeze_queue(ns->disk->queue); + goto out; + } + + /* + * Register a metadata profile for PI, or the plain non-integrity NVMe + * metadata masquerading as Type 0 if supported, otherwise reject block + * I/O to namespaces with metadata except when the namespace supports + * PI, as it can strip/insert in that case. + */ + if (!nvme_init_integrity(ns->disk, ns->head)) + capacity = 0; + + set_capacity_and_notify(ns->disk, capacity); /* * Only set the DEAC bit if the device guarantees that reads from @@ -2147,62 +2148,81 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns, if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3))) ns->head->features |= NVME_NS_DEAC; set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info)); + blk_queue_write_cache(ns->disk->queue, vwc, vwc); set_bit(NVME_NS_READY, &ns->flags); blk_mq_unfreeze_queue(ns->disk->queue); if (blk_queue_is_zoned(ns->queue)) { - ret = nvme_revalidate_zones(ns); + ret = blk_revalidate_disk_zones(ns->disk, NULL); if (ret && !nvme_first_scan(ns->disk)) goto out; } - if (nvme_ns_head_multipath(ns->head)) { - blk_mq_freeze_queue(ns->head->disk->queue); - nvme_update_disk_info(ns->ctrl, ns->head->disk, ns->head, id); - set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); - nvme_mpath_revalidate_paths(ns); - blk_stack_limits(&ns->head->disk->queue->limits, - &ns->queue->limits, 0); - disk_update_readahead(ns->head->disk); - blk_mq_unfreeze_queue(ns->head->disk->queue); - } - ret = 0; out: - /* - * If probing fails due an unsupported feature, hide the block device, - * but still allow other access. - */ - if (ret == -ENODEV) { - ns->disk->flags |= GENHD_FL_HIDDEN; - set_bit(NVME_NS_READY, &ns->flags); - ret = 0; - } - -error: + kfree(nvm); kfree(id); return ret; } static int nvme_update_ns_info(struct nvme_ns *ns, struct nvme_ns_info *info) { + bool unsupported = false; + int ret; + switch (info->ids.csi) { case NVME_CSI_ZNS: if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED)) { dev_info(ns->ctrl->device, "block device for nsid %u not supported without CONFIG_BLK_DEV_ZONED\n", info->nsid); - return nvme_update_ns_info_generic(ns, info); + ret = nvme_update_ns_info_generic(ns, info); + break; } - return nvme_update_ns_info_block(ns, info); + ret = nvme_update_ns_info_block(ns, info); + break; case NVME_CSI_NVM: - return nvme_update_ns_info_block(ns, info); + ret = nvme_update_ns_info_block(ns, info); + break; default: dev_info(ns->ctrl->device, "block device for nsid %u not supported (csi %u)\n", info->nsid, info->ids.csi); - return nvme_update_ns_info_generic(ns, info); + ret = nvme_update_ns_info_generic(ns, info); + break; + } + + /* + * If probing fails due an unsupported feature, hide the block device, + * but still allow other access. + */ + if (ret == -ENODEV) { + ns->disk->flags |= GENHD_FL_HIDDEN; + set_bit(NVME_NS_READY, &ns->flags); + unsupported = true; + ret = 0; } + + if (!ret && nvme_ns_head_multipath(ns->head)) { + struct queue_limits lim; + + blk_mq_freeze_queue(ns->head->disk->queue); + if (unsupported) + ns->head->disk->flags |= GENHD_FL_HIDDEN; + else + nvme_init_integrity(ns->head->disk, ns->head); + set_capacity_and_notify(ns->head->disk, get_capacity(ns->disk)); + set_disk_ro(ns->head->disk, nvme_ns_is_readonly(ns, info)); + nvme_mpath_revalidate_paths(ns); + + lim = queue_limits_start_update(ns->head->disk->queue); + queue_limits_stack_bdev(&lim, ns->disk->part0, 0, + ns->head->disk->disk_name); + ret = queue_limits_commit_update(ns->head->disk->queue, &lim); + blk_mq_unfreeze_queue(ns->head->disk->queue); + } + + return ret; } #ifdef CONFIG_BLK_SED_OPAL @@ -2877,7 +2897,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->awupf = le16_to_cpu(id->awupf); nvme_mpath_default_iopolicy(subsys); - subsys->dev.class = nvme_subsys_class; + subsys->dev.class = &nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); @@ -3117,11 +3137,17 @@ static int nvme_check_ctrl_fabric_info(struct nvme_ctrl *ctrl, struct nvme_id_ct return -EINVAL; } + if (!ctrl->maxcmd) { + dev_err(ctrl->device, "Maximum outstanding commands is 0\n"); + return -EINVAL; + } + return 0; } static int nvme_init_identify(struct nvme_ctrl *ctrl) { + struct queue_limits lim; struct nvme_id_ctrl *id; u32 max_hw_sectors; bool prev_apst_enabled; @@ -3188,7 +3214,12 @@ static int nvme_init_identify(struct nvme_ctrl *ctrl) ctrl->max_hw_sectors = min_not_zero(ctrl->max_hw_sectors, max_hw_sectors); - nvme_set_queue_limits(ctrl, ctrl->admin_q); + lim = queue_limits_start_update(ctrl->admin_q); + nvme_set_ctrl_limits(ctrl, &lim); + ret = queue_limits_commit_update(ctrl->admin_q, &lim); + if (ret) + goto out_free; + ctrl->sgls = le32_to_cpu(id->sgls); ctrl->kas = le16_to_cpu(id->kas); ctrl->max_namespaces = le32_to_cpu(id->mnan); @@ -3420,7 +3451,7 @@ int nvme_cdev_add(struct cdev *cdev, struct device *cdev_device, if (minor < 0) return minor; cdev_device->devt = MKDEV(MAJOR(nvme_ns_chr_devt), minor); - cdev_device->class = nvme_ns_chr_class; + cdev_device->class = &nvme_ns_chr_class; cdev_device->release = nvme_cdev_rel; device_initialize(cdev_device); cdev_init(cdev, fops); @@ -3692,7 +3723,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) if (!ns) return; - disk = blk_mq_alloc_disk(ctrl->tagset, ns); + disk = blk_mq_alloc_disk(ctrl->tagset, NULL, ns); if (IS_ERR(disk)) goto out_free_ns; disk->fops = &nvme_bdev_ops; @@ -4353,6 +4384,7 @@ EXPORT_SYMBOL_GPL(nvme_complete_async_event); int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, const struct blk_mq_ops *ops, unsigned int cmd_size) { + struct queue_limits lim = {}; int ret; memset(set, 0, sizeof(*set)); @@ -4372,14 +4404,14 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, if (ret) return ret; - ctrl->admin_q = blk_mq_init_queue(set); + ctrl->admin_q = blk_mq_alloc_queue(set, &lim, NULL); if (IS_ERR(ctrl->admin_q)) { ret = PTR_ERR(ctrl->admin_q); goto out_free_tagset; } if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->fabrics_q = blk_mq_init_queue(set); + ctrl->fabrics_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->fabrics_q)) { ret = PTR_ERR(ctrl->fabrics_q); goto out_cleanup_admin_q; @@ -4443,7 +4475,7 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set, return ret; if (ctrl->ops->flags & NVME_F_FABRICS) { - ctrl->connect_q = blk_mq_init_queue(set); + ctrl->connect_q = blk_mq_alloc_queue(set, NULL, NULL); if (IS_ERR(ctrl->connect_q)) { ret = PTR_ERR(ctrl->connect_q); goto out_free_tag_set; @@ -4613,7 +4645,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, ctrl->device = &ctrl->ctrl_device; ctrl->device->devt = MKDEV(MAJOR(nvme_ctrl_base_chr_devt), ctrl->instance); - ctrl->device->class = nvme_class; + ctrl->device->class = &nvme_class; ctrl->device->parent = ctrl->dev; if (ops->dev_attr_groups) ctrl->device->groups = ops->dev_attr_groups; @@ -4846,42 +4878,36 @@ static int __init nvme_core_init(void) if (result < 0) goto destroy_delete_wq; - nvme_class = class_create("nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); + result = class_register(&nvme_class); + if (result) goto unregister_chrdev; - } - nvme_class->dev_uevent = nvme_class_uevent; - nvme_subsys_class = class_create("nvme-subsystem"); - if (IS_ERR(nvme_subsys_class)) { - result = PTR_ERR(nvme_subsys_class); + result = class_register(&nvme_subsys_class); + if (result) goto destroy_class; - } result = alloc_chrdev_region(&nvme_ns_chr_devt, 0, NVME_MINORS, "nvme-generic"); if (result < 0) goto destroy_subsys_class; - nvme_ns_chr_class = class_create("nvme-generic"); - if (IS_ERR(nvme_ns_chr_class)) { - result = PTR_ERR(nvme_ns_chr_class); + result = class_register(&nvme_ns_chr_class); + if (result) goto unregister_generic_ns; - } + result = nvme_init_auth(); if (result) goto destroy_ns_chr; return 0; destroy_ns_chr: - class_destroy(nvme_ns_chr_class); + class_unregister(&nvme_ns_chr_class); unregister_generic_ns: unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); destroy_subsys_class: - class_destroy(nvme_subsys_class); + class_unregister(&nvme_subsys_class); destroy_class: - class_destroy(nvme_class); + class_unregister(&nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_delete_wq: @@ -4897,9 +4923,9 @@ out: static void __exit nvme_core_exit(void) { nvme_exit_auth(); - class_destroy(nvme_ns_chr_class); - class_destroy(nvme_subsys_class); - class_destroy(nvme_class); + class_unregister(&nvme_ns_chr_class); + class_unregister(&nvme_subsys_class); + class_unregister(&nvme_class); unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS); unregister_chrdev_region(nvme_ctrl_base_chr_devt, NVME_MINORS); destroy_workqueue(nvme_delete_wq); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 495c171daead..1f0ea1f32d22 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -638,7 +638,7 @@ static struct key *nvmf_parse_key(int key_id) } key = key_lookup(key_id); - if (!IS_ERR(key)) + if (IS_ERR(key)) pr_err("key id %08x not found\n", key_id); else pr_debug("Using key id %08x\n", key_id); @@ -1319,7 +1319,10 @@ out_free_opts: return ERR_PTR(ret); } -static struct class *nvmf_class; +static const struct class nvmf_class = { + .name = "nvme-fabrics", +}; + static struct device *nvmf_device; static DEFINE_MUTEX(nvmf_dev_mutex); @@ -1439,15 +1442,14 @@ static int __init nvmf_init(void) if (!nvmf_default_host) return -ENOMEM; - nvmf_class = class_create("nvme-fabrics"); - if (IS_ERR(nvmf_class)) { + ret = class_register(&nvmf_class); + if (ret) { pr_err("couldn't register class nvme-fabrics\n"); - ret = PTR_ERR(nvmf_class); goto out_free_host; } nvmf_device = - device_create(nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); + device_create(&nvmf_class, NULL, MKDEV(0, 0), NULL, "ctl"); if (IS_ERR(nvmf_device)) { pr_err("couldn't create nvme-fabrics device!\n"); ret = PTR_ERR(nvmf_device); @@ -1463,9 +1465,9 @@ static int __init nvmf_init(void) return 0; out_destroy_device: - device_destroy(nvmf_class, MKDEV(0, 0)); + device_destroy(&nvmf_class, MKDEV(0, 0)); out_destroy_class: - class_destroy(nvmf_class); + class_unregister(&nvmf_class); out_free_host: nvmf_host_put(nvmf_default_host); return ret; @@ -1474,8 +1476,8 @@ out_free_host: static void __exit nvmf_exit(void) { misc_deregister(&nvmf_misc); - device_destroy(nvmf_class, MKDEV(0, 0)); - class_destroy(nvmf_class); + device_destroy(&nvmf_class, MKDEV(0, 0)); + class_unregister(&nvmf_class); nvmf_host_put(nvmf_default_host); BUILD_BUG_ON(sizeof(struct nvmf_common_command) != 64); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 74de1e64aeea..5397fb428b24 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -516,6 +516,7 @@ static void nvme_requeue_work(struct work_struct *work) int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) { + struct queue_limits lim; bool vwc = false; mutex_init(&head->lock); @@ -532,9 +533,14 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) !nvme_is_unique_nsid(ctrl, head) || !multipath) return 0; - head->disk = blk_alloc_disk(ctrl->numa_node); - if (!head->disk) - return -ENOMEM; + blk_set_stacking_limits(&lim); + lim.dma_alignment = 3; + if (head->ids.csi != NVME_CSI_ZNS) + lim.max_zone_append_sectors = 0; + + head->disk = blk_alloc_disk(&lim, ctrl->numa_node); + if (IS_ERR(head->disk)) + return PTR_ERR(head->disk); head->disk->fops = &nvme_ns_head_ops; head->disk->private_data = head; sprintf(head->disk->disk_name, "nvme%dn%d", @@ -553,11 +559,6 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) ctrl->tagset->map[HCTX_TYPE_POLL].nr_queues) blk_queue_flag_set(QUEUE_FLAG_POLL, head->disk->queue); - /* set to a default value of 512 until the disk is validated */ - blk_queue_logical_block_size(head->disk->queue, 512); - blk_set_stacking_limits(&head->disk->queue->limits); - blk_queue_dma_alignment(head->disk->queue, 3); - /* we need to propagate up the VMC settings */ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) vwc = true; diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 7b87763e2f8a..24193fcb8bd5 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -464,6 +464,7 @@ struct nvme_ns_head { u16 ms; u16 pi_size; u8 pi_type; + u8 pi_offset; u8 guard_type; u16 sgs; u32 sws; @@ -1035,11 +1036,11 @@ static inline bool nvme_disk_is_ns_head(struct gendisk *disk) } #endif /* CONFIG_NVME_MULTIPATH */ -int nvme_revalidate_zones(struct nvme_ns *ns); int nvme_ns_report_zones(struct nvme_ns *ns, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct queue_limits *lim); #ifdef CONFIG_BLK_DEV_ZONED -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf); blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, struct request *req, struct nvme_command *cmnd, enum nvme_zone_mgmt_action action); @@ -1050,13 +1051,6 @@ static inline blk_status_t nvme_setup_zone_mgmt_send(struct nvme_ns *ns, { return BLK_STS_NOTSUPP; } - -static inline int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) -{ - dev_warn(ns->ctrl->device, - "Please enable CONFIG_BLK_DEV_ZONED to support ZNS devices\n"); - return -EPROTONOSUPPORT; -} #endif static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 20fdd40b1879..366f0bb4ebfc 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -1006,6 +1006,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) { int ret; bool changed; + u16 max_queue_size; ret = nvme_rdma_configure_admin_queue(ctrl, new); if (ret) @@ -1030,11 +1031,16 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new) ctrl->ctrl.opts->queue_size, ctrl->ctrl.sqsize + 1); } - if (ctrl->ctrl.sqsize + 1 > NVME_RDMA_MAX_QUEUE_SIZE) { + if (ctrl->ctrl.max_integrity_segments) + max_queue_size = NVME_RDMA_MAX_METADATA_QUEUE_SIZE; + else + max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + + if (ctrl->ctrl.sqsize + 1 > max_queue_size) { dev_warn(ctrl->ctrl.device, - "ctrl sqsize %u > max queue size %u, clamping down\n", - ctrl->ctrl.sqsize + 1, NVME_RDMA_MAX_QUEUE_SIZE); - ctrl->ctrl.sqsize = NVME_RDMA_MAX_QUEUE_SIZE - 1; + "ctrl sqsize %u > max queue size %u, clamping down\n", + ctrl->ctrl.sqsize + 1, max_queue_size); + ctrl->ctrl.sqsize = max_queue_size - 1; } if (ctrl->ctrl.sqsize + 1 > ctrl->ctrl.maxcmd) { diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index f2832f70e7e0..09fcaa519e5b 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -221,14 +221,11 @@ static int ns_update_nuse(struct nvme_ns *ns) ret = nvme_identify_ns(ns->ctrl, ns->head->ns_id, &id); if (ret) - goto out_free_id; + return ret; ns->head->nuse = le64_to_cpu(id->nuse); - -out_free_id: kfree(id); - - return ret; + return 0; } static ssize_t nuse_show(struct device *dev, struct device_attribute *attr, diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 499bbb0eee8d..722384bcc765 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -7,16 +7,6 @@ #include <linux/vmalloc.h> #include "nvme.h" -int nvme_revalidate_zones(struct nvme_ns *ns) -{ - struct request_queue *q = ns->queue; - - blk_queue_chunk_sectors(q, ns->head->zsze); - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); - - return blk_revalidate_disk_zones(ns->disk, NULL); -} - static int nvme_set_max_append(struct nvme_ctrl *ctrl) { struct nvme_command c = { }; @@ -45,10 +35,10 @@ static int nvme_set_max_append(struct nvme_ctrl *ctrl) return 0; } -int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) +int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf, + struct queue_limits *lim) { struct nvme_effects_log *log = ns->head->effects; - struct request_queue *q = ns->queue; struct nvme_command c = { }; struct nvme_id_ns_zns *id; int status; @@ -109,10 +99,12 @@ int nvme_update_zone_info(struct nvme_ns *ns, unsigned lbaf) goto free_data; } - disk_set_zoned(ns->disk); - blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); - disk_set_max_open_zones(ns->disk, le32_to_cpu(id->mor) + 1); - disk_set_max_active_zones(ns->disk, le32_to_cpu(id->mar) + 1); + blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, ns->queue); + lim->zoned = 1; + lim->max_open_zones = le32_to_cpu(id->mor) + 1; + lim->max_active_zones = le32_to_cpu(id->mar) + 1; + lim->chunk_sectors = ns->head->zsze; + lim->max_zone_append_sectors = ns->ctrl->max_zone_append; free_data: kfree(id); return status; diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 39cb570f833d..f5b7054a4a05 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -428,7 +428,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) id->cqes = (0x4 << 4) | 0x4; /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); id->nn = cpu_to_le32(NVMET_MAX_NAMESPACES); id->mnan = cpu_to_le32(NVMET_MAX_NAMESPACES); diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 2482a0db2504..77a6e817b315 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -273,6 +273,32 @@ static ssize_t nvmet_param_inline_data_size_store(struct config_item *item, CONFIGFS_ATTR(nvmet_, param_inline_data_size); +static ssize_t nvmet_param_max_queue_size_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", port->max_queue_size); +} + +static ssize_t nvmet_param_max_queue_size_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + int ret; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + ret = kstrtoint(page, 0, &port->max_queue_size); + if (ret) { + pr_err("Invalid value '%s' for max_queue_size\n", page); + return -EINVAL; + } + return count; +} + +CONFIGFS_ATTR(nvmet_, param_max_queue_size); + #ifdef CONFIG_BLK_DEV_INTEGRITY static ssize_t nvmet_param_pi_enable_show(struct config_item *item, char *page) @@ -1859,6 +1885,7 @@ static struct configfs_attribute *nvmet_port_attrs[] = { &nvmet_attr_addr_trtype, &nvmet_attr_addr_tsas, &nvmet_attr_param_inline_data_size, + &nvmet_attr_param_max_queue_size, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_attr_param_pi_enable, #endif @@ -1917,6 +1944,7 @@ static struct config_group *nvmet_ports_make(struct config_group *group, INIT_LIST_HEAD(&port->subsystems); INIT_LIST_HEAD(&port->referrals); port->inline_data_size = -1; /* < 0 == let the transport choose */ + port->max_queue_size = -1; /* < 0 == let the transport choose */ port->disc_addr.portid = cpu_to_le16(portid); port->disc_addr.adrfam = NVMF_ADDR_FAMILY_MAX; diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 8658e9c08534..6bbe4df0166c 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -358,6 +358,18 @@ int nvmet_enable_port(struct nvmet_port *port) if (port->inline_data_size < 0) port->inline_data_size = 0; + /* + * If the transport didn't set the max_queue_size properly, then clamp + * it to the target limits. Also set default values in case the + * transport didn't set it at all. + */ + if (port->max_queue_size < 0) + port->max_queue_size = NVMET_MAX_QUEUE_SIZE; + else + port->max_queue_size = clamp_t(int, port->max_queue_size, + NVMET_MIN_QUEUE_SIZE, + NVMET_MAX_QUEUE_SIZE); + port->enabled = true; port->tr_ops = ops; return 0; @@ -1223,9 +1235,10 @@ static void nvmet_init_cap(struct nvmet_ctrl *ctrl) ctrl->cap |= (15ULL << 24); /* maximum queue entries supported: */ if (ctrl->ops->get_max_queue_size) - ctrl->cap |= ctrl->ops->get_max_queue_size(ctrl) - 1; + ctrl->cap |= min_t(u16, ctrl->ops->get_max_queue_size(ctrl), + ctrl->port->max_queue_size) - 1; else - ctrl->cap |= NVMET_QUEUE_SIZE - 1; + ctrl->cap |= ctrl->port->max_queue_size - 1; if (nvmet_is_passthru_subsys(ctrl->subsys)) nvmet_passthrough_override_cap(ctrl); @@ -1411,6 +1424,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, kref_init(&ctrl->ref); ctrl->subsys = subsys; + ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; nvmet_init_cap(ctrl); WRITE_ONCE(ctrl->aen_enabled, NVMET_AEN_CFG_OPTIONAL); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index 68e82ccc0e4e..ce54da8c6b36 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -282,7 +282,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->lpa = (1 << 2); /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ if (ctrl->ops->flags & NVMF_KEYED_SGLS) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index 9964ffe347d2..b23f4cf840bd 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -157,7 +157,8 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; } - if (sqsize > mqes) { + /* for fabrics, this value applies to only the I/O Submission Queues */ + if (qid && sqsize > mqes) { pr_warn("sqsize %u is larger than MQES supported %u cntlid %d\n", sqsize, mqes, ctrl->cntlid); req->error_loc = offsetof(struct nvmf_connect_command, sqsize); @@ -251,8 +252,6 @@ static void nvmet_execute_admin_connect(struct nvmet_req *req) if (status) goto out; - ctrl->pi_support = ctrl->port->pi_enable && ctrl->subsys->pi_support; - uuid_copy(&ctrl->hostid, &d->hostid); ret = nvmet_setup_auth(ctrl); diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c index 1471af250ea6..913cd2ec7a6f 100644 --- a/drivers/nvme/target/fcloop.c +++ b/drivers/nvme/target/fcloop.c @@ -1556,7 +1556,9 @@ static const struct attribute_group *fcloop_dev_attr_groups[] = { NULL, }; -static struct class *fcloop_class; +static const struct class fcloop_class = { + .name = "fcloop", +}; static struct device *fcloop_device; @@ -1564,15 +1566,14 @@ static int __init fcloop_init(void) { int ret; - fcloop_class = class_create("fcloop"); - if (IS_ERR(fcloop_class)) { + ret = class_register(&fcloop_class); + if (ret) { pr_err("couldn't register class fcloop\n"); - ret = PTR_ERR(fcloop_class); return ret; } fcloop_device = device_create_with_groups( - fcloop_class, NULL, MKDEV(0, 0), NULL, + &fcloop_class, NULL, MKDEV(0, 0), NULL, fcloop_dev_attr_groups, "ctl"); if (IS_ERR(fcloop_device)) { pr_err("couldn't create ctl device!\n"); @@ -1585,7 +1586,7 @@ static int __init fcloop_init(void) return 0; out_destroy_class: - class_destroy(fcloop_class); + class_unregister(&fcloop_class); return ret; } @@ -1643,8 +1644,8 @@ static void __exit fcloop_exit(void) put_device(fcloop_device); - device_destroy(fcloop_class, MKDEV(0, 0)); - class_destroy(fcloop_class); + device_destroy(&fcloop_class, MKDEV(0, 0)); + class_unregister(&fcloop_class); } module_init(fcloop_init); diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index 33e61b4f478b..f460728e1df1 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -163,6 +163,7 @@ struct nvmet_port { void *priv; bool enabled; int inline_data_size; + int max_queue_size; const struct nvmet_fabrics_ops *tr_ops; bool pi_enable; }; @@ -543,9 +544,10 @@ void nvmet_subsys_disc_changed(struct nvmet_subsys *subsys, void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, u8 event_info, u8 log_page); -#define NVMET_QUEUE_SIZE 1024 +#define NVMET_MIN_QUEUE_SIZE 16 +#define NVMET_MAX_QUEUE_SIZE 1024 #define NVMET_NR_QUEUES 128 -#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_MAX_CMD(ctrl) (NVME_CAP_MQES(ctrl->cap) + 1) /* * Nice round number that makes a list of nsids fit into a page. diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index f2d963e1fe94..bb4a69d538fd 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -132,7 +132,7 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) id->sqes = min_t(__u8, ((0x6 << 4) | 0x6), id->sqes); id->cqes = min_t(__u8, ((0x4 << 4) | 0x4), id->cqes); - id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD(ctrl)); /* don't support fuse commands */ id->fuses = 0; diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c index 3a0f2c170f4c..f2bb9d95ecf4 100644 --- a/drivers/nvme/target/rdma.c +++ b/drivers/nvme/target/rdma.c @@ -1956,6 +1956,14 @@ static int nvmet_rdma_add_port(struct nvmet_port *nport) nport->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; } + if (nport->max_queue_size < 0) { + nport->max_queue_size = NVME_RDMA_DEFAULT_QUEUE_SIZE; + } else if (nport->max_queue_size > NVME_RDMA_MAX_QUEUE_SIZE) { + pr_warn("max_queue_size %u is too large, reducing to %u\n", + nport->max_queue_size, NVME_RDMA_MAX_QUEUE_SIZE); + nport->max_queue_size = NVME_RDMA_MAX_QUEUE_SIZE; + } + ret = inet_pton_with_scope(&init_net, af, nport->disc_addr.traddr, nport->disc_addr.trsvcid, &port->addr); if (ret) { @@ -2015,6 +2023,8 @@ static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) static u16 nvmet_rdma_get_max_queue_size(const struct nvmet_ctrl *ctrl) { + if (ctrl->pi_support) + return NVME_RDMA_MAX_METADATA_QUEUE_SIZE; return NVME_RDMA_MAX_QUEUE_SIZE; } diff --git a/drivers/nvme/target/zns.c b/drivers/nvme/target/zns.c index 5b5c1e481722..3148d9f1bde6 100644 --- a/drivers/nvme/target/zns.c +++ b/drivers/nvme/target/zns.c @@ -456,8 +456,7 @@ static u16 nvmet_bdev_execute_zmgmt_send_all(struct nvmet_req *req) switch (zsa_req_op(req->cmd->zms.zsa)) { case REQ_OP_ZONE_RESET: ret = blkdev_zone_mgmt(req->ns->bdev, REQ_OP_ZONE_RESET, 0, - get_capacity(req->ns->bdev->bd_disk), - GFP_KERNEL); + get_capacity(req->ns->bdev->bd_disk)); if (ret < 0) return blkdev_zone_mgmt_errno_to_nvme_status(ret); break; @@ -508,7 +507,7 @@ static void nvmet_bdev_zmgmt_send_work(struct work_struct *w) goto out; } - ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors, GFP_KERNEL); + ret = blkdev_zone_mgmt(bdev, op, sect, zone_sectors); if (ret < 0) status = blkdev_zone_mgmt_errno_to_nvme_status(ret); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index c833a7c7d7b2..cead018c3f06 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -8,9 +8,6 @@ * Copyright IBM Corp. 1999, 2009 */ -#define KMSG_COMPONENT "dasd" -#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt - #include <linux/kmod.h> #include <linux/init.h> #include <linux/interrupt.h> @@ -30,9 +27,6 @@ #include <asm/itcw.h> #include <asm/diag.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd:" - #include "dasd_int.h" /* * SECTION: Constant definitions to be used within this file @@ -313,39 +307,57 @@ static int dasd_state_basic_to_known(struct dasd_device *device) */ static int dasd_state_basic_to_ready(struct dasd_device *device) { - int rc; - struct dasd_block *block; - struct gendisk *disk; + struct dasd_block *block = device->block; + struct queue_limits lim; + int rc = 0; - rc = 0; - block = device->block; /* make disk known with correct capacity */ - if (block) { - if (block->base->discipline->do_analysis != NULL) - rc = block->base->discipline->do_analysis(block); - if (rc) { - if (rc != -EAGAIN) { - device->state = DASD_STATE_UNFMT; - disk = device->block->gdp; - kobject_uevent(&disk_to_dev(disk)->kobj, - KOBJ_CHANGE); - goto out; - } - return rc; - } - if (device->discipline->setup_blk_queue) - device->discipline->setup_blk_queue(block); - set_capacity(block->gdp, - block->blocks << block->s2b_shift); + if (!block) { device->state = DASD_STATE_READY; - rc = dasd_scan_partitions(block); - if (rc) { - device->state = DASD_STATE_BASIC; + goto out; + } + + if (block->base->discipline->do_analysis != NULL) + rc = block->base->discipline->do_analysis(block); + if (rc) { + if (rc == -EAGAIN) return rc; - } - } else { - device->state = DASD_STATE_READY; + device->state = DASD_STATE_UNFMT; + kobject_uevent(&disk_to_dev(device->block->gdp)->kobj, + KOBJ_CHANGE); + goto out; + } + + lim = queue_limits_start_update(block->gdp->queue); + lim.max_dev_sectors = device->discipline->max_sectors(block); + lim.max_hw_sectors = lim.max_dev_sectors; + lim.logical_block_size = block->bp_block; + + if (device->discipline->has_discard) { + unsigned int max_bytes; + + lim.discard_granularity = block->bp_block; + + /* Calculate max_discard_sectors and make it PAGE aligned */ + max_bytes = USHRT_MAX * block->bp_block; + max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); + + lim.max_hw_discard_sectors = max_bytes / block->bp_block; + lim.max_write_zeroes_sectors = lim.max_hw_discard_sectors; } + rc = queue_limits_commit_update(block->gdp->queue, &lim); + if (rc) + return rc; + + set_capacity(block->gdp, block->blocks << block->s2b_shift); + device->state = DASD_STATE_READY; + + rc = dasd_scan_partitions(block); + if (rc) { + device->state = DASD_STATE_BASIC; + return rc; + } + out: if (device->discipline->basic_to_ready) rc = device->discipline->basic_to_ready(device); @@ -1301,7 +1313,6 @@ int dasd_term_IO(struct dasd_ccw_req *cqr) { struct dasd_device *device; int retries, rc; - char errorstring[ERRORLENGTH]; /* Check the cqr */ rc = dasd_check_cqr(cqr); @@ -1340,10 +1351,8 @@ int dasd_term_IO(struct dasd_ccw_req *cqr) rc = 0; break; default: - /* internal error 10 - unknown rc*/ - snprintf(errorstring, ERRORLENGTH, "10 %d", rc); - dev_err(&device->cdev->dev, "An error occurred in the " - "DASD device driver, reason=%s\n", errorstring); + dev_err(&device->cdev->dev, + "Unexpected error during request termination %d\n", rc); BUG(); break; } @@ -1362,7 +1371,6 @@ int dasd_start_IO(struct dasd_ccw_req *cqr) { struct dasd_device *device; int rc; - char errorstring[ERRORLENGTH]; /* Check the cqr */ rc = dasd_check_cqr(cqr); @@ -1382,10 +1390,8 @@ int dasd_start_IO(struct dasd_ccw_req *cqr) return -EPERM; } if (cqr->retries < 0) { - /* internal error 14 - start_IO run out of retries */ - sprintf(errorstring, "14 %p", cqr); - dev_err(&device->cdev->dev, "An error occurred in the DASD " - "device driver, reason=%s\n", errorstring); + dev_err(&device->cdev->dev, + "Start I/O ran out of retries\n"); cqr->status = DASD_CQR_ERROR; return -EIO; } @@ -1463,11 +1469,8 @@ int dasd_start_IO(struct dasd_ccw_req *cqr) "not accessible"); break; default: - /* internal error 11 - unknown rc */ - snprintf(errorstring, ERRORLENGTH, "11 %d", rc); dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", errorstring); + "Unexpected error during request start %d", rc); BUG(); break; } @@ -1904,8 +1907,6 @@ static void __dasd_device_process_ccw_queue(struct dasd_device *device, static void __dasd_process_cqr(struct dasd_device *device, struct dasd_ccw_req *cqr) { - char errorstring[ERRORLENGTH]; - switch (cqr->status) { case DASD_CQR_SUCCESS: cqr->status = DASD_CQR_DONE; @@ -1917,11 +1918,8 @@ static void __dasd_process_cqr(struct dasd_device *device, cqr->status = DASD_CQR_TERMINATED; break; default: - /* internal error 12 - wrong cqr status*/ - snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status); dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", errorstring); + "Unexpected CQR status %02x", cqr->status); BUG(); } if (cqr->callback) @@ -1986,16 +1984,14 @@ static void __dasd_device_check_expire(struct dasd_device *device) if (device->discipline->term_IO(cqr) != 0) { /* Hmpf, try again in 5 sec */ dev_err(&device->cdev->dev, - "cqr %p timed out (%lus) but cannot be " - "ended, retrying in 5 s\n", - cqr, (cqr->expires/HZ)); + "CQR timed out (%lus) but cannot be ended, retrying in 5s\n", + (cqr->expires / HZ)); cqr->expires += 5*HZ; dasd_device_set_timer(device, 5*HZ); } else { dev_err(&device->cdev->dev, - "cqr %p timed out (%lus), %i retries " - "remaining\n", cqr, (cqr->expires/HZ), - cqr->retries); + "CQR timed out (%lus), %i retries remaining\n", + (cqr->expires / HZ), cqr->retries); } __dasd_device_check_autoquiesce_timeout(device, cqr); } @@ -2116,8 +2112,7 @@ int dasd_flush_device_queue(struct dasd_device *device) if (rc) { /* unable to terminate requeust */ dev_err(&device->cdev->dev, - "Flushing the DASD request queue " - "failed for request %p\n", cqr); + "Flushing the DASD request queue failed\n"); /* stop flush processing */ goto finished; } @@ -2633,8 +2628,7 @@ static int __dasd_cancel_req(struct dasd_ccw_req *cqr) rc = device->discipline->term_IO(cqr); if (rc) { dev_err(&device->cdev->dev, - "Cancelling request %p failed with rc=%d\n", - cqr, rc); + "Cancelling request failed with rc=%d\n", rc); } else { cqr->stopclk = get_tod_clock(); } @@ -3402,8 +3396,7 @@ static void dasd_generic_auto_online(void *data, async_cookie_t cookie) ret = ccw_device_set_online(cdev); if (ret) - pr_warn("%s: Setting the DASD online failed with rc=%d\n", - dev_name(&cdev->dev), ret); + dev_warn(&cdev->dev, "Setting the DASD online failed with rc=%d\n", ret); } /* @@ -3490,8 +3483,11 @@ int dasd_generic_set_online(struct ccw_device *cdev, { struct dasd_discipline *discipline; struct dasd_device *device; + struct device *dev; int rc; + dev = &cdev->dev; + /* first online clears initial online feature flag */ dasd_set_feature(cdev, DASD_FEATURE_INITIAL_ONLINE, 0); device = dasd_create_device(cdev); @@ -3504,11 +3500,10 @@ int dasd_generic_set_online(struct ccw_device *cdev, /* Try to load the required module. */ rc = request_module(DASD_DIAG_MOD); if (rc) { - pr_warn("%s Setting the DASD online failed " - "because the required module %s " - "could not be loaded (rc=%d)\n", - dev_name(&cdev->dev), DASD_DIAG_MOD, - rc); + dev_warn(dev, "Setting the DASD online failed " + "because the required module %s " + "could not be loaded (rc=%d)\n", + DASD_DIAG_MOD, rc); dasd_delete_device(device); return -ENODEV; } @@ -3516,8 +3511,7 @@ int dasd_generic_set_online(struct ccw_device *cdev, /* Module init could have failed, so check again here after * request_module(). */ if (!dasd_diag_discipline_pointer) { - pr_warn("%s Setting the DASD online failed because of missing DIAG discipline\n", - dev_name(&cdev->dev)); + dev_warn(dev, "Setting the DASD online failed because of missing DIAG discipline\n"); dasd_delete_device(device); return -ENODEV; } @@ -3527,37 +3521,33 @@ int dasd_generic_set_online(struct ccw_device *cdev, dasd_delete_device(device); return -EINVAL; } + device->base_discipline = base_discipline; if (!try_module_get(discipline->owner)) { - module_put(base_discipline->owner); dasd_delete_device(device); return -EINVAL; } - device->base_discipline = base_discipline; device->discipline = discipline; /* check_device will allocate block device if necessary */ rc = discipline->check_device(device); if (rc) { - pr_warn("%s Setting the DASD online with discipline %s failed with rc=%i\n", - dev_name(&cdev->dev), discipline->name, rc); - module_put(discipline->owner); - module_put(base_discipline->owner); + dev_warn(dev, "Setting the DASD online with discipline %s failed with rc=%i\n", + discipline->name, rc); dasd_delete_device(device); return rc; } dasd_set_target_state(device, DASD_STATE_ONLINE); if (device->state <= DASD_STATE_KNOWN) { - pr_warn("%s Setting the DASD online failed because of a missing discipline\n", - dev_name(&cdev->dev)); + dev_warn(dev, "Setting the DASD online failed because of a missing discipline\n"); rc = -ENODEV; dasd_set_target_state(device, DASD_STATE_NEW); if (device->block) dasd_free_block(device->block); dasd_delete_device(device); - } else - pr_debug("dasd_generic device %s found\n", - dev_name(&cdev->dev)); + } else { + dev_dbg(dev, "dasd_generic device found\n"); + } wait_event(dasd_init_waitq, _wait_for_device(device)); @@ -3568,10 +3558,13 @@ EXPORT_SYMBOL_GPL(dasd_generic_set_online); int dasd_generic_set_offline(struct ccw_device *cdev) { + int max_count, open_count, rc; struct dasd_device *device; struct dasd_block *block; - int max_count, open_count, rc; unsigned long flags; + struct device *dev; + + dev = &cdev->dev; rc = 0; spin_lock_irqsave(get_ccwdev_lock(cdev), flags); @@ -3592,11 +3585,10 @@ int dasd_generic_set_offline(struct ccw_device *cdev) open_count = atomic_read(&device->block->open_count); if (open_count > max_count) { if (open_count > 0) - pr_warn("%s: The DASD cannot be set offline with open count %i\n", - dev_name(&cdev->dev), open_count); + dev_warn(dev, "The DASD cannot be set offline with open count %i\n", + open_count); else - pr_warn("%s: The DASD cannot be set offline while it is in use\n", - dev_name(&cdev->dev)); + dev_warn(dev, "The DASD cannot be set offline while it is in use\n"); rc = -EBUSY; goto out_err; } @@ -3956,8 +3948,8 @@ static int dasd_handle_autoquiesce(struct dasd_device *device, if (dasd_eer_enabled(device)) dasd_eer_write(device, NULL, DASD_EER_AUTOQUIESCE); - pr_info("%s: The DASD has been put in the quiesce state\n", - dev_name(&device->cdev->dev)); + dev_info(&device->cdev->dev, + "The DASD has been put in the quiesce state\n"); dasd_device_set_stop_bits(device, DASD_STOPPED_QUIESCE); if (device->features & DASD_FEATURE_REQUEUEQUIESCE) @@ -3977,10 +3969,8 @@ static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device, NULL); if (IS_ERR(cqr)) { - /* internal error 13 - Allocating the RDC request failed*/ - dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", "13"); + DBF_EVENT_DEVID(DBF_WARNING, device->cdev, "%s", + "Could not allocate RDC request"); return cqr; } diff --git a/drivers/s390/block/dasd_3990_erp.c b/drivers/s390/block/dasd_3990_erp.c index 89957bb7244d..459b7f8ac883 100644 --- a/drivers/s390/block/dasd_3990_erp.c +++ b/drivers/s390/block/dasd_3990_erp.c @@ -7,13 +7,9 @@ * */ -#define KMSG_COMPONENT "dasd-eckd" - #include <linux/timer.h> #include <asm/idals.h> -#define PRINTK_HEADER "dasd_erp(3990): " - #include "dasd_int.h" #include "dasd_eckd.h" @@ -398,7 +394,6 @@ dasd_3990_handle_env_data(struct dasd_ccw_req * erp, char *sense) struct dasd_device *device = erp->startdev; char msg_format = (sense[7] & 0xF0); char msg_no = (sense[7] & 0x0F); - char errorstring[ERRORLENGTH]; switch (msg_format) { case 0x00: /* Format 0 - Program or System Checks */ @@ -1004,12 +999,9 @@ dasd_3990_handle_env_data(struct dasd_ccw_req * erp, char *sense) } break; - default: /* unknown message format - should not happen - internal error 03 - unknown message format */ - snprintf(errorstring, ERRORLENGTH, "03 %x02", msg_format); + default: dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", errorstring); + "Unknown message format %02x", msg_format); break; } /* end switch message format */ @@ -1056,11 +1048,9 @@ dasd_3990_erp_com_rej(struct dasd_ccw_req * erp, char *sense) set_bit(DASD_CQR_SUPPRESS_CR, &erp->refers->flags); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); } else { - /* fatal error - set status to FAILED - internal error 09 - Command Reject */ if (!test_bit(DASD_CQR_SUPPRESS_CR, &erp->flags)) dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, reason=09\n"); + "An I/O command request was rejected\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); } @@ -1128,13 +1118,7 @@ dasd_3990_erp_equip_check(struct dasd_ccw_req * erp, char *sense) erp->function = dasd_3990_erp_equip_check; if (sense[1] & SNS1_WRITE_INHIBITED) { - dev_info(&device->cdev->dev, - "Write inhibited path encountered\n"); - - /* vary path offline - internal error 04 - Path should be varied off-line.*/ - dev_err(&device->cdev->dev, "An error occurred in the DASD " - "device driver, reason=%s\n", "04"); + dev_err(&device->cdev->dev, "Write inhibited path encountered\n"); erp = dasd_3990_erp_action_1(erp); @@ -1285,11 +1269,7 @@ dasd_3990_erp_inv_format(struct dasd_ccw_req * erp, char *sense) erp = dasd_3990_erp_action_4(erp, sense); } else { - /* internal error 06 - The track format is not valid*/ - dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", "06"); - + dev_err(&device->cdev->dev, "Track format is not valid\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); } @@ -1663,9 +1643,8 @@ dasd_3990_erp_action_1B_32(struct dasd_ccw_req * default_erp, char *sense) sizeof(struct LO_eckd_data), device); if (IS_ERR(erp)) { - /* internal error 01 - Unable to allocate ERP */ - dev_err(&device->cdev->dev, "An error occurred in the DASD " - "device driver, reason=%s\n", "01"); + DBF_DEV_EVENT(DBF_ERR, device, "%s", + "Unable to allocate ERP request (1B 32)"); return dasd_3990_erp_cleanup(default_erp, DASD_CQR_FAILED); } @@ -1807,10 +1786,8 @@ dasd_3990_update_1B(struct dasd_ccw_req * previous_erp, char *sense) cpa = previous_erp->irb.scsw.cmd.cpa; if (cpa == 0) { - /* internal error 02 - - Unable to determine address of the CCW to be restarted */ - dev_err(&device->cdev->dev, "An error occurred in the DASD " - "device driver, reason=%s\n", "02"); + dev_err(&device->cdev->dev, + "Unable to determine address of to be restarted CCW\n"); previous_erp->status = DASD_CQR_FAILED; @@ -2009,15 +1986,9 @@ dasd_3990_erp_compound_config(struct dasd_ccw_req * erp, char *sense) { if ((sense[25] & DASD_SENSE_BIT_1) && (sense[26] & DASD_SENSE_BIT_2)) { - - /* set to suspended duplex state then restart - internal error 05 - Set device to suspended duplex state - should be done */ struct dasd_device *device = erp->startdev; dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", "05"); - + "Compound configuration error occurred\n"); } erp->function = dasd_3990_erp_compound_config; @@ -2153,10 +2124,9 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense) erp = dasd_3990_erp_int_req(erp); break; - case 0x0F: /* length mismatch during update write command - internal error 08 - update write command error*/ - dev_err(&device->cdev->dev, "An error occurred in the " - "DASD device driver, reason=%s\n", "08"); + case 0x0F: + dev_err(&device->cdev->dev, + "Update write command error occurred\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); break; @@ -2165,12 +2135,9 @@ dasd_3990_erp_inspect_32(struct dasd_ccw_req * erp, char *sense) erp = dasd_3990_erp_action_10_32(erp, sense); break; - case 0x15: /* next track outside defined extend - internal error 07 - The next track is not - within the defined storage extent */ + case 0x15: dev_err(&device->cdev->dev, - "An error occurred in the DASD device driver, " - "reason=%s\n", "07"); + "Track outside defined extent error occurred\n"); erp = dasd_3990_erp_cleanup(erp, DASD_CQR_FAILED); break; @@ -2663,7 +2630,7 @@ dasd_3990_erp_further_erp(struct dasd_ccw_req *erp) * necessary */ dev_err(&device->cdev->dev, - "ERP %p has run out of retries and failed\n", erp); + "ERP %px has run out of retries and failed\n", erp); erp->status = DASD_CQR_FAILED; } @@ -2704,8 +2671,7 @@ dasd_3990_erp_handle_match_erp(struct dasd_ccw_req *erp_head, while (erp_done != erp) { if (erp_done == NULL) /* end of chain reached */ - panic(PRINTK_HEADER "Programming error in ERP! The " - "original request was lost\n"); + panic("Programming error in ERP! The original request was lost\n"); /* remove the request from the device queue */ list_del(&erp_done->blocklist); @@ -2786,11 +2752,9 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr) "ERP chain at BEGINNING of ERP-ACTION\n"); for (temp_erp = cqr; temp_erp != NULL; temp_erp = temp_erp->refers) { - dev_err(&device->cdev->dev, - "ERP %p (%02x) refers to %p\n", - temp_erp, temp_erp->status, - temp_erp->refers); + "ERP %px (%02x) refers to %px\n", + temp_erp, temp_erp->status, temp_erp->refers); } } @@ -2837,11 +2801,9 @@ dasd_3990_erp_action(struct dasd_ccw_req * cqr) "ERP chain at END of ERP-ACTION\n"); for (temp_erp = erp; temp_erp != NULL; temp_erp = temp_erp->refers) { - dev_err(&device->cdev->dev, - "ERP %p (%02x) refers to %p\n", - temp_erp, temp_erp->status, - temp_erp->refers); + "ERP %px (%02x) refers to %px\n", + temp_erp, temp_erp->status, temp_erp->refers); } } diff --git a/drivers/s390/block/dasd_alias.c b/drivers/s390/block/dasd_alias.c index c9740ae88d1a..e84cd5436556 100644 --- a/drivers/s390/block/dasd_alias.c +++ b/drivers/s390/block/dasd_alias.c @@ -6,20 +6,12 @@ * Author(s): Stefan Weinhuber <wein@de.ibm.com> */ -#define KMSG_COMPONENT "dasd-eckd" - #include <linux/list.h> #include <linux/slab.h> #include <asm/ebcdic.h> #include "dasd_int.h" #include "dasd_eckd.h" -#ifdef PRINTK_HEADER -#undef PRINTK_HEADER -#endif /* PRINTK_HEADER */ -#define PRINTK_HEADER "dasd(eckd):" - - /* * General concept of alias management: * - PAV and DASD alias management is specific to the eckd discipline. diff --git a/drivers/s390/block/dasd_devmap.c b/drivers/s390/block/dasd_devmap.c index c4e36650c426..0316c20823ee 100644 --- a/drivers/s390/block/dasd_devmap.c +++ b/drivers/s390/block/dasd_devmap.c @@ -13,8 +13,6 @@ * */ -#define KMSG_COMPONENT "dasd" - #include <linux/ctype.h> #include <linux/init.h> #include <linux/module.h> @@ -24,8 +22,6 @@ #include <linux/uaccess.h> #include <asm/ipl.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd_devmap:" #define DASD_MAX_PARAMS 256 #include "dasd_int.h" @@ -1114,7 +1110,7 @@ dasd_use_diag_show(struct device *dev, struct device_attribute *attr, char *buf) use_diag = (devmap->features & DASD_FEATURE_USEDIAG) != 0; else use_diag = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USEDIAG) != 0; - return sprintf(buf, use_diag ? "1\n" : "0\n"); + return sysfs_emit(buf, use_diag ? "1\n" : "0\n"); } static ssize_t @@ -1163,7 +1159,7 @@ dasd_use_raw_show(struct device *dev, struct device_attribute *attr, char *buf) use_raw = (devmap->features & DASD_FEATURE_USERAW) != 0; else use_raw = (DASD_FEATURE_DEFAULT & DASD_FEATURE_USERAW) != 0; - return sprintf(buf, use_raw ? "1\n" : "0\n"); + return sysfs_emit(buf, use_raw ? "1\n" : "0\n"); } static ssize_t @@ -1259,7 +1255,7 @@ dasd_access_show(struct device *dev, struct device_attribute *attr, if (count < 0) return count; - return sprintf(buf, "%d\n", count); + return sysfs_emit(buf, "%d\n", count); } static DEVICE_ATTR(host_access_count, 0444, dasd_access_show, NULL); @@ -1338,19 +1334,19 @@ static ssize_t dasd_alias_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); if (device->discipline && device->discipline->get_uid && !device->discipline->get_uid(device, &uid)) { if (uid.type == UA_BASE_PAV_ALIAS || uid.type == UA_HYPER_PAV_ALIAS) { dasd_put_device(device); - return sprintf(buf, "1\n"); + return sysfs_emit(buf, "1\n"); } } dasd_put_device(device); - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); } static DEVICE_ATTR(alias, 0444, dasd_alias_show, NULL); @@ -1412,15 +1408,9 @@ dasd_uid_show(struct device *dev, struct device_attribute *attr, char *buf) break; } - if (strlen(uid.vduit) > 0) - snprintf(uid_string, sizeof(uid_string), - "%s.%s.%04x.%s.%s", - uid.vendor, uid.serial, uid.ssid, ua_string, - uid.vduit); - else - snprintf(uid_string, sizeof(uid_string), - "%s.%s.%04x.%s", - uid.vendor, uid.serial, uid.ssid, ua_string); + snprintf(uid_string, sizeof(uid_string), "%s.%s.%04x.%s%s%s", + uid.vendor, uid.serial, uid.ssid, ua_string, + uid.vduit[0] ? "." : "", uid.vduit); } dasd_put_device(device); @@ -1862,7 +1852,7 @@ static ssize_t dasd_pm_show(struct device *dev, device = dasd_device_from_cdev(to_ccwdev(dev)); if (IS_ERR(device)) - return sprintf(buf, "0\n"); + return sysfs_emit(buf, "0\n"); opm = dasd_path_get_opm(device); nppm = dasd_path_get_nppm(device); @@ -1872,8 +1862,8 @@ static ssize_t dasd_pm_show(struct device *dev, ifccpm = dasd_path_get_ifccpm(device); dasd_put_device(device); - return sprintf(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm, - cablepm, cuirpm, hpfpm, ifccpm); + return sysfs_emit(buf, "%02x %02x %02x %02x %02x %02x\n", opm, nppm, + cablepm, cuirpm, hpfpm, ifccpm); } static DEVICE_ATTR(path_masks, 0444, dasd_pm_show, NULL); diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index 2e4e555b37c3..ea4b1d01bb76 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -8,8 +8,6 @@ * */ -#define KMSG_COMPONENT "dasd" - #include <linux/kernel_stat.h> #include <linux/stddef.h> #include <linux/kernel.h> @@ -31,8 +29,6 @@ #include "dasd_int.h" #include "dasd_diag.h" -#define PRINTK_HEADER "dasd(diag):" - MODULE_LICENSE("GPL"); /* The maximum number of blocks per request (max_blocks) is dependent on the @@ -621,25 +617,9 @@ dasd_diag_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, "dump sense not available for DIAG data"); } -/* - * Initialize block layer request queue. - */ -static void dasd_diag_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_diag_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - int max; - - max = DIAG_MAX_BLOCKS << block->s2b_shift; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_dma_alignment(q, PAGE_SIZE - 1); + return DIAG_MAX_BLOCKS << block->s2b_shift; } static int dasd_diag_pe_handler(struct dasd_device *device, @@ -652,10 +632,10 @@ static struct dasd_discipline dasd_diag_discipline = { .owner = THIS_MODULE, .name = "DIAG", .ebcname = "DIAG", + .max_sectors = dasd_diag_max_sectors, .check_device = dasd_diag_check_device, .pe_handler = dasd_diag_pe_handler, .fill_geometry = dasd_diag_fill_geometry, - .setup_blk_queue = dasd_diag_setup_blk_queue, .start_IO = dasd_start_diag, .term_IO = dasd_diag_term_IO, .handle_terminated_request = dasd_diag_handle_terminated_request, diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index bd89b032968a..373c1a86c33e 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -10,8 +10,6 @@ * Author.........: Nigel Hislop <hislop_nigel@emc.com> */ -#define KMSG_COMPONENT "dasd-eckd" - #include <linux/stddef.h> #include <linux/kernel.h> #include <linux/slab.h> @@ -37,11 +35,6 @@ #include "dasd_int.h" #include "dasd_eckd.h" -#ifdef PRINTK_HEADER -#undef PRINTK_HEADER -#endif /* PRINTK_HEADER */ -#define PRINTK_HEADER "dasd(eckd):" - /* * raw track access always map to 64k in memory * so it maps to 16 blocks of 4k per track @@ -1072,22 +1065,14 @@ static void dasd_eckd_read_fc_security(struct dasd_device *device) } } -static void dasd_eckd_get_uid_string(struct dasd_conf *conf, - char *print_uid) +static void dasd_eckd_get_uid_string(struct dasd_conf *conf, char *print_uid) { struct dasd_uid uid; create_uid(conf, &uid); - if (strlen(uid.vduit) > 0) - snprintf(print_uid, DASD_UID_STRLEN, - "%s.%s.%04x.%02x.%s", - uid.vendor, uid.serial, uid.ssid, - uid.real_unit_addr, uid.vduit); - else - snprintf(print_uid, DASD_UID_STRLEN, - "%s.%s.%04x.%02x", - uid.vendor, uid.serial, uid.ssid, - uid.real_unit_addr); + snprintf(print_uid, DASD_UID_STRLEN, "%s.%s.%04x.%02x%s%s", + uid.vendor, uid.serial, uid.ssid, uid.real_unit_addr, + uid.vduit[0] ? "." : "", uid.vduit); } static int dasd_eckd_check_cabling(struct dasd_device *device, @@ -5529,15 +5514,15 @@ dasd_eckd_ioctl(struct dasd_block *block, unsigned int cmd, void __user *argp) * and return number of printed chars. */ static void -dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page) +dasd_eckd_dump_ccw_range(struct dasd_device *device, struct ccw1 *from, + struct ccw1 *to, char *page) { int len, count; char *datap; len = 0; while (from <= to) { - len += sprintf(page + len, PRINTK_HEADER - " CCW %p: %08X %08X DAT:", + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", from, ((int *) from)[0], ((int *) from)[1]); /* get pointer to data (consider IDALs) */ @@ -5560,7 +5545,7 @@ dasd_eckd_dump_ccw_range(struct ccw1 *from, struct ccw1 *to, char *page) from++; } if (len > 0) - printk(KERN_ERR "%s", page); + dev_err(&device->cdev->dev, "%s", page); } static void @@ -5591,9 +5576,12 @@ dasd_eckd_dump_sense_dbf(struct dasd_device *device, struct irb *irb, static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, struct dasd_ccw_req *req, struct irb *irb) { - char *page; struct ccw1 *first, *last, *fail, *from, *to; + struct device *dev; int len, sl, sct; + char *page; + + dev = &device->cdev->dev; page = (char *) get_zeroed_page(GFP_ATOMIC); if (page == NULL) { @@ -5602,24 +5590,18 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, return; } /* dump the sense data */ - len = sprintf(page, PRINTK_HEADER - " I/O status report for device %s:\n", - dev_name(&device->cdev->dev)); - len += sprintf(page + len, PRINTK_HEADER - " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " - "CS:%02X RC:%d\n", + len = sprintf(page, "I/O status report:\n"); + len += sprintf(page + len, + "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X CS:%02X RC:%d\n", req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw), scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw), scsw_dstat(&irb->scsw), scsw_cstat(&irb->scsw), req ? req->intrc : 0); - len += sprintf(page + len, PRINTK_HEADER - " device %s: Failing CCW: %p\n", - dev_name(&device->cdev->dev), + len += sprintf(page + len, "Failing CCW: %px\n", phys_to_virt(irb->scsw.cmd.cpa)); if (irb->esw.esw0.erw.cons) { for (sl = 0; sl < 4; sl++) { - len += sprintf(page + len, PRINTK_HEADER - " Sense(hex) %2d-%2d:", + len += sprintf(page + len, "Sense(hex) %2d-%2d:", (8 * sl), ((8 * sl) + 7)); for (sct = 0; sct < 8; sct++) { @@ -5631,23 +5613,20 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, if (irb->ecw[27] & DASD_SENSE_BIT_0) { /* 24 Byte Sense Data */ - sprintf(page + len, PRINTK_HEADER - " 24 Byte: %x MSG %x, " - "%s MSGb to SYSOP\n", + sprintf(page + len, + "24 Byte: %x MSG %x, %s MSGb to SYSOP\n", irb->ecw[7] >> 4, irb->ecw[7] & 0x0f, irb->ecw[1] & 0x10 ? "" : "no"); } else { /* 32 Byte Sense Data */ - sprintf(page + len, PRINTK_HEADER - " 32 Byte: Format: %x " - "Exception class %x\n", + sprintf(page + len, + "32 Byte: Format: %x Exception class %x\n", irb->ecw[6] & 0x0f, irb->ecw[22] >> 4); } } else { - sprintf(page + len, PRINTK_HEADER - " SORRY - NO VALID SENSE AVAILABLE\n"); + sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); } - printk(KERN_ERR "%s", page); + dev_err(dev, "%s", page); if (req) { /* req == NULL for unsolicited interrupts */ @@ -5656,8 +5635,8 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, first = req->cpaddr; for (last = first; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); to = min(first + 6, last); - printk(KERN_ERR PRINTK_HEADER " Related CP in req: %p\n", req); - dasd_eckd_dump_ccw_range(first, to, page); + dev_err(dev, "Related CP in req: %px\n", req); + dasd_eckd_dump_ccw_range(device, first, to, page); /* print failing CCW area (maximum 4) */ /* scsw->cda is either valid or zero */ @@ -5665,19 +5644,19 @@ static void dasd_eckd_dump_sense_ccw(struct dasd_device *device, fail = phys_to_virt(irb->scsw.cmd.cpa); /* failing CCW */ if (from < fail - 2) { from = fail - 2; /* there is a gap - print header */ - printk(KERN_ERR PRINTK_HEADER "......\n"); + dev_err(dev, "......\n"); } to = min(fail + 1, last); - dasd_eckd_dump_ccw_range(from, to, page + len); + dasd_eckd_dump_ccw_range(device, from, to, page + len); /* print last CCWs (maximum 2) */ len = 0; from = max(from, ++to); if (from < last - 1) { from = last - 1; /* there is a gap - print header */ - printk(KERN_ERR PRINTK_HEADER "......\n"); + dev_err(dev, "......\n"); } - dasd_eckd_dump_ccw_range(from, last, page + len); + dasd_eckd_dump_ccw_range(device, from, last, page + len); } free_page((unsigned long) page); } @@ -5701,11 +5680,9 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, return; } /* dump the sense data */ - len = sprintf(page, PRINTK_HEADER - " I/O status report for device %s:\n", - dev_name(&device->cdev->dev)); - len += sprintf(page + len, PRINTK_HEADER - " in req: %p CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " + len = sprintf(page, "I/O status report:\n"); + len += sprintf(page + len, + "in req: %px CC:%02X FC:%02X AC:%02X SC:%02X DS:%02X " "CS:%02X fcxs:%02X schxs:%02X RC:%d\n", req, scsw_cc(&irb->scsw), scsw_fctl(&irb->scsw), scsw_actl(&irb->scsw), scsw_stctl(&irb->scsw), @@ -5713,9 +5690,7 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, irb->scsw.tm.fcxs, (irb->scsw.tm.ifob << 7) | irb->scsw.tm.sesq, req ? req->intrc : 0); - len += sprintf(page + len, PRINTK_HEADER - " device %s: Failing TCW: %p\n", - dev_name(&device->cdev->dev), + len += sprintf(page + len, "Failing TCW: %px\n", phys_to_virt(irb->scsw.tm.tcw)); tsb = NULL; @@ -5724,47 +5699,37 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, tsb = tcw_get_tsb(phys_to_virt(irb->scsw.tm.tcw)); if (tsb) { - len += sprintf(page + len, PRINTK_HEADER - " tsb->length %d\n", tsb->length); - len += sprintf(page + len, PRINTK_HEADER - " tsb->flags %x\n", tsb->flags); - len += sprintf(page + len, PRINTK_HEADER - " tsb->dcw_offset %d\n", tsb->dcw_offset); - len += sprintf(page + len, PRINTK_HEADER - " tsb->count %d\n", tsb->count); + len += sprintf(page + len, "tsb->length %d\n", tsb->length); + len += sprintf(page + len, "tsb->flags %x\n", tsb->flags); + len += sprintf(page + len, "tsb->dcw_offset %d\n", tsb->dcw_offset); + len += sprintf(page + len, "tsb->count %d\n", tsb->count); residual = tsb->count - 28; - len += sprintf(page + len, PRINTK_HEADER - " residual %d\n", residual); + len += sprintf(page + len, "residual %d\n", residual); switch (tsb->flags & 0x07) { case 1: /* tsa_iostat */ - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.iostat.dev_time %d\n", + len += sprintf(page + len, "tsb->tsa.iostat.dev_time %d\n", tsb->tsa.iostat.dev_time); - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.iostat.def_time %d\n", + len += sprintf(page + len, "tsb->tsa.iostat.def_time %d\n", tsb->tsa.iostat.def_time); - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.iostat.queue_time %d\n", + len += sprintf(page + len, "tsb->tsa.iostat.queue_time %d\n", tsb->tsa.iostat.queue_time); - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.iostat.dev_busy_time %d\n", + len += sprintf(page + len, "tsb->tsa.iostat.dev_busy_time %d\n", tsb->tsa.iostat.dev_busy_time); - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.iostat.dev_act_time %d\n", + len += sprintf(page + len, "tsb->tsa.iostat.dev_act_time %d\n", tsb->tsa.iostat.dev_act_time); sense = tsb->tsa.iostat.sense; break; case 2: /* ts_ddpc */ - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.ddpc.rc %d\n", tsb->tsa.ddpc.rc); + len += sprintf(page + len, "tsb->tsa.ddpc.rc %d\n", + tsb->tsa.ddpc.rc); for (sl = 0; sl < 2; sl++) { - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.ddpc.rcq %2d-%2d: ", + len += sprintf(page + len, + "tsb->tsa.ddpc.rcq %2d-%2d: ", (8 * sl), ((8 * sl) + 7)); rcq = tsb->tsa.ddpc.rcq; for (sct = 0; sct < 8; sct++) { - len += sprintf(page + len, " %02x", + len += sprintf(page + len, "%02x", rcq[8 * sl + sct]); } len += sprintf(page + len, "\n"); @@ -5772,15 +5737,15 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, sense = tsb->tsa.ddpc.sense; break; case 3: /* tsa_intrg */ - len += sprintf(page + len, PRINTK_HEADER - " tsb->tsa.intrg.: not supported yet\n"); + len += sprintf(page + len, + "tsb->tsa.intrg.: not supported yet\n"); break; } if (sense) { for (sl = 0; sl < 4; sl++) { - len += sprintf(page + len, PRINTK_HEADER - " Sense(hex) %2d-%2d:", + len += sprintf(page + len, + "Sense(hex) %2d-%2d:", (8 * sl), ((8 * sl) + 7)); for (sct = 0; sct < 8; sct++) { len += sprintf(page + len, " %02x", @@ -5791,27 +5756,23 @@ static void dasd_eckd_dump_sense_tcw(struct dasd_device *device, if (sense[27] & DASD_SENSE_BIT_0) { /* 24 Byte Sense Data */ - sprintf(page + len, PRINTK_HEADER - " 24 Byte: %x MSG %x, " - "%s MSGb to SYSOP\n", + sprintf(page + len, + "24 Byte: %x MSG %x, %s MSGb to SYSOP\n", sense[7] >> 4, sense[7] & 0x0f, sense[1] & 0x10 ? "" : "no"); } else { /* 32 Byte Sense Data */ - sprintf(page + len, PRINTK_HEADER - " 32 Byte: Format: %x " - "Exception class %x\n", + sprintf(page + len, + "32 Byte: Format: %x Exception class %x\n", sense[6] & 0x0f, sense[22] >> 4); } } else { - sprintf(page + len, PRINTK_HEADER - " SORRY - NO VALID SENSE AVAILABLE\n"); + sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); } } else { - sprintf(page + len, PRINTK_HEADER - " SORRY - NO TSB DATA AVAILABLE\n"); + sprintf(page + len, "SORRY - NO TSB DATA AVAILABLE\n"); } - printk(KERN_ERR "%s", page); + dev_err(&device->cdev->dev, "%s", page); free_page((unsigned long) page); } @@ -6865,17 +6826,9 @@ static void dasd_eckd_handle_hpf_error(struct dasd_device *device, dasd_schedule_requeue(device); } -/* - * Initialize block layer request queue. - */ -static void dasd_eckd_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_eckd_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - struct dasd_device *device = block->base; - int max; - - if (device->features & DASD_FEATURE_USERAW) { + if (block->base->features & DASD_FEATURE_USERAW) { /* * the max_blocks value for raw_track access is 256 * it is higher than the native ECKD value because we @@ -6883,19 +6836,10 @@ static void dasd_eckd_setup_blk_queue(struct dasd_block *block) * so the max_hw_sectors are * 2048 x 512B = 1024kB = 16 tracks */ - max = DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; - } else { - max = DASD_ECKD_MAX_BLOCKS << block->s2b_shift; + return DASD_ECKD_MAX_BLOCKS_RAW << block->s2b_shift; } - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - blk_queue_dma_alignment(q, PAGE_SIZE - 1); + + return DASD_ECKD_MAX_BLOCKS << block->s2b_shift; } static struct ccw_driver dasd_eckd_driver = { @@ -6927,7 +6871,7 @@ static struct dasd_discipline dasd_eckd_discipline = { .basic_to_ready = dasd_eckd_basic_to_ready, .online_to_ready = dasd_eckd_online_to_ready, .basic_to_known = dasd_eckd_basic_to_known, - .setup_blk_queue = dasd_eckd_setup_blk_queue, + .max_sectors = dasd_eckd_max_sectors, .fill_geometry = dasd_eckd_fill_geometry, .start_IO = dasd_start_IO, .term_IO = dasd_term_IO, diff --git a/drivers/s390/block/dasd_eer.c b/drivers/s390/block/dasd_eer.c index c956de711cf7..5064a616e041 100644 --- a/drivers/s390/block/dasd_eer.c +++ b/drivers/s390/block/dasd_eer.c @@ -7,8 +7,6 @@ * Author(s): Stefan Weinhuber <wein@de.ibm.com> */ -#define KMSG_COMPONENT "dasd-eckd" - #include <linux/init.h> #include <linux/fs.h> #include <linux/kernel.h> @@ -28,11 +26,6 @@ #include "dasd_int.h" #include "dasd_eckd.h" -#ifdef PRINTK_HEADER -#undef PRINTK_HEADER -#endif /* PRINTK_HEADER */ -#define PRINTK_HEADER "dasd(eer):" - /* * SECTION: the internal buffer */ diff --git a/drivers/s390/block/dasd_erp.c b/drivers/s390/block/dasd_erp.c index c07e6e713518..4c0d3a704513 100644 --- a/drivers/s390/block/dasd_erp.c +++ b/drivers/s390/block/dasd_erp.c @@ -9,8 +9,6 @@ * */ -#define KMSG_COMPONENT "dasd" - #include <linux/ctype.h> #include <linux/init.h> @@ -18,9 +16,6 @@ #include <asm/ebcdic.h> #include <linux/uaccess.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd_erp:" - #include "dasd_int.h" struct dasd_ccw_req * @@ -170,12 +165,12 @@ dasd_log_sense(struct dasd_ccw_req *cqr, struct irb *irb) device = cqr->startdev; if (cqr->intrc == -ETIMEDOUT) { dev_err(&device->cdev->dev, - "A timeout error occurred for cqr %p\n", cqr); + "A timeout error occurred for cqr %px\n", cqr); return; } if (cqr->intrc == -ENOLINK) { dev_err(&device->cdev->dev, - "A transport error occurred for cqr %p\n", cqr); + "A transport error occurred for cqr %px\n", cqr); return; } /* dump sense data */ diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index c06fa2b27120..bcbb2f8e91fe 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -25,11 +25,6 @@ #include "dasd_int.h" #include "dasd_fba.h" -#ifdef PRINTK_HEADER -#undef PRINTK_HEADER -#endif /* PRINTK_HEADER */ -#define PRINTK_HEADER "dasd(fba):" - #define FBA_DEFAULT_RETRIES 32 #define DASD_FBA_CCW_WRITE 0x41 @@ -660,30 +655,27 @@ static void dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, struct irb *irb) { - char *page; struct ccw1 *act, *end, *last; int len, sl, sct, count; + struct device *dev; + char *page; + + dev = &device->cdev->dev; page = (char *) get_zeroed_page(GFP_ATOMIC); if (page == NULL) { DBF_DEV_EVENT(DBF_WARNING, device, "%s", - "No memory to dump sense data"); + "No memory to dump sense data"); return; } - len = sprintf(page, PRINTK_HEADER - " I/O status report for device %s:\n", - dev_name(&device->cdev->dev)); - len += sprintf(page + len, PRINTK_HEADER - " in req: %p CS: 0x%02X DS: 0x%02X\n", req, - irb->scsw.cmd.cstat, irb->scsw.cmd.dstat); - len += sprintf(page + len, PRINTK_HEADER - " device %s: Failing CCW: %p\n", - dev_name(&device->cdev->dev), + len = sprintf(page, "I/O status report:\n"); + len += sprintf(page + len, "in req: %px CS: 0x%02X DS: 0x%02X\n", + req, irb->scsw.cmd.cstat, irb->scsw.cmd.dstat); + len += sprintf(page + len, "Failing CCW: %px\n", (void *) (addr_t) irb->scsw.cmd.cpa); if (irb->esw.esw0.erw.cons) { for (sl = 0; sl < 4; sl++) { - len += sprintf(page + len, PRINTK_HEADER - " Sense(hex) %2d-%2d:", + len += sprintf(page + len, "Sense(hex) %2d-%2d:", (8 * sl), ((8 * sl) + 7)); for (sct = 0; sct < 8; sct++) { @@ -693,20 +685,18 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, len += sprintf(page + len, "\n"); } } else { - len += sprintf(page + len, PRINTK_HEADER - " SORRY - NO VALID SENSE AVAILABLE\n"); + len += sprintf(page + len, "SORRY - NO VALID SENSE AVAILABLE\n"); } - printk(KERN_ERR "%s", page); + dev_err(dev, "%s", page); /* dump the Channel Program */ /* print first CCWs (maximum 8) */ act = req->cpaddr; - for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); + for (last = act; last->flags & (CCW_FLAG_CC | CCW_FLAG_DC); last++); end = min(act + 8, last); - len = sprintf(page, PRINTK_HEADER " Related CP in req: %p\n", req); + len = sprintf(page, "Related CP in req: %px\n", req); while (act <= end) { - len += sprintf(page + len, PRINTK_HEADER - " CCW %p: %08X %08X DAT:", + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", act, ((int *) act)[0], ((int *) act)[1]); for (count = 0; count < 32 && count < act->count; count += sizeof(int)) @@ -716,19 +706,17 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, len += sprintf(page + len, "\n"); act++; } - printk(KERN_ERR "%s", page); - + dev_err(dev, "%s", page); /* print failing CCW area */ len = 0; if (act < ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2) { act = ((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa) - 2; - len += sprintf(page + len, PRINTK_HEADER "......\n"); + len += sprintf(page + len, "......\n"); } end = min((struct ccw1 *)(addr_t) irb->scsw.cmd.cpa + 2, last); while (act <= end) { - len += sprintf(page + len, PRINTK_HEADER - " CCW %p: %08X %08X DAT:", + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", act, ((int *) act)[0], ((int *) act)[1]); for (count = 0; count < 32 && count < act->count; count += sizeof(int)) @@ -742,11 +730,10 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, /* print last CCWs */ if (act < last - 2) { act = last - 2; - len += sprintf(page + len, PRINTK_HEADER "......\n"); + len += sprintf(page + len, "......\n"); } while (act <= last) { - len += sprintf(page + len, PRINTK_HEADER - " CCW %p: %08X %08X DAT:", + len += sprintf(page + len, "CCW %px: %08X %08X DAT:", act, ((int *) act)[0], ((int *) act)[1]); for (count = 0; count < 32 && count < act->count; count += sizeof(int)) @@ -757,39 +744,13 @@ dasd_fba_dump_sense(struct dasd_device *device, struct dasd_ccw_req * req, act++; } if (len > 0) - printk(KERN_ERR "%s", page); + dev_err(dev, "%s", page); free_page((unsigned long) page); } -/* - * Initialize block layer request queue. - */ -static void dasd_fba_setup_blk_queue(struct dasd_block *block) +static unsigned int dasd_fba_max_sectors(struct dasd_block *block) { - unsigned int logical_block_size = block->bp_block; - struct request_queue *q = block->gdp->queue; - unsigned int max_bytes, max_discard_sectors; - int max; - - max = DASD_FBA_MAX_BLOCKS << block->s2b_shift; - blk_queue_flag_set(QUEUE_FLAG_NONROT, q); - q->limits.max_dev_sectors = max; - blk_queue_logical_block_size(q, logical_block_size); - blk_queue_max_hw_sectors(q, max); - blk_queue_max_segments(q, USHRT_MAX); - /* With page sized segments each segment can be translated into one idaw/tidaw */ - blk_queue_max_segment_size(q, PAGE_SIZE); - blk_queue_segment_boundary(q, PAGE_SIZE - 1); - - q->limits.discard_granularity = logical_block_size; - - /* Calculate max_discard_sectors and make it PAGE aligned */ - max_bytes = USHRT_MAX * logical_block_size; - max_bytes = ALIGN_DOWN(max_bytes, PAGE_SIZE); - max_discard_sectors = max_bytes / logical_block_size; - - blk_queue_max_discard_sectors(q, max_discard_sectors); - blk_queue_max_write_zeroes_sectors(q, max_discard_sectors); + return DASD_FBA_MAX_BLOCKS << block->s2b_shift; } static int dasd_fba_pe_handler(struct dasd_device *device, @@ -802,10 +763,11 @@ static struct dasd_discipline dasd_fba_discipline = { .owner = THIS_MODULE, .name = "FBA ", .ebcname = "FBA ", + .has_discard = true, .check_device = dasd_fba_check_characteristics, .do_analysis = dasd_fba_do_analysis, .pe_handler = dasd_fba_pe_handler, - .setup_blk_queue = dasd_fba_setup_blk_queue, + .max_sectors = dasd_fba_max_sectors, .fill_geometry = dasd_fba_fill_geometry, .start_IO = dasd_start_IO, .term_IO = dasd_term_IO, diff --git a/drivers/s390/block/dasd_genhd.c b/drivers/s390/block/dasd_genhd.c index 8bf2cf0ccc15..4533dd055ca8 100644 --- a/drivers/s390/block/dasd_genhd.c +++ b/drivers/s390/block/dasd_genhd.c @@ -11,8 +11,6 @@ * */ -#define KMSG_COMPONENT "dasd" - #include <linux/interrupt.h> #include <linux/major.h> #include <linux/fs.h> @@ -20,9 +18,6 @@ #include <linux/uaccess.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd_gendisk:" - #include "dasd_int.h" static unsigned int queue_depth = 32; @@ -39,6 +34,16 @@ MODULE_PARM_DESC(nr_hw_queues, "Default number of hardware queues for new DASD d */ int dasd_gendisk_alloc(struct dasd_block *block) { + struct queue_limits lim = { + /* + * With page sized segments, each segment can be translated into + * one idaw/tidaw. + */ + .max_segment_size = PAGE_SIZE, + .seg_boundary_mask = PAGE_SIZE - 1, + .dma_alignment = PAGE_SIZE - 1, + .max_segments = USHRT_MAX, + }; struct gendisk *gdp; struct dasd_device *base; int len, rc; @@ -58,11 +63,12 @@ int dasd_gendisk_alloc(struct dasd_block *block) if (rc) return rc; - gdp = blk_mq_alloc_disk(&block->tag_set, block); + gdp = blk_mq_alloc_disk(&block->tag_set, &lim, block); if (IS_ERR(gdp)) { blk_mq_free_tag_set(&block->tag_set); return PTR_ERR(gdp); } + blk_queue_flag_set(QUEUE_FLAG_NONROT, gdp->queue); /* Initialize gendisk structure. */ gdp->major = DASD_MAJOR; diff --git a/drivers/s390/block/dasd_int.h b/drivers/s390/block/dasd_int.h index aecd502aec51..e5f40536b425 100644 --- a/drivers/s390/block/dasd_int.h +++ b/drivers/s390/block/dasd_int.h @@ -113,9 +113,6 @@ do { \ __dev_id.ssid, __dev_id.devno, d_data); \ } while (0) -/* limit size for an errorstring */ -#define ERRORLENGTH 30 - /* definition of dbf debug levels */ #define DBF_EMERG 0 /* system is unusable */ #define DBF_ALERT 1 /* action must be taken immediately */ @@ -126,32 +123,6 @@ do { \ #define DBF_INFO 6 /* informational */ #define DBF_DEBUG 6 /* debug-level messages */ -/* messages to be written via klogd and dbf */ -#define DEV_MESSAGE(d_loglevel,d_device,d_string,d_args...)\ -do { \ - printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \ - dev_name(&d_device->cdev->dev), d_args); \ - DBF_DEV_EVENT(DBF_ALERT, d_device, d_string, d_args); \ -} while(0) - -#define MESSAGE(d_loglevel,d_string,d_args...)\ -do { \ - printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \ - DBF_EVENT(DBF_ALERT, d_string, d_args); \ -} while(0) - -/* messages to be written via klogd only */ -#define DEV_MESSAGE_LOG(d_loglevel,d_device,d_string,d_args...)\ -do { \ - printk(d_loglevel PRINTK_HEADER " %s: " d_string "\n", \ - dev_name(&d_device->cdev->dev), d_args); \ -} while(0) - -#define MESSAGE_LOG(d_loglevel,d_string,d_args...)\ -do { \ - printk(d_loglevel PRINTK_HEADER " " d_string "\n", d_args); \ -} while(0) - /* Macro to calculate number of blocks per page */ #define BLOCKS_PER_PAGE(blksize) (PAGE_SIZE / blksize) @@ -322,6 +293,7 @@ struct dasd_discipline { struct module *owner; char ebcname[8]; /* a name used for tagging and printks */ char name[8]; /* a name used for tagging and printks */ + bool has_discard; struct list_head list; /* used for list of disciplines */ @@ -360,10 +332,7 @@ struct dasd_discipline { int (*online_to_ready) (struct dasd_device *); int (*basic_to_known)(struct dasd_device *); - /* - * Initialize block layer request queue. - */ - void (*setup_blk_queue)(struct dasd_block *); + unsigned int (*max_sectors)(struct dasd_block *); /* (struct dasd_device *); * Device operation functions. build_cp creates a ccw chain for * a block device request, start_io starts the request and diff --git a/drivers/s390/block/dasd_ioctl.c b/drivers/s390/block/dasd_ioctl.c index de85a5e4e21b..7e0ed7032f76 100644 --- a/drivers/s390/block/dasd_ioctl.c +++ b/drivers/s390/block/dasd_ioctl.c @@ -10,8 +10,6 @@ * i/o controls for the dasd driver. */ -#define KMSG_COMPONENT "dasd" - #include <linux/interrupt.h> #include <linux/compat.h> #include <linux/major.h> @@ -24,12 +22,8 @@ #include <linux/uaccess.h> #include <linux/dasd_mod.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd_ioctl:" - #include "dasd_int.h" - static int dasd_ioctl_api_version(void __user *argp) { diff --git a/drivers/s390/block/dasd_proc.c b/drivers/s390/block/dasd_proc.c index 62a859ea67f8..0faaa437d9be 100644 --- a/drivers/s390/block/dasd_proc.c +++ b/drivers/s390/block/dasd_proc.c @@ -11,8 +11,6 @@ * */ -#define KMSG_COMPONENT "dasd" - #include <linux/ctype.h> #include <linux/slab.h> #include <linux/string.h> @@ -23,9 +21,6 @@ #include <asm/debug.h> #include <linux/uaccess.h> -/* This is ugly... */ -#define PRINTK_HEADER "dasd_proc:" - #include "dasd_int.h" static struct proc_dir_entry *dasd_proc_root_entry = NULL; diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index 4b7ecd4fd431..9c8f529b827c 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -546,6 +546,9 @@ static const struct attribute_group *dcssblk_dev_attr_groups[] = { static ssize_t dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { + struct queue_limits lim = { + .logical_block_size = 4096, + }; int rc, i, j, num_of_segments; struct dcssblk_dev_info *dev_info; struct segment_info *seg_info, *temp; @@ -629,9 +632,9 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->dev.release = dcssblk_release_segment; dev_info->dev.groups = dcssblk_dev_attr_groups; INIT_LIST_HEAD(&dev_info->lh); - dev_info->gd = blk_alloc_disk(NUMA_NO_NODE); - if (dev_info->gd == NULL) { - rc = -ENOMEM; + dev_info->gd = blk_alloc_disk(&lim, NUMA_NO_NODE); + if (IS_ERR(dev_info->gd)) { + rc = PTR_ERR(dev_info->gd); goto seg_list_del; } dev_info->gd->major = dcssblk_major; @@ -639,7 +642,6 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->gd->fops = &dcssblk_devops; dev_info->gd->private_data = dev_info; dev_info->gd->flags |= GENHD_FL_NO_PART; - blk_queue_logical_block_size(dev_info->gd->queue, 4096); blk_queue_flag_set(QUEUE_FLAG_DAX, dev_info->gd->queue); seg_byte_size = (dev_info->end - dev_info->start + 1); diff --git a/drivers/s390/block/scm_blk.c b/drivers/s390/block/scm_blk.c index ade95e91b3c8..9f6fdd0daa74 100644 --- a/drivers/s390/block/scm_blk.c +++ b/drivers/s390/block/scm_blk.c @@ -435,10 +435,17 @@ static const struct blk_mq_ops scm_mq_ops = { int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) { - unsigned int devindex, nr_max_blk; + struct queue_limits lim = { + .logical_block_size = 1 << 12, + }; + unsigned int devindex; struct request_queue *rq; int len, ret; + lim.max_segments = min(scmdev->nr_max_block, + (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); + lim.max_hw_sectors = lim.max_segments << 3; /* 8 * 512 = blk_size */ + devindex = atomic_inc_return(&nr_devices) - 1; /* scma..scmz + scmaa..scmzz */ if (devindex > 701) { @@ -462,18 +469,12 @@ int scm_blk_dev_setup(struct scm_blk_dev *bdev, struct scm_device *scmdev) if (ret) goto out; - bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, scmdev); + bdev->gendisk = blk_mq_alloc_disk(&bdev->tag_set, &lim, scmdev); if (IS_ERR(bdev->gendisk)) { ret = PTR_ERR(bdev->gendisk); goto out_tag; } rq = bdev->rq = bdev->gendisk->queue; - nr_max_blk = min(scmdev->nr_max_block, - (unsigned int) (PAGE_SIZE / sizeof(struct aidaw))); - - blk_queue_logical_block_size(rq, 1 << 12); - blk_queue_max_hw_sectors(rq, nr_max_blk << 3); /* 8 * 512 = blk_size */ - blk_queue_max_segments(rq, nr_max_blk); blk_queue_flag_set(QUEUE_FLAG_NONROT, rq); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, rq); diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c index 44680f65ea14..9969f4e2f1c3 100644 --- a/drivers/scsi/scsi_scan.c +++ b/drivers/scsi/scsi_scan.c @@ -332,7 +332,7 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget, sdev->sg_reserved_size = INT_MAX; - q = blk_mq_init_queue(&sdev->host->tag_set); + q = blk_mq_alloc_queue(&sdev->host->tag_set, NULL, NULL); if (IS_ERR(q)) { /* release fn is set up in scsi_sysfs_device_initialise, so * have to free and put manually here */ diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 3b89c9d4aa40..eac7fff6992d 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -10593,7 +10593,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq) err = blk_mq_alloc_tag_set(&hba->tmf_tag_set); if (err < 0) goto out_remove_scsi_host; - hba->tmf_queue = blk_mq_init_queue(&hba->tmf_tag_set); + hba->tmf_queue = blk_mq_alloc_queue(&hba->tmf_tag_set, NULL, NULL); if (IS_ERR(hba->tmf_queue)) { err = PTR_ERR(hba->tmf_queue); goto free_tmf_tag_set; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 5f750fa53a2b..aea51fd850cd 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -824,11 +824,14 @@ static int sb_log_location(struct block_device *bdev, struct blk_zone *zones, reset = &zones[1]; if (reset && reset->cond != BLK_ZONE_COND_EMPTY) { + unsigned int nofs_flags; + ASSERT(sb_zone_is_full(reset)); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - reset->start, reset->len, - GFP_NOFS); + reset->start, reset->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -974,11 +977,14 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) * explicit ZONE_FINISH is not necessary. */ if (zone->wp != zone->start + zone->capacity) { + unsigned int nofs_flags; int ret; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, zone->start, - zone->len, GFP_NOFS); + zone->len); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; } @@ -996,11 +1002,13 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) { + unsigned int nofs_flags; sector_t zone_sectors; sector_t nr_sectors; u8 zone_sectors_shift; u32 sb_zone; u32 nr_zones; + int ret; zone_sectors = bdev_zone_sectors(bdev); zone_sectors_shift = ilog2(zone_sectors); @@ -1011,9 +1019,12 @@ int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror) if (sb_zone + 1 >= nr_zones) return -ENOENT; - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - zone_start_sector(sb_zone, bdev), - zone_sectors * BTRFS_NR_SB_LOG_ZONES, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + zone_start_sector(sb_zone, bdev), + zone_sectors * BTRFS_NR_SB_LOG_ZONES); + memalloc_nofs_restore(nofs_flags); + return ret; } /* @@ -1124,12 +1135,14 @@ static void btrfs_dev_clear_active_zone(struct btrfs_device *device, u64 pos) int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, u64 length, u64 *bytes) { + unsigned int nofs_flags; int ret; *bytes = 0; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_RESET, - physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT, - GFP_NOFS); + physical >> SECTOR_SHIFT, length >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; @@ -2244,14 +2257,16 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_device *device = map->stripes[i].dev; const u64 physical = map->stripes[i].physical; struct btrfs_zoned_device_info *zinfo = device->zone_info; + unsigned int nofs_flags; if (zinfo->max_active_zones == 0) continue; + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH, physical >> SECTOR_SHIFT, - zinfo->zone_size >> SECTOR_SHIFT, - GFP_NOFS); + zinfo->zone_size >> SECTOR_SHIFT); + memalloc_nofs_restore(nofs_flags); if (ret) return ret; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4c8836ded90f..e1065ba70207 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1971,9 +1971,15 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) { + unsigned int nofs_flags; + int ret; + trace_f2fs_issue_reset_zone(bdev, blkstart); - return blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, - sector, nr_sects, GFP_NOFS); + nofs_flags = memalloc_nofs_save(); + ret = blkdev_zone_mgmt(bdev, REQ_OP_ZONE_RESET, + sector, nr_sects); + memalloc_nofs_restore(nofs_flags); + return ret; } __queue_zone_reset_cmd(sbi, bdev, blkstart, lblkstart, blklen); @@ -4865,6 +4871,7 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, block_t zone_block, valid_block_cnt; unsigned int log_sectors_per_block = sbi->log_blocksize - SECTOR_SHIFT; int ret; + unsigned int nofs_flags; if (zone->type != BLK_ZONE_TYPE_SEQWRITE_REQ) return 0; @@ -4912,8 +4919,10 @@ static int check_zone_write_pointer(struct f2fs_sb_info *sbi, "pointer: valid block[0x%x,0x%x] cond[0x%x]", zone_segno, valid_block_cnt, zone->cond); + nofs_flags = memalloc_nofs_save(); ret = blkdev_zone_mgmt(fdev->bdev, REQ_OP_ZONE_FINISH, - zone->start, zone->len, GFP_NOFS); + zone->start, zone->len); + memalloc_nofs_restore(nofs_flags); if (ret == -EOPNOTSUPP) { ret = blkdev_issue_zeroout(fdev->bdev, zone->wp, zone->len - (zone->wp - zone->start), diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index b6e8e7c96251..aadad16738df 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -113,7 +113,7 @@ static int zonefs_zone_mgmt(struct super_block *sb, trace_zonefs_zone_mgmt(sb, z, op); ret = blkdev_zone_mgmt(sb->s_bdev, op, z->z_sector, - z->z_size >> SECTOR_SHIFT, GFP_NOFS); + z->z_size >> SECTOR_SHIFT); if (ret) { zonefs_err(sb, "Zone management operation %s at %llu failed %d\n", diff --git a/include/linux/blk-integrity.h b/include/linux/blk-integrity.h index 378b2459efe2..e253e7bd0d17 100644 --- a/include/linux/blk-integrity.h +++ b/include/linux/blk-integrity.h @@ -20,6 +20,7 @@ struct blk_integrity_iter { unsigned int data_size; unsigned short interval; unsigned char tuple_size; + unsigned char pi_offset; const char *disk_name; }; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 492b0128b5d9..d3d8fd8e229b 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -684,17 +684,19 @@ enum { #define BLK_MQ_NO_HCTX_IDX (-1U) -struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata, struct lock_class_key *lkclass); -#define blk_mq_alloc_disk(set, queuedata) \ +#define blk_mq_alloc_disk(set, lim, queuedata) \ ({ \ static struct lock_class_key __key; \ \ - __blk_mq_alloc_disk(set, queuedata, &__key); \ + __blk_mq_alloc_disk(set, lim, queuedata, &__key); \ }) struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, struct lock_class_key *lkclass); -struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); +struct request_queue *blk_mq_alloc_queue(struct blk_mq_tag_set *set, + struct queue_limits *lim, void *queuedata); int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); void blk_mq_destroy_queue(struct request_queue *); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 12d87cef2c03..cb1526ec44b5 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -207,52 +207,10 @@ static inline bool blk_path_error(blk_status_t error) return true; } -/* - * From most significant bit: - * 1 bit: reserved for other usage, see below - * 12 bits: original size of bio - * 51 bits: issue time of bio - */ -#define BIO_ISSUE_RES_BITS 1 -#define BIO_ISSUE_SIZE_BITS 12 -#define BIO_ISSUE_RES_SHIFT (64 - BIO_ISSUE_RES_BITS) -#define BIO_ISSUE_SIZE_SHIFT (BIO_ISSUE_RES_SHIFT - BIO_ISSUE_SIZE_BITS) -#define BIO_ISSUE_TIME_MASK ((1ULL << BIO_ISSUE_SIZE_SHIFT) - 1) -#define BIO_ISSUE_SIZE_MASK \ - (((1ULL << BIO_ISSUE_SIZE_BITS) - 1) << BIO_ISSUE_SIZE_SHIFT) -#define BIO_ISSUE_RES_MASK (~((1ULL << BIO_ISSUE_RES_SHIFT) - 1)) - -/* Reserved bit for blk-throtl */ -#define BIO_ISSUE_THROTL_SKIP_LATENCY (1ULL << 63) - struct bio_issue { u64 value; }; -static inline u64 __bio_issue_time(u64 time) -{ - return time & BIO_ISSUE_TIME_MASK; -} - -static inline u64 bio_issue_time(struct bio_issue *issue) -{ - return __bio_issue_time(issue->value); -} - -static inline sector_t bio_issue_size(struct bio_issue *issue) -{ - return ((issue->value & BIO_ISSUE_SIZE_MASK) >> BIO_ISSUE_SIZE_SHIFT); -} - -static inline void bio_issue_init(struct bio_issue *issue, - sector_t size) -{ - size &= (1ULL << BIO_ISSUE_SIZE_BITS) - 1; - issue->value = ((issue->value & BIO_ISSUE_RES_MASK) | - (ktime_get_ns() & BIO_ISSUE_TIME_MASK) | - ((u64)size << BIO_ISSUE_SIZE_SHIFT)); -} - typedef __u32 __bitwise blk_opf_t; typedef unsigned int blk_qc_t; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2f5dbde23094..f9b87c39cab0 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -43,7 +43,7 @@ struct blk_crypto_profile; extern const struct device_type disk_type; extern const struct device_type part_type; -extern struct class block_class; +extern const struct class block_class; /* * Maximum number of blkcg policies allowed to be registered concurrently. @@ -109,6 +109,7 @@ struct blk_integrity { const struct blk_integrity_profile *profile; unsigned char flags; unsigned char tuple_size; + unsigned char pi_offset; unsigned char interval_exp; unsigned char tag_size; }; @@ -190,8 +191,6 @@ struct gendisk { * blk_mq_unfreeze_queue(). */ unsigned int nr_zones; - unsigned int max_open_zones; - unsigned int max_active_zones; unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; #endif /* CONFIG_BLK_DEV_ZONED */ @@ -293,6 +292,7 @@ struct queue_limits { unsigned int io_opt; unsigned int max_discard_sectors; unsigned int max_hw_discard_sectors; + unsigned int max_user_discard_sectors; unsigned int max_secure_erase_sectors; unsigned int max_write_zeroes_sectors; unsigned int max_zone_append_sectors; @@ -308,6 +308,8 @@ struct queue_limits { unsigned char discard_misaligned; unsigned char raid_partial_stripes_expensive; bool zoned; + unsigned int max_open_zones; + unsigned int max_active_zones; /* * Drivers that set dma_alignment to less than 511 must be prepared to @@ -326,7 +328,7 @@ void disk_set_zoned(struct gendisk *disk); int blkdev_report_zones(struct block_device *bdev, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, - sector_t sectors, sector_t nr_sectors, gfp_t gfp_mask); + sector_t sectors, sector_t nr_sectors); int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)); @@ -474,6 +476,7 @@ struct request_queue { struct mutex sysfs_lock; struct mutex sysfs_dir_lock; + struct mutex limits_lock; /* * for reusing dead hctx instance in case of updating @@ -640,23 +643,23 @@ static inline bool disk_zone_is_seq(struct gendisk *disk, sector_t sector) static inline void disk_set_max_open_zones(struct gendisk *disk, unsigned int max_open_zones) { - disk->max_open_zones = max_open_zones; + disk->queue->limits.max_open_zones = max_open_zones; } static inline void disk_set_max_active_zones(struct gendisk *disk, unsigned int max_active_zones) { - disk->max_active_zones = max_active_zones; + disk->queue->limits.max_active_zones = max_active_zones; } static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { - return bdev->bd_disk->max_open_zones; + return bdev->bd_disk->queue->limits.max_open_zones; } static inline unsigned int bdev_max_active_zones(struct block_device *bdev) { - return bdev->bd_disk->max_active_zones; + return bdev->bd_disk->queue->limits.max_active_zones; } #else /* CONFIG_BLK_DEV_ZONED */ @@ -764,22 +767,26 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass); +struct gendisk *__blk_alloc_disk(struct queue_limits *lim, int node, + struct lock_class_key *lkclass); /** * blk_alloc_disk - allocate a gendisk structure + * @lim: queue limits to be used for this disk. * @node_id: numa node to allocate on * * Allocate and pre-initialize a gendisk structure for use with BIO based * drivers. * + * Returns an ERR_PTR on error, else the allocated disk. + * * Context: can sleep */ -#define blk_alloc_disk(node_id) \ +#define blk_alloc_disk(lim, node_id) \ ({ \ static struct lock_class_key __key; \ \ - __blk_alloc_disk(node_id, &__key); \ + __blk_alloc_disk(lim, node_id, &__key); \ }) int __register_blkdev(unsigned int major, const char *name, @@ -862,6 +869,29 @@ static inline unsigned int blk_chunk_sectors_left(sector_t offset, return chunk_sectors - (offset & (chunk_sectors - 1)); } +/** + * queue_limits_start_update - start an atomic update of queue limits + * @q: queue to update + * + * This functions starts an atomic update of the queue limits. It takes a lock + * to prevent other updates and returns a snapshot of the current limits that + * the caller can modify. The caller must call queue_limits_commit_update() + * to finish the update. + * + * Context: process context. The caller must have frozen the queue or ensured + * that there is outstanding I/O by other means. + */ +static inline struct queue_limits +queue_limits_start_update(struct request_queue *q) + __acquires(q->limits_lock) +{ + mutex_lock(&q->limits_lock); + return q->limits; +} +int queue_limits_commit_update(struct request_queue *q, + struct queue_limits *lim); +int queue_limits_set(struct request_queue *q, struct queue_limits *lim); + /* * Access functions for manipulating queue properties */ @@ -895,8 +925,8 @@ extern void blk_set_queue_depth(struct request_queue *q, unsigned int depth); extern void blk_set_stacking_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); -extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, - sector_t offset); +void queue_limits_stack_bdev(struct queue_limits *t, struct block_device *bdev, + sector_t offset, const char *pfx); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_segment_boundary(struct request_queue *, unsigned long); extern void blk_queue_virt_boundary(struct request_queue *, unsigned long); @@ -943,6 +973,7 @@ struct blk_plug { /* if ios_left is > 1, we can batch tag/rq allocations */ struct request *cached_rq; + u64 cur_ktime; unsigned short nr_ios; unsigned short rq_count; @@ -973,6 +1004,18 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) __blk_flush_plug(plug, async); } +/* + * tsk == current here + */ +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ + struct blk_plug *plug = tsk->plug; + + if (plug) + plug->cur_ktime = 0; + current->flags &= ~PF_BLOCK_TS; +} + int blkdev_issue_flush(struct block_device *bdev); long nr_blockdev_pages(void); #else /* CONFIG_BLOCK */ @@ -996,6 +1039,10 @@ static inline void blk_flush_plug(struct blk_plug *plug, bool async) { } +static inline void blk_plug_invalidate_ts(struct task_struct *tsk) +{ +} + static inline int blkdev_issue_flush(struct block_device *bdev) { return 0; diff --git a/include/linux/nvme-rdma.h b/include/linux/nvme-rdma.h index 4dd7e6fe92fb..eb2f04d636c8 100644 --- a/include/linux/nvme-rdma.h +++ b/include/linux/nvme-rdma.h @@ -6,7 +6,11 @@ #ifndef _LINUX_NVME_RDMA_H #define _LINUX_NVME_RDMA_H -#define NVME_RDMA_MAX_QUEUE_SIZE 128 +#define NVME_RDMA_IP_PORT 4420 + +#define NVME_RDMA_MAX_QUEUE_SIZE 256 +#define NVME_RDMA_MAX_METADATA_QUEUE_SIZE 128 +#define NVME_RDMA_DEFAULT_QUEUE_SIZE 128 enum nvme_rdma_cm_fmt { NVME_RDMA_CM_FMT_1_0 = 0x0, diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3ef4053ea950..425573202295 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -23,8 +23,6 @@ #define NVME_DISC_SUBSYS_NAME "nqn.2014-08.org.nvmexpress.discovery" -#define NVME_RDMA_IP_PORT 4420 - #define NVME_NSID_ALL 0xffffffff enum nvme_subsys_type { diff --git a/include/linux/sched.h b/include/linux/sched.h index ffe8f618ab86..15b7cb478d16 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1642,7 +1642,7 @@ extern struct pid *cad_pid; #define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_mask */ #define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */ #define PF_MEMALLOC_PIN 0x10000000 /* Allocation context constrained to zones which allow long term pinning. */ -#define PF__HOLE__20000000 0x20000000 +#define PF_BLOCK_TS 0x20000000 /* plug has ts that needs updating */ #define PF__HOLE__40000000 0x40000000 #define PF_SUSPEND_TASK 0x80000000 /* This thread called freeze_processes() and should not be frozen */ diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index a6e04b4a21d7..11e0e00e0bb9 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -176,6 +176,7 @@ extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); +bool cpus_equal_capacity(int this_cpu, int that_cpu); bool cpus_share_cache(int this_cpu, int that_cpu); bool cpus_share_resources(int this_cpu, int that_cpu); @@ -226,6 +227,11 @@ partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], { } +static inline bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + return true; +} + static inline bool cpus_share_cache(int this_cpu, int that_cpu) { return true; diff --git a/include/uapi/linux/ublk_cmd.h b/include/uapi/linux/ublk_cmd.h index b9cfc5c96268..c8dc5f8ea699 100644 --- a/include/uapi/linux/ublk_cmd.h +++ b/include/uapi/linux/ublk_cmd.h @@ -49,6 +49,8 @@ _IOR('u', UBLK_CMD_GET_DEV_INFO2, struct ublksrv_ctrl_cmd) #define UBLK_U_CMD_GET_FEATURES \ _IOR('u', 0x13, struct ublksrv_ctrl_cmd) +#define UBLK_U_CMD_DEL_DEV_ASYNC \ + _IOR('u', 0x14, struct ublksrv_ctrl_cmd) /* * 64bits are enough now, and it should be easy to extend in case of diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9116bcc90346..540f229700b6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3955,6 +3955,17 @@ void wake_up_if_idle(int cpu) } } +bool cpus_equal_capacity(int this_cpu, int that_cpu) +{ + if (!sched_asym_cpucap_active()) + return true; + + if (this_cpu == that_cpu) + return true; + + return arch_scale_cpu_capacity(this_cpu) == arch_scale_cpu_capacity(that_cpu); +} + bool cpus_share_cache(int this_cpu, int that_cpu) { if (this_cpu == that_cpu) @@ -6787,10 +6798,12 @@ static inline void sched_submit_work(struct task_struct *tsk) static void sched_update_worker(struct task_struct *tsk) { - if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER)) { + if (tsk->flags & (PF_WQ_WORKER | PF_IO_WORKER | PF_BLOCK_TS)) { + if (tsk->flags & PF_BLOCK_TS) + blk_plug_invalidate_ts(tsk); if (tsk->flags & PF_WQ_WORKER) wq_worker_running(tsk); - else + else if (tsk->flags & PF_IO_WORKER) io_wq_worker_running(tsk); } } |