diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/bio-integrity.c | 13 | ||||
-rw-r--r-- | block/bio.c | 82 | ||||
-rw-r--r-- | block/blk-core.c | 34 | ||||
-rw-r--r-- | block/blk-merge.c | 18 | ||||
-rw-r--r-- | block/blk-mq-cpumap.c | 4 | ||||
-rw-r--r-- | block/blk-mq-sysfs.c | 34 | ||||
-rw-r--r-- | block/blk-mq-tag.c | 98 | ||||
-rw-r--r-- | block/blk-mq-tag.h | 1 | ||||
-rw-r--r-- | block/blk-mq.c | 248 | ||||
-rw-r--r-- | block/blk-mq.h | 6 | ||||
-rw-r--r-- | block/blk-settings.c | 4 | ||||
-rw-r--r-- | block/blk-sysfs.c | 12 | ||||
-rw-r--r-- | block/blk-timeout.c | 3 | ||||
-rw-r--r-- | block/elevator.c | 6 | ||||
-rw-r--r-- | block/genhd.c | 11 | ||||
-rw-r--r-- | block/ioprio.c | 14 | ||||
-rw-r--r-- | block/scsi_ioctl.c | 13 |
17 files changed, 428 insertions, 173 deletions
diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 0984232e429f..5cbd5d9ea61d 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -216,9 +216,10 @@ static int bio_integrity_process(struct bio *bio, { struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev); struct blk_integrity_iter iter; - struct bio_vec *bv; + struct bvec_iter bviter; + struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); - unsigned int i, ret = 0; + unsigned int ret = 0; void *prot_buf = page_address(bip->bip_vec->bv_page) + bip->bip_vec->bv_offset; @@ -227,11 +228,11 @@ static int bio_integrity_process(struct bio *bio, iter.seed = bip_get_seed(bip); iter.prot_buf = prot_buf; - bio_for_each_segment_all(bv, bio, i) { - void *kaddr = kmap_atomic(bv->bv_page); + bio_for_each_segment(bv, bio, bviter) { + void *kaddr = kmap_atomic(bv.bv_page); - iter.data_buf = kaddr + bv->bv_offset; - iter.data_size = bv->bv_len; + iter.data_buf = kaddr + bv.bv_offset; + iter.data_size = bv.bv_len; ret = proc_fn(&iter); if (ret) { diff --git a/block/bio.c b/block/bio.c index 3e6e1986a5b2..471d7382c7d1 100644 --- a/block/bio.c +++ b/block/bio.c @@ -748,6 +748,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } } + bio->bi_iter.bi_size += len; goto done; } @@ -764,29 +765,32 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page return 0; /* - * we might lose a segment or two here, but rather that than - * make this too complex. + * setup the new entry, we might clear it again later if we + * cannot add the page + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_phys_segments++; + bio->bi_iter.bi_size += len; + + /* + * Perform a recount if the number of segments is greater + * than queue_max_segments(q). */ - while (bio->bi_phys_segments >= queue_max_segments(q)) { + while (bio->bi_phys_segments > queue_max_segments(q)) { if (retried_segments) - return 0; + goto failed; retried_segments = 1; blk_recount_segments(q, bio); } /* - * setup the new entry, we might clear it again later if we - * cannot add the page - */ - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; - - /* * if queue has other restrictions (eg varying max sector size * depending on offset), it can specify a merge_bvec_fn in the * queue to get further control @@ -795,7 +799,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page struct bvec_merge_data bvm = { .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_iter.bi_sector, - .bi_size = bio->bi_iter.bi_size, + .bi_size = bio->bi_iter.bi_size - len, .bi_rw = bio->bi_rw, }; @@ -803,23 +807,25 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { - bvec->bv_page = NULL; - bvec->bv_len = 0; - bvec->bv_offset = 0; - return 0; - } + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) + goto failed; } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt > 1 && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); - bio->bi_vcnt++; - bio->bi_phys_segments++; done: - bio->bi_iter.bi_size += len; return len; + + failed: + bvec->bv_page = NULL; + bvec->bv_len = 0; + bvec->bv_offset = 0; + bio->bi_vcnt--; + bio->bi_iter.bi_size -= len; + blk_recount_segments(q, bio); + return 0; } /** @@ -1739,6 +1745,34 @@ void bio_check_pages_dirty(struct bio *bio) } } +void generic_start_io_acct(int rw, unsigned long sectors, + struct hd_struct *part) +{ + int cpu = part_stat_lock(); + + part_round_stats(cpu, part); + part_stat_inc(cpu, part, ios[rw]); + part_stat_add(cpu, part, sectors[rw], sectors); + part_inc_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_start_io_acct); + +void generic_end_io_acct(int rw, struct hd_struct *part, + unsigned long start_time) +{ + unsigned long duration = jiffies - start_time; + int cpu = part_stat_lock(); + + part_stat_add(cpu, part, ticks[rw], duration); + part_round_stats(cpu, part); + part_dec_in_flight(part, rw); + + part_stat_unlock(); +} +EXPORT_SYMBOL(generic_end_io_acct); + #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE void bio_flush_dcache_pages(struct bio *bi) { diff --git a/block/blk-core.c b/block/blk-core.c index 0421b53e6431..3ad405571dcc 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -473,6 +473,25 @@ void blk_queue_bypass_end(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_bypass_end); +void blk_set_queue_dying(struct request_queue *q) +{ + queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + + if (q->mq_ops) + blk_mq_wake_waiters(q); + else { + struct request_list *rl; + + blk_queue_for_each_rl(rl, q) { + if (rl->rq_pool) { + wake_up(&rl->wait[BLK_RW_SYNC]); + wake_up(&rl->wait[BLK_RW_ASYNC]); + } + } + } +} +EXPORT_SYMBOL_GPL(blk_set_queue_dying); + /** * blk_cleanup_queue - shutdown a request queue * @q: request queue to shutdown @@ -486,7 +505,7 @@ void blk_cleanup_queue(struct request_queue *q) /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); - queue_flag_set_unlocked(QUEUE_FLAG_DYING, q); + blk_set_queue_dying(q); spin_lock_irq(lock); /* @@ -525,6 +544,9 @@ void blk_cleanup_queue(struct request_queue *q) del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); blk_sync_queue(q); + if (q->mq_ops) + blk_mq_free_queue(q); + spin_lock_irq(lock); if (q->queue_lock != &q->__queue_lock) q->queue_lock = &q->__queue_lock; @@ -1266,7 +1288,7 @@ void blk_requeue_request(struct request_queue *q, struct request *rq) blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); - if (blk_rq_tagged(rq)) + if (rq->cmd_flags & REQ_QUEUED) blk_queue_end_tag(q, rq); BUG_ON(blk_queued_rq(rq)); @@ -1325,7 +1347,7 @@ void part_round_stats(int cpu, struct hd_struct *part) } EXPORT_SYMBOL_GPL(part_round_stats); -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM static void blk_pm_put_request(struct request *rq) { if (rq->q->dev && !(rq->cmd_flags & REQ_PM) && !--rq->q->nr_pending) @@ -2134,7 +2156,7 @@ void blk_account_io_done(struct request *req) } } -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM /* * Don't process normal requests when queue is suspended * or in the process of suspending/resuming @@ -2554,7 +2576,7 @@ EXPORT_SYMBOL_GPL(blk_unprep_request); */ void blk_finish_request(struct request *req, int error) { - if (blk_rq_tagged(req)) + if (req->cmd_flags & REQ_QUEUED) blk_queue_end_tag(req->q, req); BUG_ON(blk_queued_rq(req)); @@ -3159,7 +3181,7 @@ void blk_finish_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_finish_plug); -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM /** * blk_pm_runtime_init - Block layer runtime PM initialization routine * @q: the queue of the device diff --git a/block/blk-merge.c b/block/blk-merge.c index ba99351c0f58..89b97b5e0881 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -97,18 +97,22 @@ void blk_recalc_rq_segments(struct request *rq) void blk_recount_segments(struct request_queue *q, struct bio *bio) { - bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE, - &q->queue_flags); + unsigned short seg_cnt; + + /* estimate segment number by bi_vcnt for non-cloned bio */ + if (bio_flagged(bio, BIO_CLONED)) + seg_cnt = bio_segments(bio); + else + seg_cnt = bio->bi_vcnt; - if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) && - bio->bi_vcnt < queue_max_segments(q)) - bio->bi_phys_segments = bio->bi_vcnt; + if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) && + (seg_cnt < queue_max_segments(q))) + bio->bi_phys_segments = seg_cnt; else { struct bio *nxt = bio->bi_next; bio->bi_next = NULL; - bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, - no_sg_merge); + bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false); bio->bi_next = nxt; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 1065d7c65fa1..5f13f4d0bcce 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -17,7 +17,7 @@ static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues, const int cpu) { - return cpu / ((nr_cpus + nr_queues - 1) / nr_queues); + return cpu * nr_queues / nr_cpus; } static int get_first_sibling(unsigned int cpu) @@ -90,7 +90,7 @@ unsigned int *blk_mq_make_queue_map(struct blk_mq_tag_set *set) unsigned int *map; /* If cpus are offline, map them to first hctx */ - map = kzalloc_node(sizeof(*map) * num_possible_cpus(), GFP_KERNEL, + map = kzalloc_node(sizeof(*map) * nr_cpu_ids, GFP_KERNEL, set->numa_node); if (!map) return NULL; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 371d8800b48a..6774a0e69867 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -15,6 +15,26 @@ static void blk_mq_sysfs_release(struct kobject *kobj) { + struct request_queue *q; + + q = container_of(kobj, struct request_queue, mq_kobj); + free_percpu(q->queue_ctx); +} + +static void blk_mq_ctx_release(struct kobject *kobj) +{ + struct blk_mq_ctx *ctx; + + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + kobject_put(&ctx->queue->mq_kobj); +} + +static void blk_mq_hctx_release(struct kobject *kobj) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + kfree(hctx); } struct blk_mq_ctx_sysfs_entry { @@ -318,13 +338,13 @@ static struct kobj_type blk_mq_ktype = { static struct kobj_type blk_mq_ctx_ktype = { .sysfs_ops = &blk_mq_sysfs_ops, .default_attrs = default_ctx_attrs, - .release = blk_mq_sysfs_release, + .release = blk_mq_ctx_release, }; static struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_attrs = default_hw_ctx_attrs, - .release = blk_mq_sysfs_release, + .release = blk_mq_hctx_release, }; static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) @@ -355,6 +375,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; hctx_for_each_ctx(hctx, ctx, i) { + kobject_get(&q->mq_kobj); ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) break; @@ -390,16 +411,15 @@ static void blk_mq_sysfs_init(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; - int i, j; + int i; kobject_init(&q->mq_kobj, &blk_mq_ktype); - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) kobject_init(&hctx->kobj, &blk_mq_hw_ktype); - hctx_for_each_ctx(hctx, ctx, j) - kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); - } + queue_for_each_ctx(q, ctx, i) + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } /* see blk_register_queue() */ diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 8317175a3009..60c9d4a93fe4 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -68,9 +68,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) } /* - * Wakeup all potentially sleeping on normal (non-reserved) tags + * Wakeup all potentially sleeping on tags */ -static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { struct blk_mq_bitmap_tags *bt; int i, wake_index; @@ -85,6 +85,12 @@ static void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags) wake_index = bt_index_inc(wake_index); } + + if (include_reserve) { + bt = &tags->breserved_tags; + if (waitqueue_active(&bt->bs[0].wait)) + wake_up(&bt->bs[0].wait); + } } /* @@ -100,7 +106,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) atomic_dec(&tags->active_queues); - blk_mq_tag_wakeup_all(tags); + blk_mq_tag_wakeup_all(tags, false); } /* @@ -137,6 +143,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag) { int tag, org_last_tag, end; + bool wrap = last_tag != 0; org_last_tag = last_tag; end = bm->depth; @@ -148,15 +155,16 @@ restart: * We started with an offset, start from 0 to * exhaust the map. */ - if (org_last_tag && last_tag) { - end = last_tag; + if (wrap) { + wrap = false; + end = org_last_tag; last_tag = 0; goto restart; } return -1; } last_tag = tag + 1; - } while (test_and_set_bit_lock(tag, &bm->word)); + } while (test_and_set_bit(tag, &bm->word)); return tag; } @@ -254,6 +262,21 @@ static int bt_get(struct blk_mq_alloc_data *data, if (tag != -1) break; + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. + */ + blk_mq_run_hw_queue(hctx, false); + + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __bt_get(hctx, bt, last_tag); + if (tag != -1) + break; + blk_mq_put_ctx(data->ctx); io_schedule(); @@ -340,11 +363,10 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) struct bt_wait_state *bs; int wait_cnt; - /* - * The unlock memory barrier need to order access to req in free - * path and clearing tag bit - */ - clear_bit_unlock(TAG_TO_BIT(bt, tag), &bt->map[index].word); + clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); bs = bt_wake_ptr(bt); if (!bs) @@ -360,21 +382,6 @@ static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) } } -static void __blk_mq_put_tag(struct blk_mq_tags *tags, unsigned int tag) -{ - BUG_ON(tag >= tags->nr_tags); - - bt_clear_tag(&tags->bitmap_tags, tag); -} - -static void __blk_mq_put_reserved_tag(struct blk_mq_tags *tags, - unsigned int tag) -{ - BUG_ON(tag >= tags->nr_reserved_tags); - - bt_clear_tag(&tags->breserved_tags, tag); -} - void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag) { @@ -383,10 +390,13 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, if (tag >= tags->nr_reserved_tags) { const int real_tag = tag - tags->nr_reserved_tags; - __blk_mq_put_tag(tags, real_tag); + BUG_ON(real_tag >= tags->nr_tags); + bt_clear_tag(&tags->bitmap_tags, real_tag); *last_tag = real_tag; - } else - __blk_mq_put_reserved_tag(tags, tag); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + bt_clear_tag(&tags->breserved_tags, tag); + } } static void bt_for_each(struct blk_mq_hw_ctx *hctx, @@ -580,10 +590,38 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) * static and should never need resizing. */ bt_update_count(&tags->bitmap_tags, tdepth); - blk_mq_tag_wakeup_all(tags); + blk_mq_tag_wakeup_all(tags, false); return 0; } +/** + * blk_mq_unique_tag() - return a tag that is unique queue-wide + * @rq: request for which to compute a unique tag + * + * The tag field in struct request is unique per hardware queue but not over + * all hardware queues. Hence this function that returns a tag with the + * hardware context index in the upper bits and the per hardware queue tag in + * the lower bits. + * + * Note: When called for a request that is queued on a non-multiqueue request + * queue, the hardware context index is set to zero. + */ +u32 blk_mq_unique_tag(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx; + int hwq = 0; + + if (q->mq_ops) { + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + hwq = hctx->queue_num; + } + + return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | + (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); +} +EXPORT_SYMBOL(blk_mq_unique_tag); + ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) { char *orig_page = page; diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index 6206ed17ef76..a6fa0fc9d41a 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -54,6 +54,7 @@ extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); +extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); enum { BLK_MQ_TAG_CACHE_MIN = 1, diff --git a/block/blk-mq.c b/block/blk-mq.c index 68929bad9a6a..9ee3b87c4498 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -107,11 +107,7 @@ static void blk_mq_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } -/* - * Guarantee no request is in use, so we can change any data structure of - * the queue afterward. - */ -void blk_mq_freeze_queue(struct request_queue *q) +void blk_mq_freeze_queue_start(struct request_queue *q) { bool freeze; @@ -123,10 +119,25 @@ void blk_mq_freeze_queue(struct request_queue *q) percpu_ref_kill(&q->mq_usage_counter); blk_mq_run_queues(q, false); } +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); + +static void blk_mq_freeze_queue_wait(struct request_queue *q) +{ wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); } -static void blk_mq_unfreeze_queue(struct request_queue *q) +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +void blk_mq_freeze_queue(struct request_queue *q) +{ + blk_mq_freeze_queue_start(q); + blk_mq_freeze_queue_wait(q); +} + +void blk_mq_unfreeze_queue(struct request_queue *q) { bool wake; @@ -139,6 +150,24 @@ static void blk_mq_unfreeze_queue(struct request_queue *q) wake_up_all(&q->mq_freeze_wq); } } +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); + +void blk_mq_wake_waiters(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_wakeup_all(hctx->tags, true); + + /* + * If we are called because the queue has now been marked as + * dying, we need to ensure that processes currently waiting on + * the queue are notified as well. + */ + wake_up_all(&q->mq_freeze_wq); +} bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) { @@ -248,8 +277,10 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, ctx = alloc_data.ctx; } blk_mq_put_ctx(ctx); - if (!rq) + if (!rq) { + blk_mq_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); + } return rq; } EXPORT_SYMBOL(blk_mq_alloc_request); @@ -269,17 +300,25 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, blk_mq_queue_exit(q); } -void blk_mq_free_request(struct request *rq) +void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q = rq->q; ctx->rq_completed[rq_is_sync(rq)]++; - - hctx = q->mq_ops->map_queue(q, ctx->cpu); __blk_mq_free_request(hctx, ctx, rq); + +} +EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); + +void blk_mq_free_request(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx; + struct request_queue *q = rq->q; + + hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); + blk_mq_free_hctx_request(hctx, rq); } +EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, int error) { @@ -365,6 +404,12 @@ void blk_mq_complete_request(struct request *rq) } EXPORT_SYMBOL(blk_mq_complete_request); +int blk_mq_request_started(struct request *rq) +{ + return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); +} +EXPORT_SYMBOL_GPL(blk_mq_request_started); + void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; @@ -482,12 +527,38 @@ void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) } EXPORT_SYMBOL(blk_mq_add_to_requeue_list); +void blk_mq_cancel_requeue_work(struct request_queue *q) +{ + cancel_work_sync(&q->requeue_work); +} +EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); + void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_schedule_work(&q->requeue_work); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_abort_requeue_list(struct request_queue *q) +{ + unsigned long flags; + LIST_HEAD(rq_list); + + spin_lock_irqsave(&q->requeue_lock, flags); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + while (!list_empty(&rq_list)) { + struct request *rq; + + rq = list_first_entry(&rq_list, struct request, queuelist); + list_del_init(&rq->queuelist); + rq->errors = -EIO; + blk_mq_end_request(rq, rq->errors); + } +} +EXPORT_SYMBOL(blk_mq_abort_requeue_list); + static inline bool is_flush_request(struct request *rq, struct blk_flush_queue *fq, unsigned int tag) { @@ -548,13 +619,24 @@ void blk_mq_rq_timed_out(struct request *req, bool reserved) break; } } - + static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct blk_mq_timeout_data *data = priv; - if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) + if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { + /* + * If a request wasn't started before the queue was + * marked dying, kill it here or it'll go unnoticed. + */ + if (unlikely(blk_queue_dying(rq->q))) { + rq->errors = -EIO; + blk_mq_complete_request(rq); + } + return; + } + if (rq->cmd_flags & REQ_NO_TIMEOUT) return; if (time_after_eq(jiffies, rq->deadline)) { @@ -581,7 +663,7 @@ static void blk_mq_rq_timer(unsigned long priv) * If not software queues are currently mapped to this * hardware queue, there's nothing to check */ - if (!hctx->nr_ctx || !hctx->tags) + if (!blk_mq_hw_queue_mapped(hctx)) continue; blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); @@ -680,6 +762,8 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct request *rq; LIST_HEAD(rq_list); + LIST_HEAD(driver_list); + struct list_head *dptr; int queued; WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); @@ -706,16 +790,27 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) } /* + * Start off with dptr being NULL, so we start the first request + * immediately, even if we have more pending. + */ + dptr = NULL; + + /* * Now process all the entries, sending them to the driver. */ queued = 0; while (!list_empty(&rq_list)) { + struct blk_mq_queue_data bd; int ret; rq = list_first_entry(&rq_list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list)); + bd.rq = rq; + bd.list = dptr; + bd.last = list_empty(&rq_list); + + ret = q->mq_ops->queue_rq(hctx, &bd); switch (ret) { case BLK_MQ_RQ_QUEUE_OK: queued++; @@ -734,6 +829,13 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) if (ret == BLK_MQ_RQ_QUEUE_BUSY) break; + + /* + * We've done the first request. If we have more than 1 + * left in the list, set dptr to defer issue. + */ + if (!dptr && rq_list.next != rq_list.prev) + dptr = &driver_list; } if (!queued) @@ -760,10 +862,11 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) */ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) { - int cpu = hctx->next_cpu; + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; if (--hctx->next_cpu_batch <= 0) { - int next_cpu; + int cpu = hctx->next_cpu, next_cpu; next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); if (next_cpu >= nr_cpu_ids) @@ -771,26 +874,32 @@ static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) hctx->next_cpu = next_cpu; hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + + return cpu; } - return cpu; + return hctx->next_cpu; } void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) + if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || + !blk_mq_hw_queue_mapped(hctx))) return; - if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask)) - __blk_mq_run_hw_queue(hctx); - else if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->run_work, 0); - else { - unsigned int cpu; + if (!async) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + put_cpu(); + return; + } - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0); + put_cpu(); } + + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->run_work, 0); } void blk_mq_run_queues(struct request_queue *q, bool async) @@ -804,9 +913,7 @@ void blk_mq_run_queues(struct request_queue *q, bool async) test_bit(BLK_MQ_S_STOPPED, &hctx->state)) continue; - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_run_queues); @@ -833,9 +940,7 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, false); - preempt_enable(); } EXPORT_SYMBOL(blk_mq_start_hw_queue); @@ -860,9 +965,7 @@ void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) continue; clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - preempt_disable(); blk_mq_run_hw_queue(hctx, async); - preempt_enable(); } } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); @@ -888,16 +991,11 @@ static void blk_mq_delay_work_fn(struct work_struct *work) void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { - unsigned long tmo = msecs_to_jiffies(msecs); - - if (hctx->queue->nr_hw_queues == 1) - kblockd_schedule_delayed_work(&hctx->delay_work, tmo); - else { - unsigned int cpu; + if (unlikely(!blk_mq_hw_queue_mapped(hctx))) + return; - cpu = blk_mq_hctx_next_cpu(hctx); - kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo); - } + kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), + &hctx->delay_work, msecs_to_jiffies(msecs)); } EXPORT_SYMBOL(blk_mq_delay_queue); @@ -1152,7 +1250,17 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) goto run_queue; } - if (is_sync) { + /* + * If the driver supports defer issued based on 'last', then + * queue it up like normal since we can potentially save some + * CPU this way. + */ + if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { + struct blk_mq_queue_data bd = { + .rq = rq, + .list = NULL, + .last = 1 + }; int ret; blk_mq_bio_to_request(rq, bio); @@ -1162,7 +1270,7 @@ static void blk_mq_make_request(struct request_queue *q, struct bio *bio) * error (busy), just add it to our list as we previously * would have done */ - ret = q->mq_ops->queue_rq(data.hctx, rq, true); + ret = q->mq_ops->queue_rq(data.hctx, &bd); if (ret == BLK_MQ_RQ_QUEUE_OK) goto done; else { @@ -1533,10 +1641,8 @@ static void blk_mq_free_hw_queues(struct request_queue *q, struct blk_mq_hw_ctx *hctx; unsigned int i; - queue_for_each_hw_ctx(q, hctx, i) { + queue_for_each_hw_ctx(q, hctx, i) free_cpumask_var(hctx->cpumask); - kfree(hctx); - } } static int blk_mq_init_hctx(struct request_queue *q, @@ -1557,7 +1663,6 @@ static int blk_mq_init_hctx(struct request_queue *q, hctx->queue = q; hctx->queue_num = hctx_idx; hctx->flags = set->flags; - hctx->cmd_size = set->cmd_size; blk_mq_init_cpu_notifier(&hctx->cpu_notifier, blk_mq_hctx_notify, hctx); @@ -1774,16 +1879,6 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) if (!ctx) return ERR_PTR(-ENOMEM); - /* - * If a crashdump is active, then we are potentially in a very - * memory constrained environment. Limit us to 1 queue and - * 64 tags to prevent using too much memory. - */ - if (is_kdump_kernel()) { - set->nr_hw_queues = 1; - set->queue_depth = min(64U, set->queue_depth); - } - hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, set->numa_node); @@ -1905,11 +2000,9 @@ void blk_mq_free_queue(struct request_queue *q) percpu_ref_exit(&q->mq_usage_counter); - free_percpu(q->queue_ctx); kfree(q->queue_hw_ctx); kfree(q->mq_map); - q->queue_ctx = NULL; q->queue_hw_ctx = NULL; q->mq_map = NULL; @@ -1921,7 +2014,7 @@ void blk_mq_free_queue(struct request_queue *q) /* Basically redo blk_mq_init_queue with queue frozen */ static void blk_mq_queue_reinit(struct request_queue *q) { - blk_mq_freeze_queue(q); + WARN_ON_ONCE(!q->mq_freeze_depth); blk_mq_sysfs_unregister(q); @@ -1936,8 +2029,6 @@ static void blk_mq_queue_reinit(struct request_queue *q) blk_mq_map_swqueue(q); blk_mq_sysfs_register(q); - - blk_mq_unfreeze_queue(q); } static int blk_mq_queue_reinit_notify(struct notifier_block *nb, @@ -1956,8 +2047,25 @@ static int blk_mq_queue_reinit_notify(struct notifier_block *nb, return NOTIFY_OK; mutex_lock(&all_q_mutex); + + /* + * We need to freeze and reinit all existing queues. Freezing + * involves synchronous wait for an RCU grace period and doing it + * one by one may take a long time. Start freezing all queues in + * one swoop and then wait for the completions so that freezing can + * take place in parallel. + */ + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_freeze_queue_start(q); + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_freeze_queue_wait(q); + list_for_each_entry(q, &all_q_list, all_q_node) blk_mq_queue_reinit(q); + + list_for_each_entry(q, &all_q_list, all_q_node) + blk_mq_unfreeze_queue(q); + mutex_unlock(&all_q_mutex); return NOTIFY_OK; } @@ -2024,6 +2132,8 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); + if (!set->nr_hw_queues) return -EINVAL; if (!set->queue_depth) @@ -2040,6 +2150,16 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->queue_depth = min(64U, set->queue_depth); + } + set->tags = kmalloc_node(set->nr_hw_queues * sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); diff --git a/block/blk-mq.h b/block/blk-mq.h index d567d5283ffa..4f4f943c22c3 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -32,6 +32,7 @@ void blk_mq_free_queue(struct request_queue *q); void blk_mq_clone_flush_request(struct request *flush_rq, struct request *orig_rq); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +void blk_mq_wake_waiters(struct request_queue *q); /* * CPU hotplug helpers @@ -115,4 +116,9 @@ static inline void blk_mq_set_alloc_data(struct blk_mq_alloc_data *data, data->hctx = hctx; } +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index aa02247d227e..6ed2cbe5e8c9 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -257,9 +257,7 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_ __func__, max_hw_sectors); } - limits->max_hw_sectors = max_hw_sectors; - limits->max_sectors = min_t(unsigned int, max_hw_sectors, - BLK_DEF_MAX_SECTORS); + limits->max_sectors = limits->max_hw_sectors = max_hw_sectors; } EXPORT_SYMBOL(blk_limits_max_hw_sectors); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 1fac43408911..935ea2aa0730 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -492,17 +492,15 @@ static void blk_free_queue_rcu(struct rcu_head *rcu_head) * Currently, its primary task it to free all the &struct request * structures that were allocated to the queue and the queue itself. * - * Caveat: - * Hopefully the low level driver will have finished any - * outstanding requests first... + * Note: + * The low level driver must have finished any outstanding requests first + * via blk_cleanup_queue(). **/ static void blk_release_queue(struct kobject *kobj) { struct request_queue *q = container_of(kobj, struct request_queue, kobj); - blk_sync_queue(q); - blkcg_exit_queue(q); if (q->elevator) { @@ -517,9 +515,7 @@ static void blk_release_queue(struct kobject *kobj) if (q->queue_tags) __blk_queue_free_tags(q); - if (q->mq_ops) - blk_mq_free_queue(q); - else + if (!q->mq_ops) blk_free_flush_queue(q->fq); blk_trace_shutdown(q); diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 56c025894cdf..246dfb16c3d9 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -190,6 +190,9 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; + if (req->cmd_flags & REQ_NO_TIMEOUT) + return; + /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/block/elevator.c b/block/elevator.c index 24c28b659bb3..59794d0d38e3 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -229,7 +229,9 @@ int elevator_init(struct request_queue *q, char *name) } err = e->ops.elevator_init_fn(q, e); - return 0; + if (err) + elevator_put(e); + return err; } EXPORT_SYMBOL(elevator_init); @@ -537,7 +539,7 @@ void elv_bio_merged(struct request_queue *q, struct request *rq, e->type->ops.elevator_bio_merged_fn(q, rq, bio); } -#ifdef CONFIG_PM_RUNTIME +#ifdef CONFIG_PM static void blk_pm_requeue_request(struct request *rq) { if (rq->q->dev && !(rq->cmd_flags & REQ_PM)) diff --git a/block/genhd.c b/block/genhd.c index bd3060684ab2..0a536dc05f3b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1070,9 +1070,16 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno) struct disk_part_tbl *old_ptbl = disk->part_tbl; struct disk_part_tbl *new_ptbl; int len = old_ptbl ? old_ptbl->len : 0; - int target = partno + 1; + int i, target; size_t size; - int i; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; /* disk_max_parts() is zero during initialization, ignore if so */ if (disk_max_parts(disk) && target > disk_max_parts(disk)) diff --git a/block/ioprio.c b/block/ioprio.c index e50170ca7c33..31666c92b46a 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -157,14 +157,16 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { - unsigned short aclass = IOPRIO_PRIO_CLASS(aprio); - unsigned short bclass = IOPRIO_PRIO_CLASS(bprio); + unsigned short aclass; + unsigned short bclass; - if (aclass == IOPRIO_CLASS_NONE) - aclass = IOPRIO_CLASS_BE; - if (bclass == IOPRIO_CLASS_NONE) - bclass = IOPRIO_CLASS_BE; + if (!ioprio_valid(aprio)) + aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + if (!ioprio_valid(bprio)) + bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + aclass = IOPRIO_PRIO_CLASS(aprio); + bclass = IOPRIO_PRIO_CLASS(bprio); if (aclass == bclass) return min(aprio, bprio); if (aclass > bclass) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index abb2e65b24cc..28163fad3c5d 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -142,7 +142,7 @@ static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) __set_bit(GPCMD_VERIFY_10, filter->read_ok); __set_bit(VERIFY_16, filter->read_ok); __set_bit(REPORT_LUNS, filter->read_ok); - __set_bit(SERVICE_ACTION_IN, filter->read_ok); + __set_bit(SERVICE_ACTION_IN_16, filter->read_ok); __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); __set_bit(MAINTENANCE_IN, filter->read_ok); __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); @@ -458,7 +458,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT); if (IS_ERR(rq)) { err = PTR_ERR(rq); - goto error; + goto error_free_buffer; } blk_rq_set_block_pc(rq); @@ -508,7 +508,7 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) { err = DRIVER_ERROR << 24; - goto out; + goto error; } memset(sense, 0, sizeof(sense)); @@ -517,7 +517,6 @@ int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, blk_execute_rq(q, disk, rq, 0); -out: err = rq->errors & 0xff; /* only 8 bit SCSI status */ if (err) { if (rq->sense_len && rq->sense) { @@ -532,9 +531,11 @@ out: } error: + blk_put_request(rq); + +error_free_buffer: kfree(buffer); - if (rq) - blk_put_request(rq); + return err; } EXPORT_SYMBOL_GPL(sg_scsi_ioctl); |