diff options
Diffstat (limited to 'block')
55 files changed, 1946 insertions, 10228 deletions
diff --git a/block/Kconfig b/block/Kconfig index f7045aa47edb..8044452a4fd3 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -155,12 +155,6 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. -config BLK_WBT_SQ - bool "Single queue writeback throttling" - depends on BLK_WBT - ---help--- - Enable writeback throttling by default on legacy single queue devices - config BLK_WBT_MQ bool "Multiqueue writeback throttling" default y diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index f95a48b0d7b2..4626b88b2d5a 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -3,67 +3,6 @@ if BLOCK menu "IO Schedulers" -config IOSCHED_NOOP - bool - default y - ---help--- - The no-op I/O scheduler is a minimal scheduler that does basic merging - and sorting. Its main uses include non-disk based block devices like - memory devices, and specialised software or hardware environments - that do their own scheduling and require only minimal assistance from - the kernel. - -config IOSCHED_DEADLINE - tristate "Deadline I/O scheduler" - default y - ---help--- - The deadline I/O scheduler is simple and compact. It will provide - CSCAN service with FIFO expiration of requests, switching to - a new point in the service tree and doing a batch of IO from there - in case of expiry. - -config IOSCHED_CFQ - tristate "CFQ I/O scheduler" - default y - ---help--- - The CFQ I/O scheduler tries to distribute bandwidth equally - among all processes in the system. It should provide a fair - and low latency working environment, suitable for both desktop - and server systems. - - This is the default I/O scheduler. - -config CFQ_GROUP_IOSCHED - bool "CFQ Group Scheduling support" - depends on IOSCHED_CFQ && BLK_CGROUP - ---help--- - Enable group IO scheduling in CFQ. - -choice - - prompt "Default I/O scheduler" - default DEFAULT_CFQ - help - Select the I/O scheduler which will be used by default for all - block devices. - - config DEFAULT_DEADLINE - bool "Deadline" if IOSCHED_DEADLINE=y - - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - - config DEFAULT_NOOP - bool "No-op" - -endchoice - -config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ - default "noop" if DEFAULT_NOOP - config MQ_IOSCHED_DEADLINE tristate "MQ deadline I/O scheduler" default y diff --git a/block/Makefile b/block/Makefile index 27eac600474f..eee1b4ceecf9 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,7 +3,7 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ @@ -18,9 +18,6 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o -obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o -obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o -obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 9fe5952d117d..c6113af31960 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -334,7 +334,7 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) parent = bfqg_parent(bfqg); - lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); + lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); if (unlikely(!parent)) return; @@ -642,7 +642,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) uint64_t serial_nr; rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; + serial_nr = __bio_blkcg(bio)->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger @@ -651,7 +651,7 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) goto out; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); + bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3a27d31fcda6..cd307767a134 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -399,9 +399,9 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, unsigned long flags; struct bfq_io_cq *icq; - spin_lock_irqsave(q->queue_lock, flags); + spin_lock_irqsave(&q->queue_lock, flags); icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(q->queue_lock, flags); + spin_unlock_irqrestore(&q->queue_lock, flags); return icq; } @@ -638,7 +638,7 @@ static bool bfq_varied_queue_weights_or_active_groups(struct bfq_data *bfqd) bfqd->queue_weights_tree.rb_node->rb_right) #ifdef CONFIG_BFQ_GROUP_IOSCHED ) || - (bfqd->num_active_groups > 0 + (bfqd->num_groups_with_pending_reqs > 0 #endif ); } @@ -802,7 +802,21 @@ void bfq_weights_tree_remove(struct bfq_data *bfqd, */ break; } - bfqd->num_active_groups--; + + /* + * The decrement of num_groups_with_pending_reqs is + * not performed immediately upon the deactivation of + * entity, but it is delayed to when it also happens + * that the first leaf descendant bfqq of entity gets + * all its pending requests completed. The following + * instructions perform this delayed decrement, if + * needed. See the comments on + * num_groups_with_pending_reqs for details. + */ + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + bfqd->num_groups_with_pending_reqs--; + } } } @@ -3529,27 +3543,44 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * fact, if there are active groups, then, for condition (i) * to become false, it is enough that an active group contains * more active processes or sub-groups than some other active - * group. We address this issue with the following bi-modal - * behavior, implemented in the function + * group. More precisely, for condition (i) to hold because of + * such a group, it is not even necessary that the group is + * (still) active: it is sufficient that, even if the group + * has become inactive, some of its descendant processes still + * have some request already dispatched but still waiting for + * completion. In fact, requests have still to be guaranteed + * their share of the throughput even after being + * dispatched. In this respect, it is easy to show that, if a + * group frequently becomes inactive while still having + * in-flight requests, and if, when this happens, the group is + * not considered in the calculation of whether the scenario + * is asymmetric, then the group may fail to be guaranteed its + * fair share of the throughput (basically because idling may + * not be performed for the descendant processes of the group, + * but it had to be). We address this issue with the + * following bi-modal behavior, implemented in the function * bfq_symmetric_scenario(). * - * If there are active groups, then the scenario is tagged as + * If there are groups with requests waiting for completion + * (as commented above, some of these groups may even be + * already inactive), then the scenario is tagged as * asymmetric, conservatively, without checking any of the * conditions (i) and (ii). So the device is idled for bfqq. * This behavior matches also the fact that groups are created - * exactly if controlling I/O (to preserve bandwidth and - * latency guarantees) is a primary concern. + * exactly if controlling I/O is a primary concern (to + * preserve bandwidth and latency guarantees). * - * On the opposite end, if there are no active groups, then - * only condition (i) is actually controlled, i.e., provided - * that condition (i) holds, idling is not performed, - * regardless of whether condition (ii) holds. In other words, - * only if condition (i) does not hold, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. Since there - * are no active groups, then, to control condition (i) it is - * enough to check whether all active queues have the same - * weight. + * On the opposite end, if there are no groups with requests + * waiting for completion, then only condition (i) is actually + * controlled, i.e., provided that condition (i) holds, idling + * is not performed, regardless of whether condition (ii) + * holds. In other words, only if condition (i) does not hold, + * then idling is allowed, and the device tends to be + * prevented from queueing many requests, possibly of several + * processes. Since there are no groups with requests waiting + * for completion, then, to control condition (i) it is enough + * to check just whether all the queues with requests waiting + * for completion also have the same weight. * * Not checking condition (ii) evidently exposes bfqq to the * risk of getting less throughput than its fair share. @@ -3607,10 +3638,11 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq) * bfqq is weight-raised is checked explicitly here. More * precisely, the compound condition below takes into account * also the fact that, even if bfqq is being weight-raised, - * the scenario is still symmetric if all active queues happen - * to be weight-raised. Actually, we should be even more - * precise here, and differentiate between interactive weight - * raising and soft real-time weight raising. + * the scenario is still symmetric if all queues with requests + * waiting for completion happen to be + * weight-raised. Actually, we should be even more precise + * here, and differentiate between interactive weight raising + * and soft real-time weight raising. * * As a side note, it is worth considering that the above * device-idling countermeasures may however fail in the @@ -4034,7 +4066,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (idle_timer_disabled) /* * Since the idle timer has been disabled, @@ -4053,7 +4085,7 @@ static void bfq_update_dispatch_stats(struct request_queue *q, bfqg_stats_set_start_empty_time(bfqg); bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } #else static inline void bfq_update_dispatch_stats(struct request_queue *q, @@ -4384,7 +4416,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, rcu_read_lock(); - bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); + bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); if (!bfqg) { bfqq = &bfqd->oom_bfqq; goto out; @@ -4637,11 +4669,11 @@ static void bfq_update_insert_stats(struct request_queue *q, * In addition, the following queue lock guarantees that * bfqq_group(bfqq) exists as well. */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); if (idle_timer_disabled) bfqg_stats_update_idle_time(bfqq_group(bfqq)); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } #else static inline void bfq_update_insert_stats(struct request_queue *q, @@ -5382,9 +5414,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) } eq->elevator_data = bfqd; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->elevator = eq; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); /* * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. @@ -5417,7 +5449,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT; - bfqd->num_active_groups = 0; + bfqd->num_groups_with_pending_reqs = 0; INIT_LIST_HEAD(&bfqd->active_list); INIT_LIST_HEAD(&bfqd->idle_list); @@ -5724,7 +5756,7 @@ static struct elv_fs_entry bfq_attrs[] = { }; static struct elevator_type iosched_bfq_mq = { - .ops.mq = { + .ops = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, @@ -5745,7 +5777,6 @@ static struct elevator_type iosched_bfq_mq = { .exit_sched = bfq_exit_queue, }, - .uses_mq = true, .icq_size = sizeof(struct bfq_io_cq), .icq_align = __alignof__(struct bfq_io_cq), .elevator_attrs = bfq_attrs, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index 77651d817ecd..0b02bf302de0 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -196,6 +196,9 @@ struct bfq_entity { /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; + + /* flag, set if the entity is counted in groups_with_pending_reqs */ + bool in_groups_with_pending_reqs; }; struct bfq_group; @@ -448,10 +451,54 @@ struct bfq_data { * bfq_weights_tree_[add|remove] for further details). */ struct rb_root queue_weights_tree; + /* - * number of groups with requests still waiting for completion + * Number of groups with at least one descendant process that + * has at least one request waiting for completion. Note that + * this accounts for also requests already dispatched, but not + * yet completed. Therefore this number of groups may differ + * (be larger) than the number of active groups, as a group is + * considered active only if its corresponding entity has + * descendant queues with at least one request queued. This + * number is used to decide whether a scenario is symmetric. + * For a detailed explanation see comments on the computation + * of the variable asymmetric_scenario in the function + * bfq_better_to_idle(). + * + * However, it is hard to compute this number exactly, for + * groups with multiple descendant processes. Consider a group + * that is inactive, i.e., that has no descendant process with + * pending I/O inside BFQ queues. Then suppose that + * num_groups_with_pending_reqs is still accounting for this + * group, because the group has descendant processes with some + * I/O request still in flight. num_groups_with_pending_reqs + * should be decremented when the in-flight request of the + * last descendant process is finally completed (assuming that + * nothing else has changed for the group in the meantime, in + * terms of composition of the group and active/inactive state of child + * groups and processes). To accomplish this, an additional + * pending-request counter must be added to entities, and must + * be updated correctly. To avoid this additional field and operations, + * we resort to the following tradeoff between simplicity and + * accuracy: for an inactive group that is still counted in + * num_groups_with_pending_reqs, we decrement + * num_groups_with_pending_reqs when the first descendant + * process of the group remains with no request waiting for + * completion. + * + * Even this simpler decrement strategy requires a little + * carefulness: to avoid multiple decrements, we flag a group, + * more precisely an entity representing a group, as still + * counted in num_groups_with_pending_reqs when it becomes + * inactive. Then, when the first descendant queue of the + * entity remains with no request waiting for completion, + * num_groups_with_pending_reqs is decremented, and this flag + * is reset. After this flag is reset for the entity, + * num_groups_with_pending_reqs won't be decremented any + * longer in case a new descendant queue of the entity remains + * with no request waiting for completion. */ - unsigned int num_active_groups; + unsigned int num_groups_with_pending_reqs; /* * Number of bfq_queues containing requests (including the diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index 4b0d5fb69160..63e0f12be7c9 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -1012,7 +1012,10 @@ static void __bfq_activate_entity(struct bfq_entity *entity, container_of(entity, struct bfq_group, entity); struct bfq_data *bfqd = bfqg->bfqd; - bfqd->num_active_groups++; + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; + bfqd->num_groups_with_pending_reqs++; + } } #endif diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 290af497997b..1b633a3526d4 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -390,7 +390,6 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) bip->bip_iter.bi_sector += bytes_done >> 9; bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } -EXPORT_SYMBOL(bio_integrity_advance); /** * bio_integrity_trim - Trim integrity vector @@ -460,7 +459,6 @@ void bioset_integrity_free(struct bio_set *bs) mempool_exit(&bs->bio_integrity_pool); mempool_exit(&bs->bvec_integrity_pool); } -EXPORT_SYMBOL(bioset_integrity_free); void __init bio_integrity_init(void) { diff --git a/block/bio.c b/block/bio.c index d5368a445561..8281bfcbc265 100644 --- a/block/bio.c +++ b/block/bio.c @@ -244,7 +244,7 @@ fallback: void bio_uninit(struct bio *bio) { - bio_disassociate_task(bio); + bio_disassociate_blkg(bio); } EXPORT_SYMBOL(bio_uninit); @@ -571,14 +571,13 @@ void bio_put(struct bio *bio) } EXPORT_SYMBOL(bio_put); -inline int bio_phys_segments(struct request_queue *q, struct bio *bio) +int bio_phys_segments(struct request_queue *q, struct bio *bio) { if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) blk_recount_segments(q, bio); return bio->bi_phys_segments; } -EXPORT_SYMBOL(bio_phys_segments); /** * __bio_clone_fast - clone a bio that shares the original bio's biovec @@ -605,11 +604,13 @@ void __bio_clone_fast(struct bio *bio, struct bio *bio_src) if (bio_flagged(bio_src, BIO_THROTTLED)) bio_set_flag(bio, BIO_THROTTLED); bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); } EXPORT_SYMBOL(__bio_clone_fast); @@ -900,7 +901,6 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) return 0; } -EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); static void submit_bio_wait_endio(struct bio *bio) { @@ -1260,6 +1260,8 @@ struct bio *bio_copy_user_iov(struct request_queue *q, if (ret) goto cleanup; } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); iov_iter_advance(iter, bio->bi_iter.bi_size); } @@ -1589,7 +1591,6 @@ void bio_set_pages_dirty(struct bio *bio) set_page_dirty_lock(bvec->bv_page); } } -EXPORT_SYMBOL_GPL(bio_set_pages_dirty); static void bio_release_pages(struct bio *bio) { @@ -1659,17 +1660,33 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } -EXPORT_SYMBOL_GPL(bio_check_pages_dirty); + +void update_io_ticks(struct hd_struct *part, unsigned long now) +{ + unsigned long stamp; +again: + stamp = READ_ONCE(part->stamp); + if (unlikely(stamp != now)) { + if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) { + __part_stat_add(part, io_ticks, 1); + } + } + if (part->partno) { + part = &part_to_disk(part)->part0; + goto again; + } +} void generic_start_io_acct(struct request_queue *q, int op, unsigned long sectors, struct hd_struct *part) { const int sgrp = op_stat_group(op); - int cpu = part_stat_lock(); - part_round_stats(q, cpu, part); - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, sectors[sgrp], sectors); + part_stat_lock(); + + update_io_ticks(part, jiffies); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); part_inc_in_flight(q, part, op_is_write(op)); part_stat_unlock(); @@ -1679,12 +1696,15 @@ EXPORT_SYMBOL(generic_start_io_acct); void generic_end_io_acct(struct request_queue *q, int req_op, struct hd_struct *part, unsigned long start_time) { - unsigned long duration = jiffies - start_time; + unsigned long now = jiffies; + unsigned long duration = now - start_time; const int sgrp = op_stat_group(req_op); - int cpu = part_stat_lock(); - part_stat_add(cpu, part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_round_stats(q, cpu, part); + part_stat_lock(); + + update_io_ticks(part, now); + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_add(part, time_in_queue, duration); part_dec_in_flight(q, part, op_is_write(req_op)); part_stat_unlock(); @@ -1954,102 +1974,133 @@ EXPORT_SYMBOL(bioset_init_from_src); #ifdef CONFIG_BLK_CGROUP -#ifdef CONFIG_MEMCG /** - * bio_associate_blkcg_from_page - associate a bio with the page's blkcg + * bio_disassociate_blkg - puts back the blkg reference if associated * @bio: target bio - * @page: the page to lookup the blkcg from * - * Associate @bio with the blkcg from @page's owning memcg. This works like - * every other associate function wrt references. + * Helper to disassociate the blkg from @bio if a blkg is associated. */ -int bio_associate_blkcg_from_page(struct bio *bio, struct page *page) +void bio_disassociate_blkg(struct bio *bio) { - struct cgroup_subsys_state *blkcg_css; - - if (unlikely(bio->bi_css)) - return -EBUSY; - if (!page->mem_cgroup) - return 0; - blkcg_css = cgroup_get_e_css(page->mem_cgroup->css.cgroup, - &io_cgrp_subsys); - bio->bi_css = blkcg_css; - return 0; + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } } -#endif /* CONFIG_MEMCG */ +EXPORT_SYMBOL_GPL(bio_disassociate_blkg); /** - * bio_associate_blkcg - associate a bio with the specified blkcg + * __bio_associate_blkg - associate a bio with the a blkg * @bio: target bio - * @blkcg_css: css of the blkcg to associate + * @blkg: the blkg to associate * - * Associate @bio with the blkcg specified by @blkcg_css. Block layer will - * treat @bio as if it were issued by a task which belongs to the blkcg. + * This tries to associate @bio with the specified @blkg. Association failure + * is handled by walking up the blkg tree. Therefore, the blkg associated can + * be anything between @blkg and the root_blkg. This situation only happens + * when a cgroup is dying and then the remaining bios will spill to the closest + * alive blkg. * - * This function takes an extra reference of @blkcg_css which will be put - * when @bio is released. The caller must own @bio and is responsible for - * synchronizing calls to this function. + * A reference will be taken on the @blkg and will be released when @bio is + * freed. */ -int bio_associate_blkcg(struct bio *bio, struct cgroup_subsys_state *blkcg_css) +static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) { - if (unlikely(bio->bi_css)) - return -EBUSY; - css_get(blkcg_css); - bio->bi_css = blkcg_css; - return 0; + bio_disassociate_blkg(bio); + + bio->bi_blkg = blkg_tryget_closest(blkg); } -EXPORT_SYMBOL_GPL(bio_associate_blkcg); /** - * bio_associate_blkg - associate a bio with the specified blkg + * bio_associate_blkg_from_css - associate a bio with a specified css * @bio: target bio - * @blkg: the blkg to associate + * @css: target css * - * Associate @bio with the blkg specified by @blkg. This is the queue specific - * blkcg information associated with the @bio, a reference will be taken on the - * @blkg and will be freed when the bio is freed. + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. This falls back to the queue's root_blkg if + * the association fails with the css. */ -int bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) { - if (unlikely(bio->bi_blkg)) - return -EBUSY; - if (!blkg_try_get(blkg)) - return -ENODEV; - bio->bi_blkg = blkg; - return 0; + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg; + + rcu_read_lock(); + + if (!css || !css->parent) + blkg = q->root_blkg; + else + blkg = blkg_lookup_create(css_to_blkcg(css), q); + + __bio_associate_blkg(bio, blkg); + + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); +#ifdef CONFIG_MEMCG /** - * bio_disassociate_task - undo bio_associate_current() + * bio_associate_blkg_from_page - associate a bio with the page's blkg * @bio: target bio + * @page: the page to lookup the blkcg from + * + * Associate @bio with the blkg from @page's owning memcg and the respective + * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's + * root_blkg. */ -void bio_disassociate_task(struct bio *bio) +void bio_associate_blkg_from_page(struct bio *bio, struct page *page) { - if (bio->bi_ioc) { - put_io_context(bio->bi_ioc); - bio->bi_ioc = NULL; - } - if (bio->bi_css) { - css_put(bio->bi_css); - bio->bi_css = NULL; - } - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } + struct cgroup_subsys_state *css; + + if (!page->mem_cgroup) + return; + + rcu_read_lock(); + + css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +#endif /* CONFIG_MEMCG */ + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(bio_associate_blkg); /** - * bio_clone_blkcg_association - clone blkcg association from src to dst bio + * bio_clone_blkg_association - clone blkg association from src to dst bio * @dst: destination bio * @src: source bio */ -void bio_clone_blkcg_association(struct bio *dst, struct bio *src) +void bio_clone_blkg_association(struct bio *dst, struct bio *src) { - if (src->bi_css) - WARN_ON(bio_associate_blkcg(dst, src->bi_css)); + if (src->bi_blkg) + __bio_associate_blkg(dst, src->bi_blkg); } -EXPORT_SYMBOL_GPL(bio_clone_blkcg_association); +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); #endif /* CONFIG_BLK_CGROUP */ static void __init biovec_init_slabs(void) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index c630e02836a8..c8cc1cbb6370 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -76,14 +76,42 @@ static void blkg_free(struct blkcg_gq *blkg) if (blkg->pd[i]) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - if (blkg->blkcg != &blkcg_root) - blk_exit_rl(blkg->q, &blkg->rl); - blkg_rwstat_exit(&blkg->stat_ios); blkg_rwstat_exit(&blkg->stat_bytes); kfree(blkg); } +static void __blkg_release(struct rcu_head *rcu) +{ + struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + + percpu_ref_exit(&blkg->refcnt); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + + wb_congested_put(blkg->wb_congested); + + blkg_free(blkg); +} + +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +static void blkg_release(struct percpu_ref *ref) +{ + struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); + + call_rcu(&blkg->rcu_head, __blkg_release); +} + /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with @@ -110,14 +138,6 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, blkg->q = q; INIT_LIST_HEAD(&blkg->q_node); blkg->blkcg = blkcg; - atomic_set(&blkg->refcnt, 1); - - /* root blkg uses @q->root_rl, init rl only for !root blkgs */ - if (blkcg != &blkcg_root) { - if (blk_init_rl(&blkg->rl, q, gfp_mask)) - goto err_free; - blkg->rl.blkg = blkg; - } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -157,7 +177,7 @@ struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); if (blkg && blkg->q == q) { if (update_hint) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); rcu_assign_pointer(blkcg->blkg_hint, blkg); } return blkg; @@ -180,7 +200,13 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, int i, ret; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); + + /* request_queue is dying, do not create/recreate a blkg */ + if (blk_queue_dying(q)) { + ret = -ENODEV; + goto err_free_blkg; + } /* blkg holds a reference to blkcg */ if (!css_tryget_online(&blkcg->css)) { @@ -217,6 +243,11 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_get(blkg->parent); } + ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0, + GFP_NOWAIT | __GFP_NOWARN); + if (ret) + goto err_cancel_ref; + /* invoke per-policy init */ for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -249,6 +280,8 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); +err_cancel_ref: + percpu_ref_exit(&blkg->refcnt); err_put_congested: wb_congested_put(wb_congested); err_put_css: @@ -259,7 +292,7 @@ err_free_blkg: } /** - * blkg_lookup_create - lookup blkg, try to create one if not there + * __blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest * @q: request_queue of interest * @@ -268,24 +301,16 @@ err_free_blkg: * that all non-root blkg's have access to the parent blkg. This function * should be called under RCU read lock and @q->queue_lock. * - * Returns pointer to the looked up or created blkg on success, ERR_PTR() - * value on error. If @q is dead, returns ERR_PTR(-EINVAL). If @q is not - * dead and bypassing, returns ERR_PTR(-EBUSY). + * Returns the blkg or the closest blkg if blkg_create() fails as it walks + * down from root. */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) { struct blkcg_gq *blkg; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); - - /* - * This could be the first entry point of blkcg implementation and - * we shouldn't allow anything to go through for a bypassing queue. - */ - if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); + lockdep_assert_held(&q->queue_lock); blkg = __blkg_lookup(blkcg, q, true); if (blkg) @@ -293,30 +318,64 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, /* * Create blkgs walking down from blkcg_root to @blkcg, so that all - * non-root blkgs have access to their parents. + * non-root blkgs have access to their parents. Returns the closest + * blkg to the intended blkg should blkg_create() fail. */ while (true) { struct blkcg *pos = blkcg; struct blkcg *parent = blkcg_parent(blkcg); - - while (parent && !__blkg_lookup(parent, q, false)) { + struct blkcg_gq *ret_blkg = q->root_blkg; + + while (parent) { + blkg = __blkg_lookup(parent, q, false); + if (blkg) { + /* remember closest blkg */ + ret_blkg = blkg; + break; + } pos = parent; parent = blkcg_parent(parent); } blkg = blkg_create(pos, q, NULL); - if (pos == blkcg || IS_ERR(blkg)) + if (IS_ERR(blkg)) + return ret_blkg; + if (pos == blkcg) return blkg; } } +/** + * blkg_lookup_create - find or create a blkg + * @blkcg: target block cgroup + * @q: target request_queue + * + * This looks up or creates the blkg representing the unique pair + * of the blkcg and the request_queue. + */ +struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) +{ + struct blkcg_gq *blkg = blkg_lookup(blkcg, q); + + if (unlikely(!blkg)) { + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup_create(blkcg, q); + spin_unlock_irqrestore(&q->queue_lock, flags); + } + + return blkg; +} + static void blkg_destroy(struct blkcg_gq *blkg) { struct blkcg *blkcg = blkg->blkcg; struct blkcg_gq *parent = blkg->parent; int i; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); /* Something wrong if we are trying to remove same group twice */ @@ -353,7 +412,7 @@ static void blkg_destroy(struct blkcg_gq *blkg) * Put the reference taken at the time of creation so that when all * queues are gone, group can be destroyed. */ - blkg_put(blkg); + percpu_ref_kill(&blkg->refcnt); } /** @@ -366,8 +425,7 @@ static void blkg_destroy_all(struct request_queue *q) { struct blkcg_gq *blkg, *n; - lockdep_assert_held(q->queue_lock); - + spin_lock_irq(&q->queue_lock); list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; @@ -377,7 +435,7 @@ static void blkg_destroy_all(struct request_queue *q) } q->root_blkg = NULL; - q->root_rl.blkg = NULL; + spin_unlock_irq(&q->queue_lock); } /* @@ -403,41 +461,6 @@ void __blkg_release_rcu(struct rcu_head *rcu_head) } EXPORT_SYMBOL_GPL(__blkg_release_rcu); -/* - * The next function used by blk_queue_for_each_rl(). It's a bit tricky - * because the root blkg uses @q->root_rl instead of its own rl. - */ -struct request_list *__blk_queue_next_rl(struct request_list *rl, - struct request_queue *q) -{ - struct list_head *ent; - struct blkcg_gq *blkg; - - /* - * Determine the current blkg list_head. The first entry is - * root_rl which is off @q->blkg_list and mapped to the head. - */ - if (rl == &q->root_rl) { - ent = &q->blkg_list; - /* There are no more block groups, hence no request lists */ - if (list_empty(ent)) - return NULL; - } else { - blkg = container_of(rl, struct blkcg_gq, rl); - ent = &blkg->q_node; - } - - /* walk to the next list_head, skip root blkcg */ - ent = ent->next; - if (ent == &q->root_blkg->q_node) - ent = ent->next; - if (ent == &q->blkg_list) - return NULL; - - blkg = container_of(ent, struct blkcg_gq, q_node); - return &blkg->rl; -} - static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct cftype *cftype, u64 val) { @@ -477,7 +500,6 @@ const char *blkg_dev_name(struct blkcg_gq *blkg) return dev_name(blkg->q->backing_dev_info->dev); return NULL; } -EXPORT_SYMBOL_GPL(blkg_dev_name); /** * blkcg_print_blkgs - helper for printing per-blkg data @@ -508,10 +530,10 @@ void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, rcu_read_lock(); hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - spin_lock_irq(blkg->q->queue_lock); + spin_lock_irq(&blkg->q->queue_lock); if (blkcg_policy_enabled(blkg->q, pol)) total += prfill(sf, blkg->pd[pol->plid], data); - spin_unlock_irq(blkg->q->queue_lock); + spin_unlock_irq(&blkg->q->queue_lock); } rcu_read_unlock(); @@ -709,7 +731,7 @@ u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg, struct cgroup_subsys_state *pos_css; u64 sum = 0; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { @@ -752,7 +774,7 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkg_rwstat sum = { }; int i; - lockdep_assert_held(blkg->q->queue_lock); + lockdep_assert_held(&blkg->q->queue_lock); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { @@ -783,18 +805,10 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, struct request_queue *q) { WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); if (!blkcg_policy_enabled(q, pol)) return ERR_PTR(-EOPNOTSUPP); - - /* - * This could be the first entry point of blkcg implementation and - * we shouldn't allow anything to go through for a bypassing queue. - */ - if (unlikely(blk_queue_bypass(q))) - return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY); - return __blkg_lookup(blkcg, q, true /* update_hint */); } @@ -812,7 +826,7 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(disk->queue->queue_lock) + __acquires(rcu) __acquires(&disk->queue->queue_lock) { struct gendisk *disk; struct request_queue *q; @@ -840,7 +854,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, q = disk->queue; rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(blkcg, pol, q); if (IS_ERR(blkg)) { @@ -867,7 +881,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); new_blkg = blkg_alloc(pos, q, GFP_KERNEL); @@ -877,7 +891,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, } rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_lookup_check(pos, pol, q); if (IS_ERR(blkg)) { @@ -905,7 +919,7 @@ success: return 0; fail_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); fail: put_disk_and_module(disk); @@ -921,7 +935,6 @@ fail: } return ret; } -EXPORT_SYMBOL_GPL(blkg_conf_prep); /** * blkg_conf_finish - finish up per-blkg config update @@ -931,13 +944,12 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); * with blkg_conf_prep(). */ void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(ctx->disk->queue->queue_lock) __releases(rcu) + __releases(&ctx->disk->queue->queue_lock) __releases(rcu) { - spin_unlock_irq(ctx->disk->queue->queue_lock); + spin_unlock_irq(&ctx->disk->queue->queue_lock); rcu_read_unlock(); put_disk_and_module(ctx->disk); } -EXPORT_SYMBOL_GPL(blkg_conf_finish); static int blkcg_print_stat(struct seq_file *sf, void *v) { @@ -967,7 +979,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) */ off += scnprintf(buf+off, size-off, "%s ", dname); - spin_lock_irq(blkg->q->queue_lock); + spin_lock_irq(&blkg->q->queue_lock); rwstat = blkg_rwstat_recursive_sum(blkg, NULL, offsetof(struct blkcg_gq, stat_bytes)); @@ -981,7 +993,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]); dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]); - spin_unlock_irq(blkg->q->queue_lock); + spin_unlock_irq(&blkg->q->queue_lock); if (rbytes || wbytes || rios || wios) { has_stats = true; @@ -1102,9 +1114,9 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; - if (spin_trylock(q->queue_lock)) { + if (spin_trylock(&q->queue_lock)) { blkg_destroy(blkg); - spin_unlock(q->queue_lock); + spin_unlock(&q->queue_lock); } else { spin_unlock_irq(&blkcg->lock); cpu_relax(); @@ -1225,36 +1237,31 @@ int blkcg_init_queue(struct request_queue *q) /* Make sure the root blkg exists. */ rcu_read_lock(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); blkg = blkg_create(&blkcg_root, q, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; - q->root_rl.blkg = blkg; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); ret = blk_iolatency_init(q); - if (ret) { - spin_lock_irq(q->queue_lock); - blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - return ret; - } + if (ret) + goto err_destroy_all; ret = blk_throtl_init(q); - if (ret) { - spin_lock_irq(q->queue_lock); - blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - } - return ret; + if (ret) + goto err_destroy_all; + return 0; +err_destroy_all: + blkg_destroy_all(q); + return ret; err_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); @@ -1269,7 +1276,7 @@ err_unlock: */ void blkcg_drain_queue(struct request_queue *q) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); /* * @q could be exiting and already have destroyed all blkgs as @@ -1289,10 +1296,7 @@ void blkcg_drain_queue(struct request_queue *q) */ void blkcg_exit_queue(struct request_queue *q) { - spin_lock_irq(q->queue_lock); blkg_destroy_all(q); - spin_unlock_irq(q->queue_lock); - blk_throtl_exit(q); } @@ -1396,10 +1400,8 @@ int blkcg_activate_policy(struct request_queue *q, if (blkcg_policy_enabled(q, pol)) return 0; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); - else - blk_queue_bypass_start(q); pd_prealloc: if (!pd_prealloc) { pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node); @@ -1409,7 +1411,7 @@ pd_prealloc: } } - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1421,7 +1423,7 @@ pd_prealloc: if (!pd) swap(pd, pd_prealloc); if (!pd) { - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); goto pd_prealloc; } @@ -1435,12 +1437,10 @@ pd_prealloc: __set_bit(pol->plid, q->blkcg_pols); ret = 0; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); out_bypass_end: - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); - else - blk_queue_bypass_end(q); if (pd_prealloc) pol->pd_free_fn(pd_prealloc); return ret; @@ -1463,12 +1463,10 @@ void blkcg_deactivate_policy(struct request_queue *q, if (!blkcg_policy_enabled(q, pol)) return; - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_freeze_queue(q); - else - blk_queue_bypass_start(q); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); @@ -1481,12 +1479,10 @@ void blkcg_deactivate_policy(struct request_queue *q, } } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); - else - blk_queue_bypass_end(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); @@ -1748,8 +1744,7 @@ void blkcg_maybe_throttle_current(void) blkg = blkg_lookup(blkcg, q); if (!blkg) goto out; - blkg = blkg_try_get(blkg); - if (!blkg) + if (!blkg_tryget(blkg)) goto out; rcu_read_unlock(); @@ -1761,7 +1756,6 @@ out: rcu_read_unlock(); blk_put_queue(q); } -EXPORT_SYMBOL_GPL(blkcg_maybe_throttle_current); /** * blkcg_schedule_throttle - this task needs to check for throttling @@ -1795,7 +1789,6 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); } -EXPORT_SYMBOL_GPL(blkcg_schedule_throttle); /** * blkcg_add_delay - add delay to this blkg @@ -1810,7 +1803,6 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) blkcg_scale_delay(blkg, now); atomic64_add(delta, &blkg->delay_nsec); } -EXPORT_SYMBOL_GPL(blkcg_add_delay); module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c index ce12515f9b9b..c78042975737 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -58,11 +58,6 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); DEFINE_IDA(blk_queue_ida); /* - * For the allocated request tables - */ -struct kmem_cache *request_cachep; - -/* * For queue allocation */ struct kmem_cache *blk_requestq_cachep; @@ -79,11 +74,7 @@ static struct workqueue_struct *kblockd_workqueue; */ void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_set(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); + set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_set); @@ -94,11 +85,7 @@ EXPORT_SYMBOL(blk_queue_flag_set); */ void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - queue_flag_clear(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); + clear_bit(flag, &q->queue_flags); } EXPORT_SYMBOL(blk_queue_flag_clear); @@ -112,85 +99,15 @@ EXPORT_SYMBOL(blk_queue_flag_clear); */ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) { - unsigned long flags; - bool res; - - spin_lock_irqsave(q->queue_lock, flags); - res = queue_flag_test_and_set(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return res; + return test_and_set_bit(flag, &q->queue_flags); } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -/** - * blk_queue_flag_test_and_clear - atomically test and clear a queue flag - * @flag: flag to be cleared - * @q: request queue - * - * Returns the previous value of @flag - 0 if the flag was not set and 1 if - * the flag was set. - */ -bool blk_queue_flag_test_and_clear(unsigned int flag, struct request_queue *q) -{ - unsigned long flags; - bool res; - - spin_lock_irqsave(q->queue_lock, flags); - res = queue_flag_test_and_clear(flag, q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return res; -} -EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_clear); - -static void blk_clear_congested(struct request_list *rl, int sync) -{ -#ifdef CONFIG_CGROUP_WRITEBACK - clear_wb_congested(rl->blkg->wb_congested, sync); -#else - /* - * If !CGROUP_WRITEBACK, all blkg's map to bdi->wb and we shouldn't - * flip its congestion state for events on other blkcgs. - */ - if (rl == &rl->q->root_rl) - clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); -#endif -} - -static void blk_set_congested(struct request_list *rl, int sync) -{ -#ifdef CONFIG_CGROUP_WRITEBACK - set_wb_congested(rl->blkg->wb_congested, sync); -#else - /* see blk_clear_congested() */ - if (rl == &rl->q->root_rl) - set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); -#endif -} - -void blk_queue_congestion_threshold(struct request_queue *q) -{ - int nr; - - nr = q->nr_requests - (q->nr_requests / 8) + 1; - if (nr > q->nr_requests) - nr = q->nr_requests; - q->nr_congestion_on = nr; - - nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1; - if (nr < 1) - nr = 1; - q->nr_congestion_off = nr; -} - void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); INIT_LIST_HEAD(&rq->queuelist); - INIT_LIST_HEAD(&rq->timeout_list); - rq->cpu = -1; rq->q = q; rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); @@ -256,10 +173,11 @@ static void print_req_error(struct request *req, blk_status_t status) if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) return; - printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu\n", - __func__, blk_errors[idx].name, req->rq_disk ? - req->rq_disk->disk_name : "?", - (unsigned long long)blk_rq_pos(req)); + printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n", + __func__, blk_errors[idx].name, + req->rq_disk ? req->rq_disk->disk_name : "?", + (unsigned long long)blk_rq_pos(req), + req->cmd_flags); } static void req_bio_endio(struct request *rq, struct bio *bio, @@ -292,99 +210,6 @@ void blk_dump_rq_flags(struct request *rq, char *msg) } EXPORT_SYMBOL(blk_dump_rq_flags); -static void blk_delay_work(struct work_struct *work) -{ - struct request_queue *q; - - q = container_of(work, struct request_queue, delay_work.work); - spin_lock_irq(q->queue_lock); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - -/** - * blk_delay_queue - restart queueing after defined interval - * @q: The &struct request_queue in question - * @msecs: Delay in msecs - * - * Description: - * Sometimes queueing needs to be postponed for a little while, to allow - * resources to come back. This function will make sure that queueing is - * restarted around the specified time. - */ -void blk_delay_queue(struct request_queue *q, unsigned long msecs) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (likely(!blk_queue_dead(q))) - queue_delayed_work(kblockd_workqueue, &q->delay_work, - msecs_to_jiffies(msecs)); -} -EXPORT_SYMBOL(blk_delay_queue); - -/** - * blk_start_queue_async - asynchronously restart a previously stopped queue - * @q: The &struct request_queue in question - * - * Description: - * blk_start_queue_async() will clear the stop flag on the queue, and - * ensure that the request_fn for the queue is run from an async - * context. - **/ -void blk_start_queue_async(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - blk_run_queue_async(q); -} -EXPORT_SYMBOL(blk_start_queue_async); - -/** - * blk_start_queue - restart a previously stopped queue - * @q: The &struct request_queue in question - * - * Description: - * blk_start_queue() will clear the stop flag on the queue, and call - * the request_fn for the queue if it was in a stopped state when - * entered. Also see blk_stop_queue(). - **/ -void blk_start_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - queue_flag_clear(QUEUE_FLAG_STOPPED, q); - __blk_run_queue(q); -} -EXPORT_SYMBOL(blk_start_queue); - -/** - * blk_stop_queue - stop a queue - * @q: The &struct request_queue in question - * - * Description: - * The Linux block layer assumes that a block driver will consume all - * entries on the request queue when the request_fn strategy is called. - * Often this will not happen, because of hardware limitations (queue - * depth settings). If a device driver gets a 'queue full' response, - * or if it simply chooses not to queue more I/O at one point, it can - * call this function to prevent the request_fn from being called until - * the driver has signalled it's ready to go again. This happens by calling - * blk_start_queue() to restart queue operations. - **/ -void blk_stop_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - cancel_delayed_work(&q->delay_work); - queue_flag_set(QUEUE_FLAG_STOPPED, q); -} -EXPORT_SYMBOL(blk_stop_queue); - /** * blk_sync_queue - cancel any pending callbacks on a queue * @q: the queue @@ -408,15 +233,13 @@ void blk_sync_queue(struct request_queue *q) del_timer_sync(&q->timeout); cancel_work_sync(&q->timeout_work); - if (q->mq_ops) { + if (queue_is_mq(q)) { struct blk_mq_hw_ctx *hctx; int i; cancel_delayed_work_sync(&q->requeue_work); queue_for_each_hw_ctx(q, hctx, i) cancel_delayed_work_sync(&hctx->run_work); - } else { - cancel_delayed_work_sync(&q->delay_work); } } EXPORT_SYMBOL(blk_sync_queue); @@ -442,250 +265,12 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); -/** - * __blk_run_queue_uncond - run a queue whether or not it has been stopped - * @q: The queue to run - * - * Description: - * Invoke request handling on a queue if there are any pending requests. - * May be used to restart request handling after a request has completed. - * This variant runs the queue whether or not the queue has been - * stopped. Must be called with the queue lock held and interrupts - * disabled. See also @blk_run_queue. - */ -inline void __blk_run_queue_uncond(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (unlikely(blk_queue_dead(q))) - return; - - /* - * Some request_fn implementations, e.g. scsi_request_fn(), unlock - * the queue lock internally. As a result multiple threads may be - * running such a request function concurrently. Keep track of the - * number of active request_fn invocations such that blk_drain_queue() - * can wait until all these request_fn calls have finished. - */ - q->request_fn_active++; - q->request_fn(q); - q->request_fn_active--; -} -EXPORT_SYMBOL_GPL(__blk_run_queue_uncond); - -/** - * __blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * See @blk_run_queue. - */ -void __blk_run_queue(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (unlikely(blk_queue_stopped(q))) - return; - - __blk_run_queue_uncond(q); -} -EXPORT_SYMBOL(__blk_run_queue); - -/** - * blk_run_queue_async - run a single device queue in workqueue context - * @q: The queue to run - * - * Description: - * Tells kblockd to perform the equivalent of @blk_run_queue on behalf - * of us. - * - * Note: - * Since it is not allowed to run q->delay_work after blk_cleanup_queue() - * has canceled q->delay_work, callers must hold the queue lock to avoid - * race conditions between blk_cleanup_queue() and blk_run_queue_async(). - */ -void blk_run_queue_async(struct request_queue *q) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (likely(!blk_queue_stopped(q) && !blk_queue_dead(q))) - mod_delayed_work(kblockd_workqueue, &q->delay_work, 0); -} -EXPORT_SYMBOL(blk_run_queue_async); - -/** - * blk_run_queue - run a single device queue - * @q: The queue to run - * - * Description: - * Invoke request handling on this queue, if it has pending work to do. - * May be used to restart queueing when a request has completed. - */ -void blk_run_queue(struct request_queue *q) -{ - unsigned long flags; - - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irqsave(q->queue_lock, flags); - __blk_run_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(blk_run_queue); - void blk_put_queue(struct request_queue *q) { kobject_put(&q->kobj); } EXPORT_SYMBOL(blk_put_queue); -/** - * __blk_drain_queue - drain requests from request_queue - * @q: queue to drain - * @drain_all: whether to drain all requests or only the ones w/ ELVPRIV - * - * Drain requests from @q. If @drain_all is set, all requests are drained. - * If not, only ELVPRIV requests are drained. The caller is responsible - * for ensuring that no new requests which need to be drained are queued. - */ -static void __blk_drain_queue(struct request_queue *q, bool drain_all) - __releases(q->queue_lock) - __acquires(q->queue_lock) -{ - int i; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - while (true) { - bool drain = false; - - /* - * The caller might be trying to drain @q before its - * elevator is initialized. - */ - if (q->elevator) - elv_drain_elevator(q); - - blkcg_drain_queue(q); - - /* - * This function might be called on a queue which failed - * driver init after queue creation or is not yet fully - * active yet. Some drivers (e.g. fd and loop) get unhappy - * in such cases. Kick queue iff dispatch queue has - * something on it and @q has request_fn set. - */ - if (!list_empty(&q->queue_head) && q->request_fn) - __blk_run_queue(q); - - drain |= q->nr_rqs_elvpriv; - drain |= q->request_fn_active; - - /* - * Unfortunately, requests are queued at and tracked from - * multiple places and there's no single counter which can - * be drained. Check all the queues and counters. - */ - if (drain_all) { - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - drain |= !list_empty(&q->queue_head); - for (i = 0; i < 2; i++) { - drain |= q->nr_rqs[i]; - drain |= q->in_flight[i]; - if (fq) - drain |= !list_empty(&fq->flush_queue[i]); - } - } - - if (!drain) - break; - - spin_unlock_irq(q->queue_lock); - - msleep(10); - - spin_lock_irq(q->queue_lock); - } - - /* - * With queue marked dead, any woken up waiter will fail the - * allocation path, so the wakeup chaining is lost and we're - * left with hung waiters. We need to wake up those waiters. - */ - if (q->request_fn) { - struct request_list *rl; - - blk_queue_for_each_rl(rl, q) - for (i = 0; i < ARRAY_SIZE(rl->wait); i++) - wake_up_all(&rl->wait[i]); - } -} - -void blk_drain_queue(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - __blk_drain_queue(q, true); - spin_unlock_irq(q->queue_lock); -} - -/** - * blk_queue_bypass_start - enter queue bypass mode - * @q: queue of interest - * - * In bypass mode, only the dispatch FIFO queue of @q is used. This - * function makes @q enter bypass mode and drains all requests which were - * throttled or issued before. On return, it's guaranteed that no request - * is being throttled or has ELVPRIV set and blk_queue_bypass() %true - * inside queue or RCU read lock. - */ -void blk_queue_bypass_start(struct request_queue *q) -{ - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irq(q->queue_lock); - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - spin_unlock_irq(q->queue_lock); - - /* - * Queues start drained. Skip actual draining till init is - * complete. This avoids lenghty delays during queue init which - * can happen many times during boot. - */ - if (blk_queue_init_done(q)) { - spin_lock_irq(q->queue_lock); - __blk_drain_queue(q, false); - spin_unlock_irq(q->queue_lock); - - /* ensure blk_queue_bypass() is %true inside RCU read lock */ - synchronize_rcu(); - } -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_start); - -/** - * blk_queue_bypass_end - leave queue bypass mode - * @q: queue of interest - * - * Leave bypass mode and restore the normal queueing behavior. - * - * Note: although blk_queue_bypass_start() is only called for blk-sq queues, - * this function is called for both blk-sq and blk-mq queues. - */ -void blk_queue_bypass_end(struct request_queue *q) -{ - spin_lock_irq(q->queue_lock); - if (!--q->bypass_depth) - queue_flag_clear(QUEUE_FLAG_BYPASS, q); - WARN_ON_ONCE(q->bypass_depth < 0); - spin_unlock_irq(q->queue_lock); -} -EXPORT_SYMBOL_GPL(blk_queue_bypass_end); - void blk_set_queue_dying(struct request_queue *q) { blk_queue_flag_set(QUEUE_FLAG_DYING, q); @@ -697,20 +282,8 @@ void blk_set_queue_dying(struct request_queue *q) */ blk_freeze_queue_start(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_wake_waiters(q); - else { - struct request_list *rl; - - spin_lock_irq(q->queue_lock); - blk_queue_for_each_rl(rl, q) { - if (rl->rq_pool) { - wake_up_all(&rl->wait[BLK_RW_SYNC]); - wake_up_all(&rl->wait[BLK_RW_ASYNC]); - } - } - spin_unlock_irq(q->queue_lock); - } /* Make blk_queue_enter() reexamine the DYING flag. */ wake_up_all(&q->mq_freeze_wq); @@ -755,29 +328,13 @@ void blk_exit_queue(struct request_queue *q) */ void blk_cleanup_queue(struct request_queue *q) { - spinlock_t *lock = q->queue_lock; - /* mark @q DYING, no new request or merges will be allowed afterwards */ mutex_lock(&q->sysfs_lock); blk_set_queue_dying(q); - spin_lock_irq(lock); - - /* - * A dying queue is permanently in bypass mode till released. Note - * that, unlike blk_queue_bypass_start(), we aren't performing - * synchronize_rcu() after entering bypass mode to avoid the delay - * as some drivers create and destroy a lot of queues while - * probing. This is still safe because blk_release_queue() will be - * called only after the queue refcnt drops to zero and nothing, - * RCU or not, would be traversing the queue by then. - */ - q->bypass_depth++; - queue_flag_set(QUEUE_FLAG_BYPASS, q); - queue_flag_set(QUEUE_FLAG_NOMERGES, q); - queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - queue_flag_set(QUEUE_FLAG_DYING, q); - spin_unlock_irq(lock); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); mutex_unlock(&q->sysfs_lock); /* @@ -788,9 +345,7 @@ void blk_cleanup_queue(struct request_queue *q) rq_qos_exit(q); - spin_lock_irq(lock); - queue_flag_set(QUEUE_FLAG_DEAD, q); - spin_unlock_irq(lock); + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); /* * make sure all in-progress dispatch are completed because @@ -798,11 +353,10 @@ void blk_cleanup_queue(struct request_queue *q) * dispatch may still be in-progress since we dispatch requests * from more than one contexts. * - * No need to quiesce queue if it isn't initialized yet since - * blk_freeze_queue() should be enough for cases of passthrough - * request. + * We rely on driver to deal with the race in case that queue + * initialization isn't done. */ - if (q->mq_ops && blk_queue_init_done(q)) + if (queue_is_mq(q) && blk_queue_init_done(q)) blk_mq_quiesce_queue(q); /* for synchronous bio-based driver finish in-flight integrity i/o */ @@ -820,98 +374,19 @@ void blk_cleanup_queue(struct request_queue *q) blk_exit_queue(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_free_queue(q); - percpu_ref_exit(&q->q_usage_counter); - spin_lock_irq(lock); - if (q->queue_lock != &q->__queue_lock) - q->queue_lock = &q->__queue_lock; - spin_unlock_irq(lock); + percpu_ref_exit(&q->q_usage_counter); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } EXPORT_SYMBOL(blk_cleanup_queue); -/* Allocate memory local to the request queue */ -static void *alloc_request_simple(gfp_t gfp_mask, void *data) -{ - struct request_queue *q = data; - - return kmem_cache_alloc_node(request_cachep, gfp_mask, q->node); -} - -static void free_request_simple(void *element, void *data) -{ - kmem_cache_free(request_cachep, element); -} - -static void *alloc_request_size(gfp_t gfp_mask, void *data) -{ - struct request_queue *q = data; - struct request *rq; - - rq = kmalloc_node(sizeof(struct request) + q->cmd_size, gfp_mask, - q->node); - if (rq && q->init_rq_fn && q->init_rq_fn(q, rq, gfp_mask) < 0) { - kfree(rq); - rq = NULL; - } - return rq; -} - -static void free_request_size(void *element, void *data) -{ - struct request_queue *q = data; - - if (q->exit_rq_fn) - q->exit_rq_fn(q, element); - kfree(element); -} - -int blk_init_rl(struct request_list *rl, struct request_queue *q, - gfp_t gfp_mask) -{ - if (unlikely(rl->rq_pool) || q->mq_ops) - return 0; - - rl->q = q; - rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0; - rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0; - init_waitqueue_head(&rl->wait[BLK_RW_SYNC]); - init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]); - - if (q->cmd_size) { - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, - alloc_request_size, free_request_size, - q, gfp_mask, q->node); - } else { - rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, - alloc_request_simple, free_request_simple, - q, gfp_mask, q->node); - } - if (!rl->rq_pool) - return -ENOMEM; - - if (rl != &q->root_rl) - WARN_ON_ONCE(!blk_get_queue(q)); - - return 0; -} - -void blk_exit_rl(struct request_queue *q, struct request_list *rl) -{ - if (rl->rq_pool) { - mempool_destroy(rl->rq_pool); - if (rl != &q->root_rl) - blk_put_queue(q); - } -} - struct request_queue *blk_alloc_queue(gfp_t gfp_mask) { - return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE, NULL); + return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE); } EXPORT_SYMBOL(blk_alloc_queue); @@ -991,17 +466,8 @@ static void blk_rq_timed_out_timer(struct timer_list *t) * blk_alloc_queue_node - allocate a request queue * @gfp_mask: memory allocation flags * @node_id: NUMA node to allocate memory from - * @lock: For legacy queues, pointer to a spinlock that will be used to e.g. - * serialize calls to the legacy .request_fn() callback. Ignored for - * blk-mq request queues. - * - * Note: pass the queue lock as the third argument to this function instead of - * setting the queue lock pointer explicitly to avoid triggering a sporadic - * crash in the blkcg code. This function namely calls blkcg_init_queue() and - * the queue lock pointer must be set before blkcg_init_queue() is called. */ -struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, - spinlock_t *lock) +struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; int ret; @@ -1013,8 +479,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, INIT_LIST_HEAD(&q->queue_head); q->last_merge = NULL; - q->end_sector = 0; - q->boundary_rq = NULL; q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask); if (q->id < 0) @@ -1042,12 +506,10 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, laptop_mode_timer_fn, 0); timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, NULL); - INIT_LIST_HEAD(&q->timeout_list); INIT_LIST_HEAD(&q->icq_list); #ifdef CONFIG_BLK_CGROUP INIT_LIST_HEAD(&q->blkg_list); #endif - INIT_DELAYED_WORK(&q->delay_work, blk_delay_work); kobject_init(&q->kobj, &blk_queue_ktype); @@ -1055,18 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id, mutex_init(&q->blk_trace_mutex); #endif mutex_init(&q->sysfs_lock); - spin_lock_init(&q->__queue_lock); - - q->queue_lock = lock ? : &q->__queue_lock; - - /* - * A queue starts its life with bypass turned on to avoid - * unnecessary bypass on/off overhead and nasty surprises during - * init. The initial bypass will be finished when the queue is - * registered by blk_register_queue(). - */ - q->bypass_depth = 1; - queue_flag_set_unlocked(QUEUE_FLAG_BYPASS, q); + spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -1100,105 +551,6 @@ fail_q: } EXPORT_SYMBOL(blk_alloc_queue_node); -/** - * blk_init_queue - prepare a request queue for use with a block device - * @rfn: The function to be called to process requests that have been - * placed on the queue. - * @lock: Request queue spin lock - * - * Description: - * If a block device wishes to use the standard request handling procedures, - * which sorts requests and coalesces adjacent requests, then it must - * call blk_init_queue(). The function @rfn will be called when there - * are requests on the queue that need to be processed. If the device - * supports plugging, then @rfn may not be called immediately when requests - * are available on the queue, but may be called at some time later instead. - * Plugged queues are generally unplugged when a buffer belonging to one - * of the requests on the queue is needed, or due to memory pressure. - * - * @rfn is not required, or even expected, to remove all requests off the - * queue, but only as many as it can handle at a time. If it does leave - * requests on the queue, it is responsible for arranging that the requests - * get dealt with eventually. - * - * The queue spin lock must be held while manipulating the requests on the - * request queue; this lock will be taken also from interrupt context, so irq - * disabling is needed for it. - * - * Function returns a pointer to the initialized request queue, or %NULL if - * it didn't succeed. - * - * Note: - * blk_init_queue() must be paired with a blk_cleanup_queue() call - * when the block device is deactivated (such as at module unload). - **/ - -struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock) -{ - return blk_init_queue_node(rfn, lock, NUMA_NO_NODE); -} -EXPORT_SYMBOL(blk_init_queue); - -struct request_queue * -blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id) -{ - struct request_queue *q; - - q = blk_alloc_queue_node(GFP_KERNEL, node_id, lock); - if (!q) - return NULL; - - q->request_fn = rfn; - if (blk_init_allocated_queue(q) < 0) { - blk_cleanup_queue(q); - return NULL; - } - - return q; -} -EXPORT_SYMBOL(blk_init_queue_node); - -static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio); - - -int blk_init_allocated_queue(struct request_queue *q) -{ - WARN_ON_ONCE(q->mq_ops); - - q->fq = blk_alloc_flush_queue(q, NUMA_NO_NODE, q->cmd_size, GFP_KERNEL); - if (!q->fq) - return -ENOMEM; - - if (q->init_rq_fn && q->init_rq_fn(q, q->fq->flush_rq, GFP_KERNEL)) - goto out_free_flush_queue; - - if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) - goto out_exit_flush_rq; - - INIT_WORK(&q->timeout_work, blk_timeout_work); - q->queue_flags |= QUEUE_FLAG_DEFAULT; - - /* - * This also sets hw/phys segments, boundary and size - */ - blk_queue_make_request(q, blk_queue_bio); - - q->sg_reserved_size = INT_MAX; - - if (elevator_init(q)) - goto out_exit_flush_rq; - return 0; - -out_exit_flush_rq: - if (q->exit_rq_fn) - q->exit_rq_fn(q, q->fq->flush_rq); -out_free_flush_queue: - blk_free_flush_queue(q->fq); - q->fq = NULL; - return -ENOMEM; -} -EXPORT_SYMBOL(blk_init_allocated_queue); - bool blk_get_queue(struct request_queue *q) { if (likely(!blk_queue_dying(q))) { @@ -1210,406 +562,6 @@ bool blk_get_queue(struct request_queue *q) } EXPORT_SYMBOL(blk_get_queue); -static inline void blk_free_request(struct request_list *rl, struct request *rq) -{ - if (rq->rq_flags & RQF_ELVPRIV) { - elv_put_request(rl->q, rq); - if (rq->elv.icq) - put_io_context(rq->elv.icq->ioc); - } - - mempool_free(rq, rl->rq_pool); -} - -/* - * ioc_batching returns true if the ioc is a valid batching request and - * should be given priority access to a request. - */ -static inline int ioc_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc) - return 0; - - /* - * Make sure the process is able to allocate at least 1 request - * even if the batch times out, otherwise we could theoretically - * lose wakeups. - */ - return ioc->nr_batch_requests == q->nr_batching || - (ioc->nr_batch_requests > 0 - && time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME)); -} - -/* - * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This - * will cause the process to be a "batcher" on all queues in the system. This - * is the behaviour we want though - once it gets a wakeup it should be given - * a nice run. - */ -static void ioc_set_batching(struct request_queue *q, struct io_context *ioc) -{ - if (!ioc || ioc_batching(q, ioc)) - return; - - ioc->nr_batch_requests = q->nr_batching; - ioc->last_waited = jiffies; -} - -static void __freed_request(struct request_list *rl, int sync) -{ - struct request_queue *q = rl->q; - - if (rl->count[sync] < queue_congestion_off_threshold(q)) - blk_clear_congested(rl, sync); - - if (rl->count[sync] + 1 <= q->nr_requests) { - if (waitqueue_active(&rl->wait[sync])) - wake_up(&rl->wait[sync]); - - blk_clear_rl_full(rl, sync); - } -} - -/* - * A request has just been released. Account for it, update the full and - * congestion status, wake up any waiters. Called under q->queue_lock. - */ -static void freed_request(struct request_list *rl, bool sync, - req_flags_t rq_flags) -{ - struct request_queue *q = rl->q; - - q->nr_rqs[sync]--; - rl->count[sync]--; - if (rq_flags & RQF_ELVPRIV) - q->nr_rqs_elvpriv--; - - __freed_request(rl, sync); - - if (unlikely(rl->starved[sync ^ 1])) - __freed_request(rl, sync ^ 1); -} - -int blk_update_nr_requests(struct request_queue *q, unsigned int nr) -{ - struct request_list *rl; - int on_thresh, off_thresh; - - WARN_ON_ONCE(q->mq_ops); - - spin_lock_irq(q->queue_lock); - q->nr_requests = nr; - blk_queue_congestion_threshold(q); - on_thresh = queue_congestion_on_threshold(q); - off_thresh = queue_congestion_off_threshold(q); - - blk_queue_for_each_rl(rl, q) { - if (rl->count[BLK_RW_SYNC] >= on_thresh) - blk_set_congested(rl, BLK_RW_SYNC); - else if (rl->count[BLK_RW_SYNC] < off_thresh) - blk_clear_congested(rl, BLK_RW_SYNC); - - if (rl->count[BLK_RW_ASYNC] >= on_thresh) - blk_set_congested(rl, BLK_RW_ASYNC); - else if (rl->count[BLK_RW_ASYNC] < off_thresh) - blk_clear_congested(rl, BLK_RW_ASYNC); - - if (rl->count[BLK_RW_SYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_SYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_SYNC); - wake_up(&rl->wait[BLK_RW_SYNC]); - } - - if (rl->count[BLK_RW_ASYNC] >= q->nr_requests) { - blk_set_rl_full(rl, BLK_RW_ASYNC); - } else { - blk_clear_rl_full(rl, BLK_RW_ASYNC); - wake_up(&rl->wait[BLK_RW_ASYNC]); - } - } - - spin_unlock_irq(q->queue_lock); - return 0; -} - -/** - * __get_request - get a free request - * @rl: request list to allocate from - * @op: operation and flags - * @bio: bio to allocate request for (can be %NULL) - * @flags: BLQ_MQ_REQ_* flags - * @gfp_mask: allocator flags - * - * Get a free request from @q. This function may fail under memory - * pressure or if @q is dead. - * - * Must be called with @q->queue_lock held and, - * Returns ERR_PTR on failure, with @q->queue_lock held. - * Returns request pointer on success, with @q->queue_lock *not held*. - */ -static struct request *__get_request(struct request_list *rl, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp_mask) -{ - struct request_queue *q = rl->q; - struct request *rq; - struct elevator_type *et = q->elevator->type; - struct io_context *ioc = rq_ioc(bio); - struct io_cq *icq = NULL; - const bool is_sync = op_is_sync(op); - int may_queue; - req_flags_t rq_flags = RQF_ALLOCED; - - lockdep_assert_held(q->queue_lock); - - if (unlikely(blk_queue_dying(q))) - return ERR_PTR(-ENODEV); - - may_queue = elv_may_queue(q, op); - if (may_queue == ELV_MQUEUE_NO) - goto rq_starved; - - if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) { - if (rl->count[is_sync]+1 >= q->nr_requests) { - /* - * The queue will fill after this allocation, so set - * it as full, and mark this process as "batching". - * This process will be allowed to complete a batch of - * requests, others will be blocked. - */ - if (!blk_rl_full(rl, is_sync)) { - ioc_set_batching(q, ioc); - blk_set_rl_full(rl, is_sync); - } else { - if (may_queue != ELV_MQUEUE_MUST - && !ioc_batching(q, ioc)) { - /* - * The queue is full and the allocating - * process is not a "batcher", and not - * exempted by the IO scheduler - */ - return ERR_PTR(-ENOMEM); - } - } - } - blk_set_congested(rl, is_sync); - } - - /* - * Only allow batching queuers to allocate up to 50% over the defined - * limit of requests, otherwise we could have thousands of requests - * allocated with any setting of ->nr_requests - */ - if (rl->count[is_sync] >= (3 * q->nr_requests / 2)) - return ERR_PTR(-ENOMEM); - - q->nr_rqs[is_sync]++; - rl->count[is_sync]++; - rl->starved[is_sync] = 0; - - /* - * Decide whether the new request will be managed by elevator. If - * so, mark @rq_flags and increment elvpriv. Non-zero elvpriv will - * prevent the current elevator from being destroyed until the new - * request is freed. This guarantees icq's won't be destroyed and - * makes creating new ones safe. - * - * Flush requests do not use the elevator so skip initialization. - * This allows a request to share the flush and elevator data. - * - * Also, lookup icq while holding queue_lock. If it doesn't exist, - * it will be created after releasing queue_lock. - */ - if (!op_is_flush(op) && !blk_queue_bypass(q)) { - rq_flags |= RQF_ELVPRIV; - q->nr_rqs_elvpriv++; - if (et->icq_cache && ioc) - icq = ioc_lookup_icq(ioc, q); - } - - if (blk_queue_io_stat(q)) - rq_flags |= RQF_IO_STAT; - spin_unlock_irq(q->queue_lock); - - /* allocate and init request */ - rq = mempool_alloc(rl->rq_pool, gfp_mask); - if (!rq) - goto fail_alloc; - - blk_rq_init(q, rq); - blk_rq_set_rl(rq, rl); - rq->cmd_flags = op; - rq->rq_flags = rq_flags; - if (flags & BLK_MQ_REQ_PREEMPT) - rq->rq_flags |= RQF_PREEMPT; - - /* init elvpriv */ - if (rq_flags & RQF_ELVPRIV) { - if (unlikely(et->icq_cache && !icq)) { - if (ioc) - icq = ioc_create_icq(ioc, q, gfp_mask); - if (!icq) - goto fail_elvpriv; - } - - rq->elv.icq = icq; - if (unlikely(elv_set_request(q, rq, bio, gfp_mask))) - goto fail_elvpriv; - - /* @rq->elv.icq holds io_context until @rq is freed */ - if (icq) - get_io_context(icq->ioc); - } -out: - /* - * ioc may be NULL here, and ioc_batching will be false. That's - * OK, if the queue is under the request limit then requests need - * not count toward the nr_batch_requests limit. There will always - * be some limit enforced by BLK_BATCH_TIME. - */ - if (ioc_batching(q, ioc)) - ioc->nr_batch_requests--; - - trace_block_getrq(q, bio, op); - return rq; - -fail_elvpriv: - /* - * elvpriv init failed. ioc, icq and elvpriv aren't mempool backed - * and may fail indefinitely under memory pressure and thus - * shouldn't stall IO. Treat this request as !elvpriv. This will - * disturb iosched and blkcg but weird is bettern than dead. - */ - printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", - __func__, dev_name(q->backing_dev_info->dev)); - - rq->rq_flags &= ~RQF_ELVPRIV; - rq->elv.icq = NULL; - - spin_lock_irq(q->queue_lock); - q->nr_rqs_elvpriv--; - spin_unlock_irq(q->queue_lock); - goto out; - -fail_alloc: - /* - * Allocation failed presumably due to memory. Undo anything we - * might have messed up. - * - * Allocating task should really be put onto the front of the wait - * queue, but this is pretty rare. - */ - spin_lock_irq(q->queue_lock); - freed_request(rl, is_sync, rq_flags); - - /* - * in the very unlikely event that allocation failed and no - * requests for this direction was pending, mark us starved so that - * freeing of a request in the other direction will notice - * us. another possible fix would be to split the rq mempool into - * READ and WRITE - */ -rq_starved: - if (unlikely(rl->count[is_sync] == 0)) - rl->starved[is_sync] = 1; - return ERR_PTR(-ENOMEM); -} - -/** - * get_request - get a free request - * @q: request_queue to allocate request from - * @op: operation and flags - * @bio: bio to allocate request for (can be %NULL) - * @flags: BLK_MQ_REQ_* flags. - * @gfp: allocator flags - * - * Get a free request from @q. If %BLK_MQ_REQ_NOWAIT is set in @flags, - * this function keeps retrying under memory pressure and fails iff @q is dead. - * - * Must be called with @q->queue_lock held and, - * Returns ERR_PTR on failure, with @q->queue_lock held. - * Returns request pointer on success, with @q->queue_lock *not held*. - */ -static struct request *get_request(struct request_queue *q, unsigned int op, - struct bio *bio, blk_mq_req_flags_t flags, gfp_t gfp) -{ - const bool is_sync = op_is_sync(op); - DEFINE_WAIT(wait); - struct request_list *rl; - struct request *rq; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - rl = blk_get_rl(q, bio); /* transferred to @rq on success */ -retry: - rq = __get_request(rl, op, bio, flags, gfp); - if (!IS_ERR(rq)) - return rq; - - if (op & REQ_NOWAIT) { - blk_put_rl(rl); - return ERR_PTR(-EAGAIN); - } - - if ((flags & BLK_MQ_REQ_NOWAIT) || unlikely(blk_queue_dying(q))) { - blk_put_rl(rl); - return rq; - } - - /* wait on @rl and retry */ - prepare_to_wait_exclusive(&rl->wait[is_sync], &wait, - TASK_UNINTERRUPTIBLE); - - trace_block_sleeprq(q, bio, op); - - spin_unlock_irq(q->queue_lock); - io_schedule(); - - /* - * After sleeping, we become a "batching" process and will be able - * to allocate at least one request, and up to a big batch of them - * for a small period time. See ioc_batching, ioc_set_batching - */ - ioc_set_batching(q, current->io_context); - - spin_lock_irq(q->queue_lock); - finish_wait(&rl->wait[is_sync], &wait); - - goto retry; -} - -/* flags: BLK_MQ_REQ_PREEMPT and/or BLK_MQ_REQ_NOWAIT. */ -static struct request *blk_old_get_request(struct request_queue *q, - unsigned int op, blk_mq_req_flags_t flags) -{ - struct request *rq; - gfp_t gfp_mask = flags & BLK_MQ_REQ_NOWAIT ? GFP_ATOMIC : GFP_NOIO; - int ret = 0; - - WARN_ON_ONCE(q->mq_ops); - - /* create ioc upfront */ - create_io_context(gfp_mask, q->node); - - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); - spin_lock_irq(q->queue_lock); - rq = get_request(q, op, NULL, flags, gfp_mask); - if (IS_ERR(rq)) { - spin_unlock_irq(q->queue_lock); - blk_queue_exit(q); - return rq; - } - - /* q->queue_lock is unlocked at this point */ - rq->__data_len = 0; - rq->__sector = (sector_t) -1; - rq->bio = rq->biotail = NULL; - return rq; -} - /** * blk_get_request - allocate a request * @q: request queue to allocate a request for @@ -1624,170 +576,17 @@ struct request *blk_get_request(struct request_queue *q, unsigned int op, WARN_ON_ONCE(op & REQ_NOWAIT); WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); - if (q->mq_ops) { - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - } else { - req = blk_old_get_request(q, op, flags); - if (!IS_ERR(req) && q->initialize_rq_fn) - q->initialize_rq_fn(req); - } + req = blk_mq_alloc_request(q, op, flags); + if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) + q->mq_ops->initialize_rq_fn(req); return req; } EXPORT_SYMBOL(blk_get_request); -/** - * blk_requeue_request - put a request back on queue - * @q: request queue where request should be inserted - * @rq: request to be inserted - * - * Description: - * Drivers often keep queueing requests until the hardware cannot accept - * more, when that condition happens we need to put the request back - * on the queue. Must be called with queue lock held. - */ -void blk_requeue_request(struct request_queue *q, struct request *rq) -{ - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - blk_delete_timer(rq); - blk_clear_rq_complete(rq); - trace_block_rq_requeue(q, rq); - rq_qos_requeue(q, rq); - - if (rq->rq_flags & RQF_QUEUED) - blk_queue_end_tag(q, rq); - - BUG_ON(blk_queued_rq(rq)); - - elv_requeue_request(q, rq); -} -EXPORT_SYMBOL(blk_requeue_request); - -static void add_acct_request(struct request_queue *q, struct request *rq, - int where) -{ - blk_account_io_start(rq, true); - __elv_add_request(q, rq, where); -} - -static void part_round_stats_single(struct request_queue *q, int cpu, - struct hd_struct *part, unsigned long now, - unsigned int inflight) -{ - if (inflight) { - __part_stat_add(cpu, part, time_in_queue, - inflight * (now - part->stamp)); - __part_stat_add(cpu, part, io_ticks, (now - part->stamp)); - } - part->stamp = now; -} - -/** - * part_round_stats() - Round off the performance stats on a struct disk_stats. - * @q: target block queue - * @cpu: cpu number for stats access - * @part: target partition - * - * The average IO queue length and utilisation statistics are maintained - * by observing the current state of the queue length and the amount of - * time it has been in this state for. - * - * Normally, that accounting is done on IO completion, but that can result - * in more than a second's worth of IO being accounted for within any one - * second, leading to >100% utilisation. To deal with that, we call this - * function to do a round-off before returning the results when reading - * /proc/diskstats. This accounts immediately for all queue usage up to - * the current jiffies and restarts the counters again. - */ -void part_round_stats(struct request_queue *q, int cpu, struct hd_struct *part) -{ - struct hd_struct *part2 = NULL; - unsigned long now = jiffies; - unsigned int inflight[2]; - int stats = 0; - - if (part->stamp != now) - stats |= 1; - - if (part->partno) { - part2 = &part_to_disk(part)->part0; - if (part2->stamp != now) - stats |= 2; - } - - if (!stats) - return; - - part_in_flight(q, part, inflight); - - if (stats & 2) - part_round_stats_single(q, cpu, part2, now, inflight[1]); - if (stats & 1) - part_round_stats_single(q, cpu, part, now, inflight[0]); -} -EXPORT_SYMBOL_GPL(part_round_stats); - -void __blk_put_request(struct request_queue *q, struct request *req) -{ - req_flags_t rq_flags = req->rq_flags; - - if (unlikely(!q)) - return; - - if (q->mq_ops) { - blk_mq_free_request(req); - return; - } - - lockdep_assert_held(q->queue_lock); - - blk_req_zone_write_unlock(req); - blk_pm_put_request(req); - blk_pm_mark_last_busy(req); - - elv_completed_request(q, req); - - /* this is a bio leak */ - WARN_ON(req->bio != NULL); - - rq_qos_done(q, req); - - /* - * Request may not have originated from ll_rw_blk. if not, - * it didn't come out of our reserved rq pools - */ - if (rq_flags & RQF_ALLOCED) { - struct request_list *rl = blk_rq_rl(req); - bool sync = op_is_sync(req->cmd_flags); - - BUG_ON(!list_empty(&req->queuelist)); - BUG_ON(ELV_ON_HASH(req)); - - blk_free_request(rl, req); - freed_request(rl, sync, rq_flags); - blk_put_rl(rl); - blk_queue_exit(q); - } -} -EXPORT_SYMBOL_GPL(__blk_put_request); - void blk_put_request(struct request *req) { - struct request_queue *q = req->q; - - if (q->mq_ops) - blk_mq_free_request(req); - else { - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __blk_put_request(q, req); - spin_unlock_irqrestore(q->queue_lock, flags); - } + blk_mq_free_request(req); } EXPORT_SYMBOL(blk_put_request); @@ -1807,7 +606,6 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; @@ -1831,7 +629,6 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req, req->__sector = bio->bi_iter.bi_sector; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); blk_account_io_start(req, false); return true; @@ -1851,7 +648,6 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = bio; req->biotail = bio; req->__data_len += bio->bi_iter.bi_size; - req->ioprio = ioprio_best(req->ioprio, bio_prio(bio)); req->nr_phys_segments = segments + 1; blk_account_io_start(req, false); @@ -1884,7 +680,6 @@ no_merge: * Caller must ensure !blk_queue_nomerges(q) beforehand. */ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count, struct request **same_queue_rq) { struct blk_plug *plug; @@ -1894,25 +689,19 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, plug = current->plug; if (!plug) return false; - *request_count = 0; - if (q->mq_ops) - plug_list = &plug->mq_list; - else - plug_list = &plug->list; + plug_list = &plug->mq_list; list_for_each_entry_reverse(rq, plug_list, queuelist) { bool merged = false; - if (rq->q == q) { - (*request_count)++; + if (rq->q == q && same_queue_rq) { /* * Only blk-mq multiple hardware queues case checks the * rq in the same queue, there should be only one such * rq in a queue **/ - if (same_queue_rq) - *same_queue_rq = rq; + *same_queue_rq = rq; } if (rq->q != q || !blk_rq_merge_ok(rq, bio)) @@ -1939,176 +728,18 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, return false; } -unsigned int blk_plug_queued_count(struct request_queue *q) -{ - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - unsigned int ret = 0; - - plug = current->plug; - if (!plug) - goto out; - - if (q->mq_ops) - plug_list = &plug->mq_list; - else - plug_list = &plug->list; - - list_for_each_entry(rq, plug_list, queuelist) { - if (rq->q == q) - ret++; - } -out: - return ret; -} - void blk_init_request_from_bio(struct request *req, struct bio *bio) { - struct io_context *ioc = rq_ioc(bio); - if (bio->bi_opf & REQ_RAHEAD) req->cmd_flags |= REQ_FAILFAST_MASK; req->__sector = bio->bi_iter.bi_sector; - if (ioprio_valid(bio_prio(bio))) - req->ioprio = bio_prio(bio); - else if (ioc) - req->ioprio = ioc->ioprio; - else - req->ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0); + req->ioprio = bio_prio(bio); req->write_hint = bio->bi_write_hint; blk_rq_bio_prep(req->q, req, bio); } EXPORT_SYMBOL_GPL(blk_init_request_from_bio); -static blk_qc_t blk_queue_bio(struct request_queue *q, struct bio *bio) -{ - struct blk_plug *plug; - int where = ELEVATOR_INSERT_SORT; - struct request *req, *free; - unsigned int request_count = 0; - - /* - * low level driver can indicate that it wants pages above a - * certain limit bounced to low memory (ie for highmem, or even - * ISA dma in theory) - */ - blk_queue_bounce(q, &bio); - - blk_queue_split(q, &bio); - - if (!bio_integrity_prep(bio)) - return BLK_QC_T_NONE; - - if (op_is_flush(bio->bi_opf)) { - spin_lock_irq(q->queue_lock); - where = ELEVATOR_INSERT_FLUSH; - goto get_rq; - } - - /* - * Check if we can merge with the plugged list before grabbing - * any locks. - */ - if (!blk_queue_nomerges(q)) { - if (blk_attempt_plug_merge(q, bio, &request_count, NULL)) - return BLK_QC_T_NONE; - } else - request_count = blk_plug_queued_count(q); - - spin_lock_irq(q->queue_lock); - - switch (elv_merge(q, &req, bio)) { - case ELEVATOR_BACK_MERGE: - if (!bio_attempt_back_merge(q, req, bio)) - break; - elv_bio_merged(q, req, bio); - free = attempt_back_merge(q, req); - if (free) - __blk_put_request(q, free); - else - elv_merged_request(q, req, ELEVATOR_BACK_MERGE); - goto out_unlock; - case ELEVATOR_FRONT_MERGE: - if (!bio_attempt_front_merge(q, req, bio)) - break; - elv_bio_merged(q, req, bio); - free = attempt_front_merge(q, req); - if (free) - __blk_put_request(q, free); - else - elv_merged_request(q, req, ELEVATOR_FRONT_MERGE); - goto out_unlock; - default: - break; - } - -get_rq: - rq_qos_throttle(q, bio, q->queue_lock); - - /* - * Grab a free request. This is might sleep but can not fail. - * Returns with the queue unlocked. - */ - blk_queue_enter_live(q); - req = get_request(q, bio->bi_opf, bio, 0, GFP_NOIO); - if (IS_ERR(req)) { - blk_queue_exit(q); - rq_qos_cleanup(q, bio); - if (PTR_ERR(req) == -ENOMEM) - bio->bi_status = BLK_STS_RESOURCE; - else - bio->bi_status = BLK_STS_IOERR; - bio_endio(bio); - goto out_unlock; - } - - rq_qos_track(q, req, bio); - - /* - * After dropping the lock and possibly sleeping here, our request - * may now be mergeable after it had proven unmergeable (above). - * We don't worry about that case for efficiency. It won't happen - * often, and the elevators are able to handle it. - */ - blk_init_request_from_bio(req, bio); - - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) - req->cpu = raw_smp_processor_id(); - - plug = current->plug; - if (plug) { - /* - * If this is the first request added after a plug, fire - * of a plug trace. - * - * @request_count may become stale because of schedule - * out, so check plug list again. - */ - if (!request_count || list_empty(&plug->list)) - trace_block_plug(q); - else { - struct request *last = list_entry_rq(plug->list.prev); - if (request_count >= BLK_MAX_REQUEST_COUNT || - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } - } - list_add_tail(&req->queuelist, &plug->list); - blk_account_io_start(req, true); - } else { - spin_lock_irq(q->queue_lock); - add_acct_request(q, req, where); - __blk_run_queue(q); -out_unlock: - spin_unlock_irq(q->queue_lock); - } - - return BLK_QC_T_NONE; -} - static void handle_bad_sector(struct bio *bio, sector_t maxsector) { char b[BDEVNAME_SIZE]; @@ -2260,7 +891,7 @@ generic_make_request_checks(struct bio *bio) * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue is not a request based queue. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_rq_based(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) goto not_supported; if (should_fail_bio(bio)) @@ -2290,6 +921,9 @@ generic_make_request_checks(struct bio *bio) } } + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + bio->bi_opf &= ~REQ_HIPRI; + switch (bio_op(bio)) { case REQ_OP_DISCARD: if (!blk_queue_discard(q)) @@ -2562,17 +1196,6 @@ blk_qc_t submit_bio(struct bio *bio) } EXPORT_SYMBOL(submit_bio); -bool blk_poll(struct request_queue *q, blk_qc_t cookie) -{ - if (!q->poll_fn || !blk_qc_t_valid(cookie)) - return false; - - if (current->plug) - blk_flush_plug_list(current->plug, false); - return q->poll_fn(q, cookie); -} -EXPORT_SYMBOL_GPL(blk_poll); - /** * blk_cloned_rq_check_limits - Helper function to check a cloned request * for new the queue limits @@ -2620,8 +1243,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q, */ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) { - unsigned long flags; - int where = ELEVATOR_INSERT_BACK; + blk_qc_t unused; if (blk_cloned_rq_check_limits(q, rq)) return BLK_STS_IOERR; @@ -2630,38 +1252,15 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request * should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) return BLK_STS_IOERR; - if (q->mq_ops) { - if (blk_queue_io_stat(q)) - blk_account_io_start(rq, true); - /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. - */ - return blk_mq_request_issue_directly(rq); - } - - spin_lock_irqsave(q->queue_lock, flags); - if (unlikely(blk_queue_dying(q))) { - spin_unlock_irqrestore(q->queue_lock, flags); - return BLK_STS_IOERR; - } + if (blk_queue_io_stat(q)) + blk_account_io_start(rq, true); /* - * Submitting request must be dequeued before calling this function - * because it will be linked to another request_queue + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. */ - BUG_ON(blk_queued_rq(rq)); - - if (op_is_flush(rq->cmd_flags)) - where = ELEVATOR_INSERT_FLUSH; - - add_acct_request(q, rq, where); - if (where == ELEVATOR_INSERT_FLUSH) - __blk_run_queue(q); - spin_unlock_irqrestore(q->queue_lock, flags); - - return BLK_STS_OK; + return blk_mq_try_issue_directly(rq->mq_hctx, rq, &unused, true, true); } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); @@ -2711,11 +1310,10 @@ void blk_account_io_completion(struct request *req, unsigned int bytes) if (blk_do_io_stat(req)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_add(cpu, part, sectors[sgrp], bytes >> 9); + part_stat_add(part, sectors[sgrp], bytes >> 9); part_stat_unlock(); } } @@ -2730,14 +1328,14 @@ void blk_account_io_done(struct request *req, u64 now) if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) { const int sgrp = op_stat_group(req_op(req)); struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_stat_inc(cpu, part, ios[sgrp]); - part_stat_add(cpu, part, nsecs[sgrp], now - req->start_time_ns); - part_round_stats(req->q, cpu, part); + update_io_ticks(part, jiffies); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns)); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -2749,16 +1347,15 @@ void blk_account_io_start(struct request *rq, bool new_io) { struct hd_struct *part; int rw = rq_data_dir(rq); - int cpu; if (!blk_do_io_stat(rq)) return; - cpu = part_stat_lock(); + part_stat_lock(); if (!new_io) { part = rq->part; - part_stat_inc(cpu, part, merges[rw]); + part_stat_inc(part, merges[rw]); } else { part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!hd_struct_try_get(part)) { @@ -2773,232 +1370,14 @@ void blk_account_io_start(struct request *rq, bool new_io) part = &rq->rq_disk->part0; hd_struct_get(part); } - part_round_stats(rq->q, cpu, part); part_inc_in_flight(rq->q, part, rw); rq->part = part; } - part_stat_unlock(); -} - -static struct request *elv_next_request(struct request_queue *q) -{ - struct request *rq; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - WARN_ON_ONCE(q->mq_ops); - - while (1) { - list_for_each_entry(rq, &q->queue_head, queuelist) { -#ifdef CONFIG_PM - /* - * If a request gets queued in state RPM_SUSPENDED - * then that's a kernel bug. - */ - WARN_ON_ONCE(q->rpm_status == RPM_SUSPENDED); -#endif - return rq; - } - - /* - * Flush request is running and flush request isn't queueable - * in the drive, we can hold the queue till flush request is - * finished. Even we don't do this, driver can't dispatch next - * requests and will requeue them. And this can improve - * throughput too. For example, we have request flush1, write1, - * flush 2. flush1 is dispatched, then queue is hold, write1 - * isn't inserted to queue. After flush1 is finished, flush2 - * will be dispatched. Since disk cache is already clean, - * flush2 will be finished very soon, so looks like flush2 is - * folded to flush1. - * Since the queue is hold, a flag is set to indicate the queue - * should be restarted later. Please see flush_end_io() for - * details. - */ - if (fq->flush_pending_idx != fq->flush_running_idx && - !queue_flush_queueable(q)) { - fq->flush_queue_delayed = 1; - return NULL; - } - if (unlikely(blk_queue_bypass(q)) || - !q->elevator->type->ops.sq.elevator_dispatch_fn(q, 0)) - return NULL; - } -} - -/** - * blk_peek_request - peek at the top of a request queue - * @q: request queue to peek at - * - * Description: - * Return the request at the top of @q. The returned request - * should be started using blk_start_request() before LLD starts - * processing it. - * - * Return: - * Pointer to the request at the top of @q if available. Null - * otherwise. - */ -struct request *blk_peek_request(struct request_queue *q) -{ - struct request *rq; - int ret; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - while ((rq = elv_next_request(q)) != NULL) { - if (!(rq->rq_flags & RQF_STARTED)) { - /* - * This is the first time the device driver - * sees this request (possibly after - * requeueing). Notify IO scheduler. - */ - if (rq->rq_flags & RQF_SORTED) - elv_activate_rq(q, rq); - - /* - * just mark as started even if we don't start - * it, a request that has been delayed should - * not be passed by new incoming requests - */ - rq->rq_flags |= RQF_STARTED; - trace_block_rq_issue(q, rq); - } - - if (!q->boundary_rq || q->boundary_rq == rq) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = NULL; - } - - if (rq->rq_flags & RQF_DONTPREP) - break; - - if (q->dma_drain_size && blk_rq_bytes(rq)) { - /* - * make sure space for the drain appears we - * know we can do this because max_hw_segments - * has been adjusted to be one fewer than the - * device can handle - */ - rq->nr_phys_segments++; - } - - if (!q->prep_rq_fn) - break; - - ret = q->prep_rq_fn(q, rq); - if (ret == BLKPREP_OK) { - break; - } else if (ret == BLKPREP_DEFER) { - /* - * the request may have been (partially) prepped. - * we need to keep this request in the front to - * avoid resource deadlock. RQF_STARTED will - * prevent other fs requests from passing this one. - */ - if (q->dma_drain_size && blk_rq_bytes(rq) && - !(rq->rq_flags & RQF_DONTPREP)) { - /* - * remove the space for the drain we added - * so that we don't add it again - */ - --rq->nr_phys_segments; - } - - rq = NULL; - break; - } else if (ret == BLKPREP_KILL || ret == BLKPREP_INVALID) { - rq->rq_flags |= RQF_QUIET; - /* - * Mark this request as started so we don't trigger - * any debug logic in the end I/O path. - */ - blk_start_request(rq); - __blk_end_request_all(rq, ret == BLKPREP_INVALID ? - BLK_STS_TARGET : BLK_STS_IOERR); - } else { - printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); - break; - } - } - - return rq; -} -EXPORT_SYMBOL(blk_peek_request); - -static void blk_dequeue_request(struct request *rq) -{ - struct request_queue *q = rq->q; + update_io_ticks(part, jiffies); - BUG_ON(list_empty(&rq->queuelist)); - BUG_ON(ELV_ON_HASH(rq)); - - list_del_init(&rq->queuelist); - - /* - * the time frame between a request being removed from the lists - * and to it is freed is accounted as io that is in progress at - * the driver side. - */ - if (blk_account_rq(rq)) - q->in_flight[rq_is_sync(rq)]++; -} - -/** - * blk_start_request - start request processing on the driver - * @req: request to dequeue - * - * Description: - * Dequeue @req and start timeout timer on it. This hands off the - * request to the driver. - */ -void blk_start_request(struct request *req) -{ - lockdep_assert_held(req->q->queue_lock); - WARN_ON_ONCE(req->q->mq_ops); - - blk_dequeue_request(req); - - if (test_bit(QUEUE_FLAG_STATS, &req->q->queue_flags)) { - req->io_start_time_ns = ktime_get_ns(); -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - req->throtl_size = blk_rq_sectors(req); -#endif - req->rq_flags |= RQF_STATS; - rq_qos_issue(req->q, req); - } - - BUG_ON(blk_rq_is_complete(req)); - blk_add_timer(req); -} -EXPORT_SYMBOL(blk_start_request); - -/** - * blk_fetch_request - fetch a request from a request queue - * @q: request queue to fetch a request from - * - * Description: - * Return the request at the top of @q. The request is started on - * return and LLD can start processing it immediately. - * - * Return: - * Pointer to the request at the top of @q if available. Null - * otherwise. - */ -struct request *blk_fetch_request(struct request_queue *q) -{ - struct request *rq; - - lockdep_assert_held(q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - rq = blk_peek_request(q); - if (rq) - blk_start_request(rq); - return rq; + part_stat_unlock(); } -EXPORT_SYMBOL(blk_fetch_request); /* * Steal bios from a request and add them to a bio list. @@ -3125,255 +1504,6 @@ bool blk_update_request(struct request *req, blk_status_t error, } EXPORT_SYMBOL_GPL(blk_update_request); -static bool blk_update_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, - unsigned int bidi_bytes) -{ - if (blk_update_request(rq, error, nr_bytes)) - return true; - - /* Bidi request must be completed as a whole */ - if (unlikely(blk_bidi_rq(rq)) && - blk_update_request(rq->next_rq, error, bidi_bytes)) - return true; - - if (blk_queue_add_random(rq->q)) - add_disk_randomness(rq->rq_disk); - - return false; -} - -/** - * blk_unprep_request - unprepare a request - * @req: the request - * - * This function makes a request ready for complete resubmission (or - * completion). It happens only after all error handling is complete, - * so represents the appropriate moment to deallocate any resources - * that were allocated to the request in the prep_rq_fn. The queue - * lock is held when calling this. - */ -void blk_unprep_request(struct request *req) -{ - struct request_queue *q = req->q; - - req->rq_flags &= ~RQF_DONTPREP; - if (q->unprep_rq_fn) - q->unprep_rq_fn(q, req); -} -EXPORT_SYMBOL_GPL(blk_unprep_request); - -void blk_finish_request(struct request *req, blk_status_t error) -{ - struct request_queue *q = req->q; - u64 now = ktime_get_ns(); - - lockdep_assert_held(req->q->queue_lock); - WARN_ON_ONCE(q->mq_ops); - - if (req->rq_flags & RQF_STATS) - blk_stat_add(req, now); - - if (req->rq_flags & RQF_QUEUED) - blk_queue_end_tag(q, req); - - BUG_ON(blk_queued_rq(req)); - - if (unlikely(laptop_mode) && !blk_rq_is_passthrough(req)) - laptop_io_completion(req->q->backing_dev_info); - - blk_delete_timer(req); - - if (req->rq_flags & RQF_DONTPREP) - blk_unprep_request(req); - - blk_account_io_done(req, now); - - if (req->end_io) { - rq_qos_done(q, req); - req->end_io(req, error); - } else { - if (blk_bidi_rq(req)) - __blk_put_request(req->next_rq->q, req->next_rq); - - __blk_put_request(q, req); - } -} -EXPORT_SYMBOL(blk_finish_request); - -/** - * blk_end_bidi_request - Complete a bidi request - * @rq: the request to complete - * @error: block status code - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. - * Drivers that supports bidi can safely call this member for any - * type of request, bidi or uni. In the later case @bidi_bytes is - * just ignored. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static bool blk_end_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - WARN_ON_ONCE(q->mq_ops); - - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) - return true; - - spin_lock_irqsave(q->queue_lock, flags); - blk_finish_request(rq, error); - spin_unlock_irqrestore(q->queue_lock, flags); - - return false; -} - -/** - * __blk_end_bidi_request - Complete a bidi request with queue lock held - * @rq: the request to complete - * @error: block status code - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Identical to blk_end_bidi_request() except that queue lock is - * assumed to be locked on entry and remains so on return. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static bool __blk_end_bidi_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) - return true; - - blk_finish_request(rq, error); - - return false; -} - -/** - * blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -bool blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes) -{ - WARN_ON_ONCE(rq->q->mq_ops); - return blk_end_bidi_request(rq, error, nr_bytes, 0); -} -EXPORT_SYMBOL(blk_end_request); - -/** - * blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @error: block status code - * - * Description: - * Completely finish @rq. - */ -void blk_end_request_all(struct request *rq, blk_status_t error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} -EXPORT_SYMBOL(blk_end_request_all); - -/** - * __blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete - * - * Description: - * Must be called with queue lock held unlike blk_end_request(). - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -bool __blk_end_request(struct request *rq, blk_status_t error, - unsigned int nr_bytes) -{ - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - return __blk_end_bidi_request(rq, error, nr_bytes, 0); -} -EXPORT_SYMBOL(__blk_end_request); - -/** - * __blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @error: block status code - * - * Description: - * Completely finish @rq. Must be called with queue lock held. - */ -void __blk_end_request_all(struct request *rq, blk_status_t error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - lockdep_assert_held(rq->q->queue_lock); - WARN_ON_ONCE(rq->q->mq_ops); - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} -EXPORT_SYMBOL(__blk_end_request_all); - -/** - * __blk_end_request_cur - Helper function to finish the current request chunk. - * @rq: the request to finish the current chunk for - * @error: block status code - * - * Description: - * Complete the current consecutively mapped chunk from @rq. Must - * be called with queue lock held. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - */ -bool __blk_end_request_cur(struct request *rq, blk_status_t error) -{ - return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); -} -EXPORT_SYMBOL(__blk_end_request_cur); - void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { @@ -3429,8 +1559,8 @@ EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); */ int blk_lld_busy(struct request_queue *q) { - if (q->lld_busy_fn) - return q->lld_busy_fn(q); + if (queue_is_mq(q) && q->mq_ops->busy) + return q->mq_ops->busy(q); return 0; } @@ -3461,7 +1591,6 @@ EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); */ static void __blk_rq_prep_clone(struct request *dst, struct request *src) { - dst->cpu = src->cpu; dst->__sector = blk_rq_pos(src); dst->__data_len = blk_rq_bytes(src); if (src->rq_flags & RQF_SPECIAL_PAYLOAD) { @@ -3573,9 +1702,11 @@ void blk_start_plug(struct blk_plug *plug) if (tsk->plug) return; - INIT_LIST_HEAD(&plug->list); INIT_LIST_HEAD(&plug->mq_list); INIT_LIST_HEAD(&plug->cb_list); + plug->rq_count = 0; + plug->multiple_queues = false; + /* * Store ordering should not be needed here, since a potential * preempt will imply a full memory barrier @@ -3584,36 +1715,6 @@ void blk_start_plug(struct blk_plug *plug) } EXPORT_SYMBOL(blk_start_plug); -static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); - - return !(rqa->q < rqb->q || - (rqa->q == rqb->q && blk_rq_pos(rqa) < blk_rq_pos(rqb))); -} - -/* - * If 'from_schedule' is true, then postpone the dispatch of requests - * until a safe kblockd context. We due this to avoid accidental big - * additional stack usage in driver dispatch, in places where the originally - * plugger did not intend it. - */ -static void queue_unplugged(struct request_queue *q, unsigned int depth, - bool from_schedule) - __releases(q->queue_lock) -{ - lockdep_assert_held(q->queue_lock); - - trace_block_unplug(q, depth, !from_schedule); - - if (from_schedule) - blk_run_queue_async(q); - else - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); -} - static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) { LIST_HEAD(callbacks); @@ -3658,65 +1759,10 @@ EXPORT_SYMBOL(blk_check_plugged); void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) { - struct request_queue *q; - struct request *rq; - LIST_HEAD(list); - unsigned int depth; - flush_plug_callbacks(plug, from_schedule); if (!list_empty(&plug->mq_list)) blk_mq_flush_plug_list(plug, from_schedule); - - if (list_empty(&plug->list)) - return; - - list_splice_init(&plug->list, &list); - - list_sort(NULL, &list, plug_rq_cmp); - - q = NULL; - depth = 0; - - while (!list_empty(&list)) { - rq = list_entry_rq(list.next); - list_del_init(&rq->queuelist); - BUG_ON(!rq->q); - if (rq->q != q) { - /* - * This drops the queue lock - */ - if (q) - queue_unplugged(q, depth, from_schedule); - q = rq->q; - depth = 0; - spin_lock_irq(q->queue_lock); - } - - /* - * Short-circuit if @q is dead - */ - if (unlikely(blk_queue_dying(q))) { - __blk_end_request_all(rq, BLK_STS_IOERR); - continue; - } - - /* - * rq is already accounted, so use raw insert - */ - if (op_is_flush(rq->cmd_flags)) - __elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH); - else - __elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE); - - depth++; - } - - /* - * This drops the queue lock - */ - if (q) - queue_unplugged(q, depth, from_schedule); } void blk_finish_plug(struct blk_plug *plug) @@ -3743,9 +1789,6 @@ int __init blk_dev_init(void) if (!kblockd_workqueue) panic("Failed to create kblockd\n"); - request_cachep = kmem_cache_create("blkdev_requests", - sizeof(struct request), 0, SLAB_PANIC, NULL); - blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); diff --git a/block/blk-exec.c b/block/blk-exec.c index f7b292f12449..a34b7d918742 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -48,8 +48,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, struct request *rq, int at_head, rq_end_io_fn *done) { - int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; - WARN_ON(irqs_disabled()); WARN_ON(!blk_rq_is_passthrough(rq)); @@ -60,23 +58,7 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, * don't check dying flag for MQ because the request won't * be reused after dying flag is set */ - if (q->mq_ops) { - blk_mq_sched_insert_request(rq, at_head, true, false); - return; - } - - spin_lock_irq(q->queue_lock); - - if (unlikely(blk_queue_dying(q))) { - rq->rq_flags |= RQF_QUIET; - __blk_end_request_all(rq, BLK_STS_IOERR); - spin_unlock_irq(q->queue_lock); - return; - } - - __elv_add_request(q, rq, where); - __blk_run_queue(q); - spin_unlock_irq(q->queue_lock); + blk_mq_sched_insert_request(rq, at_head, true, false); } EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); diff --git a/block/blk-flush.c b/block/blk-flush.c index 8b44b86779da..a3fc7191c694 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -93,7 +93,7 @@ enum { FLUSH_PENDING_TIMEOUT = 5 * HZ, }; -static bool blk_kick_flush(struct request_queue *q, +static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags); static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) @@ -132,18 +132,9 @@ static void blk_flush_restore_request(struct request *rq) rq->end_io = rq->flush.saved_end_io; } -static bool blk_flush_queue_rq(struct request *rq, bool add_front) +static void blk_flush_queue_rq(struct request *rq, bool add_front) { - if (rq->q->mq_ops) { - blk_mq_add_to_requeue_list(rq, add_front, true); - return false; - } else { - if (add_front) - list_add(&rq->queuelist, &rq->q->queue_head); - else - list_add_tail(&rq->queuelist, &rq->q->queue_head); - return true; - } + blk_mq_add_to_requeue_list(rq, add_front, true); } /** @@ -157,18 +148,17 @@ static bool blk_flush_queue_rq(struct request *rq, bool add_front) * completion and trigger the next step. * * CONTEXT: - * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) + * spin_lock_irq(fq->mq_flush_lock) * * RETURNS: * %true if requests were added to the dispatch queue, %false otherwise. */ -static bool blk_flush_complete_seq(struct request *rq, +static void blk_flush_complete_seq(struct request *rq, struct blk_flush_queue *fq, unsigned int seq, blk_status_t error) { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; - bool queued = false, kicked; unsigned int cmd_flags; BUG_ON(rq->flush.seq & seq); @@ -191,7 +181,7 @@ static bool blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - queued = blk_flush_queue_rq(rq, true); + blk_flush_queue_rq(rq, true); break; case REQ_FSEQ_DONE: @@ -204,42 +194,34 @@ static bool blk_flush_complete_seq(struct request *rq, BUG_ON(!list_empty(&rq->queuelist)); list_del_init(&rq->flush.list); blk_flush_restore_request(rq); - if (q->mq_ops) - blk_mq_end_request(rq, error); - else - __blk_end_request_all(rq, error); + blk_mq_end_request(rq, error); break; default: BUG(); } - kicked = blk_kick_flush(q, fq, cmd_flags); - return kicked | queued; + blk_kick_flush(q, fq, cmd_flags); } static void flush_end_io(struct request *flush_rq, blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; - bool queued = false; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + struct blk_mq_hw_ctx *hctx; - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - - /* release the tag's ownership to the req cloned from */ - spin_lock_irqsave(&fq->mq_flush_lock, flags); - hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); - if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; - } else { - blk_mq_put_driver_tag_hctx(hctx, flush_rq); - flush_rq->internal_tag = -1; - } + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); + hctx = flush_rq->mq_hctx; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); + flush_rq->tag = -1; + } else { + blk_mq_put_driver_tag_hctx(hctx, flush_rq); + flush_rq->internal_tag = -1; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -248,35 +230,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) /* account completion of the flush request */ fq->flush_running_idx ^= 1; - if (!q->mq_ops) - elv_completed_request(q, flush_rq); - /* and push the waiting requests to the next stage */ list_for_each_entry_safe(rq, n, running, flush.list) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); - queued |= blk_flush_complete_seq(rq, fq, seq, error); + blk_flush_complete_seq(rq, fq, seq, error); } - /* - * Kick the queue to avoid stall for two cases: - * 1. Moving a request silently to empty queue_head may stall the - * queue. - * 2. When flush request is running in non-queueable queue, the - * queue is hold. Restart the queue after flush request is finished - * to avoid stall. - * This function is called from request completion path and calling - * directly into request_fn may confuse the driver. Always use - * kblockd. - */ - if (queued || fq->flush_queue_delayed) { - WARN_ON(q->mq_ops); - blk_run_queue_async(q); - } fq->flush_queue_delayed = 0; - if (q->mq_ops) - spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); } /** @@ -289,12 +252,10 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * Please read the comment at the top of this file for more info. * * CONTEXT: - * spin_lock_irq(q->queue_lock or fq->mq_flush_lock) + * spin_lock_irq(fq->mq_flush_lock) * - * RETURNS: - * %true if flush was issued, %false otherwise. */ -static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, +static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, unsigned int flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; @@ -304,7 +265,7 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) - return false; + return; /* C2 and C3 * @@ -312,11 +273,10 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * assigned to empty flushes, and we deadlock if we are expecting * other requests to make progress. Don't defer for that case. */ - if (!list_empty(&fq->flush_data_in_flight) && - !(q->mq_ops && q->elevator) && + if (!list_empty(&fq->flush_data_in_flight) && q->elevator && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) - return false; + return; /* * Issue flush and toggle pending_idx. This makes pending_idx @@ -334,19 +294,15 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, * In case of IO scheduler, flush rq need to borrow scheduler tag * just for cheating put/get driver tag. */ - if (q->mq_ops) { - struct blk_mq_hw_ctx *hctx; - - flush_rq->mq_ctx = first_rq->mq_ctx; - - if (!q->elevator) { - fq->orig_rq = first_rq; - flush_rq->tag = first_rq->tag; - hctx = blk_mq_map_queue(q, first_rq->mq_ctx->cpu); - blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); - } else { - flush_rq->internal_tag = first_rq->internal_tag; - } + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->mq_hctx = first_rq->mq_hctx; + + if (!q->elevator) { + fq->orig_rq = first_rq; + flush_rq->tag = first_rq->tag; + blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); + } else { + flush_rq->internal_tag = first_rq->internal_tag; } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; @@ -355,62 +311,17 @@ static bool blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; - return blk_flush_queue_rq(flush_rq, false); -} - -static void flush_data_end_io(struct request *rq, blk_status_t error) -{ - struct request_queue *q = rq->q; - struct blk_flush_queue *fq = blk_get_flush_queue(q, NULL); - - lockdep_assert_held(q->queue_lock); - - /* - * Updating q->in_flight[] here for making this tag usable - * early. Because in blk_queue_start_tag(), - * q->in_flight[BLK_RW_ASYNC] is used to limit async I/O and - * reserve tags for sync I/O. - * - * More importantly this way can avoid the following I/O - * deadlock: - * - * - suppose there are 40 fua requests comming to flush queue - * and queue depth is 31 - * - 30 rqs are scheduled then blk_queue_start_tag() can't alloc - * tag for async I/O any more - * - all the 30 rqs are completed before FLUSH_PENDING_TIMEOUT - * and flush_data_end_io() is called - * - the other rqs still can't go ahead if not updating - * q->in_flight[BLK_RW_ASYNC] here, meantime these rqs - * are held in flush data queue and make no progress of - * handling post flush rq - * - only after the post flush rq is handled, all these rqs - * can be completed - */ - - elv_completed_request(q, rq); - - /* for avoiding double accounting */ - rq->rq_flags &= ~RQF_STARTED; - - /* - * After populating an empty queue, kick it to avoid stall. Read - * the comment in flush_end_io(). - */ - if (blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error)) - blk_run_queue_async(q); + blk_flush_queue_rq(flush_rq, false); } static void mq_flush_data_end_io(struct request *rq, blk_status_t error) { struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct blk_mq_ctx *ctx = rq->mq_ctx; unsigned long flags; struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); - hctx = blk_mq_map_queue(q, ctx->cpu); - if (q->elevator) { WARN_ON(rq->tag < 0); blk_mq_put_driver_tag_hctx(hctx, rq); @@ -443,9 +354,6 @@ void blk_insert_flush(struct request *rq) unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. @@ -468,10 +376,7 @@ void blk_insert_flush(struct request *rq) * complete the request. */ if (!policy) { - if (q->mq_ops) - blk_mq_end_request(rq, 0); - else - __blk_end_request(rq, 0, 0); + blk_mq_end_request(rq, 0); return; } @@ -484,10 +389,7 @@ void blk_insert_flush(struct request *rq) */ if ((policy & REQ_FSEQ_DATA) && !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - if (q->mq_ops) - blk_mq_request_bypass_insert(rq, false); - else - list_add_tail(&rq->queuelist, &q->queue_head); + blk_mq_request_bypass_insert(rq, false); return; } @@ -499,17 +401,12 @@ void blk_insert_flush(struct request *rq) INIT_LIST_HEAD(&rq->flush.list); rq->rq_flags |= RQF_FLUSH_SEQ; rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ - if (q->mq_ops) { - rq->end_io = mq_flush_data_end_io; - spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&fq->mq_flush_lock); - return; - } - rq->end_io = flush_data_end_io; + rq->end_io = mq_flush_data_end_io; + spin_lock_irq(&fq->mq_flush_lock); blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); } /** @@ -575,8 +472,7 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, if (!fq) goto fail; - if (q->mq_ops) - spin_lock_init(&fq->mq_flush_lock); + spin_lock_init(&fq->mq_flush_lock); rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); fq->flush_rq = kzalloc_node(rq_sz, flags, node); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 01580f88fcb3..5ed59ac6ae58 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -28,7 +28,6 @@ void get_io_context(struct io_context *ioc) BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); } -EXPORT_SYMBOL(get_io_context); static void icq_free_icq_rcu(struct rcu_head *head) { @@ -48,10 +47,8 @@ static void ioc_exit_icq(struct io_cq *icq) if (icq->flags & ICQ_EXITED) return; - if (et->uses_mq && et->ops.mq.exit_icq) - et->ops.mq.exit_icq(icq); - else if (!et->uses_mq && et->ops.sq.elevator_exit_icq_fn) - et->ops.sq.elevator_exit_icq_fn(icq); + if (et->ops.exit_icq) + et->ops.exit_icq(icq); icq->flags |= ICQ_EXITED; } @@ -113,9 +110,9 @@ static void ioc_release_fn(struct work_struct *work) struct io_cq, ioc_node); struct request_queue *q = icq->q; - if (spin_trylock(q->queue_lock)) { + if (spin_trylock(&q->queue_lock)) { ioc_destroy_icq(icq); - spin_unlock(q->queue_lock); + spin_unlock(&q->queue_lock); } else { spin_unlock_irqrestore(&ioc->lock, flags); cpu_relax(); @@ -162,7 +159,6 @@ void put_io_context(struct io_context *ioc) if (free_ioc) kmem_cache_free(iocontext_cachep, ioc); } -EXPORT_SYMBOL(put_io_context); /** * put_io_context_active - put active reference on ioc @@ -173,7 +169,6 @@ EXPORT_SYMBOL(put_io_context); */ void put_io_context_active(struct io_context *ioc) { - struct elevator_type *et; unsigned long flags; struct io_cq *icq; @@ -187,25 +182,12 @@ void put_io_context_active(struct io_context *ioc) * reverse double locking. Read comment in ioc_release_fn() for * explanation on the nested locking annotation. */ -retry: spin_lock_irqsave_nested(&ioc->lock, flags, 1); hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { if (icq->flags & ICQ_EXITED) continue; - et = icq->q->elevator->type; - if (et->uses_mq) { - ioc_exit_icq(icq); - } else { - if (spin_trylock(icq->q->queue_lock)) { - ioc_exit_icq(icq); - spin_unlock(icq->q->queue_lock); - } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - goto retry; - } - } + ioc_exit_icq(icq); } spin_unlock_irqrestore(&ioc->lock, flags); @@ -232,7 +214,7 @@ static void __ioc_clear_queue(struct list_head *icq_list) while (!list_empty(icq_list)) { struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); + struct io_cq, q_node); struct io_context *ioc = icq->ioc; spin_lock_irqsave(&ioc->lock, flags); @@ -251,16 +233,11 @@ void ioc_clear_queue(struct request_queue *q) { LIST_HEAD(icq_list); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); list_splice_init(&q->icq_list, &icq_list); + spin_unlock_irq(&q->queue_lock); - if (q->mq_ops) { - spin_unlock_irq(q->queue_lock); - __ioc_clear_queue(&icq_list); - } else { - __ioc_clear_queue(&icq_list); - spin_unlock_irq(q->queue_lock); - } + __ioc_clear_queue(&icq_list); } int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) @@ -336,7 +313,6 @@ struct io_context *get_task_io_context(struct task_struct *task, return NULL; } -EXPORT_SYMBOL(get_task_io_context); /** * ioc_lookup_icq - lookup io_cq from ioc @@ -350,7 +326,7 @@ struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) { struct io_cq *icq; - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); /* * icq's are indexed from @ioc using radix tree and hint pointer, @@ -409,16 +385,14 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, INIT_HLIST_NODE(&icq->ioc_node); /* lock both q and ioc and try to link @icq */ - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); spin_lock(&ioc->lock); if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { hlist_add_head(&icq->ioc_node, &ioc->icq_list); list_add(&icq->q_node, &q->icq_list); - if (et->uses_mq && et->ops.mq.init_icq) - et->ops.mq.init_icq(icq); - else if (!et->uses_mq && et->ops.sq.elevator_init_icq_fn) - et->ops.sq.elevator_init_icq_fn(icq); + if (et->ops.init_icq) + et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); icq = ioc_lookup_icq(ioc, q); @@ -427,7 +401,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, } spin_unlock(&ioc->lock); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); radix_tree_preload_end(); return icq; } diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index 38c35c32aff2..fc714ef402a6 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -262,29 +262,25 @@ static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, stat->rqs.mean); } -static inline bool iolatency_may_queue(struct iolatency_grp *iolat, - wait_queue_entry_t *wait, - bool first_block) +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) { - struct rq_wait *rqw = &iolat->rq_wait; + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +} - if (first_block && waitqueue_active(&rqw->wait) && - rqw->wait.head.next != &wait->entry) - return false; +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) +{ + struct iolatency_grp *iolat = private_data; return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, struct iolatency_grp *iolat, - spinlock_t *lock, bool issue_as_root, + bool issue_as_root, bool use_memdelay) - __releases(lock) - __acquires(lock) { struct rq_wait *rqw = &iolat->rq_wait; unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); - DEFINE_WAIT(wait); - bool first_block = true; if (use_delay) blkcg_schedule_throttle(rqos->q, use_memdelay); @@ -301,27 +297,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, return; } - if (iolatency_may_queue(iolat, &wait, first_block)) - return; - - do { - prepare_to_wait_exclusive(&rqw->wait, &wait, - TASK_UNINTERRUPTIBLE); - - if (iolatency_may_queue(iolat, &wait, first_block)) - break; - first_block = false; - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else { - io_schedule(); - } - } while (1); - - finish_wait(&rqw->wait, &wait); + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); } #define SCALE_DOWN_FACTOR 2 @@ -478,38 +454,15 @@ static void check_scale_change(struct iolatency_grp *iolat) scale_change(iolat, direction > 0); } -static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, - spinlock_t *lock) +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - struct blkcg *blkcg; - struct blkcg_gq *blkg; - struct request_queue *q = rqos->q; + struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); if (!blk_iolatency_enabled(blkiolat)) return; - rcu_read_lock(); - blkcg = bio_blkcg(bio); - bio_associate_blkcg(bio, &blkcg->css); - blkg = blkg_lookup(blkcg, q); - if (unlikely(!blkg)) { - if (!lock) - spin_lock_irq(q->queue_lock); - blkg = blkg_lookup_create(blkcg, q); - if (IS_ERR(blkg)) - blkg = NULL; - if (!lock) - spin_unlock_irq(q->queue_lock); - } - if (!blkg) - goto out; - - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); - bio_associate_blkg(bio, blkg); -out: - rcu_read_unlock(); while (blkg && blkg->parent) { struct iolatency_grp *iolat = blkg_to_lat(blkg); if (!iolat) { @@ -518,7 +471,7 @@ out: } check_scale_change(iolat); - __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); blkg = blkg->parent; } @@ -640,7 +593,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) bool enabled = false; blkg = bio->bi_blkg; - if (!blkg) + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) return; iolat = blkg_to_lat(bio->bi_blkg); @@ -730,7 +683,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) * We could be exiting, don't access the pd unless we have a * ref on the blkg. */ - if (!blkg_try_get(blkg)) + if (!blkg_tryget(blkg)) continue; iolat = blkg_to_lat(blkg); diff --git a/block/blk-lib.c b/block/blk-lib.c index 76f867ea9a9b..5f2c429d4378 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -51,16 +51,14 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, if ((sector | nr_sects) & bs_mask) return -EINVAL; - while (nr_sects) { - unsigned int req_sects = nr_sects; - sector_t end_sect; + if (!nr_sects) + return -EINVAL; - if (!req_sects) - goto fail; - if (req_sects > UINT_MAX >> 9) - req_sects = UINT_MAX >> 9; + while (nr_sects) { + sector_t req_sects = min_t(sector_t, nr_sects, + bio_allowed_max_sectors(q)); - end_sect = sector + req_sects; + WARN_ON_ONCE((req_sects << 9) > UINT_MAX); bio = blk_next_bio(bio, 0, gfp_mask); bio->bi_iter.bi_sector = sector; @@ -68,8 +66,8 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, bio_set_op_attrs(bio, op, 0); bio->bi_iter.bi_size = req_sects << 9; + sector += req_sects; nr_sects -= req_sects; - sector = end_sect; /* * We can loop for a long time in here, if someone does @@ -82,14 +80,6 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, *biop = bio; return 0; - -fail: - if (bio) { - submit_bio_wait(bio); - bio_put(bio); - } - *biop = NULL; - return -EOPNOTSUPP; } EXPORT_SYMBOL(__blkdev_issue_discard); @@ -161,7 +151,7 @@ static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, return -EOPNOTSUPP; /* Ensure that max_write_same_sectors doesn't overflow bi_size */ - max_write_same_sectors = UINT_MAX >> 9; + max_write_same_sectors = bio_allowed_max_sectors(q); while (nr_sects) { bio = blk_next_bio(bio, 1, gfp_mask); diff --git a/block/blk-merge.c b/block/blk-merge.c index 4478d53cc6ee..71e9ac03f621 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -46,7 +46,7 @@ static inline bool bio_will_gap(struct request_queue *q, bio_get_first_bvec(prev_rq->bio, &pb); else bio_get_first_bvec(prev, &pb); - if (pb.bv_offset) + if (pb.bv_offset & queue_virt_boundary(q)) return true; /* @@ -90,7 +90,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(q->limits.discard_granularity >> 9, 1U); - max_discard_sectors = min(q->limits.max_discard_sectors, UINT_MAX >> 9); + max_discard_sectors = min(q->limits.max_discard_sectors, + bio_allowed_max_sectors(q)); max_discard_sectors -= max_discard_sectors % granularity; if (unlikely(!max_discard_sectors)) { @@ -387,7 +388,6 @@ void blk_recount_segments(struct request_queue *q, struct bio *bio) bio_set_flag(bio, BIO_SEG_VALID); } -EXPORT_SYMBOL(blk_recount_segments); static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, struct bio *nxt) @@ -591,17 +591,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, return ll_new_hw_segment(q, req, bio); } -/* - * blk-mq uses req->special to carry normal driver per-request payload, it - * does not indicate a prepared command that we cannot merge with. - */ -static bool req_no_special_merge(struct request *req) -{ - struct request_queue *q = req->q; - - return !q->mq_ops && req->special; -} - static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, struct request *next) { @@ -627,13 +616,6 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, unsigned int seg_size = req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size; - /* - * First check if the either of the requests are re-queued - * requests. Can't merge them if they are. - */ - if (req_no_special_merge(req) || req_no_special_merge(next)) - return 0; - if (req_gap_back_merge(req, next->bio)) return 0; @@ -698,12 +680,10 @@ static void blk_account_io_merge(struct request *req) { if (blk_do_io_stat(req)) { struct hd_struct *part; - int cpu; - cpu = part_stat_lock(); + part_stat_lock(); part = req->part; - part_round_stats(req->q, cpu, part); part_dec_in_flight(req->q, part, rq_data_dir(req)); hd_struct_put(part); @@ -726,7 +706,8 @@ static inline bool blk_discard_mergable(struct request *req) return false; } -enum elv_merge blk_try_req_merge(struct request *req, struct request *next) +static enum elv_merge blk_try_req_merge(struct request *req, + struct request *next) { if (blk_discard_mergable(req)) return ELEVATOR_DISCARD_MERGE; @@ -743,9 +724,6 @@ enum elv_merge blk_try_req_merge(struct request *req, struct request *next) static struct request *attempt_merge(struct request_queue *q, struct request *req, struct request *next) { - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - if (!rq_mergeable(req) || !rq_mergeable(next)) return NULL; @@ -753,8 +731,7 @@ static struct request *attempt_merge(struct request_queue *q, return NULL; if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk - || req_no_special_merge(next)) + || req->rq_disk != next->rq_disk) return NULL; if (req_op(req) == REQ_OP_WRITE_SAME && @@ -768,6 +745,9 @@ static struct request *attempt_merge(struct request_queue *q, if (req->write_hint != next->write_hint) return NULL; + if (req->ioprio != next->ioprio) + return NULL; + /* * If we are allowed to merge, then append bio list * from next to rq and release next. merge_requests_fn @@ -815,7 +795,7 @@ static struct request *attempt_merge(struct request_queue *q, req->__data_len += blk_rq_bytes(next); - if (req_op(req) != REQ_OP_DISCARD) + if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); /* @@ -823,10 +803,6 @@ static struct request *attempt_merge(struct request_queue *q, */ blk_account_io_merge(next); - req->ioprio = ioprio_best(req->ioprio, next->ioprio); - if (blk_rq_cpu_valid(next)) - req->cpu = next->cpu; - /* * ownership of bio passed from next to req, return 'next' for * the caller to free @@ -858,16 +834,11 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq) int blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next) { - struct elevator_queue *e = q->elevator; struct request *free; - if (!e->uses_mq && e->type->ops.sq.elevator_allow_rq_merge_fn) - if (!e->type->ops.sq.elevator_allow_rq_merge_fn(q, rq, next)) - return 0; - free = attempt_merge(q, rq, next); if (free) { - __blk_put_request(q, free); + blk_put_request(free); return 1; } @@ -886,8 +857,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device and not a special request */ - if (rq->rq_disk != bio->bi_disk || req_no_special_merge(rq)) + /* must be same device */ + if (rq->rq_disk != bio->bi_disk) return false; /* only merge integrity protected bio into ditto rq */ @@ -906,6 +877,9 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (rq->write_hint != bio->bi_write_hint) return false; + if (rq->ioprio != bio_prio(bio)) + return false; + return true; } diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 3eb169f15842..03a534820271 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -14,9 +14,10 @@ #include "blk.h" #include "blk-mq.h" -static int cpu_to_queue_index(unsigned int nr_queues, const int cpu) +static int cpu_to_queue_index(struct blk_mq_queue_map *qmap, + unsigned int nr_queues, const int cpu) { - return cpu % nr_queues; + return qmap->queue_offset + (cpu % nr_queues); } static int get_first_sibling(unsigned int cpu) @@ -30,10 +31,10 @@ static int get_first_sibling(unsigned int cpu) return cpu; } -int blk_mq_map_queues(struct blk_mq_tag_set *set) +int blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - unsigned int *map = set->mq_map; - unsigned int nr_queues = set->nr_hw_queues; + unsigned int *map = qmap->mq_map; + unsigned int nr_queues = qmap->nr_queues; unsigned int cpu, first_sibling; for_each_possible_cpu(cpu) { @@ -44,11 +45,11 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set) * performace optimizations. */ if (cpu < nr_queues) { - map[cpu] = cpu_to_queue_index(nr_queues, cpu); + map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); } else { first_sibling = get_first_sibling(cpu); if (first_sibling == cpu) - map[cpu] = cpu_to_queue_index(nr_queues, cpu); + map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu); else map[cpu] = map[first_sibling]; } @@ -62,12 +63,12 @@ EXPORT_SYMBOL_GPL(blk_mq_map_queues); * We have no quick way of doing reverse lookups. This is only used at * queue init time, so runtime isn't important. */ -int blk_mq_hw_queue_to_node(unsigned int *mq_map, unsigned int index) +int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) { int i; for_each_possible_cpu(i) { - if (index == mq_map[i]) + if (index == qmap->mq_map[i]) return local_memory_node(cpu_to_node(i)); } diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 10b284a1f18d..90d68760af08 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -23,6 +23,7 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-rq-qos.h" static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) { @@ -112,10 +113,8 @@ static int queue_pm_only_show(void *data, struct seq_file *m) #define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name static const char *const blk_queue_flag_name[] = { - QUEUE_FLAG_NAME(QUEUED), QUEUE_FLAG_NAME(STOPPED), QUEUE_FLAG_NAME(DYING), - QUEUE_FLAG_NAME(BYPASS), QUEUE_FLAG_NAME(BIDI), QUEUE_FLAG_NAME(NOMERGES), QUEUE_FLAG_NAME(SAME_COMP), @@ -318,7 +317,6 @@ static const char *const cmd_flag_name[] = { static const char *const rqf_name[] = { RQF_NAME(SORTED), RQF_NAME(STARTED), - RQF_NAME(QUEUED), RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), @@ -424,15 +422,18 @@ struct show_busy_params { /* * Note: the state of a request may change while this function is in progress, - * e.g. due to a concurrent blk_mq_finish_request() call. + * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to + * keep iterating requests. */ -static void hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) { const struct show_busy_params *params = data; - if (blk_mq_map_queue(rq->q, rq->mq_ctx->cpu) == params->hctx) + if (rq->mq_hctx == params->hctx) __blk_mq_debugfs_rq_show(params->m, list_entry_rq(&rq->queuelist)); + + return true; } static int hctx_busy_show(void *data, struct seq_file *m) @@ -446,6 +447,21 @@ static int hctx_busy_show(void *data, struct seq_file *m) return 0; } +static const char *const hctx_types[] = { + [HCTX_TYPE_DEFAULT] = "default", + [HCTX_TYPE_READ] = "read", + [HCTX_TYPE_POLL] = "poll", +}; + +static int hctx_type_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); + seq_printf(m, "%s\n", hctx_types[hctx->type]); + return 0; +} + static int hctx_ctx_map_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -636,36 +652,43 @@ static int hctx_dispatch_busy_show(void *data, struct seq_file *m) return 0; } -static void *ctx_rq_list_start(struct seq_file *m, loff_t *pos) - __acquires(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_lock(&ctx->lock); - return seq_list_start(&ctx->rq_list, *pos); -} - -static void *ctx_rq_list_next(struct seq_file *m, void *v, loff_t *pos) -{ - struct blk_mq_ctx *ctx = m->private; - - return seq_list_next(v, &ctx->rq_list, pos); +#define CTX_RQ_SEQ_OPS(name, type) \ +static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ + __acquires(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_lock(&ctx->lock); \ + return seq_list_start(&ctx->rq_lists[type], *pos); \ +} \ + \ +static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + return seq_list_next(v, &ctx->rq_lists[type], pos); \ +} \ + \ +static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ + __releases(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_unlock(&ctx->lock); \ +} \ + \ +static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ + .start = ctx_##name##_rq_list_start, \ + .next = ctx_##name##_rq_list_next, \ + .stop = ctx_##name##_rq_list_stop, \ + .show = blk_mq_debugfs_rq_show, \ } -static void ctx_rq_list_stop(struct seq_file *m, void *v) - __releases(&ctx->lock) -{ - struct blk_mq_ctx *ctx = m->private; - - spin_unlock(&ctx->lock); -} +CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); +CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); +CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); -static const struct seq_operations ctx_rq_list_seq_ops = { - .start = ctx_rq_list_start, - .next = ctx_rq_list_next, - .stop = ctx_rq_list_stop, - .show = blk_mq_debugfs_rq_show, -}; static int ctx_dispatched_show(void *data, struct seq_file *m) { struct blk_mq_ctx *ctx = data; @@ -798,11 +821,14 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, + {"type", 0400, hctx_type_show}, {}, }; static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { - {"rq_list", 0400, .seq_ops = &ctx_rq_list_seq_ops}, + {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, + {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, + {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, {"merged", 0600, ctx_merged_show, ctx_merged_write}, {"completed", 0600, ctx_completed_show, ctx_completed_write}, @@ -856,6 +882,15 @@ int blk_mq_debugfs_register(struct request_queue *q) goto err; } + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } + return 0; err: @@ -978,6 +1013,50 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q) q->sched_debugfs_dir = NULL; } +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ + debugfs_remove_recursive(rqos->debugfs_dir); + rqos->debugfs_dir = NULL; +} + +int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->q; + const char *dir_name = rq_qos_id_to_name(rqos->id); + + if (!q->debugfs_dir) + return -ENOENT; + + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) + return 0; + + if (!q->rqos_debugfs_dir) { + q->rqos_debugfs_dir = debugfs_create_dir("rqos", + q->debugfs_dir); + if (!q->rqos_debugfs_dir) + return -ENOMEM; + } + + rqos->debugfs_dir = debugfs_create_dir(dir_name, + rqos->q->rqos_debugfs_dir); + if (!rqos->debugfs_dir) + return -ENOMEM; + + if (!debugfs_create_files(rqos->debugfs_dir, rqos, + rqos->ops->debugfs_attrs)) + goto err; + return 0; + err: + blk_mq_debugfs_unregister_rqos(rqos); + return -ENOMEM; +} + +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ + debugfs_remove_recursive(q->rqos_debugfs_dir); + q->rqos_debugfs_dir = NULL; +} + int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a9160be12be0..8c9012a578c1 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -31,6 +31,10 @@ void blk_mq_debugfs_unregister_sched(struct request_queue *q); int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); + +int blk_mq_debugfs_register_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); #else static inline int blk_mq_debugfs_register(struct request_queue *q) { @@ -78,6 +82,19 @@ static inline int blk_mq_debugfs_register_sched_hctx(struct request_queue *q, static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { } + +static inline int blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ + return 0; +} + +static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ +} + +static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ +} #endif #ifdef CONFIG_BLK_DEBUG_FS_ZONED diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index db644ec624f5..1dce18553984 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -31,26 +31,26 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_tag_set *set, struct pci_dev *pdev, +int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, int offset) { const struct cpumask *mask; unsigned int queue, cpu; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < qmap->nr_queues; queue++) { mask = pci_irq_get_affinity(pdev, queue + offset); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + qmap->mq_map[cpu] = qmap->queue_offset + queue; } return 0; fallback: - WARN_ON_ONCE(set->nr_hw_queues > 1); - blk_mq_clear_mq_map(set); + WARN_ON_ONCE(qmap->nr_queues > 1); + blk_mq_clear_mq_map(qmap); return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c index 996167f1de18..45030a81a1ed 100644 --- a/block/blk-mq-rdma.c +++ b/block/blk-mq-rdma.c @@ -29,24 +29,24 @@ * @set->nr_hw_queues, or @dev does not provide an affinity mask for a * vector, we fallback to the naive mapping. */ -int blk_mq_rdma_map_queues(struct blk_mq_tag_set *set, +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, struct ib_device *dev, int first_vec) { const struct cpumask *mask; unsigned int queue, cpu; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < map->nr_queues; queue++) { mask = ib_get_vector_affinity(dev, first_vec + queue); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + map->mq_map[cpu] = map->queue_offset + queue; } return 0; fallback: - return blk_mq_map_queues(set); + return blk_mq_map_queues(map); } EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 29bfe8017a2d..140933e4a7d1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -31,15 +31,22 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) +void blk_mq_sched_assign_ioc(struct request *rq) { struct request_queue *q = rq->q; - struct io_context *ioc = rq_ioc(bio); + struct io_context *ioc; struct io_cq *icq; - spin_lock_irq(q->queue_lock); + /* + * May not have an IO context if it's a passthrough request + */ + ioc = current->io_context; + if (!ioc) + return; + + spin_lock_irq(&q->queue_lock); icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!icq) { icq = ioc_create_icq(ioc, q, GFP_ATOMIC); @@ -54,13 +61,14 @@ void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) * Mark a hardware queue as needing a restart. For shared queues, maintain * a count of how many hardware queues are marked for restart. */ -static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) return; set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } +EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { @@ -85,14 +93,13 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) do { struct request *rq; - if (e->type->ops.mq.has_work && - !e->type->ops.mq.has_work(hctx)) + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!blk_mq_get_dispatch_budget(hctx)) break; - rq = e->type->ops.mq.dispatch_request(hctx); + rq = e->type->ops.dispatch_request(hctx); if (!rq) { blk_mq_put_dispatch_budget(hctx); break; @@ -110,7 +117,7 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - unsigned idx = ctx->index_hw; + unsigned short idx = ctx->index_hw[hctx->type]; if (++idx == hctx->nr_ctx) idx = 0; @@ -163,7 +170,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; LIST_HEAD(rq_list); /* RCU or SRCU read lock is needed before checking quiesced flag */ @@ -295,11 +302,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); * too much time checking for merges. */ static bool blk_mq_attempt_merge(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct bio *bio) { + enum hctx_type type = hctx->type; + lockdep_assert_held(&ctx->lock); - if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { + if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { ctx->rq_merged++; return true; } @@ -311,19 +321,21 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) { struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx->cpu); bool ret = false; + enum hctx_type type; - if (e && e->type->ops.mq.bio_merge) { + if (e && e->type->ops.bio_merge) { blk_mq_put_ctx(ctx); - return e->type->ops.mq.bio_merge(hctx, bio); + return e->type->ops.bio_merge(hctx, bio); } + type = hctx->type; if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_list)) { + !list_empty_careful(&ctx->rq_lists[type])) { /* default per sw-queue merge */ spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, ctx, bio); + ret = blk_mq_attempt_merge(q, hctx, ctx, bio); spin_unlock(&ctx->lock); } @@ -367,7 +379,7 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; /* flush rq in flush machinery need to be dispatched directly */ if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { @@ -380,11 +392,11 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head, if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) goto run; - if (e && e->type->ops.mq.insert_requests) { + if (e && e->type->ops.insert_requests) { LIST_HEAD(list); list_add(&rq->queuelist, &list); - e->type->ops.mq.insert_requests(hctx, &list, at_head); + e->type->ops.insert_requests(hctx, &list, at_head); } else { spin_lock(&ctx->lock); __blk_mq_insert_request(hctx, rq, at_head); @@ -396,27 +408,25 @@ run: blk_mq_run_hw_queue(hctx, async); } -void blk_mq_sched_insert_requests(struct request_queue *q, +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async) { - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); - struct elevator_queue *e = hctx->queue->elevator; + struct elevator_queue *e; - if (e && e->type->ops.mq.insert_requests) - e->type->ops.mq.insert_requests(hctx, list, false); + e = hctx->queue->elevator; + if (e && e->type->ops.insert_requests) + e->type->ops.insert_requests(hctx, list, false); else { /* * try to issue requests directly if the hw queue isn't * busy in case of 'none' scheduler, and this way may save * us one extra enqueue & dequeue to sw queue. */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { + if (!hctx->dispatch_busy && !e && !run_queue_async) blk_mq_try_issue_list_directly(hctx, list); - if (list_empty(list)) - return; - } - blk_mq_insert_requests(hctx, ctx, list); + else + blk_mq_insert_requests(hctx, ctx, list); } blk_mq_run_hw_queue(hctx, run_queue_async); @@ -489,15 +499,15 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) goto err; } - ret = e->ops.mq.init_sched(q, e); + ret = e->ops.init_sched(q, e); if (ret) goto err; blk_mq_debugfs_register_sched(q); queue_for_each_hw_ctx(q, hctx, i) { - if (e->ops.mq.init_hctx) { - ret = e->ops.mq.init_hctx(hctx, i); + if (e->ops.init_hctx) { + ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; blk_mq_exit_sched(q, eq); @@ -523,14 +533,14 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) queue_for_each_hw_ctx(q, hctx, i) { blk_mq_debugfs_unregister_sched_hctx(hctx); - if (e->type->ops.mq.exit_hctx && hctx->sched_data) { - e->type->ops.mq.exit_hctx(hctx, i); + if (e->type->ops.exit_hctx && hctx->sched_data) { + e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } } blk_mq_debugfs_unregister_sched(q); - if (e->type->ops.mq.exit_sched) - e->type->ops.mq.exit_sched(e); + if (e->type->ops.exit_sched) + e->type->ops.exit_sched(e); blk_mq_sched_tags_teardown(q); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 8a9544203173..c7bdb52367ac 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -8,18 +8,19 @@ void blk_mq_sched_free_hctx_data(struct request_queue *q, void (*exit)(struct blk_mq_hw_ctx *)); -void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio); +void blk_mq_sched_assign_ioc(struct request *rq); void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, struct request **merged_request); bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio); bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_insert_request(struct request *rq, bool at_head, bool run_queue, bool async); -void blk_mq_sched_insert_requests(struct request_queue *q, +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list, bool run_queue_async); @@ -43,8 +44,8 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.allow_merge) - return e->type->ops.mq.allow_merge(q, rq, bio); + if (e && e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); return true; } @@ -53,8 +54,8 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { struct elevator_queue *e = rq->q->elevator; - if (e && e->type->ops.mq.completed_request) - e->type->ops.mq.completed_request(rq, now); + if (e && e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); } static inline void blk_mq_sched_started_request(struct request *rq) @@ -62,8 +63,8 @@ static inline void blk_mq_sched_started_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.started_request) - e->type->ops.mq.started_request(rq); + if (e && e->type->ops.started_request) + e->type->ops.started_request(rq); } static inline void blk_mq_sched_requeue_request(struct request *rq) @@ -71,16 +72,16 @@ static inline void blk_mq_sched_requeue_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e && e->type->ops.mq.requeue_request) - e->type->ops.mq.requeue_request(rq); + if (e && e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) { struct elevator_queue *e = hctx->queue->elevator; - if (e && e->type->ops.mq.has_work) - return e->type->ops.mq.has_work(hctx); + if (e && e->type->ops.has_work) + return e->type->ops.has_work(hctx); return false; } diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index aafb44224c89..3f9c3f4ac44c 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -15,6 +15,18 @@ static void blk_mq_sysfs_release(struct kobject *kobj) { + struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); + + free_percpu(ctxs->queue_ctx); + kfree(ctxs); +} + +static void blk_mq_ctx_sysfs_release(struct kobject *kobj) +{ + struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); + + /* ctx->ctxs won't be released until all ctx are freed */ + kobject_put(&ctx->ctxs->kobj); } static void blk_mq_hw_sysfs_release(struct kobject *kobj) @@ -203,7 +215,7 @@ static struct kobj_type blk_mq_ktype = { static struct kobj_type blk_mq_ctx_ktype = { .sysfs_ops = &blk_mq_sysfs_ops, .default_attrs = default_ctx_attrs, - .release = blk_mq_sysfs_release, + .release = blk_mq_ctx_sysfs_release, }; static struct kobj_type blk_mq_hw_ktype = { @@ -235,7 +247,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) if (!hctx->nr_ctx) return 0; - ret = kobject_add(&hctx->kobj, &q->mq_kobj, "%u", hctx->queue_num); + ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); if (ret) return ret; @@ -258,8 +270,8 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) queue_for_each_hw_ctx(q, hctx, i) blk_mq_unregister_hctx(hctx); - kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); - kobject_del(&q->mq_kobj); + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; @@ -279,7 +291,7 @@ void blk_mq_sysfs_deinit(struct request_queue *q) ctx = per_cpu_ptr(q->queue_ctx, cpu); kobject_put(&ctx->kobj); } - kobject_put(&q->mq_kobj); + kobject_put(q->mq_kobj); } void blk_mq_sysfs_init(struct request_queue *q) @@ -287,10 +299,12 @@ void blk_mq_sysfs_init(struct request_queue *q) struct blk_mq_ctx *ctx; int cpu; - kobject_init(&q->mq_kobj, &blk_mq_ktype); + kobject_init(q->mq_kobj, &blk_mq_ktype); for_each_possible_cpu(cpu) { ctx = per_cpu_ptr(q->queue_ctx, cpu); + + kobject_get(q->mq_kobj); kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); } } @@ -303,11 +317,11 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q) WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_lock); - ret = kobject_add(&q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); if (ret < 0) goto out; - kobject_uevent(&q->mq_kobj, KOBJ_ADD); + kobject_uevent(q->mq_kobj, KOBJ_ADD); queue_for_each_hw_ctx(q, hctx, i) { ret = blk_mq_register_hctx(hctx); @@ -324,8 +338,8 @@ unreg: while (--i >= 0) blk_mq_unregister_hctx(q->queue_hw_ctx[i]); - kobject_uevent(&q->mq_kobj, KOBJ_REMOVE); - kobject_del(&q->mq_kobj); + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); kobject_put(&dev->kobj); return ret; } @@ -340,7 +354,6 @@ int blk_mq_register_dev(struct device *dev, struct request_queue *q) return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index cfda95b85d34..2089c6c62f44 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -110,7 +110,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) struct blk_mq_tags *tags = blk_mq_tags_from_data(data); struct sbitmap_queue *bt; struct sbq_wait_state *ws; - DEFINE_WAIT(wait); + DEFINE_SBQ_WAIT(wait); unsigned int tag_offset; bool drop_ctx; int tag; @@ -154,8 +154,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (tag != -1) break; - prepare_to_wait_exclusive(&ws->wait, &wait, - TASK_UNINTERRUPTIBLE); + sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); tag = __blk_mq_get_tag(data, bt); if (tag != -1) @@ -167,16 +166,17 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) bt_prev = bt; io_schedule(); + sbitmap_finish_wait(bt, ws, &wait); + data->ctx = blk_mq_get_ctx(data->q); - data->hctx = blk_mq_map_queue(data->q, data->ctx->cpu); + data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, + data->ctx->cpu); tags = blk_mq_tags_from_data(data); if (data->flags & BLK_MQ_REQ_RESERVED) bt = &tags->breserved_tags; else bt = &tags->bitmap_tags; - finish_wait(&ws->wait, &wait); - /* * If destination hw queue is changed, fake wake up on * previous queue for compensating the wake up miss, so @@ -191,7 +191,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) if (drop_ctx && data->ctx) blk_mq_put_ctx(data->ctx); - finish_wait(&ws->wait, &wait); + sbitmap_finish_wait(bt, ws, &wait); found_tag: return tag + tag_offset; @@ -235,7 +235,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * test and set the bit before assigning ->rqs[]. */ if (rq && rq->q == hctx->queue) - iter_data->fn(hctx, rq, iter_data->data, reserved); + return iter_data->fn(hctx, rq, iter_data->data, reserved); return true; } @@ -247,7 +247,8 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @fn: Pointer to the function that will be called for each request * associated with @hctx that has been assigned a driver tag. * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) - * where rq is a pointer to a request. + * where rq is a pointer to a request. Return true to continue + * iterating tags, false to stop. * @data: Will be passed as third argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. @@ -288,7 +289,7 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) */ rq = tags->rqs[bitnr]; if (rq && blk_mq_request_started(rq)) - iter_data->fn(rq, iter_data->data, reserved); + return iter_data->fn(rq, iter_data->data, reserved); return true; } @@ -300,7 +301,8 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @data, - * @reserved) where rq is a pointer to a request. + * @reserved) where rq is a pointer to a request. Return true + * to continue iterating tags, false to stop. * @data: Will be passed as second argument to @fn. * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. @@ -325,7 +327,8 @@ static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. */ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, @@ -342,7 +345,8 @@ static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, * @fn: Pointer to the function that will be called for each started * request. @fn will be called as follows: @fn(rq, @priv, * reserved) where rq is a pointer to a request. 'reserved' - * indicates whether or not @rq is a reserved request. + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -526,16 +530,7 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, */ u32 blk_mq_unique_tag(struct request *rq) { - struct request_queue *q = rq->q; - struct blk_mq_hw_ctx *hctx; - int hwq = 0; - - if (q->mq_ops) { - hctx = blk_mq_map_queue(q, rq->mq_ctx->cpu); - hwq = hctx->queue_num; - } - - return (hwq << BLK_MQ_UNIQUE_TAG_BITS) | + return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); } EXPORT_SYMBOL(blk_mq_unique_tag); diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index c3afbca11299..370827163835 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -29,7 +29,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, +int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -38,17 +38,17 @@ int blk_mq_virtio_map_queues(struct blk_mq_tag_set *set, if (!vdev->config->get_vq_affinity) goto fallback; - for (queue = 0; queue < set->nr_hw_queues; queue++) { + for (queue = 0; queue < qmap->nr_queues; queue++) { mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); if (!mask) goto fallback; for_each_cpu(cpu, mask) - set->mq_map[cpu] = queue; + qmap->mq_map[cpu] = qmap->queue_offset + queue; } return 0; fallback: - return blk_mq_map_queues(set); + return blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 3f91c6e5b17a..3ba37b9e15e9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -38,7 +38,6 @@ #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -75,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) - sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + if (!sbitmap_test_bit(&hctx->ctx_map, bit)) + sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { @@ -90,33 +93,33 @@ struct mq_inflight { unsigned int *inflight; }; -static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; /* - * index[0] counts the specific partition that was asked for. index[1] - * counts the ones that are active on the whole device, so increment - * that if mi->part is indeed a partition, and not a whole device. + * index[0] counts the specific partition that was asked for. */ if (rq->part == mi->part) mi->inflight[0]++; - if (mi->part->partno) - mi->inflight[1]++; + + return true; } -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { + unsigned inflight[2]; struct mq_inflight mi = { .part = part, .inflight = inflight, }; inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + + return inflight[0]; } -static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -124,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, if (rq->part == mi->part) mi->inflight[rq_data_dir(rq)]++; + + return true; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, @@ -142,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } } @@ -177,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); - if (!q->mq_ops) - blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } @@ -275,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); +/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, unsigned int tag, unsigned int op) { @@ -298,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->mq_hctx = data->hctx; rq->rq_flags = rq_flags; - rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -310,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -319,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; - rq->__deadline = 0; + WRITE_ONCE(rq->deadline, 0); - INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; rq->next_rq = NULL; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; -#endif - data->ctx->rq_dispatched[op_is_sync(op)]++; refcount_set(&rq->ref, 1); return rq; } static struct request *blk_mq_get_request(struct request_queue *q, - struct bio *bio, unsigned int op, - struct blk_mq_alloc_data *data) + struct bio *bio, + struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; struct request *rq; @@ -353,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q, put_ctx_on_error = true; } if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - if (op & REQ_NOWAIT) + data->hctx = blk_mq_map_queue(q, data->cmd_flags, + data->ctx->cpu); + if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { @@ -365,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ - if (!op_is_flush(op) && e->type->ops.mq.limit_depth && + if (!op_is_flush(data->cmd_flags) && + e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.mq.limit_depth(op, data); + e->type->ops.limit_depth(data->cmd_flags, data); } else { blk_mq_tag_busy(data->hctx); } @@ -382,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, op); - if (!op_is_flush(op)) { + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; - if (e && e->type->ops.mq.prepare_request) { - if (e->type->icq_cache && rq_ioc(bio)) - blk_mq_sched_assign_ioc(rq, bio); + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); - e->type->ops.mq.prepare_request(rq, bio); + e->type->ops.prepare_request(rq, bio); rq->rq_flags |= RQF_ELVPRIV; } } @@ -400,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; int ret; @@ -408,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -426,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; unsigned int cpu; int ret; @@ -459,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -473,10 +485,11 @@ static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; blk_pm_mark_last_busy(rq); + rq->mq_hctx = NULL; if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -490,11 +503,11 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.mq.finish_request) - e->type->ops.mq.finish_request(rq); + if (e && e->type->ops.finish_request) + e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); rq->elv.icq = NULL; @@ -510,9 +523,6 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); - if (blk_rq_rl(rq)) - blk_put_rl(blk_rq_rl(rq)); - WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); @@ -521,7 +531,10 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); @@ -555,19 +568,19 @@ EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; + struct request_queue *q = rq->q; - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } static void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + struct request_queue *q = rq->q; bool shared = false; int cpu; - if (!blk_mq_mark_complete(rq)) - return; - + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); /* * Most of single queue controllers, there is only one irq vector * for handling IO completion, and the only irq's affinity is set @@ -577,18 +590,23 @@ static void __blk_mq_complete_request(struct request *rq) * So complete IO reqeust in softirq context in case of single queue * for not degrading IO performance by irqsoff latency. */ - if (rq->q->nr_hw_queues == 1) { + if (q->nr_hw_queues == 1) { __blk_complete_request(rq); return; } - if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { - rq->q->softirq_done_fn(rq); + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if ((rq->cmd_flags & REQ_HIPRI) || + !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + q->mq_ops->complete(rq); return; } cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { @@ -597,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } put_cpu(); } @@ -630,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) - return; + return false; __blk_mq_complete_request(rq); + return true; } EXPORT_SYMBOL(blk_mq_complete_request); @@ -701,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(blk_queued_rq(rq)); + BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -786,6 +805,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) +{ + /* + * If we find a request that is inflight and the queue matches, + * we know the queue is busy. Return false to stop the iteration. + */ + if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + bool *busy = priv; + + *busy = true; + return false; + } + + return true; +} + +bool blk_mq_queue_inflight(struct request_queue *q) +{ + bool busy = false; + + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); + return busy; +} +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); + static void blk_mq_rq_timed_out(struct request *req, bool reserved) { req->rq_flags |= RQF_TIMED_OUT; @@ -810,7 +855,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) if (rq->rq_flags & RQF_TIMED_OUT) return false; - deadline = blk_rq_deadline(rq); + deadline = READ_ONCE(rq->deadline); if (time_after_eq(jiffies, deadline)) return true; @@ -821,7 +866,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; @@ -831,7 +876,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * so we're not unnecessarilly synchronizing across CPUs. */ if (!blk_mq_req_expired(rq, next)) - return; + return true; /* * We have reason to believe the request may be expired. Take a @@ -843,7 +888,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * timeout handler to posting a natural completion. */ if (!refcount_inc_not_zero(&rq->ref)) - return; + return true; /* * The request is now locked and cannot be reallocated underneath the @@ -855,6 +900,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_rq_timed_out(rq, reserved); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); + + return true; } static void blk_mq_timeout_work(struct work_struct *work) @@ -911,9 +958,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, flush_data->list); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; @@ -945,12 +993,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); - if (list_empty(&ctx->rq_list)) + if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); @@ -961,7 +1010,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { - unsigned off = start ? start->index_hw : 0; + unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, @@ -985,8 +1034,9 @@ bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, - .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .hctx = rq->mq_hctx, .flags = BLK_MQ_REQ_NOWAIT, + .cmd_flags = rq->cmd_flags, }; bool shared; @@ -1150,7 +1200,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + hctx = rq->mq_hctx; if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; @@ -1223,6 +1273,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart; + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (q->mq_ops->commit_rqs) + q->mq_ops->commit_rqs(hctx); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1552,15 +1610,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); trace_block_rq_insert(hctx->queue, rq); if (at_head) - list_add(&rq->queuelist, &ctx->rq_list); + list_add(&rq->queuelist, &ctx->rq_lists[type]); else - list_add_tail(&rq->queuelist, &ctx->rq_list); + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1580,8 +1639,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); list_add_tail(&rq->queuelist, &hctx->dispatch); @@ -1596,6 +1654,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, { struct request *rq; + enum hctx_type type = hctx->type; /* * preemption doesn't flush plug list, so it's possible ctx->cpu is @@ -1607,35 +1666,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, } spin_lock(&ctx->lock); - list_splice_tail_init(list, &ctx->rq_list); + list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - return !(rqa->mq_ctx < rqb->mq_ctx || - (rqa->mq_ctx == rqb->mq_ctx && - blk_rq_pos(rqa) < blk_rq_pos(rqb))); + if (rqa->mq_ctx < rqb->mq_ctx) + return -1; + else if (rqa->mq_ctx > rqb->mq_ctx) + return 1; + else if (rqa->mq_hctx < rqb->mq_hctx) + return -1; + else if (rqa->mq_hctx > rqb->mq_hctx) + return 1; + + return blk_rq_pos(rqa) > blk_rq_pos(rqb); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; struct blk_mq_ctx *this_ctx; struct request_queue *this_q; struct request *rq; LIST_HEAD(list); - LIST_HEAD(ctx_list); + LIST_HEAD(rq_list); unsigned int depth; list_splice_init(&plug->mq_list, &list); + plug->rq_count = 0; - list_sort(NULL, &list, plug_ctx_cmp); + if (plug->rq_count > 2 && plug->multiple_queues) + list_sort(NULL, &list, plug_rq_cmp); this_q = NULL; + this_hctx = NULL; this_ctx = NULL; depth = 0; @@ -1643,30 +1713,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); BUG_ON(!rq->q); - if (rq->mq_ctx != this_ctx) { - if (this_ctx) { + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, - &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &rq_list, from_schedule); } - this_ctx = rq->mq_ctx; this_q = rq->q; + this_ctx = rq->mq_ctx; + this_hctx = rq->mq_hctx; depth = 0; } depth++; - list_add_tail(&rq->queuelist, &ctx_list); + list_add_tail(&rq->queuelist, &rq_list); } /* - * If 'this_ctx' is set, we know we have entries to complete - * on 'ctx_list'. Do those. + * If 'this_hctx' is set, we know we have entries to complete + * on 'rq_list'. Do those. */ - if (this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); } } @@ -1675,27 +1746,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); - blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); - blk_account_io_start(rq, true); } -static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag != -1) - return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - - return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); -} - static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie) + blk_qc_t *cookie, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .last = true, + .last = last, }; blk_qc_t new_cookie; blk_status_t ret; @@ -1727,77 +1788,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass_insert) + bool bypass, bool last) { struct request_queue *q = rq->q; bool run_queue = true; + blk_status_t ret = BLK_STS_RESOURCE; + int srcu_idx; + bool force = false; + hctx_lock(hctx, &srcu_idx); /* - * RCU or SRCU read lock is needed before checking quiesced flag. + * hctx_lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. + * When queue is stopped or quiesced, ignore 'bypass', insert + * and return BLK_STS_OK to caller, and avoid driver to try to + * dispatch again. */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { run_queue = false; - bypass_insert = false; - goto insert; + bypass = false; + goto out_unlock; } - if (q->elevator && !bypass_insert) - goto insert; + if (unlikely(q->elevator && !bypass)) + goto out_unlock; if (!blk_mq_get_dispatch_budget(hctx)) - goto insert; + goto out_unlock; if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto insert; + goto out_unlock; } - return __blk_mq_issue_directly(hctx, rq, cookie); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_sched_insert_request(rq, false, run_queue, false); - return BLK_STS_OK; -} - -static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) -{ - blk_status_t ret; - int srcu_idx; - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_sched_insert_request(rq, false, true, false); - else if (ret != BLK_STS_OK) - blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); -} - -blk_status_t blk_mq_request_issue_directly(struct request *rq) -{ - blk_status_t ret; - int srcu_idx; - blk_qc_t unused_cookie; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); - - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + /* + * Always add a request that has been through + *.queue_rq() to the hardware dispatch list. + */ + force = true; + ret = __blk_mq_issue_directly(hctx, rq, cookie, last); +out_unlock: hctx_unlock(hctx, srcu_idx); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_DEV_RESOURCE: + case BLK_STS_RESOURCE: + if (force) { + blk_mq_request_bypass_insert(rq, run_queue); + /* + * We have to return BLK_STS_OK for the DM + * to avoid livelock. Otherwise, we return + * the real result to indicate whether the + * request is direct-issued successfully. + */ + ret = bypass ? BLK_STS_OK : ret; + } else if (!bypass) { + blk_mq_sched_insert_request(rq, false, + run_queue, false); + } + break; + default: + if (!bypass) + blk_mq_end_request(rq, ret); + break; + } return ret; } @@ -1805,21 +1863,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { + blk_qc_t unused; + blk_status_t ret = BLK_STS_OK; + while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = blk_mq_request_issue_directly(rq); - if (ret != BLK_STS_OK) { - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - list_add(&rq->queuelist, list); - break; - } - blk_mq_end_request(rq, ret); - } + if (ret == BLK_STS_OK) + ret = blk_mq_try_issue_directly(hctx, rq, &unused, + false, + list_empty(list)); + else + blk_mq_sched_insert_request(rq, false, true, false); + } + + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + hctx->queue->mq_ops->commit_rqs(hctx); +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count++; + if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { + struct request *tmp; + + tmp = list_first_entry(&plug->mq_list, struct request, + queuelist); + if (tmp->q != rq->q) + plug->multiple_queues = true; } } @@ -1827,9 +1906,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { .flags = 0, .cmd_flags = bio->bi_opf }; struct request *rq; - unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1842,15 +1920,15 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + blk_attempt_plug_merge(q, bio, &same_queue_rq)) return BLK_QC_T_NONE; if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - rq_qos_throttle(q, bio, NULL); + rq_qos_throttle(q, bio); - rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); + rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) @@ -1872,21 +1950,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && q->nr_hw_queues == 1) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + /* + * Use plugging if we have a ->commit_rqs() hook as well, as + * we know the driver uses bd->last in a smart fashion. + */ + unsigned int request_count = plug->rq_count; struct request *last = NULL; blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - /* - * @request_count may become stale because of schedule - * out, so check the list again. - */ - if (list_empty(&plug->mq_list)) - request_count = 0; - else if (blk_queue_nomerges(q)) - request_count = blk_plug_queued_count(q); - if (!request_count) trace_block_plug(q); else @@ -1898,7 +1972,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } - list_add_tail(&rq->queuelist, &plug->mq_list); + blk_add_rq_to_plug(plug, rq); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -1911,23 +1985,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) */ if (list_empty(&plug->mq_list)) same_queue_rq = NULL; - if (same_queue_rq) + if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist); - list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count--; + } + blk_add_rq_to_plug(plug, rq); blk_mq_put_ctx(data.ctx); if (same_queue_rq) { - data.hctx = blk_mq_map_queue(q, - same_queue_rq->mq_ctx->cpu); + data.hctx = same_queue_rq->mq_hctx; blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + &cookie, false, true); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie); + blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1985,7 +2060,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2041,7 +2116,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, size_t rq_size, left; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2121,13 +2196,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -2258,24 +2335,30 @@ static int blk_mq_init_hctx(struct request_queue *q, static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { - unsigned int i; + struct blk_mq_tag_set *set = q->tag_set; + unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; + int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ - hctx = blk_mq_map_queue(q, i); - if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + for (j = 0; j < set->nr_maps; j++) { + hctx = blk_mq_map_queue_type(q, j, i); + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = local_memory_node(cpu_to_node(i)); + } } } @@ -2301,7 +2384,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { - if (set->tags[hctx_idx]) { + if (set->tags && set->tags[hctx_idx]) { blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); blk_mq_free_rq_map(set->tags[hctx_idx]); set->tags[hctx_idx] = NULL; @@ -2310,7 +2393,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, hctx_idx; + unsigned int i, j, hctx_idx; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2332,7 +2415,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = q->mq_map[i]; + hctx_idx = set->map[0].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_rq_map(set, hctx_idx)) { @@ -2342,15 +2425,35 @@ static void blk_mq_map_swqueue(struct request_queue *q) * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - q->mq_map[i] = 0; + set->map[0].mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = blk_mq_map_queue(q, i); + for (j = 0; j < set->nr_maps; j++) { + if (!set->map[j].nr_queues) + continue; + + hctx = blk_mq_map_queue_type(q, j, i); + + /* + * If the CPU is already set in the mask, then we've + * mapped this one already. This can happen if + * devices share queues across queue maps. + */ + if (cpumask_test_cpu(i, hctx->cpumask)) + continue; + + cpumask_set_cpu(i, hctx->cpumask); + hctx->type = j; + ctx->index_hw[hctx->type] = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; - cpumask_set_cpu(i, hctx->cpumask); - ctx->index_hw = hctx->nr_ctx; - hctx->ctxs[hctx->nr_ctx++] = ctx; + /* + * If the nr_ctx type overflows, we have exceeded the + * amount of sw queues we can support. + */ + BUG_ON(!hctx->nr_ctx); + } } mutex_unlock(&q->sysfs_lock); @@ -2440,8 +2543,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { - q->tag_set = set; - mutex_lock(&set->tag_list_lock); /* @@ -2460,6 +2561,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_unlock(&set->tag_list_lock); } +/* All allocations will be freed in release handler of q->mq_kobj */ +static int blk_mq_alloc_ctxs(struct request_queue *q) +{ + struct blk_mq_ctxs *ctxs; + int cpu; + + ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctxs->queue_ctx) + goto fail; + + for_each_possible_cpu(cpu) { + struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); + ctx->ctxs = ctxs; + } + + q->mq_kobj = &ctxs->kobj; + q->queue_ctx = ctxs->queue_ctx; + + return 0; + fail: + kfree(ctxs); + return -ENOMEM; +} + /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free @@ -2478,8 +2607,6 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - q->mq_map = NULL; - kfree(q->queue_hw_ctx); /* @@ -2487,15 +2614,13 @@ void blk_mq_release(struct request_queue *q) * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); - - free_percpu(q->queue_ctx); } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); @@ -2522,6 +2647,7 @@ struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, memset(set, 0, sizeof(*set)); set->ops = ops; set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = queue_depth; set->numa_node = NUMA_NO_NODE; set->flags = set_flags; @@ -2599,7 +2725,7 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, int node; struct blk_mq_hw_ctx *hctx; - node = blk_mq_hw_queue_to_node(q->mq_map, i); + node = blk_mq_hw_queue_to_node(&set->map[0], i); /* * If the hw queue has been mapped to another numa node, * we need to realloc the hctx. If allocation fails, fallback @@ -2652,6 +2778,19 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, mutex_unlock(&q->sysfs_lock); } +/* + * Maximum number of hardware queues we support. For single sets, we'll never + * have more than the CPUs (software queues). For multiple sets, the tag_set + * user may have set ->nr_hw_queues larger. + */ +static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) +{ + if (set->nr_maps == 1) + return nr_cpu_ids; + + return max(set->nr_hw_queues, nr_cpu_ids); +} + struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q) { @@ -2664,19 +2803,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->poll_cb) goto err_exit; - q->queue_ctx = alloc_percpu(struct blk_mq_ctx); - if (!q->queue_ctx) + if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), + q->nr_queues = nr_hw_queues(set); + q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); if (!q->queue_hw_ctx) - goto err_percpu; - - q->mq_map = set->mq_map; + goto err_sys_init; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2685,12 +2822,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->nr_queues = nr_cpu_ids; + q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); if (!(set->flags & BLK_MQ_F_SG_MERGE)) - queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); q->sg_reserved_size = INT_MAX; @@ -2699,8 +2839,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); - if (q->mq_ops->poll) - q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2712,9 +2850,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->poll_nsec = -1; - if (set->ops->complete) - blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -2731,8 +2866,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); -err_percpu: - free_percpu(q->queue_ctx); +err_sys_init: + blk_mq_sysfs_deinit(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); @@ -2801,7 +2936,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) { + if (set->ops->map_queues && !is_kdump_kernel()) { + int i; + /* * transport .map_queues is usually done in the following * way: @@ -2809,18 +2946,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) - * set->mq_map[cpu] = queue; + * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - blk_mq_clear_mq_map(set); + for (i = 0; i < set->nr_maps; i++) + blk_mq_clear_mq_map(&set->map[i]); return set->ops->map_queues(set); - } else - return blk_mq_map_queues(set); + } else { + BUG_ON(set->nr_maps > 1); + return blk_mq_map_queues(&set->map[0]); + } } /* @@ -2831,7 +2971,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int ret; + int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); @@ -2854,6 +2994,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + if (!set->nr_maps) + set->nr_maps = 1; + else if (set->nr_maps > HCTX_MAX_TYPES) + return -EINVAL; + /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and @@ -2861,24 +3006,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) */ if (is_kdump_kernel()) { set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* - * There is no use for more h/w queues than cpus. + * There is no use for more h/w queues than cpus if we just have + * a single map */ - if (set->nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), + set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; ret = -ENOMEM; - set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), - GFP_KERNEL, set->numa_node); - if (!set->mq_map) - goto out_free_tags; + for (i = 0; i < set->nr_maps; i++) { + set->map[i].mq_map = kcalloc_node(nr_cpu_ids, + sizeof(set->map[i].mq_map[0]), + GFP_KERNEL, set->numa_node); + if (!set->map[i].mq_map) + goto out_free_mq_map; + set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + } ret = blk_mq_update_queue_map(set); if (ret) @@ -2894,9 +3045,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return 0; out_free_mq_map: - kfree(set->mq_map); - set->mq_map = NULL; -out_free_tags: + for (i = 0; i < set->nr_maps; i++) { + kfree(set->map[i].mq_map); + set->map[i].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; return ret; @@ -2905,13 +3057,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { - int i; + int i, j; - for (i = 0; i < nr_cpu_ids; i++) + for (i = 0; i < nr_hw_queues(set); i++) blk_mq_free_map_and_requests(set, i); - kfree(set->mq_map); - set->mq_map = NULL; + for (j = 0; j < set->nr_maps; j++) { + kfree(set->map[j].mq_map); + set->map[j].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; @@ -3037,7 +3191,7 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, lockdep_assert_held(&set->tag_list_lock); - if (nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) return; @@ -3072,7 +3226,7 @@ fallback: pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(set); + blk_mq_map_queues(&set->map[0]); goto fallback; } blk_mq_map_swqueue(q); @@ -3179,15 +3333,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return false; /* - * poll_nsec can be: + * If we get here, hybrid polling is enabled. Hence poll_nsec can be: * - * -1: don't ever hybrid sleep * 0: use half of prev avg * >0: use this specific value */ - if (q->poll_nsec == -1) - return false; - else if (q->poll_nsec > 0) + if (q->poll_nsec > 0) nsecs = q->poll_nsec; else nsecs = blk_mq_poll_nsecs(q, hctx, rq); @@ -3224,11 +3375,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return true; } -static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) { - struct request_queue *q = hctx->queue; + struct request *rq; + + if (q->poll_nsec == -1) + return false; + + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else { + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); + /* + * With scheduling, if the request has completed, we'll + * get a NULL return here, as we clear the sched tag when + * that happens. The request still remains valid, like always, + * so we should be safe with just the NULL check. + */ + if (!rq) + return false; + } + + return blk_mq_poll_hybrid_sleep(q, hctx, rq); +} + +/** + * blk_poll - poll for IO completions + * @q: the queue + * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions + * + * Description: + * Poll for completions on the passed in queue. Returns number of + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). + */ +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +{ + struct blk_mq_hw_ctx *hctx; long state; + if (!blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + /* * If we sleep, have the caller restart the poll loop to reset * the state. Like for the other success return cases, the @@ -3236,63 +3433,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) * the IO isn't complete, we'll get called again and will go * straight to the busy poll loop. */ - if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) - return true; + if (blk_mq_poll_hybrid(q, hctx, cookie)) + return 1; hctx->poll_considered++; state = current->state; - while (!need_resched()) { + do { int ret; hctx->poll_invoked++; - ret = q->mq_ops->poll(hctx, rq->tag); + ret = q->mq_ops->poll(hctx); if (ret > 0) { hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; + __set_current_state(TASK_RUNNING); + return ret; } if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (current->state == TASK_RUNNING) - return true; - if (ret < 0) + return 1; + if (ret < 0 || !spin) break; cpu_relax(); - } + } while (!need_resched()); __set_current_state(TASK_RUNNING); - return false; + return 0; } +EXPORT_SYMBOL_GPL(blk_poll); -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +unsigned int blk_mq_rq_cpu(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request *rq; - - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return false; - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return __blk_mq_poll(hctx, rq); + return rq->mq_ctx->cpu; } +EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { diff --git a/block/blk-mq.h b/block/blk-mq.h index 9497b47e2526..d943d46b0785 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -7,17 +7,22 @@ struct blk_mq_tag_set; +struct blk_mq_ctxs { + struct kobject kobj; + struct blk_mq_ctx __percpu *queue_ctx; +}; + /** * struct blk_mq_ctx - State for a software queue facing the submitting CPUs */ struct blk_mq_ctx { struct { spinlock_t lock; - struct list_head rq_list; - } ____cacheline_aligned_in_smp; + struct list_head rq_lists[HCTX_MAX_TYPES]; + } ____cacheline_aligned_in_smp; unsigned int cpu; - unsigned int index_hw; + unsigned short index_hw[HCTX_MAX_TYPES]; /* incremented at dispatch time */ unsigned long rq_dispatched[2]; @@ -27,6 +32,7 @@ struct blk_mq_ctx { unsigned long ____cacheline_aligned_in_smp rq_completed[2]; struct request_queue *queue; + struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; @@ -62,20 +68,55 @@ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue); void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, struct list_head *list); -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq); +blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass, bool last); void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); /* * CPU -> queue mappings */ -extern int blk_mq_hw_queue_to_node(unsigned int *map, unsigned int); +extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); + +/* + * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue + * @q: request queue + * @type: the hctx type index + * @cpu: CPU + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, + enum hctx_type type, + unsigned int cpu) +{ + return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; +} +/* + * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue + * @q: request queue + * @flags: request command flags + * @cpu: CPU + */ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - int cpu) + unsigned int flags, + unsigned int cpu) { - return q->queue_hw_ctx[q->mq_map[cpu]]; + enum hctx_type type = HCTX_TYPE_DEFAULT; + + if ((flags & REQ_HIPRI) && + q->tag_set->nr_maps > HCTX_TYPE_POLL && + q->tag_set->map[HCTX_TYPE_POLL].nr_queues && + test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + type = HCTX_TYPE_POLL; + + else if (((flags & REQ_OP_MASK) == REQ_OP_READ) && + q->tag_set->nr_maps > HCTX_TYPE_READ && + q->tag_set->map[HCTX_TYPE_READ].nr_queues) + type = HCTX_TYPE_READ; + + return blk_mq_map_queue_type(q, type, cpu); } /* @@ -126,6 +167,7 @@ struct blk_mq_alloc_data { struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; + unsigned int cmd_flags; /* input & output parameter */ struct blk_mq_ctx *ctx; @@ -150,8 +192,7 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]); @@ -195,21 +236,18 @@ static inline void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, static inline void blk_mq_put_driver_tag(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - if (rq->tag == -1 || rq->internal_tag == -1) return; - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); - __blk_mq_put_driver_tag(hctx, rq); + __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -static inline void blk_mq_clear_mq_map(struct blk_mq_tag_set *set) +static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; for_each_possible_cpu(cpu) - set->mq_map[cpu] = 0; + qmap->mq_map[cpu] = 0; } #endif diff --git a/block/blk-pm.c b/block/blk-pm.c index f8fdae01bea2..0a028c189897 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -89,12 +89,12 @@ int blk_pre_runtime_suspend(struct request_queue *q) /* Switch q_usage_counter back to per-cpu mode. */ blk_mq_unfreeze_queue(q); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (ret < 0) pm_runtime_mark_last_busy(q->dev); else q->rpm_status = RPM_SUSPENDING; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (ret) blk_clear_pm_only(q); @@ -121,14 +121,14 @@ void blk_post_runtime_suspend(struct request_queue *q, int err) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (!err) { q->rpm_status = RPM_SUSPENDED; } else { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (err) blk_clear_pm_only(q); @@ -151,9 +151,9 @@ void blk_pre_runtime_resume(struct request_queue *q) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->rpm_status = RPM_RESUMING; - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } EXPORT_SYMBOL(blk_pre_runtime_resume); @@ -176,7 +176,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) if (!q->dev) return; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (!err) { q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); @@ -184,7 +184,7 @@ void blk_post_runtime_resume(struct request_queue *q, int err) } else { q->rpm_status = RPM_SUSPENDED; } - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!err) blk_clear_pm_only(q); @@ -207,10 +207,10 @@ EXPORT_SYMBOL(blk_post_runtime_resume); */ void blk_set_runtime_active(struct request_queue *q) { - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); pm_request_autosuspend(q->dev); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-pm.h b/block/blk-pm.h index a8564ea72a41..ea5507d23e75 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -21,7 +21,7 @@ static inline void blk_pm_mark_last_busy(struct request *rq) static inline void blk_pm_requeue_request(struct request *rq) { - lockdep_assert_held(rq->q->queue_lock); + lockdep_assert_held(&rq->q->queue_lock); if (rq->q->dev && !(rq->rq_flags & RQF_PM)) rq->q->nr_pending--; @@ -30,7 +30,7 @@ static inline void blk_pm_requeue_request(struct request *rq) static inline void blk_pm_add_request(struct request_queue *q, struct request *rq) { - lockdep_assert_held(q->queue_lock); + lockdep_assert_held(&q->queue_lock); if (q->dev && !(rq->rq_flags & RQF_PM)) q->nr_pending++; @@ -38,7 +38,7 @@ static inline void blk_pm_add_request(struct request_queue *q, static inline void blk_pm_put_request(struct request *rq) { - lockdep_assert_held(rq->q->queue_lock); + lockdep_assert_held(&rq->q->queue_lock); if (rq->q->dev && !(rq->rq_flags & RQF_PM)) --rq->q->nr_pending; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 0005dfd568dd..d169d7188fa6 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -27,75 +27,67 @@ bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) return atomic_inc_below(&rq_wait->inflight, limit); } -void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->cleanup) rqos->ops->cleanup(rqos, bio); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_done(struct request_queue *q, struct request *rq) +void __rq_qos_done(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->done) rqos->ops->done(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_issue(struct request_queue *q, struct request *rq) +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->issue) rqos->ops->issue(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_requeue(struct request_queue *q, struct request *rq) +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->requeue) rqos->ops->requeue(rqos, rq); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_throttle(struct request_queue *q, struct bio *bio, - spinlock_t *lock) +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->throttle) - rqos->ops->throttle(rqos, bio, lock); - } + rqos->ops->throttle(rqos, bio); + rqos = rqos->next; + } while (rqos); } -void rq_qos_track(struct request_queue *q, struct request *rq, struct bio *bio) +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->track) rqos->ops->track(rqos, rq, bio); - } + rqos = rqos->next; + } while (rqos); } -void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) { - struct rq_qos *rqos; - - for(rqos = q->rq_qos; rqos; rqos = rqos->next) { + do { if (rqos->ops->done_bio) rqos->ops->done_bio(rqos, bio); - } + rqos = rqos->next; + } while (rqos); } /* @@ -184,8 +176,96 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) rq_depth_calc_max_depth(rqd); } +struct rq_qos_wait_data { + struct wait_queue_entry wq; + struct task_struct *task; + struct rq_wait *rqw; + acquire_inflight_cb_t *cb; + void *private_data; + bool got_token; +}; + +static int rq_qos_wake_function(struct wait_queue_entry *curr, + unsigned int mode, int wake_flags, void *key) +{ + struct rq_qos_wait_data *data = container_of(curr, + struct rq_qos_wait_data, + wq); + + /* + * If we fail to get a budget, return -1 to interrupt the wake up loop + * in __wake_up_common. + */ + if (!data->cb(data->rqw, data->private_data)) + return -1; + + data->got_token = true; + list_del_init(&curr->entry); + wake_up_process(data->task); + return 1; +} + +/** + * rq_qos_wait - throttle on a rqw if we need to + * @private_data - caller provided specific data + * @acquire_inflight_cb - inc the rqw->inflight counter if we can + * @cleanup_cb - the callback to cleanup in case we race with a waker + * + * This provides a uniform place for the rq_qos users to do their throttling. + * Since you can end up with a lot of things sleeping at once, this manages the + * waking up based on the resources available. The acquire_inflight_cb should + * inc the rqw->inflight if we have the ability to do so, or return false if not + * and then we will sleep until the room becomes available. + * + * cleanup_cb is in case that we race with a waker and need to cleanup the + * inflight count accordingly. + */ +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb) +{ + struct rq_qos_wait_data data = { + .wq = { + .func = rq_qos_wake_function, + .entry = LIST_HEAD_INIT(data.wq.entry), + }, + .task = current, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + }; + bool has_sleeper; + + has_sleeper = wq_has_sleeper(&rqw->wait); + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + return; + + prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); + do { + if (data.got_token) + break; + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + + /* + * We raced with wbt_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + */ + if (data.got_token) + cleanup_cb(rqw, private_data); + break; + } + io_schedule(); + has_sleeper = false; + } while (1); + finish_wait(&rqw->wait, &data.wq); +} + void rq_qos_exit(struct request_queue *q) { + blk_mq_debugfs_unregister_queue_rqos(q); + while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 32b02efbfa66..564851889550 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -7,6 +7,10 @@ #include <linux/atomic.h> #include <linux/wait.h> +#include "blk-mq-debugfs.h" + +struct blk_mq_debugfs_attr; + enum rq_qos_id { RQ_QOS_WBT, RQ_QOS_CGROUP, @@ -22,10 +26,13 @@ struct rq_qos { struct request_queue *q; enum rq_qos_id id; struct rq_qos *next; +#ifdef CONFIG_BLK_DEBUG_FS + struct dentry *debugfs_dir; +#endif }; struct rq_qos_ops { - void (*throttle)(struct rq_qos *, struct bio *, spinlock_t *); + void (*throttle)(struct rq_qos *, struct bio *); void (*track)(struct rq_qos *, struct request *, struct bio *); void (*issue)(struct rq_qos *, struct request *); void (*requeue)(struct rq_qos *, struct request *); @@ -33,6 +40,7 @@ struct rq_qos_ops { void (*done_bio)(struct rq_qos *, struct bio *); void (*cleanup)(struct rq_qos *, struct bio *); void (*exit)(struct rq_qos *); + const struct blk_mq_debugfs_attr *debugfs_attrs; }; struct rq_depth { @@ -66,6 +74,17 @@ static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_CGROUP); } +static inline const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_CGROUP: + return "cgroup"; + } + return "unknown"; +} + static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); @@ -76,6 +95,9 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) { rqos->next = q->rq_qos; q->rq_qos = rqos; + + if (rqos->ops->debugfs_attrs) + blk_mq_debugfs_register_rqos(rqos); } static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) @@ -91,19 +113,77 @@ static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) } prev = cur; } + + blk_mq_debugfs_unregister_rqos(rqos); } +typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); +typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); + +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb); bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); void rq_depth_scale_up(struct rq_depth *rqd); void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); bool rq_depth_calc_max_depth(struct rq_depth *rqd); -void rq_qos_cleanup(struct request_queue *, struct bio *); -void rq_qos_done(struct request_queue *, struct request *); -void rq_qos_issue(struct request_queue *, struct request *); -void rq_qos_requeue(struct request_queue *, struct request *); -void rq_qos_done_bio(struct request_queue *q, struct bio *bio); -void rq_qos_throttle(struct request_queue *, struct bio *, spinlock_t *); -void rq_qos_track(struct request_queue *q, struct request *, struct bio *); +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_done(struct rq_qos *rqos, struct request *rq); +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); + +static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_cleanup(q->rq_qos, bio); +} + +static inline void rq_qos_done(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_done(q->rq_qos, rq); +} + +static inline void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_issue(q->rq_qos, rq); +} + +static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_requeue(q->rq_qos, rq); +} + +static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); +} + +static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) +{ + /* + * BIO_TRACKED lets controllers know that a bio went through the + * normal rq_qos path. + */ + bio_set_flag(bio, BIO_TRACKED); + if (q->rq_qos) + __rq_qos_throttle(q->rq_qos, bio); +} + +static inline void rq_qos_track(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_track(q->rq_qos, rq, bio); +} + void rq_qos_exit(struct request_queue *); + #endif diff --git a/block/blk-settings.c b/block/blk-settings.c index 9c8b62f8c180..3e7038e475ee 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -20,65 +20,12 @@ EXPORT_SYMBOL(blk_max_low_pfn); unsigned long blk_max_pfn; -/** - * blk_queue_prep_rq - set a prepare_request function for queue - * @q: queue - * @pfn: prepare_request function - * - * It's possible for a queue to register a prepare_request callback which - * is invoked before the request is handed to the request_fn. The goal of - * the function is to prepare a request for I/O, it can be used to build a - * cdb from the request data for instance. - * - */ -void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn) -{ - q->prep_rq_fn = pfn; -} -EXPORT_SYMBOL(blk_queue_prep_rq); - -/** - * blk_queue_unprep_rq - set an unprepare_request function for queue - * @q: queue - * @ufn: unprepare_request function - * - * It's possible for a queue to register an unprepare_request callback - * which is invoked before the request is finally completed. The goal - * of the function is to deallocate any data that was allocated in the - * prepare_request callback. - * - */ -void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn) -{ - q->unprep_rq_fn = ufn; -} -EXPORT_SYMBOL(blk_queue_unprep_rq); - -void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn) -{ - q->softirq_done_fn = fn; -} -EXPORT_SYMBOL(blk_queue_softirq_done); - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; } EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); -void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn) -{ - WARN_ON_ONCE(q->mq_ops); - q->rq_timed_out_fn = fn; -} -EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out); - -void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn) -{ - q->lld_busy_fn = fn; -} -EXPORT_SYMBOL_GPL(blk_queue_lld_busy); - /** * blk_set_default_limits - reset limits to default values * @lim: the queue_limits structure to reset @@ -168,8 +115,6 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->make_request_fn = mfn; blk_queue_dma_alignment(q, 511); - blk_queue_congestion_threshold(q); - q->nr_batching = BLK_BATCH_REQ; blk_set_default_limits(&q->limits); } @@ -886,16 +831,14 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - spin_lock_irq(q->queue_lock); if (wc) - queue_flag_set(QUEUE_FLAG_WC, q); + blk_queue_flag_set(QUEUE_FLAG_WC, q); else - queue_flag_clear(QUEUE_FLAG_WC, q); + blk_queue_flag_clear(QUEUE_FLAG_WC, q); if (fua) - queue_flag_set(QUEUE_FLAG_FUA, q); + blk_queue_flag_set(QUEUE_FLAG_FUA, q); else - queue_flag_clear(QUEUE_FLAG_FUA, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_clear(QUEUE_FLAG_FUA, q); wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } diff --git a/block/blk-softirq.c b/block/blk-softirq.c index e47a2f751884..457d9ba3eb20 100644 --- a/block/blk-softirq.c +++ b/block/blk-softirq.c @@ -34,7 +34,7 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h) rq = list_entry(local_list.next, struct request, ipi_list); list_del_init(&rq->ipi_list); - rq->q->softirq_done_fn(rq); + rq->q->mq_ops->complete(rq); } } @@ -98,11 +98,11 @@ static int blk_softirq_cpu_dead(unsigned int cpu) void __blk_complete_request(struct request *req) { struct request_queue *q = req->q; - int cpu, ccpu = q->mq_ops ? req->mq_ctx->cpu : req->cpu; + int cpu, ccpu = req->mq_ctx->cpu; unsigned long flags; bool shared = false; - BUG_ON(!q->softirq_done_fn); + BUG_ON(!q->mq_ops->complete); local_irq_save(flags); cpu = smp_processor_id(); @@ -143,27 +143,6 @@ do_local: local_irq_restore(flags); } -EXPORT_SYMBOL(__blk_complete_request); - -/** - * blk_complete_request - end I/O on a request - * @req: the request being processed - * - * Description: - * Ends all I/O on a request. It does not handle partial completions, - * unless the driver actually implements this in its completion callback - * through requeueing. The actual completion happens out-of-order, - * through a softirq handler. The user must have registered a completion - * callback through blk_queue_softirq_done(). - **/ -void blk_complete_request(struct request *req) -{ - if (unlikely(blk_should_fake_timeout(req->q))) - return; - if (!blk_mark_rq_complete(req)) - __blk_complete_request(req); -} -EXPORT_SYMBOL(blk_complete_request); static __init int blk_softirq_init(void) { diff --git a/block/blk-stat.c b/block/blk-stat.c index 90561af85a62..696a04176e4d 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -130,7 +130,6 @@ blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), return cb; } -EXPORT_SYMBOL_GPL(blk_stat_alloc_callback); void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) @@ -151,7 +150,6 @@ void blk_stat_add_callback(struct request_queue *q, blk_queue_flag_set(QUEUE_FLAG_STATS, q); spin_unlock(&q->stats->lock); } -EXPORT_SYMBOL_GPL(blk_stat_add_callback); void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) @@ -164,7 +162,6 @@ void blk_stat_remove_callback(struct request_queue *q, del_timer_sync(&cb->timer); } -EXPORT_SYMBOL_GPL(blk_stat_remove_callback); static void blk_stat_free_callback_rcu(struct rcu_head *head) { @@ -181,7 +178,6 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) if (cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } -EXPORT_SYMBOL_GPL(blk_stat_free_callback); void blk_stat_enable_accounting(struct request_queue *q) { diff --git a/block/blk-stat.h b/block/blk-stat.h index f4a1568e81a4..17b47a86eefb 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -145,6 +145,11 @@ static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); } +static inline void blk_stat_deactivate(struct blk_stat_callback *cb) +{ + del_timer_sync(&cb->timer); +} + /** * blk_stat_activate_msecs() - Gather block statistics during a time window in * milliseconds. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 5144707f25ea..590d1ef2f961 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -68,7 +68,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) unsigned long nr; int ret, err; - if (!q->request_fn && !q->mq_ops) + if (!queue_is_mq(q)) return -EINVAL; ret = queue_var_store(&nr, page, count); @@ -78,11 +78,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) if (nr < BLKDEV_MIN_RQ) nr = BLKDEV_MIN_RQ; - if (q->request_fn) - err = blk_update_nr_requests(q, nr); - else - err = blk_mq_update_nr_requests(q, nr); - + err = blk_mq_update_nr_requests(q, nr); if (err) return err; @@ -239,10 +235,10 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) return -EINVAL; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); return ret; } @@ -317,14 +313,12 @@ static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, if (ret < 0) return ret; - spin_lock_irq(q->queue_lock); - queue_flag_clear(QUEUE_FLAG_NOMERGES, q); - queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); if (nm == 2) - queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); else if (nm) - queue_flag_set(QUEUE_FLAG_NOXMERGES, q); - spin_unlock_irq(q->queue_lock); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); return ret; } @@ -348,18 +342,16 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - spin_lock_irq(q->queue_lock); if (val == 2) { - queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 1) { - queue_flag_set(QUEUE_FLAG_SAME_COMP, q); - queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } else if (val == 0) { - queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); - queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); } - spin_unlock_irq(q->queue_lock); #endif return ret; } @@ -407,7 +399,8 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, unsigned long poll_on; ssize_t ret; - if (!q->mq_ops || !q->mq_ops->poll) + if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || + !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) return -EINVAL; ret = queue_var_store(&poll_on, page, count); @@ -422,6 +415,26 @@ static ssize_t queue_poll_store(struct request_queue *q, const char *page, return ret; } +static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); +} + +static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned int val; + int err; + + err = kstrtou32(page, 10, &val); + if (err || val == 0) + return -EINVAL; + + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + + return count; +} + static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { if (!wbt_rq_qos(q)) @@ -460,20 +473,14 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, * ends up either enabling or disabling wbt completely. We can't * have IO inflight if that happens. */ - if (q->mq_ops) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - } else - blk_queue_bypass_start(q); + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); wbt_set_min_lat(q, val); wbt_update_limits(q); - if (q->mq_ops) { - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - } else - blk_queue_bypass_end(q); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); return count; } @@ -696,6 +703,12 @@ static struct queue_sysfs_entry queue_dax_entry = { .show = queue_dax_show, }; +static struct queue_sysfs_entry queue_io_timeout_entry = { + .attr = {.name = "io_timeout", .mode = 0644 }, + .show = queue_io_timeout_show, + .store = queue_io_timeout_store, +}; + static struct queue_sysfs_entry queue_wb_lat_entry = { .attr = {.name = "wbt_lat_usec", .mode = 0644 }, .show = queue_wb_lat_show, @@ -745,6 +758,7 @@ static struct attribute *default_attrs[] = { &queue_dax_entry.attr, &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, + &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &throtl_sample_time_entry.attr, #endif @@ -844,24 +858,14 @@ static void __blk_release_queue(struct work_struct *work) blk_free_queue_stats(q->stats); - blk_exit_rl(q, &q->root_rl); - - if (q->queue_tags) - __blk_queue_free_tags(q); - blk_queue_free_zone_bitmaps(q); - if (!q->mq_ops) { - if (q->exit_rq_fn) - q->exit_rq_fn(q, q->fq->flush_rq); - blk_free_flush_queue(q->fq); - } else { + if (queue_is_mq(q)) blk_mq_release(q); - } blk_trace_shutdown(q); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_debugfs_unregister(q); bioset_exit(&q->bio_split); @@ -906,7 +910,7 @@ int blk_register_queue(struct gendisk *disk) WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags), "%s is registering an already registered queue\n", kobject_name(&dev->kobj)); - queue_flag_set_unlocked(QUEUE_FLAG_REGISTERED, q); + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); /* * SCSI probing may synchronously create and destroy a lot of @@ -918,9 +922,8 @@ int blk_register_queue(struct gendisk *disk) * request_queues for non-existent devices never get registered. */ if (!blk_queue_init_done(q)) { - queue_flag_set_unlocked(QUEUE_FLAG_INIT_DONE, q); + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); percpu_ref_switch_to_percpu(&q->q_usage_counter); - blk_queue_bypass_end(q); } ret = blk_trace_init_sysfs(dev); @@ -936,7 +939,7 @@ int blk_register_queue(struct gendisk *disk) goto unlock; } - if (q->mq_ops) { + if (queue_is_mq(q)) { __blk_mq_register_dev(dev, q); blk_mq_debugfs_register(q); } @@ -947,7 +950,7 @@ int blk_register_queue(struct gendisk *disk) blk_throtl_register_queue(q); - if (q->request_fn || (q->mq_ops && q->elevator)) { + if (q->elevator) { ret = elv_register_queue(q); if (ret) { mutex_unlock(&q->sysfs_lock); @@ -996,7 +999,7 @@ void blk_unregister_queue(struct gendisk *disk) * Remove the sysfs attributes before unregistering the queue data * structures that can be modified through sysfs. */ - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_unregister_dev(disk_to_dev(disk), q); mutex_unlock(&q->sysfs_lock); @@ -1005,7 +1008,7 @@ void blk_unregister_queue(struct gendisk *disk) blk_trace_remove_sysfs(disk_to_dev(disk)); mutex_lock(&q->sysfs_lock); - if (q->request_fn || (q->mq_ops && q->elevator)) + if (q->elevator) elv_unregister_queue(q); mutex_unlock(&q->sysfs_lock); diff --git a/block/blk-tag.c b/block/blk-tag.c deleted file mode 100644 index fbc153aef166..000000000000 --- a/block/blk-tag.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to tagged command queuing - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/slab.h> - -#include "blk.h" - -/** - * blk_queue_find_tag - find a request by its tag and queue - * @q: The request queue for the device - * @tag: The tag of the request - * - * Notes: - * Should be used when a device returns a tag and you want to match - * it with a request. - * - * no locks need be held. - **/ -struct request *blk_queue_find_tag(struct request_queue *q, int tag) -{ - return blk_map_queue_find_tag(q->queue_tags, tag); -} -EXPORT_SYMBOL(blk_queue_find_tag); - -/** - * blk_free_tags - release a given set of tag maintenance info - * @bqt: the tag map to free - * - * Drop the reference count on @bqt and frees it when the last reference - * is dropped. - */ -void blk_free_tags(struct blk_queue_tag *bqt) -{ - if (atomic_dec_and_test(&bqt->refcnt)) { - BUG_ON(find_first_bit(bqt->tag_map, bqt->max_depth) < - bqt->max_depth); - - kfree(bqt->tag_index); - bqt->tag_index = NULL; - - kfree(bqt->tag_map); - bqt->tag_map = NULL; - - kfree(bqt); - } -} -EXPORT_SYMBOL(blk_free_tags); - -/** - * __blk_queue_free_tags - release tag maintenance info - * @q: the request queue for the device - * - * Notes: - * blk_cleanup_queue() will take care of calling this function, if tagging - * has been used. So there's no need to call this directly. - **/ -void __blk_queue_free_tags(struct request_queue *q) -{ - struct blk_queue_tag *bqt = q->queue_tags; - - if (!bqt) - return; - - blk_free_tags(bqt); - - q->queue_tags = NULL; - queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); -} - -/** - * blk_queue_free_tags - release tag maintenance info - * @q: the request queue for the device - * - * Notes: - * This is used to disable tagged queuing to a device, yet leave - * queue in function. - **/ -void blk_queue_free_tags(struct request_queue *q) -{ - queue_flag_clear_unlocked(QUEUE_FLAG_QUEUED, q); -} -EXPORT_SYMBOL(blk_queue_free_tags); - -static int -init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth) -{ - struct request **tag_index; - unsigned long *tag_map; - int nr_ulongs; - - if (q && depth > q->nr_requests * 2) { - depth = q->nr_requests * 2; - printk(KERN_ERR "%s: adjusted depth to %d\n", - __func__, depth); - } - - tag_index = kcalloc(depth, sizeof(struct request *), GFP_ATOMIC); - if (!tag_index) - goto fail; - - nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG; - tag_map = kcalloc(nr_ulongs, sizeof(unsigned long), GFP_ATOMIC); - if (!tag_map) - goto fail; - - tags->real_max_depth = depth; - tags->max_depth = depth; - tags->tag_index = tag_index; - tags->tag_map = tag_map; - - return 0; -fail: - kfree(tag_index); - return -ENOMEM; -} - -static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q, - int depth, int alloc_policy) -{ - struct blk_queue_tag *tags; - - tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC); - if (!tags) - goto fail; - - if (init_tag_map(q, tags, depth)) - goto fail; - - atomic_set(&tags->refcnt, 1); - tags->alloc_policy = alloc_policy; - tags->next_tag = 0; - return tags; -fail: - kfree(tags); - return NULL; -} - -/** - * blk_init_tags - initialize the tag info for an external tag map - * @depth: the maximum queue depth supported - * @alloc_policy: tag allocation policy - **/ -struct blk_queue_tag *blk_init_tags(int depth, int alloc_policy) -{ - return __blk_queue_init_tags(NULL, depth, alloc_policy); -} -EXPORT_SYMBOL(blk_init_tags); - -/** - * blk_queue_init_tags - initialize the queue tag info - * @q: the request queue for the device - * @depth: the maximum queue depth supported - * @tags: the tag to use - * @alloc_policy: tag allocation policy - * - * Queue lock must be held here if the function is called to resize an - * existing map. - **/ -int blk_queue_init_tags(struct request_queue *q, int depth, - struct blk_queue_tag *tags, int alloc_policy) -{ - int rc; - - BUG_ON(tags && q->queue_tags && tags != q->queue_tags); - - if (!tags && !q->queue_tags) { - tags = __blk_queue_init_tags(q, depth, alloc_policy); - - if (!tags) - return -ENOMEM; - - } else if (q->queue_tags) { - rc = blk_queue_resize_tags(q, depth); - if (rc) - return rc; - queue_flag_set(QUEUE_FLAG_QUEUED, q); - return 0; - } else - atomic_inc(&tags->refcnt); - - /* - * assign it, all done - */ - q->queue_tags = tags; - queue_flag_set_unlocked(QUEUE_FLAG_QUEUED, q); - return 0; -} -EXPORT_SYMBOL(blk_queue_init_tags); - -/** - * blk_queue_resize_tags - change the queueing depth - * @q: the request queue for the device - * @new_depth: the new max command queueing depth - * - * Notes: - * Must be called with the queue lock held. - **/ -int blk_queue_resize_tags(struct request_queue *q, int new_depth) -{ - struct blk_queue_tag *bqt = q->queue_tags; - struct request **tag_index; - unsigned long *tag_map; - int max_depth, nr_ulongs; - - if (!bqt) - return -ENXIO; - - /* - * if we already have large enough real_max_depth. just - * adjust max_depth. *NOTE* as requests with tag value - * between new_depth and real_max_depth can be in-flight, tag - * map can not be shrunk blindly here. - */ - if (new_depth <= bqt->real_max_depth) { - bqt->max_depth = new_depth; - return 0; - } - - /* - * Currently cannot replace a shared tag map with a new - * one, so error out if this is the case - */ - if (atomic_read(&bqt->refcnt) != 1) - return -EBUSY; - - /* - * save the old state info, so we can copy it back - */ - tag_index = bqt->tag_index; - tag_map = bqt->tag_map; - max_depth = bqt->real_max_depth; - - if (init_tag_map(q, bqt, new_depth)) - return -ENOMEM; - - memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *)); - nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG; - memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long)); - - kfree(tag_index); - kfree(tag_map); - return 0; -} -EXPORT_SYMBOL(blk_queue_resize_tags); - -/** - * blk_queue_end_tag - end tag operations for a request - * @q: the request queue for the device - * @rq: the request that has completed - * - * Description: - * Typically called when end_that_request_first() returns %0, meaning - * all transfers have been done for a request. It's important to call - * this function before end_that_request_last(), as that will put the - * request back on the free list thus corrupting the internal tag list. - **/ -void blk_queue_end_tag(struct request_queue *q, struct request *rq) -{ - struct blk_queue_tag *bqt = q->queue_tags; - unsigned tag = rq->tag; /* negative tags invalid */ - - lockdep_assert_held(q->queue_lock); - - BUG_ON(tag >= bqt->real_max_depth); - - list_del_init(&rq->queuelist); - rq->rq_flags &= ~RQF_QUEUED; - rq->tag = -1; - rq->internal_tag = -1; - - if (unlikely(bqt->tag_index[tag] == NULL)) - printk(KERN_ERR "%s: tag %d is missing\n", - __func__, tag); - - bqt->tag_index[tag] = NULL; - - if (unlikely(!test_bit(tag, bqt->tag_map))) { - printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n", - __func__, tag); - return; - } - /* - * The tag_map bit acts as a lock for tag_index[bit], so we need - * unlock memory barrier semantics. - */ - clear_bit_unlock(tag, bqt->tag_map); -} - -/** - * blk_queue_start_tag - find a free tag and assign it - * @q: the request queue for the device - * @rq: the block request that needs tagging - * - * Description: - * This can either be used as a stand-alone helper, or possibly be - * assigned as the queue &prep_rq_fn (in which case &struct request - * automagically gets a tag assigned). Note that this function - * assumes that any type of request can be queued! if this is not - * true for your device, you must check the request type before - * calling this function. The request will also be removed from - * the request queue, so it's the drivers responsibility to readd - * it if it should need to be restarted for some reason. - **/ -int blk_queue_start_tag(struct request_queue *q, struct request *rq) -{ - struct blk_queue_tag *bqt = q->queue_tags; - unsigned max_depth; - int tag; - - lockdep_assert_held(q->queue_lock); - - if (unlikely((rq->rq_flags & RQF_QUEUED))) { - printk(KERN_ERR - "%s: request %p for device [%s] already tagged %d", - __func__, rq, - rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag); - BUG(); - } - - /* - * Protect against shared tag maps, as we may not have exclusive - * access to the tag map. - * - * We reserve a few tags just for sync IO, since we don't want - * to starve sync IO on behalf of flooding async IO. - */ - max_depth = bqt->max_depth; - if (!rq_is_sync(rq) && max_depth > 1) { - switch (max_depth) { - case 2: - max_depth = 1; - break; - case 3: - max_depth = 2; - break; - default: - max_depth -= 2; - } - if (q->in_flight[BLK_RW_ASYNC] > max_depth) - return 1; - } - - do { - if (bqt->alloc_policy == BLK_TAG_ALLOC_FIFO) { - tag = find_first_zero_bit(bqt->tag_map, max_depth); - if (tag >= max_depth) - return 1; - } else { - int start = bqt->next_tag; - int size = min_t(int, bqt->max_depth, max_depth + start); - tag = find_next_zero_bit(bqt->tag_map, size, start); - if (tag >= size && start + size > bqt->max_depth) { - size = start + size - bqt->max_depth; - tag = find_first_zero_bit(bqt->tag_map, size); - } - if (tag >= size) - return 1; - } - - } while (test_and_set_bit_lock(tag, bqt->tag_map)); - /* - * We need lock ordering semantics given by test_and_set_bit_lock. - * See blk_queue_end_tag for details. - */ - - bqt->next_tag = (tag + 1) % bqt->max_depth; - rq->rq_flags |= RQF_QUEUED; - rq->tag = tag; - bqt->tag_index[tag] = rq; - blk_start_request(rq); - return 0; -} -EXPORT_SYMBOL(blk_queue_start_tag); diff --git a/block/blk-throttle.c b/block/blk-throttle.c index db1a3a2ae006..1b97a73d2fb1 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -1243,7 +1243,7 @@ static void throtl_pending_timer_fn(struct timer_list *t) bool dispatched; int ret; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1266,9 +1266,9 @@ again: break; /* this dispatch windows is still open, relax and repeat */ - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); cpu_relax(); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); } if (!dispatched) @@ -1290,7 +1290,7 @@ again: queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); } /** @@ -1314,11 +1314,11 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) bio_list_init(&bio_list_on_stack); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); for (rw = READ; rw <= WRITE; rw++) while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) bio_list_add(&bio_list_on_stack, bio); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); @@ -2115,16 +2115,6 @@ static inline void throtl_update_latency_buckets(struct throtl_data *td) } #endif -static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) -{ -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW - /* fallback to root_blkg if we fail to get a blkg ref */ - if (bio->bi_css && (bio_associate_blkg(bio, tg_to_blkg(tg)) == -ENODEV)) - bio_associate_blkg(bio, bio->bi_disk->queue->root_blkg); - bio_issue_init(&bio->bi_issue, bio_sectors(bio)); -#endif -} - bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, struct bio *bio) { @@ -2141,14 +2131,10 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) goto out; - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); - if (unlikely(blk_queue_bypass(q))) - goto out_unlock; - - blk_throtl_assoc_bio(tg, bio); blk_throtl_update_idletime(tg); sq = &tg->service_queue; @@ -2227,7 +2213,7 @@ again: } out_unlock: - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); out: bio_set_flag(bio, BIO_THROTTLED); @@ -2348,7 +2334,7 @@ static void tg_drain_bios(struct throtl_service_queue *parent_sq) * Dispatch all currently throttled bios on @q through ->make_request_fn(). */ void blk_throtl_drain(struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) + __releases(&q->queue_lock) __acquires(&q->queue_lock) { struct throtl_data *td = q->td; struct blkcg_gq *blkg; @@ -2356,7 +2342,6 @@ void blk_throtl_drain(struct request_queue *q) struct bio *bio; int rw; - queue_lockdep_assert_held(q); rcu_read_lock(); /* @@ -2372,7 +2357,7 @@ void blk_throtl_drain(struct request_queue *q) tg_drain_bios(&td->service_queue); rcu_read_unlock(); - spin_unlock_irq(q->queue_lock); + spin_unlock_irq(&q->queue_lock); /* all bios now should be in td->service_queue, issue them */ for (rw = READ; rw <= WRITE; rw++) @@ -2380,7 +2365,7 @@ void blk_throtl_drain(struct request_queue *q) NULL))) generic_make_request(bio); - spin_lock_irq(q->queue_lock); + spin_lock_irq(&q->queue_lock); } int blk_throtl_init(struct request_queue *q) @@ -2460,7 +2445,7 @@ void blk_throtl_register_queue(struct request_queue *q) td->throtl_slice = DFL_THROTL_SLICE_HD; #endif - td->track_bio_latency = !queue_is_rq_based(q); + td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); } diff --git a/block/blk-timeout.c b/block/blk-timeout.c index f2cfd56e1606..124c26128bf6 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -68,80 +68,6 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ -/* - * blk_delete_timer - Delete/cancel timer for a given function. - * @req: request that we are canceling timer for - * - */ -void blk_delete_timer(struct request *req) -{ - list_del_init(&req->timeout_list); -} - -static void blk_rq_timed_out(struct request *req) -{ - struct request_queue *q = req->q; - enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; - - if (q->rq_timed_out_fn) - ret = q->rq_timed_out_fn(req); - switch (ret) { - case BLK_EH_RESET_TIMER: - blk_add_timer(req); - blk_clear_rq_complete(req); - break; - case BLK_EH_DONE: - /* - * LLD handles this for now but in the future - * we can send a request msg to abort the command - * and we can move more of the generic scsi eh code to - * the blk layer. - */ - break; - default: - printk(KERN_ERR "block: bad eh return: %d\n", ret); - break; - } -} - -static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout, - unsigned int *next_set) -{ - const unsigned long deadline = blk_rq_deadline(rq); - - if (time_after_eq(jiffies, deadline)) { - list_del_init(&rq->timeout_list); - - /* - * Check if we raced with end io completion - */ - if (!blk_mark_rq_complete(rq)) - blk_rq_timed_out(rq); - } else if (!*next_set || time_after(*next_timeout, deadline)) { - *next_timeout = deadline; - *next_set = 1; - } -} - -void blk_timeout_work(struct work_struct *work) -{ - struct request_queue *q = - container_of(work, struct request_queue, timeout_work); - unsigned long flags, next = 0; - struct request *rq, *tmp; - int next_set = 0; - - spin_lock_irqsave(q->queue_lock, flags); - - list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) - blk_rq_check_expired(rq, &next, &next_set); - - if (next_set) - mod_timer(&q->timeout, round_jiffies_up(next)); - - spin_unlock_irqrestore(q->queue_lock, flags); -} - /** * blk_abort_request -- Request request recovery for the specified command * @req: pointer to the request of interest @@ -149,24 +75,17 @@ void blk_timeout_work(struct work_struct *work) * This function requests that the block layer start recovery for the * request by deleting the timer and calling the q's timeout function. * LLDDs who implement their own error recovery MAY ignore the timeout - * event if they generated blk_abort_req. Must hold queue lock. + * event if they generated blk_abort_request. */ void blk_abort_request(struct request *req) { - if (req->q->mq_ops) { - /* - * All we need to ensure is that timeout scan takes place - * immediately and that scan sees the new timeout value. - * No need for fancy synchronizations. - */ - blk_rq_set_deadline(req, jiffies); - kblockd_schedule_work(&req->q->timeout_work); - } else { - if (blk_mark_rq_complete(req)) - return; - blk_delete_timer(req); - blk_rq_timed_out(req); - } + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + WRITE_ONCE(req->deadline, jiffies); + kblockd_schedule_work(&req->q->timeout_work); } EXPORT_SYMBOL_GPL(blk_abort_request); @@ -194,15 +113,6 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (!q->mq_ops) - lockdep_assert_held(q->queue_lock); - - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ - if (!q->mq_ops && !q->rq_timed_out_fn) - return; - - BUG_ON(!list_empty(&req->timeout_list)); - /* * Some LLDs, like scsi, peek at the timeout to prevent a * command from being retried forever. @@ -211,21 +121,16 @@ void blk_add_timer(struct request *req) req->timeout = q->rq_timeout; req->rq_flags &= ~RQF_TIMED_OUT; - blk_rq_set_deadline(req, jiffies + req->timeout); - /* - * Only the non-mq case needs to add the request to a protected list. - * For the mq case we simply scan the tag map. - */ - if (!q->mq_ops) - list_add_tail(&req->timeout_list, &req->q->timeout_list); + expiry = jiffies + req->timeout; + WRITE_ONCE(req->deadline, expiry); /* * If the timer isn't already pending or this timeout is earlier * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(blk_rq_deadline(req))); + expiry = blk_rq_timeout(round_jiffies_up(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 8ac93fcbaa2e..f0c56649775f 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -489,31 +489,21 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) } struct wbt_wait_data { - struct wait_queue_entry wq; - struct task_struct *task; struct rq_wb *rwb; - struct rq_wait *rqw; + enum wbt_flags wb_acct; unsigned long rw; - bool got_token; }; -static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, - int wake_flags, void *key) +static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { - struct wbt_wait_data *data = container_of(curr, struct wbt_wait_data, - wq); - - /* - * If we fail to get a budget, return -1 to interrupt the wake up - * loop in __wake_up_common. - */ - if (!rq_wait_inc_below(data->rqw, get_limit(data->rwb, data->rw))) - return -1; + struct wbt_wait_data *data = private_data; + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); +} - data->got_token = true; - list_del_init(&curr->entry); - wake_up_process(data->task); - return 1; +static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + wbt_rqw_done(data->rwb, rqw, data->wb_acct); } /* @@ -521,57 +511,16 @@ static int wbt_wake_function(struct wait_queue_entry *curr, unsigned int mode, * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw, spinlock_t *lock) - __releases(lock) - __acquires(lock) + unsigned long rw) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { - .wq = { - .func = wbt_wake_function, - .entry = LIST_HEAD_INIT(data.wq.entry), - }, - .task = current, .rwb = rwb, - .rqw = rqw, + .wb_acct = wb_acct, .rw = rw, }; - bool has_sleeper; - - has_sleeper = wq_has_sleeper(&rqw->wait); - if (!has_sleeper && rq_wait_inc_below(rqw, get_limit(rwb, rw))) - return; - prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); - do { - if (data.got_token) - break; - - if (!has_sleeper && - rq_wait_inc_below(rqw, get_limit(rwb, rw))) { - finish_wait(&rqw->wait, &data.wq); - - /* - * We raced with wbt_wake_function() getting a token, - * which means we now have two. Put our local token - * and wake anyone else potentially waiting for one. - */ - if (data.got_token) - wbt_rqw_done(rwb, rqw, wb_acct); - break; - } - - if (lock) { - spin_unlock_irq(lock); - io_schedule(); - spin_lock_irq(lock); - } else - io_schedule(); - - has_sleeper = false; - } while (1); - - finish_wait(&rqw->wait, &data.wq); + rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) @@ -624,7 +573,7 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. */ -static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) +static void wbt_wait(struct rq_qos *rqos, struct bio *bio) { struct rq_wb *rwb = RQWB(rqos); enum wbt_flags flags; @@ -636,7 +585,7 @@ static void wbt_wait(struct rq_qos *rqos, struct bio *bio, spinlock_t *lock) return; } - __wbt_wait(rwb, flags, bio->bi_opf, lock); + __wbt_wait(rwb, flags, bio->bi_opf); if (!blk_stat_is_active(rwb->cb)) rwb_arm_timer(rwb); @@ -709,8 +658,7 @@ void wbt_enable_default(struct request_queue *q) if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) return; - if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || - (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) + if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) wbt_init(q); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -760,11 +708,100 @@ void wbt_disable_default(struct request_queue *q) if (!rqos) return; rwb = RQWB(rqos); - if (rwb->enable_state == WBT_STATE_ON_DEFAULT) + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { + blk_stat_deactivate(rwb->cb); rwb->wb_normal = 0; + } } EXPORT_SYMBOL_GPL(wbt_disable_default); +#ifdef CONFIG_BLK_DEBUG_FS +static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%llu\n", rwb->cur_win_nsec); + return 0; +} + +static int wbt_enabled_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%d\n", rwb->enable_state); + return 0; +} + +static int wbt_id_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + + seq_printf(m, "%u\n", rqos->id); + return 0; +} + +static int wbt_inflight_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) + seq_printf(m, "%d: inflight %d\n", i, + atomic_read(&rwb->rq_wait[i].inflight)); + return 0; +} + +static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%lu\n", rwb->min_lat_nsec); + return 0; +} + +static int wbt_unknown_cnt_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->unknown_cnt); + return 0; +} + +static int wbt_normal_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_normal); + return 0; +} + +static int wbt_background_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_background); + return 0; +} + +static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { + {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, + {"enabled", 0400, wbt_enabled_show}, + {"id", 0400, wbt_id_show}, + {"inflight", 0400, wbt_inflight_show}, + {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, + {"unknown_cnt", 0400, wbt_unknown_cnt_show}, + {"wb_normal", 0400, wbt_normal_show}, + {"wb_background", 0400, wbt_background_show}, + {}, +}; +#endif static struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, @@ -774,6 +811,9 @@ static struct rq_qos_ops wbt_rqos_ops = { .done = wbt_done, .cleanup = wbt_cleanup, .exit = wbt_exit, +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = wbt_debugfs_attrs, +#endif }; int wbt_init(struct request_queue *q) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 13ba2011a306..2d98803faec2 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -378,7 +378,7 @@ static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones) struct page *page; int order; - for (order = get_order(size); order > 0; order--) { + for (order = get_order(size); order >= 0; order--) { page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order); if (page) { *nr_zones = min_t(unsigned int, *nr_zones, @@ -421,7 +421,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk) * BIO based queues do not use a scheduler so only q->nr_zones * needs to be updated so that the sysfs exposed value is correct. */ - if (!queue_is_rq_based(q)) { + if (!queue_is_mq(q)) { q->nr_zones = nr_zones; return 0; } diff --git a/block/blk.h b/block/blk.h index a1841b8ff129..848278c52030 100644 --- a/block/blk.h +++ b/block/blk.h @@ -7,12 +7,6 @@ #include <xen/xen.h> #include "blk-mq.h" -/* Amount of time in which a process may batch requests */ -#define BLK_BATCH_TIME (HZ/50UL) - -/* Number of requests a "batching" process may submit */ -#define BLK_BATCH_REQ 32 - /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) @@ -38,85 +32,13 @@ struct blk_flush_queue { }; extern struct kmem_cache *blk_requestq_cachep; -extern struct kmem_cache *request_cachep; extern struct kobj_type blk_queue_ktype; extern struct ida blk_queue_ida; -/* - * @q->queue_lock is set while a queue is being initialized. Since we know - * that no other threads access the queue object before @q->queue_lock has - * been set, it is safe to manipulate queue flags without holding the - * queue_lock if @q->queue_lock == NULL. See also blk_alloc_queue_node() and - * blk_init_allocated_queue(). - */ -static inline void queue_lockdep_assert_held(struct request_queue *q) -{ - if (q->queue_lock) - lockdep_assert_held(q->queue_lock); -} - -static inline void queue_flag_set_unlocked(unsigned int flag, - struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && - kref_read(&q->kobj.kref)) - lockdep_assert_held(q->queue_lock); - __set_bit(flag, &q->queue_flags); -} - -static inline void queue_flag_clear_unlocked(unsigned int flag, - struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_INIT_DONE, &q->queue_flags) && - kref_read(&q->kobj.kref)) - lockdep_assert_held(q->queue_lock); - __clear_bit(flag, &q->queue_flags); -} - -static inline int queue_flag_test_and_clear(unsigned int flag, - struct request_queue *q) -{ - queue_lockdep_assert_held(q); - - if (test_bit(flag, &q->queue_flags)) { - __clear_bit(flag, &q->queue_flags); - return 1; - } - - return 0; -} - -static inline int queue_flag_test_and_set(unsigned int flag, - struct request_queue *q) +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) { - queue_lockdep_assert_held(q); - - if (!test_bit(flag, &q->queue_flags)) { - __set_bit(flag, &q->queue_flags); - return 0; - } - - return 1; -} - -static inline void queue_flag_set(unsigned int flag, struct request_queue *q) -{ - queue_lockdep_assert_held(q); - __set_bit(flag, &q->queue_flags); -} - -static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) -{ - queue_lockdep_assert_held(q); - __clear_bit(flag, &q->queue_flags); -} - -static inline struct blk_flush_queue *blk_get_flush_queue( - struct request_queue *q, struct blk_mq_ctx *ctx) -{ - if (q->mq_ops) - return blk_mq_map_queue(q, ctx->cpu)->fq; - return q->fq; + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx->cpu)->fq; } static inline void __blk_get_queue(struct request_queue *q) @@ -128,15 +50,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, int node, int cmd_size, gfp_t flags); void blk_free_flush_queue(struct blk_flush_queue *q); -int blk_init_rl(struct request_list *rl, struct request_queue *q, - gfp_t gfp_mask); -void blk_exit_rl(struct request_queue *q, struct request_list *rl); void blk_exit_queue(struct request_queue *q); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); -void blk_queue_bypass_start(struct request_queue *q); -void blk_queue_bypass_end(struct request_queue *q); -void __blk_queue_free_tags(struct request_queue *q); void blk_freeze_queue(struct request_queue *q); static inline void blk_queue_enter_live(struct request_queue *q) @@ -169,7 +85,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, static inline bool __bvec_gap_to_prev(struct request_queue *q, struct bio_vec *bprv, unsigned int offset) { - return offset || + return (offset & queue_virt_boundary(q)) || ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); } @@ -235,11 +151,8 @@ static inline bool bio_integrity_endio(struct bio *bio) } #endif /* CONFIG_BLK_DEV_INTEGRITY */ -void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -void blk_delete_timer(struct request *); - bool bio_attempt_front_merge(struct request_queue *q, struct request *req, struct bio *bio); @@ -248,58 +161,19 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req, bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int *request_count, struct request **same_queue_rq); -unsigned int blk_plug_queued_count(struct request_queue *q); void blk_account_io_start(struct request *req, bool new_io); void blk_account_io_completion(struct request *req, unsigned int bytes); void blk_account_io_done(struct request *req, u64 now); /* - * EH timer and IO completion will both attempt to 'grab' the request, make - * sure that only one of them succeeds. Steal the bottom bit of the - * __deadline field for this. - */ -static inline int blk_mark_rq_complete(struct request *rq) -{ - return test_and_set_bit(0, &rq->__deadline); -} - -static inline void blk_clear_rq_complete(struct request *rq) -{ - clear_bit(0, &rq->__deadline); -} - -static inline bool blk_rq_is_complete(struct request *rq) -{ - return test_bit(0, &rq->__deadline); -} - -/* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) void blk_insert_flush(struct request *rq); -static inline void elv_activate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->type->ops.sq.elevator_activate_req_fn) - e->type->ops.sq.elevator_activate_req_fn(q, rq); -} - -static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->type->ops.sq.elevator_deactivate_req_fn) - e->type->ops.sq.elevator_deactivate_req_fn(q, rq); -} - -int elevator_init(struct request_queue *); int elevator_init_mq(struct request_queue *q); int elevator_switch_mq(struct request_queue *q, struct elevator_type *new_e); @@ -334,31 +208,8 @@ void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); -void blk_queue_congestion_threshold(struct request_queue *q); - int blk_dev_init(void); - -/* - * Return the threshold (number of used requests) at which the queue is - * considered to be congested. It include a little hysteresis to keep the - * context switch rate down. - */ -static inline int queue_congestion_on_threshold(struct request_queue *q) -{ - return q->nr_congestion_on; -} - -/* - * The threshold at which a queue is considered to be uncongested - */ -static inline int queue_congestion_off_threshold(struct request_queue *q) -{ - return q->nr_congestion_off; -} - -extern int blk_update_nr_requests(struct request_queue *, unsigned int); - /* * Contribute to IO statistics IFF: * @@ -381,18 +232,13 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) } /* - * Steal a bit from this field for legacy IO path atomic IO marking. Note that - * setting the deadline clears the bottom bit, potentially clearing the - * completed bit. The user has to be OK with this (current ones are fine). + * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size + * is defined as 'unsigned int', meantime it has to aligned to with logical + * block size which is the minimum accepted unit by hardware. */ -static inline void blk_rq_set_deadline(struct request *rq, unsigned long time) -{ - rq->__deadline = time & ~0x1UL; -} - -static inline unsigned long blk_rq_deadline(struct request *rq) +static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) { - return rq->__deadline & ~0x1UL; + return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; } /* @@ -407,22 +253,6 @@ void ioc_clear_queue(struct request_queue *q); int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); /** - * rq_ioc - determine io_context for request allocation - * @bio: request being allocated is for this bio (can be %NULL) - * - * Determine io_context to use for request allocation for @bio. May return - * %NULL if %current->io_context doesn't exist. - */ -static inline struct io_context *rq_ioc(struct bio *bio) -{ -#ifdef CONFIG_BLK_CGROUP - if (bio && bio->bi_ioc) - return bio->bi_ioc; -#endif - return current->io_context; -} - -/** * create_io_context - try to create task->io_context * @gfp_mask: allocation mask * @node: allocation node @@ -480,8 +310,6 @@ static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) } #endif /* CONFIG_BOUNCE */ -extern void blk_drain_queue(struct request_queue *q); - #ifdef CONFIG_BLK_CGROUP_IOLATENCY extern int blk_iolatency_init(struct request_queue *q); #else diff --git a/block/bounce.c b/block/bounce.c index 36869afc258c..ffb9e9ecfa7e 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -248,6 +248,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, return NULL; bio->bi_disk = bio_src->bi_disk; bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -276,7 +277,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, } } - bio_clone_blkcg_association(bio, bio_src); + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); return bio; } diff --git a/block/bsg-lib.c b/block/bsg-lib.c index f3501cdaf1a6..192129856342 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -21,7 +21,7 @@ * */ #include <linux/slab.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/delay.h> #include <linux/scatterlist.h> #include <linux/bsg-lib.h> @@ -31,6 +31,12 @@ #define uptr64(val) ((void __user *)(uintptr_t)(val)) +struct bsg_set { + struct blk_mq_tag_set tag_set; + bsg_job_fn *job_fn; + bsg_timeout_fn *timeout_fn; +}; + static int bsg_transport_check_proto(struct sg_io_v4 *hdr) { if (hdr->protocol != BSG_PROTOCOL_SCSI || @@ -129,7 +135,7 @@ static void bsg_teardown_job(struct kref *kref) kfree(job->request_payload.sg_list); kfree(job->reply_payload.sg_list); - blk_end_request_all(rq, BLK_STS_OK); + blk_mq_end_request(rq, BLK_STS_OK); } void bsg_job_put(struct bsg_job *job) @@ -157,15 +163,15 @@ void bsg_job_done(struct bsg_job *job, int result, { job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_complete_request(blk_mq_rq_from_pdu(job)); + blk_mq_complete_request(blk_mq_rq_from_pdu(job)); } EXPORT_SYMBOL_GPL(bsg_job_done); /** - * bsg_softirq_done - softirq done routine for destroying the bsg requests + * bsg_complete - softirq done routine for destroying the bsg requests * @rq: BSG request that holds the job to be destroyed */ -static void bsg_softirq_done(struct request *rq) +static void bsg_complete(struct request *rq) { struct bsg_job *job = blk_mq_rq_to_pdu(rq); @@ -224,54 +230,48 @@ failjob_rls_job: } /** - * bsg_request_fn - generic handler for bsg requests - * @q: request queue to manage + * bsg_queue_rq - generic handler for bsg requests + * @hctx: hardware queue + * @bd: queue data * * On error the create_bsg_job function should return a -Exyz error value * that will be set to ->result. * * Drivers/subsys should pass this to the queue init function. */ -static void bsg_request_fn(struct request_queue *q) - __releases(q->queue_lock) - __acquires(q->queue_lock) +static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request_queue *q = hctx->queue; struct device *dev = q->queuedata; - struct request *req; + struct request *req = bd->rq; + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); int ret; + blk_mq_start_request(req); + if (!get_device(dev)) - return; - - while (1) { - req = blk_fetch_request(q); - if (!req) - break; - spin_unlock_irq(q->queue_lock); - - if (!bsg_prepare_job(dev, req)) { - blk_end_request_all(req, BLK_STS_OK); - spin_lock_irq(q->queue_lock); - continue; - } - - ret = q->bsg_job_fn(blk_mq_rq_to_pdu(req)); - spin_lock_irq(q->queue_lock); - if (ret) - break; - } + return BLK_STS_IOERR; + + if (!bsg_prepare_job(dev, req)) + return BLK_STS_IOERR; + + ret = bset->job_fn(blk_mq_rq_to_pdu(req)); + if (ret) + return BLK_STS_IOERR; - spin_unlock_irq(q->queue_lock); put_device(dev); - spin_lock_irq(q->queue_lock); + return BLK_STS_OK; } /* called right after the request is allocated for the request_queue */ -static int bsg_init_rq(struct request_queue *q, struct request *req, gfp_t gfp) +static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) { struct bsg_job *job = blk_mq_rq_to_pdu(req); - job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, gfp); + job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); if (!job->reply) return -ENOMEM; return 0; @@ -289,13 +289,47 @@ static void bsg_initialize_rq(struct request *req) job->dd_data = job + 1; } -static void bsg_exit_rq(struct request_queue *q, struct request *req) +static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx) { struct bsg_job *job = blk_mq_rq_to_pdu(req); kfree(job->reply); } +void bsg_remove_queue(struct request_queue *q) +{ + if (q) { + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); + + bsg_unregister_queue(q); + blk_cleanup_queue(q); + blk_mq_free_tag_set(&bset->tag_set); + kfree(bset); + } +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); + +static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) +{ + struct bsg_set *bset = + container_of(rq->q->tag_set, struct bsg_set, tag_set); + + if (!bset->timeout_fn) + return BLK_EH_DONE; + return bset->timeout_fn(rq); +} + +static const struct blk_mq_ops bsg_mq_ops = { + .queue_rq = bsg_queue_rq, + .init_request = bsg_init_rq, + .exit_request = bsg_exit_rq, + .initialize_rq_fn = bsg_initialize_rq, + .complete = bsg_complete, + .timeout = bsg_timeout, +}; + /** * bsg_setup_queue - Create and add the bsg hooks so we can receive requests * @dev: device to attach bsg device to @@ -304,28 +338,38 @@ static void bsg_exit_rq(struct request_queue *q, struct request *req) * @dd_job_size: size of LLD data needed for each job */ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, - bsg_job_fn *job_fn, int dd_job_size) + bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size) { + struct bsg_set *bset; + struct blk_mq_tag_set *set; struct request_queue *q; - int ret; + int ret = -ENOMEM; - q = blk_alloc_queue(GFP_KERNEL); - if (!q) + bset = kzalloc(sizeof(*bset), GFP_KERNEL); + if (!bset) return ERR_PTR(-ENOMEM); - q->cmd_size = sizeof(struct bsg_job) + dd_job_size; - q->init_rq_fn = bsg_init_rq; - q->exit_rq_fn = bsg_exit_rq; - q->initialize_rq_fn = bsg_initialize_rq; - q->request_fn = bsg_request_fn; - ret = blk_init_allocated_queue(q); - if (ret) - goto out_cleanup_queue; + bset->job_fn = job_fn; + bset->timeout_fn = timeout; + + set = &bset->tag_set; + set->ops = &bsg_mq_ops, + set->nr_hw_queues = 1; + set->queue_depth = 128; + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct bsg_job) + dd_job_size; + set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + if (blk_mq_alloc_tag_set(set)) + goto out_tag_set; + + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + ret = PTR_ERR(q); + goto out_queue; + } q->queuedata = dev; - q->bsg_job_fn = job_fn; blk_queue_flag_set(QUEUE_FLAG_BIDI, q); - blk_queue_softirq_done(q, bsg_softirq_done); blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); @@ -338,6 +382,10 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, return q; out_cleanup_queue: blk_cleanup_queue(q); +out_queue: + blk_mq_free_tag_set(set); +out_tag_set: + kfree(bset); return ERR_PTR(ret); } EXPORT_SYMBOL_GPL(bsg_setup_queue); diff --git a/block/bsg.c b/block/bsg.c index 9a442c23a715..44f6028b9567 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -471,7 +471,7 @@ int bsg_register_queue(struct request_queue *q, struct device *parent, /* * we need a proper transport to send commands, not a stacked device */ - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return 0; bcd = &q->bsg_dev; diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c deleted file mode 100644 index ed41aa978c4a..000000000000 --- a/block/cfq-iosched.c +++ /dev/null @@ -1,4916 +0,0 @@ -/* - * CFQ, or complete fairness queueing, disk scheduler. - * - * Based on ideas from a previously unfinished io - * scheduler (round robin per-process disk scheduling) and Andrea Arcangeli. - * - * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> - */ -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/sched/clock.h> -#include <linux/blkdev.h> -#include <linux/elevator.h> -#include <linux/ktime.h> -#include <linux/rbtree.h> -#include <linux/ioprio.h> -#include <linux/blktrace_api.h> -#include <linux/blk-cgroup.h> -#include "blk.h" -#include "blk-wbt.h" - -/* - * tunables - */ -/* max queue in one round of service */ -static const int cfq_quantum = 8; -static const u64 cfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; -/* maximum backwards seek, in KiB */ -static const int cfq_back_max = 16 * 1024; -/* penalty of a backwards seek */ -static const int cfq_back_penalty = 2; -static const u64 cfq_slice_sync = NSEC_PER_SEC / 10; -static u64 cfq_slice_async = NSEC_PER_SEC / 25; -static const int cfq_slice_async_rq = 2; -static u64 cfq_slice_idle = NSEC_PER_SEC / 125; -static u64 cfq_group_idle = NSEC_PER_SEC / 125; -static const u64 cfq_target_latency = (u64)NSEC_PER_SEC * 3/10; /* 300 ms */ -static const int cfq_hist_divisor = 4; - -/* - * offset from end of queue service tree for idle class - */ -#define CFQ_IDLE_DELAY (NSEC_PER_SEC / 5) -/* offset from end of group service tree under time slice mode */ -#define CFQ_SLICE_MODE_GROUP_DELAY (NSEC_PER_SEC / 5) -/* offset from end of group service under IOPS mode */ -#define CFQ_IOPS_MODE_GROUP_DELAY (HZ / 5) - -/* - * below this threshold, we consider thinktime immediate - */ -#define CFQ_MIN_TT (2 * NSEC_PER_SEC / HZ) - -#define CFQ_SLICE_SCALE (5) -#define CFQ_HW_QUEUE_MIN (5) -#define CFQ_SERVICE_SHIFT 12 - -#define CFQQ_SEEK_THR (sector_t)(8 * 100) -#define CFQQ_CLOSE_THR (sector_t)(8 * 1024) -#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8) - -#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq) -#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0]) -#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1]) - -static struct kmem_cache *cfq_pool; - -#define CFQ_PRIO_LISTS IOPRIO_BE_NR -#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT) - -#define sample_valid(samples) ((samples) > 80) -#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node) - -/* blkio-related constants */ -#define CFQ_WEIGHT_LEGACY_MIN 10 -#define CFQ_WEIGHT_LEGACY_DFL 500 -#define CFQ_WEIGHT_LEGACY_MAX 1000 - -struct cfq_ttime { - u64 last_end_request; - - u64 ttime_total; - u64 ttime_mean; - unsigned long ttime_samples; -}; - -/* - * Most of our rbtree usage is for sorting with min extraction, so - * if we cache the leftmost node we don't have to walk down the tree - * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should - * move this into the elevator for the rq sorting as well. - */ -struct cfq_rb_root { - struct rb_root_cached rb; - struct rb_node *rb_rightmost; - unsigned count; - u64 min_vdisktime; - struct cfq_ttime ttime; -}; -#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT_CACHED, \ - .rb_rightmost = NULL, \ - .ttime = {.last_end_request = ktime_get_ns(),},} - -/* - * Per process-grouping structure - */ -struct cfq_queue { - /* reference count */ - int ref; - /* various state flags, see below */ - unsigned int flags; - /* parent cfq_data */ - struct cfq_data *cfqd; - /* service_tree member */ - struct rb_node rb_node; - /* service_tree key */ - u64 rb_key; - /* prio tree member */ - struct rb_node p_node; - /* prio tree root we belong to, if any */ - struct rb_root *p_root; - /* sorted list of pending requests */ - struct rb_root sort_list; - /* if fifo isn't expired, next request to serve */ - struct request *next_rq; - /* requests queued in sort_list */ - int queued[2]; - /* currently allocated requests */ - int allocated[2]; - /* fifo list of requests in sort_list */ - struct list_head fifo; - - /* time when queue got scheduled in to dispatch first request. */ - u64 dispatch_start; - u64 allocated_slice; - u64 slice_dispatch; - /* time when first request from queue completed and slice started. */ - u64 slice_start; - u64 slice_end; - s64 slice_resid; - - /* pending priority requests */ - int prio_pending; - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - - /* io prio of this group */ - unsigned short ioprio, org_ioprio; - unsigned short ioprio_class, org_ioprio_class; - - pid_t pid; - - u32 seek_history; - sector_t last_request_pos; - - struct cfq_rb_root *service_tree; - struct cfq_queue *new_cfqq; - struct cfq_group *cfqg; - /* Number of sectors dispatched from queue in single dispatch round */ - unsigned long nr_sectors; -}; - -/* - * First index in the service_trees. - * IDLE is handled separately, so it has negative index - */ -enum wl_class_t { - BE_WORKLOAD = 0, - RT_WORKLOAD = 1, - IDLE_WORKLOAD = 2, - CFQ_PRIO_NR, -}; - -/* - * Second index in the service_trees. - */ -enum wl_type_t { - ASYNC_WORKLOAD = 0, - SYNC_NOIDLE_WORKLOAD = 1, - SYNC_WORKLOAD = 2 -}; - -struct cfqg_stats { -#ifdef CONFIG_CFQ_GROUP_IOSCHED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ - struct blkg_rwstat service_time; - /* total time spent waiting in scheduler queue in ns */ - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; -#ifdef CONFIG_DEBUG_BLK_CGROUP - /* time not charged to this cgroup */ - struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ - struct blkg_stat avg_queue_size_samples; - /* how many times this group has been removed from service tree */ - struct blkg_stat dequeue; - /* total time spent waiting for it to be assigned a timeslice. */ - struct blkg_stat group_wait_time; - /* time spent idling for this blkcg_gq */ - struct blkg_stat idle_time; - /* total time with empty current active q with other requests queued */ - struct blkg_stat empty_time; - /* fields after this shouldn't be cleared on stat reset */ - u64 start_group_wait_time; - u64 start_idle_time; - u64 start_empty_time; - uint16_t flags; -#endif /* CONFIG_DEBUG_BLK_CGROUP */ -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ -}; - -/* Per-cgroup data */ -struct cfq_group_data { - /* must be the first member */ - struct blkcg_policy_data cpd; - - unsigned int weight; - unsigned int leaf_weight; -}; - -/* This is per cgroup per device grouping structure */ -struct cfq_group { - /* must be the first member */ - struct blkg_policy_data pd; - - /* group service_tree member */ - struct rb_node rb_node; - - /* group service_tree key */ - u64 vdisktime; - - /* - * The number of active cfqgs and sum of their weights under this - * cfqg. This covers this cfqg's leaf_weight and all children's - * weights, but does not cover weights of further descendants. - * - * If a cfqg is on the service tree, it's active. An active cfqg - * also activates its parent and contributes to the children_weight - * of the parent. - */ - int nr_active; - unsigned int children_weight; - - /* - * vfraction is the fraction of vdisktime that the tasks in this - * cfqg are entitled to. This is determined by compounding the - * ratios walking up from this cfqg to the root. - * - * It is in fixed point w/ CFQ_SERVICE_SHIFT and the sum of all - * vfractions on a service tree is approximately 1. The sum may - * deviate a bit due to rounding errors and fluctuations caused by - * cfqgs entering and leaving the service tree. - */ - unsigned int vfraction; - - /* - * There are two weights - (internal) weight is the weight of this - * cfqg against the sibling cfqgs. leaf_weight is the wight of - * this cfqg against the child cfqgs. For the root cfqg, both - * weights are kept in sync for backward compatibility. - */ - unsigned int weight; - unsigned int new_weight; - unsigned int dev_weight; - - unsigned int leaf_weight; - unsigned int new_leaf_weight; - unsigned int dev_leaf_weight; - - /* number of cfqq currently on this group */ - int nr_cfqq; - - /* - * Per group busy queues average. Useful for workload slice calc. We - * create the array for each prio class but at run time it is used - * only for RT and BE class and slot for IDLE class remains unused. - * This is primarily done to avoid confusion and a gcc warning. - */ - unsigned int busy_queues_avg[CFQ_PRIO_NR]; - /* - * rr lists of queues with requests. We maintain service trees for - * RT and BE classes. These trees are subdivided in subclasses - * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE - * class there is no subclassification and all the cfq queues go on - * a single tree service_tree_idle. - * Counts are embedded in the cfq_rb_root - */ - struct cfq_rb_root service_trees[2][3]; - struct cfq_rb_root service_tree_idle; - - u64 saved_wl_slice; - enum wl_type_t saved_wl_type; - enum wl_class_t saved_wl_class; - - /* number of requests that are on the dispatch list or inside driver */ - int dispatched; - struct cfq_ttime ttime; - struct cfqg_stats stats; /* stats for this cfqg */ - - /* async queue for each priority case */ - struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR]; - struct cfq_queue *async_idle_cfqq; - -}; - -struct cfq_io_cq { - struct io_cq icq; /* must be the first member */ - struct cfq_queue *cfqq[2]; - struct cfq_ttime ttime; - int ioprio; /* the current ioprio */ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif -}; - -/* - * Per block device queue structure - */ -struct cfq_data { - struct request_queue *queue; - /* Root service tree for cfq_groups */ - struct cfq_rb_root grp_service_tree; - struct cfq_group *root_group; - - /* - * The priority currently being served - */ - enum wl_class_t serving_wl_class; - enum wl_type_t serving_wl_type; - u64 workload_expires; - struct cfq_group *serving_group; - - /* - * Each priority tree is sorted by next_request position. These - * trees are used when determining if two or more queues are - * interleaving requests (see cfq_close_cooperator). - */ - struct rb_root prio_trees[CFQ_PRIO_LISTS]; - - unsigned int busy_queues; - unsigned int busy_sync_queues; - - int rq_in_driver; - int rq_in_flight[2]; - - /* - * queue-depth detection - */ - int rq_queued; - int hw_tag; - /* - * hw_tag can be - * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection) - * 1 => NCQ is present (hw_tag_est_depth is the estimated max depth) - * 0 => no NCQ - */ - int hw_tag_est_depth; - unsigned int hw_tag_samples; - - /* - * idle window management - */ - struct hrtimer idle_slice_timer; - struct work_struct unplug_work; - - struct cfq_queue *active_queue; - struct cfq_io_cq *active_cic; - - sector_t last_position; - - /* - * tunables, see top of file - */ - unsigned int cfq_quantum; - unsigned int cfq_back_penalty; - unsigned int cfq_back_max; - unsigned int cfq_slice_async_rq; - unsigned int cfq_latency; - u64 cfq_fifo_expire[2]; - u64 cfq_slice[2]; - u64 cfq_slice_idle; - u64 cfq_group_idle; - u64 cfq_target_latency; - - /* - * Fallback dummy cfqq for extreme OOM conditions - */ - struct cfq_queue oom_cfqq; - - u64 last_delayed_sync; -}; - -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd); -static void cfq_put_queue(struct cfq_queue *cfqq); - -static struct cfq_rb_root *st_for(struct cfq_group *cfqg, - enum wl_class_t class, - enum wl_type_t type) -{ - if (!cfqg) - return NULL; - - if (class == IDLE_WORKLOAD) - return &cfqg->service_tree_idle; - - return &cfqg->service_trees[class][type]; -} - -enum cfqq_state_flags { - CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */ - CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */ - CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */ - CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */ - CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */ - CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */ - CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ - CFQ_CFQQ_FLAG_sync, /* synchronous queue */ - CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ - CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ - CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ - CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ -}; - -#define CFQ_CFQQ_FNS(name) \ -static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \ -{ \ - (cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \ -} \ -static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \ -{ \ - (cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \ -} \ -static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \ -{ \ - return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \ -} - -CFQ_CFQQ_FNS(on_rr); -CFQ_CFQQ_FNS(wait_request); -CFQ_CFQQ_FNS(must_dispatch); -CFQ_CFQQ_FNS(must_alloc_slice); -CFQ_CFQQ_FNS(fifo_expire); -CFQ_CFQQ_FNS(idle_window); -CFQ_CFQQ_FNS(prio_changed); -CFQ_CFQQ_FNS(slice_new); -CFQ_CFQQ_FNS(sync); -CFQ_CFQQ_FNS(coop); -CFQ_CFQQ_FNS(split_coop); -CFQ_CFQQ_FNS(deep); -CFQ_CFQQ_FNS(wait_busy); -#undef CFQ_CFQQ_FNS - -#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP) - -/* cfqg stats flags */ -enum cfqg_stats_flags { - CFQG_stats_waiting = 0, - CFQG_stats_idling, - CFQG_stats_empty, -}; - -#define CFQG_FLAG_FNS(name) \ -static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags |= (1 << CFQG_stats_##name); \ -} \ -static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \ -{ \ - stats->flags &= ~(1 << CFQG_stats_##name); \ -} \ -static inline int cfqg_stats_##name(struct cfqg_stats *stats) \ -{ \ - return (stats->flags & (1 << CFQG_stats_##name)) != 0; \ -} \ - -CFQG_FLAG_FNS(waiting) -CFQG_FLAG_FNS(idling) -CFQG_FLAG_FNS(empty) -#undef CFQG_FLAG_FNS - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats) -{ - u64 now; - - if (!cfqg_stats_waiting(stats)) - return; - - now = ktime_get_ns(); - if (now > stats->start_group_wait_time) - blkg_stat_add(&stats->group_wait_time, - now - stats->start_group_wait_time); - cfqg_stats_clear_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_waiting(stats)) - return; - if (cfqg == curr_cfqg) - return; - stats->start_group_wait_time = ktime_get_ns(); - cfqg_stats_mark_waiting(stats); -} - -/* This should be called with the queue_lock held. */ -static void cfqg_stats_end_empty_time(struct cfqg_stats *stats) -{ - u64 now; - - if (!cfqg_stats_empty(stats)) - return; - - now = ktime_get_ns(); - if (now > stats->start_empty_time) - blkg_stat_add(&stats->empty_time, - now - stats->start_empty_time); - cfqg_stats_clear_empty(stats); -} - -static void cfqg_stats_update_dequeue(struct cfq_group *cfqg) -{ - blkg_stat_add(&cfqg->stats.dequeue, 1); -} - -static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (blkg_rwstat_total(&stats->queued)) - return; - - /* - * group is already marked empty. This can happen if cfqq got new - * request in parent group and moved to this group while being added - * to service tree. Just ignore the event and move on. - */ - if (cfqg_stats_empty(stats)) - return; - - stats->start_empty_time = ktime_get_ns(); - cfqg_stats_mark_empty(stats); -} - -static void cfqg_stats_update_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - if (cfqg_stats_idling(stats)) { - u64 now = ktime_get_ns(); - - if (now > stats->start_idle_time) - blkg_stat_add(&stats->idle_time, - now - stats->start_idle_time); - cfqg_stats_clear_idling(stats); - } -} - -static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - BUG_ON(cfqg_stats_idling(stats)); - - stats->start_idle_time = ktime_get_ns(); - cfqg_stats_mark_idling(stats); -} - -static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) -{ - struct cfqg_stats *stats = &cfqg->stats; - - blkg_stat_add(&stats->avg_queue_size_sum, - blkg_rwstat_total(&stats->queued)); - blkg_stat_add(&stats->avg_queue_size_samples, 1); - cfqg_stats_update_group_wait_time(stats); -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - -static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { } -static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { } -static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { } -static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */ - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - -static inline struct cfq_group *pd_to_cfqg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct cfq_group, pd) : NULL; -} - -static struct cfq_group_data -*cpd_to_cfqgd(struct blkcg_policy_data *cpd) -{ - return cpd ? container_of(cpd, struct cfq_group_data, cpd) : NULL; -} - -static inline struct blkcg_gq *cfqg_to_blkg(struct cfq_group *cfqg) -{ - return pd_to_blkg(&cfqg->pd); -} - -static struct blkcg_policy blkcg_policy_cfq; - -static inline struct cfq_group *blkg_to_cfqg(struct blkcg_gq *blkg) -{ - return pd_to_cfqg(blkg_to_pd(blkg, &blkcg_policy_cfq)); -} - -static struct cfq_group_data *blkcg_to_cfqgd(struct blkcg *blkcg) -{ - return cpd_to_cfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_cfq)); -} - -static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) -{ - struct blkcg_gq *pblkg = cfqg_to_blkg(cfqg)->parent; - - return pblkg ? blkg_to_cfqg(pblkg) : NULL; -} - -static inline bool cfqg_is_descendant(struct cfq_group *cfqg, - struct cfq_group *ancestor) -{ - return cgroup_is_descendant(cfqg_to_blkg(cfqg)->blkcg->css.cgroup, - cfqg_to_blkg(ancestor)->blkcg->css.cgroup); -} - -static inline void cfqg_get(struct cfq_group *cfqg) -{ - return blkg_get(cfqg_to_blkg(cfqg)); -} - -static inline void cfqg_put(struct cfq_group *cfqg) -{ - return blkg_put(cfqg_to_blkg(cfqg)); -} - -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) do { \ - blk_add_cgroup_trace_msg((cfqd)->queue, \ - cfqg_to_blkg((cfqq)->cfqg)->blkcg, \ - "cfq%d%c%c " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - ##args); \ -} while (0) - -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do { \ - blk_add_cgroup_trace_msg((cfqd)->queue, \ - cfqg_to_blkg(cfqg)->blkcg, fmt, ##args); \ -} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.queued, op, 1); - cfqg_stats_end_empty_time(&cfqg->stats); - cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg); -} - -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - uint64_t time, unsigned long unaccounted_time) -{ - blkg_stat_add(&cfqg->stats.time, time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time); -#endif -} - -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.queued, op, -1); -} - -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, - unsigned int op) -{ - blkg_rwstat_add(&cfqg->stats.merged, op, 1); -} - -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - u64 start_time_ns, - u64 io_start_time_ns, - unsigned int op) -{ - struct cfqg_stats *stats = &cfqg->stats; - u64 now = ktime_get_ns(); - - if (now > io_start_time_ns) - blkg_rwstat_add(&stats->service_time, op, - now - io_start_time_ns); - if (io_start_time_ns > start_time_ns) - blkg_rwstat_add(&stats->wait_time, op, - io_start_time_ns - start_time_ns); -} - -/* @stats = 0 */ -static void cfqg_stats_reset(struct cfqg_stats *stats) -{ - /* queued stats shouldn't be cleared */ - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); - blkg_stat_reset(&stats->group_wait_time); - blkg_stat_reset(&stats->idle_time); - blkg_stat_reset(&stats->empty_time); -#endif -} - -/* @to += @from */ -static void cfqg_stats_add_aux(struct cfqg_stats *to, struct cfqg_stats *from) -{ - /* queued stats shouldn't be cleared */ - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, &from->avg_queue_size_samples); - blkg_stat_add_aux(&to->dequeue, &from->dequeue); - blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); - blkg_stat_add_aux(&to->idle_time, &from->idle_time); - blkg_stat_add_aux(&to->empty_time, &from->empty_time); -#endif -} - -/* - * Transfer @cfqg's stats to its parent's aux counts so that the ancestors' - * recursive stats can still account for the amount used by this cfqg after - * it's gone. - */ -static void cfqg_stats_xfer_dead(struct cfq_group *cfqg) -{ - struct cfq_group *parent = cfqg_parent(cfqg); - - lockdep_assert_held(cfqg_to_blkg(cfqg)->q->queue_lock); - - if (unlikely(!parent)) - return; - - cfqg_stats_add_aux(&parent->stats, &cfqg->stats); - cfqg_stats_reset(&cfqg->stats); -} - -#else /* CONFIG_CFQ_GROUP_IOSCHED */ - -static inline struct cfq_group *cfqg_parent(struct cfq_group *cfqg) { return NULL; } -static inline bool cfqg_is_descendant(struct cfq_group *cfqg, - struct cfq_group *ancestor) -{ - return true; -} -static inline void cfqg_get(struct cfq_group *cfqg) { } -static inline void cfqg_put(struct cfq_group *cfqg) { } - -#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq%d%c%c " fmt, (cfqq)->pid, \ - cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \ - cfqq_type((cfqq)) == SYNC_NOIDLE_WORKLOAD ? 'N' : ' ',\ - ##args) -#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0) - -static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg, - struct cfq_group *curr_cfqg, unsigned int op) { } -static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg, - uint64_t time, unsigned long unaccounted_time) { } -static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, - unsigned int op) { } -static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, - unsigned int op) { } -static inline void cfqg_stats_update_completion(struct cfq_group *cfqg, - u64 start_time_ns, - u64 io_start_time_ns, - unsigned int op) { } - -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - -#define cfq_log(cfqd, fmt, args...) \ - blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args) - -/* Traverses through cfq group service trees */ -#define for_each_cfqg_st(cfqg, i, j, st) \ - for (i = 0; i <= IDLE_WORKLOAD; i++) \ - for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\ - : &cfqg->service_tree_idle; \ - (i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \ - (i == IDLE_WORKLOAD && j == 0); \ - j++, st = i < IDLE_WORKLOAD ? \ - &cfqg->service_trees[i][j]: NULL) \ - -static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd, - struct cfq_ttime *ttime, bool group_idle) -{ - u64 slice; - if (!sample_valid(ttime->ttime_samples)) - return false; - if (group_idle) - slice = cfqd->cfq_group_idle; - else - slice = cfqd->cfq_slice_idle; - return ttime->ttime_mean > slice; -} - -static inline bool iops_mode(struct cfq_data *cfqd) -{ - /* - * If we are not idling on queues and it is a NCQ drive, parallel - * execution of requests is on and measuring time is not possible - * in most of the cases until and unless we drive shallower queue - * depths and that becomes a performance bottleneck. In such cases - * switch to start providing fairness in terms of number of IOs. - */ - if (!cfqd->cfq_slice_idle && cfqd->hw_tag) - return true; - else - return false; -} - -static inline enum wl_class_t cfqq_class(struct cfq_queue *cfqq) -{ - if (cfq_class_idle(cfqq)) - return IDLE_WORKLOAD; - if (cfq_class_rt(cfqq)) - return RT_WORKLOAD; - return BE_WORKLOAD; -} - - -static enum wl_type_t cfqq_type(struct cfq_queue *cfqq) -{ - if (!cfq_cfqq_sync(cfqq)) - return ASYNC_WORKLOAD; - if (!cfq_cfqq_idle_window(cfqq)) - return SYNC_NOIDLE_WORKLOAD; - return SYNC_WORKLOAD; -} - -static inline int cfq_group_busy_queues_wl(enum wl_class_t wl_class, - struct cfq_data *cfqd, - struct cfq_group *cfqg) -{ - if (wl_class == IDLE_WORKLOAD) - return cfqg->service_tree_idle.count; - - return cfqg->service_trees[wl_class][ASYNC_WORKLOAD].count + - cfqg->service_trees[wl_class][SYNC_NOIDLE_WORKLOAD].count + - cfqg->service_trees[wl_class][SYNC_WORKLOAD].count; -} - -static inline int cfqg_busy_async_queues(struct cfq_data *cfqd, - struct cfq_group *cfqg) -{ - return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count + - cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count; -} - -static void cfq_dispatch_insert(struct request_queue *, struct request *); -static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync, - struct cfq_io_cq *cic, struct bio *bio); - -static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq) -{ - /* cic->icq is the first member, %NULL will convert to %NULL */ - return container_of(icq, struct cfq_io_cq, icq); -} - -static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd, - struct io_context *ioc) -{ - if (ioc) - return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue)); - return NULL; -} - -static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync) -{ - return cic->cfqq[is_sync]; -} - -static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq, - bool is_sync) -{ - cic->cfqq[is_sync] = cfqq; -} - -static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic) -{ - return cic->icq.q->elevator->elevator_data; -} - -/* - * scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing - */ -static inline void cfq_schedule_dispatch(struct cfq_data *cfqd) -{ - if (cfqd->busy_queues) { - cfq_log(cfqd, "schedule dispatch"); - kblockd_schedule_work(&cfqd->unplug_work); - } -} - -/* - * Scale schedule slice based on io priority. Use the sync time slice only - * if a queue is marked sync and has sync io queued. A sync queue with async - * io only, should not get full sync slice length. - */ -static inline u64 cfq_prio_slice(struct cfq_data *cfqd, bool sync, - unsigned short prio) -{ - u64 base_slice = cfqd->cfq_slice[sync]; - u64 slice = div_u64(base_slice, CFQ_SLICE_SCALE); - - WARN_ON(prio >= IOPRIO_BE_NR); - - return base_slice + (slice * (4 - prio)); -} - -static inline u64 -cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio); -} - -/** - * cfqg_scale_charge - scale disk time charge according to cfqg weight - * @charge: disk time being charged - * @vfraction: vfraction of the cfqg, fixed point w/ CFQ_SERVICE_SHIFT - * - * Scale @charge according to @vfraction, which is in range (0, 1]. The - * scaling is inversely proportional. - * - * scaled = charge / vfraction - * - * The result is also in fixed point w/ CFQ_SERVICE_SHIFT. - */ -static inline u64 cfqg_scale_charge(u64 charge, - unsigned int vfraction) -{ - u64 c = charge << CFQ_SERVICE_SHIFT; /* make it fixed point */ - - /* charge / vfraction */ - c <<= CFQ_SERVICE_SHIFT; - return div_u64(c, vfraction); -} - -static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime) -{ - s64 delta = (s64)(vdisktime - min_vdisktime); - if (delta > 0) - min_vdisktime = vdisktime; - - return min_vdisktime; -} - -static void update_min_vdisktime(struct cfq_rb_root *st) -{ - if (!RB_EMPTY_ROOT(&st->rb.rb_root)) { - struct cfq_group *cfqg = rb_entry_cfqg(st->rb.rb_leftmost); - - st->min_vdisktime = max_vdisktime(st->min_vdisktime, - cfqg->vdisktime); - } -} - -/* - * get averaged number of queues of RT/BE priority. - * average is updated, with a formula that gives more weight to higher numbers, - * to quickly follows sudden increases and decrease slowly - */ - -static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd, - struct cfq_group *cfqg, bool rt) -{ - unsigned min_q, max_q; - unsigned mult = cfq_hist_divisor - 1; - unsigned round = cfq_hist_divisor / 2; - unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg); - - min_q = min(cfqg->busy_queues_avg[rt], busy); - max_q = max(cfqg->busy_queues_avg[rt], busy); - cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) / - cfq_hist_divisor; - return cfqg->busy_queues_avg[rt]; -} - -static inline u64 -cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - return cfqd->cfq_target_latency * cfqg->vfraction >> CFQ_SERVICE_SHIFT; -} - -static inline u64 -cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - u64 slice = cfq_prio_to_slice(cfqd, cfqq); - if (cfqd->cfq_latency) { - /* - * interested queues (we consider only the ones with the same - * priority class in the cfq group) - */ - unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg, - cfq_class_rt(cfqq)); - u64 sync_slice = cfqd->cfq_slice[1]; - u64 expect_latency = sync_slice * iq; - u64 group_slice = cfq_group_slice(cfqd, cfqq->cfqg); - - if (expect_latency > group_slice) { - u64 base_low_slice = 2 * cfqd->cfq_slice_idle; - u64 low_slice; - - /* scale low_slice according to IO priority - * and sync vs async */ - low_slice = div64_u64(base_low_slice*slice, sync_slice); - low_slice = min(slice, low_slice); - /* the adapted slice value is scaled to fit all iqs - * into the target latency */ - slice = div64_u64(slice*group_slice, expect_latency); - slice = max(slice, low_slice); - } - } - return slice; -} - -static inline void -cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - u64 slice = cfq_scaled_cfqq_slice(cfqd, cfqq); - u64 now = ktime_get_ns(); - - cfqq->slice_start = now; - cfqq->slice_end = now + slice; - cfqq->allocated_slice = slice; - cfq_log_cfqq(cfqd, cfqq, "set_slice=%llu", cfqq->slice_end - now); -} - -/* - * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end - * isn't valid until the first request from the dispatch is activated - * and the slice time set. - */ -static inline bool cfq_slice_used(struct cfq_queue *cfqq) -{ - if (cfq_cfqq_slice_new(cfqq)) - return false; - if (ktime_get_ns() < cfqq->slice_end) - return false; - - return true; -} - -/* - * Lifted from AS - choose which of rq1 and rq2 that is best served now. - * We choose the request that is closest to the head right now. Distance - * behind the head is penalized and only allowed to a certain extent. - */ -static struct request * -cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last) -{ - sector_t s1, s2, d1 = 0, d2 = 0; - unsigned long back_max; -#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */ - unsigned wrap = 0; /* bit mask: requests behind the disk head? */ - - if (rq1 == NULL || rq1 == rq2) - return rq2; - if (rq2 == NULL) - return rq1; - - if (rq_is_sync(rq1) != rq_is_sync(rq2)) - return rq_is_sync(rq1) ? rq1 : rq2; - - if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO) - return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2; - - s1 = blk_rq_pos(rq1); - s2 = blk_rq_pos(rq2); - - /* - * by definition, 1KiB is 2 sectors - */ - back_max = cfqd->cfq_back_max * 2; - - /* - * Strict one way elevator _except_ in the case where we allow - * short backward seeks which are biased as twice the cost of a - * similar forward seek. - */ - if (s1 >= last) - d1 = s1 - last; - else if (s1 + back_max >= last) - d1 = (last - s1) * cfqd->cfq_back_penalty; - else - wrap |= CFQ_RQ1_WRAP; - - if (s2 >= last) - d2 = s2 - last; - else if (s2 + back_max >= last) - d2 = (last - s2) * cfqd->cfq_back_penalty; - else - wrap |= CFQ_RQ2_WRAP; - - /* Found required data */ - - /* - * By doing switch() on the bit mask "wrap" we avoid having to - * check two variables for all permutations: --> faster! - */ - switch (wrap) { - case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ - if (d1 < d2) - return rq1; - else if (d2 < d1) - return rq2; - else { - if (s1 >= s2) - return rq1; - else - return rq2; - } - - case CFQ_RQ2_WRAP: - return rq1; - case CFQ_RQ1_WRAP: - return rq2; - case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */ - default: - /* - * Since both rqs are wrapped, - * start with the one that's further behind head - * (--> only *one* back seek required), - * since back seek takes more time than forward. - */ - if (s1 <= s2) - return rq1; - else - return rq2; - } -} - -static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root) -{ - /* Service tree is empty */ - if (!root->count) - return NULL; - - return rb_entry(rb_first_cached(&root->rb), struct cfq_queue, rb_node); -} - -static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root) -{ - return rb_entry_cfqg(rb_first_cached(&root->rb)); -} - -static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root) -{ - if (root->rb_rightmost == n) - root->rb_rightmost = rb_prev(n); - - rb_erase_cached(n, &root->rb); - RB_CLEAR_NODE(n); - - --root->count; -} - -/* - * would be nice to take fifo expire time into account as well - */ -static struct request * -cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *last) -{ - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); - struct request *next = NULL, *prev = NULL; - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - - if (rbprev) - prev = rb_entry_rq(rbprev); - - if (rbnext) - next = rb_entry_rq(rbnext); - else { - rbnext = rb_first(&cfqq->sort_list); - if (rbnext && rbnext != &last->rb_node) - next = rb_entry_rq(rbnext); - } - - return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last)); -} - -static u64 cfq_slice_offset(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - /* - * just an approximation, should be ok. - */ - return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) - - cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio)); -} - -static inline s64 -cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - return cfqg->vdisktime - st->min_vdisktime; -} - -static void -__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - struct rb_node **node = &st->rb.rb_root.rb_node; - struct rb_node *parent = NULL; - struct cfq_group *__cfqg; - s64 key = cfqg_key(st, cfqg); - bool leftmost = true, rightmost = true; - - while (*node != NULL) { - parent = *node; - __cfqg = rb_entry_cfqg(parent); - - if (key < cfqg_key(st, __cfqg)) { - node = &parent->rb_left; - rightmost = false; - } else { - node = &parent->rb_right; - leftmost = false; - } - } - - if (rightmost) - st->rb_rightmost = &cfqg->rb_node; - - rb_link_node(&cfqg->rb_node, parent, node); - rb_insert_color_cached(&cfqg->rb_node, &st->rb, leftmost); -} - -/* - * This has to be called only on activation of cfqg - */ -static void -cfq_update_group_weight(struct cfq_group *cfqg) -{ - if (cfqg->new_weight) { - cfqg->weight = cfqg->new_weight; - cfqg->new_weight = 0; - } -} - -static void -cfq_update_group_leaf_weight(struct cfq_group *cfqg) -{ - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - - if (cfqg->new_leaf_weight) { - cfqg->leaf_weight = cfqg->new_leaf_weight; - cfqg->new_leaf_weight = 0; - } -} - -static void -cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - unsigned int vfr = 1 << CFQ_SERVICE_SHIFT; /* start with 1 */ - struct cfq_group *pos = cfqg; - struct cfq_group *parent; - bool propagate; - - /* add to the service tree */ - BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node)); - - /* - * Update leaf_weight. We cannot update weight at this point - * because cfqg might already have been activated and is - * contributing its current weight to the parent's child_weight. - */ - cfq_update_group_leaf_weight(cfqg); - __cfq_group_service_tree_add(st, cfqg); - - /* - * Activate @cfqg and calculate the portion of vfraction @cfqg is - * entitled to. vfraction is calculated by walking the tree - * towards the root calculating the fraction it has at each level. - * The compounded ratio is how much vfraction @cfqg owns. - * - * Start with the proportion tasks in this cfqg has against active - * children cfqgs - its leaf_weight against children_weight. - */ - propagate = !pos->nr_active++; - pos->children_weight += pos->leaf_weight; - vfr = vfr * pos->leaf_weight / pos->children_weight; - - /* - * Compound ->weight walking up the tree. Both activation and - * vfraction calculation are done in the same loop. Propagation - * stops once an already activated node is met. vfraction - * calculation should always continue to the root. - */ - while ((parent = cfqg_parent(pos))) { - if (propagate) { - cfq_update_group_weight(pos); - propagate = !parent->nr_active++; - parent->children_weight += pos->weight; - } - vfr = vfr * pos->weight / parent->children_weight; - pos = parent; - } - - cfqg->vfraction = max_t(unsigned, vfr, 1); -} - -static inline u64 cfq_get_cfqg_vdisktime_delay(struct cfq_data *cfqd) -{ - if (!iops_mode(cfqd)) - return CFQ_SLICE_MODE_GROUP_DELAY; - else - return CFQ_IOPS_MODE_GROUP_DELAY; -} - -static void -cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - struct cfq_group *__cfqg; - struct rb_node *n; - - cfqg->nr_cfqq++; - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - return; - - /* - * Currently put the group at the end. Later implement something - * so that groups get lesser vtime based on their weights, so that - * if group does not loose all if it was not continuously backlogged. - */ - n = st->rb_rightmost; - if (n) { - __cfqg = rb_entry_cfqg(n); - cfqg->vdisktime = __cfqg->vdisktime + - cfq_get_cfqg_vdisktime_delay(cfqd); - } else - cfqg->vdisktime = st->min_vdisktime; - cfq_group_service_tree_add(st, cfqg); -} - -static void -cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg) -{ - struct cfq_group *pos = cfqg; - bool propagate; - - /* - * Undo activation from cfq_group_service_tree_add(). Deactivate - * @cfqg and propagate deactivation upwards. - */ - propagate = !--pos->nr_active; - pos->children_weight -= pos->leaf_weight; - - while (propagate) { - struct cfq_group *parent = cfqg_parent(pos); - - /* @pos has 0 nr_active at this point */ - WARN_ON_ONCE(pos->children_weight); - pos->vfraction = 0; - - if (!parent) - break; - - propagate = !--parent->nr_active; - parent->children_weight -= pos->weight; - pos = parent; - } - - /* remove from the service tree */ - if (!RB_EMPTY_NODE(&cfqg->rb_node)) - cfq_rb_erase(&cfqg->rb_node, st); -} - -static void -cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - - BUG_ON(cfqg->nr_cfqq < 1); - cfqg->nr_cfqq--; - - /* If there are other cfq queues under this group, don't delete it */ - if (cfqg->nr_cfqq) - return; - - cfq_log_cfqg(cfqd, cfqg, "del_from_rr group"); - cfq_group_service_tree_del(st, cfqg); - cfqg->saved_wl_slice = 0; - cfqg_stats_update_dequeue(cfqg); -} - -static inline u64 cfq_cfqq_slice_usage(struct cfq_queue *cfqq, - u64 *unaccounted_time) -{ - u64 slice_used; - u64 now = ktime_get_ns(); - - /* - * Queue got expired before even a single request completed or - * got expired immediately after first request completion. - */ - if (!cfqq->slice_start || cfqq->slice_start == now) { - /* - * Also charge the seek time incurred to the group, otherwise - * if there are mutiple queues in the group, each can dispatch - * a single request on seeky media and cause lots of seek time - * and group will never know it. - */ - slice_used = max_t(u64, (now - cfqq->dispatch_start), - jiffies_to_nsecs(1)); - } else { - slice_used = now - cfqq->slice_start; - if (slice_used > cfqq->allocated_slice) { - *unaccounted_time = slice_used - cfqq->allocated_slice; - slice_used = cfqq->allocated_slice; - } - if (cfqq->slice_start > cfqq->dispatch_start) - *unaccounted_time += cfqq->slice_start - - cfqq->dispatch_start; - } - - return slice_used; -} - -static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg, - struct cfq_queue *cfqq) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - u64 used_sl, charge, unaccounted_sl = 0; - int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg) - - cfqg->service_tree_idle.count; - unsigned int vfr; - u64 now = ktime_get_ns(); - - BUG_ON(nr_sync < 0); - used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl); - - if (iops_mode(cfqd)) - charge = cfqq->slice_dispatch; - else if (!cfq_cfqq_sync(cfqq) && !nr_sync) - charge = cfqq->allocated_slice; - - /* - * Can't update vdisktime while on service tree and cfqg->vfraction - * is valid only while on it. Cache vfr, leave the service tree, - * update vdisktime and go back on. The re-addition to the tree - * will also update the weights as necessary. - */ - vfr = cfqg->vfraction; - cfq_group_service_tree_del(st, cfqg); - cfqg->vdisktime += cfqg_scale_charge(charge, vfr); - cfq_group_service_tree_add(st, cfqg); - - /* This group is being expired. Save the context */ - if (cfqd->workload_expires > now) { - cfqg->saved_wl_slice = cfqd->workload_expires - now; - cfqg->saved_wl_type = cfqd->serving_wl_type; - cfqg->saved_wl_class = cfqd->serving_wl_class; - } else - cfqg->saved_wl_slice = 0; - - cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime, - st->min_vdisktime); - cfq_log_cfqq(cfqq->cfqd, cfqq, - "sl_used=%llu disp=%llu charge=%llu iops=%u sect=%lu", - used_sl, cfqq->slice_dispatch, charge, - iops_mode(cfqd), cfqq->nr_sectors); - cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl); - cfqg_stats_set_start_empty_time(cfqg); -} - -/** - * cfq_init_cfqg_base - initialize base part of a cfq_group - * @cfqg: cfq_group to initialize - * - * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED - * is enabled or not. - */ -static void cfq_init_cfqg_base(struct cfq_group *cfqg) -{ - struct cfq_rb_root *st; - int i, j; - - for_each_cfqg_st(cfqg, i, j, st) - *st = CFQ_RB_ROOT; - RB_CLEAR_NODE(&cfqg->rb_node); - - cfqg->ttime.last_end_request = ktime_get_ns(); -} - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool on_dfl, bool reset_dev, bool is_leaf_weight); - -static void cfqg_stats_exit(struct cfqg_stats *stats) -{ - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); - blkg_stat_exit(&stats->time); -#ifdef CONFIG_DEBUG_BLK_CGROUP - blkg_stat_exit(&stats->unaccounted_time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); - blkg_stat_exit(&stats->group_wait_time); - blkg_stat_exit(&stats->idle_time); - blkg_stat_exit(&stats->empty_time); -#endif -} - -static int cfqg_stats_init(struct cfqg_stats *stats, gfp_t gfp) -{ - if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || - blkg_stat_init(&stats->time, gfp)) - goto err; - -#ifdef CONFIG_DEBUG_BLK_CGROUP - if (blkg_stat_init(&stats->unaccounted_time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || - blkg_stat_init(&stats->group_wait_time, gfp) || - blkg_stat_init(&stats->idle_time, gfp) || - blkg_stat_init(&stats->empty_time, gfp)) - goto err; -#endif - return 0; -err: - cfqg_stats_exit(stats); - return -ENOMEM; -} - -static struct blkcg_policy_data *cfq_cpd_alloc(gfp_t gfp) -{ - struct cfq_group_data *cgd; - - cgd = kzalloc(sizeof(*cgd), gfp); - if (!cgd) - return NULL; - return &cgd->cpd; -} - -static void cfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct cfq_group_data *cgd = cpd_to_cfqgd(cpd); - unsigned int weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - - if (cpd_to_blkcg(cpd) == &blkcg_root) - weight *= 2; - - cgd->weight = weight; - cgd->leaf_weight = weight; -} - -static void cfq_cpd_free(struct blkcg_policy_data *cpd) -{ - kfree(cpd_to_cfqgd(cpd)); -} - -static void cfq_cpd_bind(struct blkcg_policy_data *cpd) -{ - struct blkcg *blkcg = cpd_to_blkcg(cpd); - bool on_dfl = cgroup_subsys_on_dfl(io_cgrp_subsys); - unsigned int weight = on_dfl ? CGROUP_WEIGHT_DFL : CFQ_WEIGHT_LEGACY_DFL; - - if (blkcg == &blkcg_root) - weight *= 2; - - WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, false)); - WARN_ON_ONCE(__cfq_set_weight(&blkcg->css, weight, on_dfl, true, true)); -} - -static struct blkg_policy_data *cfq_pd_alloc(gfp_t gfp, int node) -{ - struct cfq_group *cfqg; - - cfqg = kzalloc_node(sizeof(*cfqg), gfp, node); - if (!cfqg) - return NULL; - - cfq_init_cfqg_base(cfqg); - if (cfqg_stats_init(&cfqg->stats, gfp)) { - kfree(cfqg); - return NULL; - } - - return &cfqg->pd; -} - -static void cfq_pd_init(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - struct cfq_group_data *cgd = blkcg_to_cfqgd(pd->blkg->blkcg); - - cfqg->weight = cgd->weight; - cfqg->leaf_weight = cgd->leaf_weight; -} - -static void cfq_pd_offline(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - int i; - - for (i = 0; i < IOPRIO_BE_NR; i++) { - if (cfqg->async_cfqq[0][i]) { - cfq_put_queue(cfqg->async_cfqq[0][i]); - cfqg->async_cfqq[0][i] = NULL; - } - if (cfqg->async_cfqq[1][i]) { - cfq_put_queue(cfqg->async_cfqq[1][i]); - cfqg->async_cfqq[1][i] = NULL; - } - } - - if (cfqg->async_idle_cfqq) { - cfq_put_queue(cfqg->async_idle_cfqq); - cfqg->async_idle_cfqq = NULL; - } - - /* - * @blkg is going offline and will be ignored by - * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so - * that they don't get lost. If IOs complete after this point, the - * stats for them will be lost. Oh well... - */ - cfqg_stats_xfer_dead(cfqg); -} - -static void cfq_pd_free(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - cfqg_stats_exit(&cfqg->stats); - return kfree(cfqg); -} - -static void cfq_pd_reset_stats(struct blkg_policy_data *pd) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - cfqg_stats_reset(&cfqg->stats); -} - -static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) -{ - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, cfqd->queue); - if (likely(blkg)) - return blkg_to_cfqg(blkg); - return NULL; -} - -static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) -{ - cfqq->cfqg = cfqg; - /* cfqq reference on cfqg */ - cfqg_get(cfqg); -} - -static u64 cfqg_prfill_weight_device(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - if (!cfqg->dev_weight) - return 0; - return __blkg_prfill_u64(sf, pd, cfqg->dev_weight); -} - -static int cfqg_print_weight_device(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_weight_device, &blkcg_policy_cfq, - 0, false); - return 0; -} - -static u64 cfqg_prfill_leaf_weight_device(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - - if (!cfqg->dev_leaf_weight) - return 0; - return __blkg_prfill_u64(sf, pd, cfqg->dev_leaf_weight); -} - -static int cfqg_print_leaf_weight_device(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_leaf_weight_device, &blkcg_policy_cfq, - 0, false); - return 0; -} - -static int cfq_print_weight(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - unsigned int val = 0; - - if (cgd) - val = cgd->weight; - - seq_printf(sf, "%u\n", val); - return 0; -} - -static int cfq_print_leaf_weight(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - unsigned int val = 0; - - if (cgd) - val = cgd->leaf_weight; - - seq_printf(sf, "%u\n", val); - return 0; -} - -static ssize_t __cfqg_set_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off, - bool on_dfl, bool is_leaf_weight) -{ - unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; - unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; - struct blkcg *blkcg = css_to_blkcg(of_css(of)); - struct blkg_conf_ctx ctx; - struct cfq_group *cfqg; - struct cfq_group_data *cfqgd; - int ret; - u64 v; - - ret = blkg_conf_prep(blkcg, &blkcg_policy_cfq, buf, &ctx); - if (ret) - return ret; - - if (sscanf(ctx.body, "%llu", &v) == 1) { - /* require "default" on dfl */ - ret = -ERANGE; - if (!v && on_dfl) - goto out_finish; - } else if (!strcmp(strim(ctx.body), "default")) { - v = 0; - } else { - ret = -EINVAL; - goto out_finish; - } - - cfqg = blkg_to_cfqg(ctx.blkg); - cfqgd = blkcg_to_cfqgd(blkcg); - - ret = -ERANGE; - if (!v || (v >= min && v <= max)) { - if (!is_leaf_weight) { - cfqg->dev_weight = v; - cfqg->new_weight = v ?: cfqgd->weight; - } else { - cfqg->dev_leaf_weight = v; - cfqg->new_leaf_weight = v ?: cfqgd->leaf_weight; - } - ret = 0; - } -out_finish: - blkg_conf_finish(&ctx); - return ret ?: nbytes; -} - -static ssize_t cfqg_set_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cfqg_set_weight_device(of, buf, nbytes, off, false, false); -} - -static ssize_t cfqg_set_leaf_weight_device(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - return __cfqg_set_weight_device(of, buf, nbytes, off, false, true); -} - -static int __cfq_set_weight(struct cgroup_subsys_state *css, u64 val, - bool on_dfl, bool reset_dev, bool is_leaf_weight) -{ - unsigned int min = on_dfl ? CGROUP_WEIGHT_MIN : CFQ_WEIGHT_LEGACY_MIN; - unsigned int max = on_dfl ? CGROUP_WEIGHT_MAX : CFQ_WEIGHT_LEGACY_MAX; - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - struct cfq_group_data *cfqgd; - int ret = 0; - - if (val < min || val > max) - return -ERANGE; - - spin_lock_irq(&blkcg->lock); - cfqgd = blkcg_to_cfqgd(blkcg); - if (!cfqgd) { - ret = -EINVAL; - goto out; - } - - if (!is_leaf_weight) - cfqgd->weight = val; - else - cfqgd->leaf_weight = val; - - hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { - struct cfq_group *cfqg = blkg_to_cfqg(blkg); - - if (!cfqg) - continue; - - if (!is_leaf_weight) { - if (reset_dev) - cfqg->dev_weight = 0; - if (!cfqg->dev_weight) - cfqg->new_weight = cfqgd->weight; - } else { - if (reset_dev) - cfqg->dev_leaf_weight = 0; - if (!cfqg->dev_leaf_weight) - cfqg->new_leaf_weight = cfqgd->leaf_weight; - } - } - -out: - spin_unlock_irq(&blkcg->lock); - return ret; -} - -static int cfq_set_weight(struct cgroup_subsys_state *css, struct cftype *cft, - u64 val) -{ - return __cfq_set_weight(css, val, false, false, false); -} - -static int cfq_set_leaf_weight(struct cgroup_subsys_state *css, - struct cftype *cft, u64 val) -{ - return __cfq_set_weight(css, val, false, false, true); -} - -static int cfqg_print_stat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, - &blkcg_policy_cfq, seq_cft(sf)->private, false); - return 0; -} - -static int cfqg_print_rwstat(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, - &blkcg_policy_cfq, seq_cft(sf)->private, true); - return 0; -} - -static u64 cfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_cfq, off); - return __blkg_prfill_u64(sf, pd, sum); -} - -static u64 cfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), - &blkcg_policy_cfq, off); - return __blkg_prfill_rwstat(sf, pd, &sum); -} - -static int cfqg_print_stat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_stat_recursive, &blkcg_policy_cfq, - seq_cft(sf)->private, false); - return 0; -} - -static int cfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_rwstat_recursive, &blkcg_policy_cfq, - seq_cft(sf)->private, true); - return 0; -} - -static u64 cfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, - int off) -{ - u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int cfqg_print_stat_sectors(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_sectors, &blkcg_policy_cfq, 0, false); - return 0; -} - -static u64 cfqg_prfill_sectors_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, - offsetof(struct blkcg_gq, stat_bytes)); - u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + - atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); - - return __blkg_prfill_u64(sf, pd, sum >> 9); -} - -static int cfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_sectors_recursive, &blkcg_policy_cfq, 0, - false); - return 0; -} - -#ifdef CONFIG_DEBUG_BLK_CGROUP -static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) -{ - struct cfq_group *cfqg = pd_to_cfqg(pd); - u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples); - u64 v = 0; - - if (samples) { - v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum); - v = div64_u64(v, samples); - } - __blkg_prfill_u64(sf, pd, v); - return 0; -} - -/* print avg_queue_size */ -static int cfqg_print_avg_queue_size(struct seq_file *sf, void *v) -{ - blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), - cfqg_prfill_avg_queue_size, &blkcg_policy_cfq, - 0, false); - return 0; -} -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - -static struct cftype cfq_blkcg_legacy_files[] = { - /* on root, weight is mapped to leaf_weight */ - { - .name = "weight_device", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cfqg_print_leaf_weight_device, - .write = cfqg_set_leaf_weight_device, - }, - { - .name = "weight", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cfq_print_leaf_weight, - .write_u64 = cfq_set_leaf_weight, - }, - - /* no such mapping necessary for !roots */ - { - .name = "weight_device", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfqg_print_weight_device, - .write = cfqg_set_weight_device, - }, - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfq_print_weight, - .write_u64 = cfq_set_weight, - }, - - { - .name = "leaf_weight_device", - .seq_show = cfqg_print_leaf_weight_device, - .write = cfqg_set_leaf_weight_device, - }, - { - .name = "leaf_weight", - .seq_show = cfq_print_leaf_weight, - .write_u64 = cfq_set_leaf_weight, - }, - - /* statistics, covers only the tasks in the cfqg */ - { - .name = "time", - .private = offsetof(struct cfq_group, stats.time), - .seq_show = cfqg_print_stat, - }, - { - .name = "sectors", - .seq_show = cfqg_print_stat_sectors, - }, - { - .name = "io_service_bytes", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_bytes, - }, - { - .name = "io_serviced", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_ios, - }, - { - .name = "io_service_time", - .private = offsetof(struct cfq_group, stats.service_time), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_wait_time", - .private = offsetof(struct cfq_group, stats.wait_time), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_merged", - .private = offsetof(struct cfq_group, stats.merged), - .seq_show = cfqg_print_rwstat, - }, - { - .name = "io_queued", - .private = offsetof(struct cfq_group, stats.queued), - .seq_show = cfqg_print_rwstat, - }, - - /* the same statictics which cover the cfqg and its descendants */ - { - .name = "time_recursive", - .private = offsetof(struct cfq_group, stats.time), - .seq_show = cfqg_print_stat_recursive, - }, - { - .name = "sectors_recursive", - .seq_show = cfqg_print_stat_sectors_recursive, - }, - { - .name = "io_service_bytes_recursive", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_bytes_recursive, - }, - { - .name = "io_serviced_recursive", - .private = (unsigned long)&blkcg_policy_cfq, - .seq_show = blkg_print_stat_ios_recursive, - }, - { - .name = "io_service_time_recursive", - .private = offsetof(struct cfq_group, stats.service_time), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_wait_time_recursive", - .private = offsetof(struct cfq_group, stats.wait_time), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_merged_recursive", - .private = offsetof(struct cfq_group, stats.merged), - .seq_show = cfqg_print_rwstat_recursive, - }, - { - .name = "io_queued_recursive", - .private = offsetof(struct cfq_group, stats.queued), - .seq_show = cfqg_print_rwstat_recursive, - }, -#ifdef CONFIG_DEBUG_BLK_CGROUP - { - .name = "avg_queue_size", - .seq_show = cfqg_print_avg_queue_size, - }, - { - .name = "group_wait_time", - .private = offsetof(struct cfq_group, stats.group_wait_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "idle_time", - .private = offsetof(struct cfq_group, stats.idle_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "empty_time", - .private = offsetof(struct cfq_group, stats.empty_time), - .seq_show = cfqg_print_stat, - }, - { - .name = "dequeue", - .private = offsetof(struct cfq_group, stats.dequeue), - .seq_show = cfqg_print_stat, - }, - { - .name = "unaccounted_time", - .private = offsetof(struct cfq_group, stats.unaccounted_time), - .seq_show = cfqg_print_stat, - }, -#endif /* CONFIG_DEBUG_BLK_CGROUP */ - { } /* terminate */ -}; - -static int cfq_print_weight_on_dfl(struct seq_file *sf, void *v) -{ - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct cfq_group_data *cgd = blkcg_to_cfqgd(blkcg); - - seq_printf(sf, "default %u\n", cgd->weight); - blkcg_print_blkgs(sf, blkcg, cfqg_prfill_weight_device, - &blkcg_policy_cfq, 0, false); - return 0; -} - -static ssize_t cfq_set_weight_on_dfl(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - char *endp; - int ret; - u64 v; - - buf = strim(buf); - - /* "WEIGHT" or "default WEIGHT" sets the default weight */ - v = simple_strtoull(buf, &endp, 0); - if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { - ret = __cfq_set_weight(of_css(of), v, true, false, false); - return ret ?: nbytes; - } - - /* "MAJ:MIN WEIGHT" */ - return __cfqg_set_weight_device(of, buf, nbytes, off, true, false); -} - -static struct cftype cfq_blkcg_files[] = { - { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cfq_print_weight_on_dfl, - .write = cfq_set_weight_on_dfl, - }, - { } /* terminate */ -}; - -#else /* GROUP_IOSCHED */ -static struct cfq_group *cfq_lookup_cfqg(struct cfq_data *cfqd, - struct blkcg *blkcg) -{ - return cfqd->root_group; -} - -static inline void -cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) { - cfqq->cfqg = cfqg; -} - -#endif /* GROUP_IOSCHED */ - -/* - * The cfqd->service_trees holds all pending cfq_queue's that have - * requests waiting to be processed. It is sorted in the order that - * we will service the queues. - */ -static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool add_front) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - u64 rb_key; - struct cfq_rb_root *st; - bool leftmost = true; - int new_cfqq = 1; - u64 now = ktime_get_ns(); - - st = st_for(cfqq->cfqg, cfqq_class(cfqq), cfqq_type(cfqq)); - if (cfq_class_idle(cfqq)) { - rb_key = CFQ_IDLE_DELAY; - parent = st->rb_rightmost; - if (parent && parent != &cfqq->rb_node) { - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - rb_key += __cfqq->rb_key; - } else - rb_key += now; - } else if (!add_front) { - /* - * Get our rb key offset. Subtract any residual slice - * value carried from last service. A negative resid - * count indicates slice overrun, and this should position - * the next service time further away in the tree. - */ - rb_key = cfq_slice_offset(cfqd, cfqq) + now; - rb_key -= cfqq->slice_resid; - cfqq->slice_resid = 0; - } else { - rb_key = -NSEC_PER_SEC; - __cfqq = cfq_rb_first(st); - rb_key += __cfqq ? __cfqq->rb_key : now; - } - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - new_cfqq = 0; - /* - * same position, nothing more to do - */ - if (rb_key == cfqq->rb_key && cfqq->service_tree == st) - return; - - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); - cfqq->service_tree = NULL; - } - - parent = NULL; - cfqq->service_tree = st; - p = &st->rb.rb_root.rb_node; - while (*p) { - parent = *p; - __cfqq = rb_entry(parent, struct cfq_queue, rb_node); - - /* - * sort by key, that represents service time. - */ - if (rb_key < __cfqq->rb_key) - p = &parent->rb_left; - else { - p = &parent->rb_right; - leftmost = false; - } - } - - cfqq->rb_key = rb_key; - rb_link_node(&cfqq->rb_node, parent, p); - rb_insert_color_cached(&cfqq->rb_node, &st->rb, leftmost); - st->count++; - if (add_front || !new_cfqq) - return; - cfq_group_notify_queue_add(cfqd, cfqq->cfqg); -} - -static struct cfq_queue * -cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, - sector_t sector, struct rb_node **ret_parent, - struct rb_node ***rb_link) -{ - struct rb_node **p, *parent; - struct cfq_queue *cfqq = NULL; - - parent = NULL; - p = &root->rb_node; - while (*p) { - struct rb_node **n; - - parent = *p; - cfqq = rb_entry(parent, struct cfq_queue, p_node); - - /* - * Sort strictly based on sector. Smallest to the left, - * largest to the right. - */ - if (sector > blk_rq_pos(cfqq->next_rq)) - n = &(*p)->rb_right; - else if (sector < blk_rq_pos(cfqq->next_rq)) - n = &(*p)->rb_left; - else - break; - p = n; - cfqq = NULL; - } - - *ret_parent = parent; - if (rb_link) - *rb_link = p; - return cfqq; -} - -static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct rb_node **p, *parent; - struct cfq_queue *__cfqq; - - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - - if (cfq_class_idle(cfqq)) - return; - if (!cfqq->next_rq) - return; - - cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; - __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, - blk_rq_pos(cfqq->next_rq), &parent, &p); - if (!__cfqq) { - rb_link_node(&cfqq->p_node, parent, p); - rb_insert_color(&cfqq->p_node, cfqq->p_root); - } else - cfqq->p_root = NULL; -} - -/* - * Update cfqq's position in the service tree. - */ -static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - /* - * Resorting requires the cfqq to be on the RR list already. - */ - if (cfq_cfqq_on_rr(cfqq)) { - cfq_service_tree_add(cfqd, cfqq, 0); - cfq_prio_tree_add(cfqd, cfqq); - } -} - -/* - * add to busy list of queues for service, trying to be fair in ordering - * the pending list according to last request service - */ -static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "add_to_rr"); - BUG_ON(cfq_cfqq_on_rr(cfqq)); - cfq_mark_cfqq_on_rr(cfqq); - cfqd->busy_queues++; - if (cfq_cfqq_sync(cfqq)) - cfqd->busy_sync_queues++; - - cfq_resort_rr_list(cfqd, cfqq); -} - -/* - * Called when the cfqq no longer has requests pending, remove it from - * the service tree. - */ -static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "del_from_rr"); - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - cfq_clear_cfqq_on_rr(cfqq); - - if (!RB_EMPTY_NODE(&cfqq->rb_node)) { - cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree); - cfqq->service_tree = NULL; - } - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - - cfq_group_notify_queue_del(cfqd, cfqq->cfqg); - BUG_ON(!cfqd->busy_queues); - cfqd->busy_queues--; - if (cfq_cfqq_sync(cfqq)) - cfqd->busy_sync_queues--; -} - -/* - * rb tree support functions - */ -static void cfq_del_rq_rb(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - const int sync = rq_is_sync(rq); - - BUG_ON(!cfqq->queued[sync]); - cfqq->queued[sync]--; - - elv_rb_del(&cfqq->sort_list, rq); - - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) { - /* - * Queue will be deleted from service tree when we actually - * expire it later. Right now just remove it from prio tree - * as it is empty. - */ - if (cfqq->p_root) { - rb_erase(&cfqq->p_node, cfqq->p_root); - cfqq->p_root = NULL; - } - } -} - -static void cfq_add_rq_rb(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; - struct request *prev; - - cfqq->queued[rq_is_sync(rq)]++; - - elv_rb_add(&cfqq->sort_list, rq); - - if (!cfq_cfqq_on_rr(cfqq)) - cfq_add_cfqq_rr(cfqd, cfqq); - - /* - * check if this request is a better next-serve candidate - */ - prev = cfqq->next_rq; - cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position); - - /* - * adjust priority tree position, if ->next_rq changes - */ - if (prev != cfqq->next_rq) - cfq_prio_tree_add(cfqd, cfqq); - - BUG_ON(!cfqq->next_rq); -} - -static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq) -{ - elv_rb_del(&cfqq->sort_list, rq); - cfqq->queued[rq_is_sync(rq)]--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); - cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group, - rq->cmd_flags); -} - -static struct request * -cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio) -{ - struct task_struct *tsk = current; - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - cic = cfq_cic_lookup(cfqd, tsk->io_context); - if (!cic) - return NULL; - - cfqq = cic_to_cfqq(cic, op_is_sync(bio->bi_opf)); - if (cfqq) - return elv_rb_find(&cfqq->sort_list, bio_end_sector(bio)); - - return NULL; -} - -static void cfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - cfqd->rq_in_driver++; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", - cfqd->rq_in_driver); - - cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -} - -static void cfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - - WARN_ON(!cfqd->rq_in_driver); - cfqd->rq_in_driver--; - cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d", - cfqd->rq_in_driver); -} - -static void cfq_remove_request(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - if (cfqq->next_rq == rq) - cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq); - - list_del_init(&rq->queuelist); - cfq_del_rq_rb(rq); - - cfqq->cfqd->rq_queued--; - cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags); - if (rq->cmd_flags & REQ_PRIO) { - WARN_ON(!cfqq->prio_pending); - cfqq->prio_pending--; - } -} - -static enum elv_merge cfq_merge(struct request_queue *q, struct request **req, - struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct request *__rq; - - __rq = cfq_find_rq_fmerge(cfqd, bio); - if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - - return ELEVATOR_NO_MERGE; -} - -static void cfq_merged_request(struct request_queue *q, struct request *req, - enum elv_merge type) -{ - if (type == ELEVATOR_FRONT_MERGE) { - struct cfq_queue *cfqq = RQ_CFQQ(req); - - cfq_reposition_rq_rb(cfqq, req); - } -} - -static void cfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) -{ - cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_opf); -} - -static void -cfq_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = q->elevator->elevator_data; - - /* - * reposition in fifo if next is older than rq - */ - if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && - next->fifo_time < rq->fifo_time && - cfqq == RQ_CFQQ(next)) { - list_move(&rq->queuelist, &next->queuelist); - rq->fifo_time = next->fifo_time; - } - - if (cfqq->next_rq == next) - cfqq->next_rq = rq; - cfq_remove_request(next); - cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags); - - cfqq = RQ_CFQQ(next); - /* - * all requests of this queue are merged to other queues, delete it - * from the service tree. If it's the active_queue, - * cfq_dispatch_requests() will choose to expire it or do idle - */ - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) && - cfqq != cfqd->active_queue) - cfq_del_cfqq_rr(cfqd, cfqq); -} - -static int cfq_allow_bio_merge(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - bool is_sync = op_is_sync(bio->bi_opf); - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ - if (is_sync && !rq_is_sync(rq)) - return false; - - /* - * Lookup the cfqq that this bio will be queued with and allow - * merge only if rq is queued there. - */ - cic = cfq_cic_lookup(cfqd, current->io_context); - if (!cic) - return false; - - cfqq = cic_to_cfqq(cic, is_sync); - return cfqq == RQ_CFQQ(rq); -} - -static int cfq_allow_rq_merge(struct request_queue *q, struct request *rq, - struct request *next) -{ - return RQ_CFQQ(rq) == RQ_CFQQ(next); -} - -static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - hrtimer_try_to_cancel(&cfqd->idle_slice_timer); - cfqg_stats_update_idle_time(cfqq->cfqg); -} - -static void __cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "set_active wl_class:%d wl_type:%d", - cfqd->serving_wl_class, cfqd->serving_wl_type); - cfqg_stats_update_avg_queue_size(cfqq->cfqg); - cfqq->slice_start = 0; - cfqq->dispatch_start = ktime_get_ns(); - cfqq->allocated_slice = 0; - cfqq->slice_end = 0; - cfqq->slice_dispatch = 0; - cfqq->nr_sectors = 0; - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_must_dispatch(cfqq); - cfq_clear_cfqq_must_alloc_slice(cfqq); - cfq_clear_cfqq_fifo_expire(cfqq); - cfq_mark_cfqq_slice_new(cfqq); - - cfq_del_timer(cfqd, cfqq); - } - - cfqd->active_queue = cfqq; -} - -/* - * current cfqq expired its slice (or was too idle), select new one - */ -static void -__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, - bool timed_out) -{ - cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out); - - if (cfq_cfqq_wait_request(cfqq)) - cfq_del_timer(cfqd, cfqq); - - cfq_clear_cfqq_wait_request(cfqq); - cfq_clear_cfqq_wait_busy(cfqq); - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) - cfq_mark_cfqq_split_coop(cfqq); - - /* - * store what was left of this slice, if the queue idled/timed out - */ - if (timed_out) { - if (cfq_cfqq_slice_new(cfqq)) - cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq); - else - cfqq->slice_resid = cfqq->slice_end - ktime_get_ns(); - cfq_log_cfqq(cfqd, cfqq, "resid=%lld", cfqq->slice_resid); - } - - cfq_group_served(cfqd, cfqq->cfqg, cfqq); - - if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) - cfq_del_cfqq_rr(cfqd, cfqq); - - cfq_resort_rr_list(cfqd, cfqq); - - if (cfqq == cfqd->active_queue) - cfqd->active_queue = NULL; - - if (cfqd->active_cic) { - put_io_context(cfqd->active_cic->icq.ioc); - cfqd->active_cic = NULL; - } -} - -static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - - if (cfqq) - __cfq_slice_expired(cfqd, cfqq, timed_out); -} - -/* - * Get next queue for service. Unless we have a queue preemption, - * we'll simply select the first cfqq in the service tree. - */ -static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd) -{ - struct cfq_rb_root *st = st_for(cfqd->serving_group, - cfqd->serving_wl_class, cfqd->serving_wl_type); - - if (!cfqd->rq_queued) - return NULL; - - /* There is nothing to dispatch */ - if (!st) - return NULL; - if (RB_EMPTY_ROOT(&st->rb.rb_root)) - return NULL; - return cfq_rb_first(st); -} - -static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd) -{ - struct cfq_group *cfqg; - struct cfq_queue *cfqq; - int i, j; - struct cfq_rb_root *st; - - if (!cfqd->rq_queued) - return NULL; - - cfqg = cfq_get_next_cfqg(cfqd); - if (!cfqg) - return NULL; - - for_each_cfqg_st(cfqg, i, j, st) { - cfqq = cfq_rb_first(st); - if (cfqq) - return cfqq; - } - return NULL; -} - -/* - * Get and set a new active queue for service. - */ -static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - if (!cfqq) - cfqq = cfq_get_next_queue(cfqd); - - __cfq_set_active_queue(cfqd, cfqq); - return cfqq; -} - -static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, - struct request *rq) -{ - if (blk_rq_pos(rq) >= cfqd->last_position) - return blk_rq_pos(rq) - cfqd->last_position; - else - return cfqd->last_position - blk_rq_pos(rq); -} - -static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR; -} - -static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq) -{ - struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio]; - struct rb_node *parent, *node; - struct cfq_queue *__cfqq; - sector_t sector = cfqd->last_position; - - if (RB_EMPTY_ROOT(root)) - return NULL; - - /* - * First, if we find a request starting at the end of the last - * request, choose it. - */ - __cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL); - if (__cfqq) - return __cfqq; - - /* - * If the exact sector wasn't found, the parent of the NULL leaf - * will contain the closest sector. - */ - __cfqq = rb_entry(parent, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) - return __cfqq; - - if (blk_rq_pos(__cfqq->next_rq) < sector) - node = rb_next(&__cfqq->p_node); - else - node = rb_prev(&__cfqq->p_node); - if (!node) - return NULL; - - __cfqq = rb_entry(node, struct cfq_queue, p_node); - if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq)) - return __cfqq; - - return NULL; -} - -/* - * cfqd - obvious - * cur_cfqq - passed in so that we don't decide that the current queue is - * closely cooperating with itself. - * - * So, basically we're assuming that that cur_cfqq has dispatched at least - * one request, and that cfqd->last_position reflects a position on the disk - * associated with the I/O issued by cur_cfqq. I'm not sure this is a valid - * assumption. - */ -static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd, - struct cfq_queue *cur_cfqq) -{ - struct cfq_queue *cfqq; - - if (cfq_class_idle(cur_cfqq)) - return NULL; - if (!cfq_cfqq_sync(cur_cfqq)) - return NULL; - if (CFQQ_SEEKY(cur_cfqq)) - return NULL; - - /* - * Don't search priority tree if it's the only queue in the group. - */ - if (cur_cfqq->cfqg->nr_cfqq == 1) - return NULL; - - /* - * We should notice if some of the queues are cooperating, eg - * working closely on the same area of the disk. In that case, - * we can group them together and don't waste time idling. - */ - cfqq = cfqq_close(cfqd, cur_cfqq); - if (!cfqq) - return NULL; - - /* If new queue belongs to different cfq_group, don't choose it */ - if (cur_cfqq->cfqg != cfqq->cfqg) - return NULL; - - /* - * It only makes sense to merge sync queues. - */ - if (!cfq_cfqq_sync(cfqq)) - return NULL; - if (CFQQ_SEEKY(cfqq)) - return NULL; - - /* - * Do not merge queues of different priority classes - */ - if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq)) - return NULL; - - return cfqq; -} - -/* - * Determine whether we should enforce idle window for this queue. - */ - -static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - enum wl_class_t wl_class = cfqq_class(cfqq); - struct cfq_rb_root *st = cfqq->service_tree; - - BUG_ON(!st); - BUG_ON(!st->count); - - if (!cfqd->cfq_slice_idle) - return false; - - /* We never do for idle class queues. */ - if (wl_class == IDLE_WORKLOAD) - return false; - - /* We do for queues that were marked with idle window flag. */ - if (cfq_cfqq_idle_window(cfqq) && - !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)) - return true; - - /* - * Otherwise, we do only if they are the last ones - * in their service tree. - */ - if (st->count == 1 && cfq_cfqq_sync(cfqq) && - !cfq_io_thinktime_big(cfqd, &st->ttime, false)) - return true; - cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d", st->count); - return false; -} - -static void cfq_arm_slice_timer(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - struct cfq_rb_root *st = cfqq->service_tree; - struct cfq_io_cq *cic; - u64 sl, group_idle = 0; - u64 now = ktime_get_ns(); - - /* - * SSD device without seek penalty, disable idling. But only do so - * for devices that support queuing, otherwise we still have a problem - * with sync vs async workloads. - */ - if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag && - !cfqd->cfq_group_idle) - return; - - WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list)); - WARN_ON(cfq_cfqq_slice_new(cfqq)); - - /* - * idle is disabled, either manually or by past process history - */ - if (!cfq_should_idle(cfqd, cfqq)) { - /* no queue idling. Check for group idling */ - if (cfqd->cfq_group_idle) - group_idle = cfqd->cfq_group_idle; - else - return; - } - - /* - * still active requests from this queue, don't idle - */ - if (cfqq->dispatched) - return; - - /* - * task has exited, don't wait - */ - cic = cfqd->active_cic; - if (!cic || !atomic_read(&cic->icq.ioc->active_ref)) - return; - - /* - * If our average think time is larger than the remaining time - * slice, then don't idle. This avoids overrunning the allotted - * time slice. - */ - if (sample_valid(cic->ttime.ttime_samples) && - (cfqq->slice_end - now < cic->ttime.ttime_mean)) { - cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%llu", - cic->ttime.ttime_mean); - return; - } - - /* - * There are other queues in the group or this is the only group and - * it has too big thinktime, don't do group idle. - */ - if (group_idle && - (cfqq->cfqg->nr_cfqq > 1 || - cfq_io_thinktime_big(cfqd, &st->ttime, true))) - return; - - cfq_mark_cfqq_wait_request(cfqq); - - if (group_idle) - sl = cfqd->cfq_group_idle; - else - sl = cfqd->cfq_slice_idle; - - hrtimer_start(&cfqd->idle_slice_timer, ns_to_ktime(sl), - HRTIMER_MODE_REL); - cfqg_stats_set_start_idle_time(cfqq->cfqg); - cfq_log_cfqq(cfqd, cfqq, "arm_idle: %llu group_idle: %d", sl, - group_idle ? 1 : 0); -} - -/* - * Move request from internal lists to the request queue dispatch list. - */ -static void cfq_dispatch_insert(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - cfq_log_cfqq(cfqd, cfqq, "dispatch_insert"); - - cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq); - cfq_remove_request(rq); - cfqq->dispatched++; - (RQ_CFQG(rq))->dispatched++; - elv_dispatch_sort(q, rq); - - cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++; - cfqq->nr_sectors += blk_rq_sectors(rq); -} - -/* - * return expired entry, or NULL to just start from scratch in rbtree - */ -static struct request *cfq_check_fifo(struct cfq_queue *cfqq) -{ - struct request *rq = NULL; - - if (cfq_cfqq_fifo_expire(cfqq)) - return NULL; - - cfq_mark_cfqq_fifo_expire(cfqq); - - if (list_empty(&cfqq->fifo)) - return NULL; - - rq = rq_entry_fifo(cfqq->fifo.next); - if (ktime_get_ns() < rq->fifo_time) - rq = NULL; - - return rq; -} - -static inline int -cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - const int base_rq = cfqd->cfq_slice_async_rq; - - WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR); - - return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio); -} - -/* - * Must be called with the queue_lock held. - */ -static int cfqq_process_refs(struct cfq_queue *cfqq) -{ - int process_refs, io_refs; - - io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE]; - process_refs = cfqq->ref - io_refs; - BUG_ON(process_refs < 0); - return process_refs; -} - -static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq) -{ - int process_refs, new_process_refs; - struct cfq_queue *__cfqq; - - /* - * If there are no process references on the new_cfqq, then it is - * unsafe to follow the ->new_cfqq chain as other cfqq's in the - * chain may have dropped their last reference (not just their - * last process reference). - */ - if (!cfqq_process_refs(new_cfqq)) - return; - - /* Avoid a circular list and skip interim queue merges */ - while ((__cfqq = new_cfqq->new_cfqq)) { - if (__cfqq == cfqq) - return; - new_cfqq = __cfqq; - } - - process_refs = cfqq_process_refs(cfqq); - new_process_refs = cfqq_process_refs(new_cfqq); - /* - * If the process for the cfqq has gone away, there is no - * sense in merging the queues. - */ - if (process_refs == 0 || new_process_refs == 0) - return; - - /* - * Merge in the direction of the lesser amount of work. - */ - if (new_process_refs >= process_refs) { - cfqq->new_cfqq = new_cfqq; - new_cfqq->ref += process_refs; - } else { - new_cfqq->new_cfqq = cfqq; - cfqq->ref += new_process_refs; - } -} - -static enum wl_type_t cfq_choose_wl_type(struct cfq_data *cfqd, - struct cfq_group *cfqg, enum wl_class_t wl_class) -{ - struct cfq_queue *queue; - int i; - bool key_valid = false; - u64 lowest_key = 0; - enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD; - - for (i = 0; i <= SYNC_WORKLOAD; ++i) { - /* select the one with lowest rb_key */ - queue = cfq_rb_first(st_for(cfqg, wl_class, i)); - if (queue && - (!key_valid || queue->rb_key < lowest_key)) { - lowest_key = queue->rb_key; - cur_best = i; - key_valid = true; - } - } - - return cur_best; -} - -static void -choose_wl_class_and_type(struct cfq_data *cfqd, struct cfq_group *cfqg) -{ - u64 slice; - unsigned count; - struct cfq_rb_root *st; - u64 group_slice; - enum wl_class_t original_class = cfqd->serving_wl_class; - u64 now = ktime_get_ns(); - - /* Choose next priority. RT > BE > IDLE */ - if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg)) - cfqd->serving_wl_class = RT_WORKLOAD; - else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg)) - cfqd->serving_wl_class = BE_WORKLOAD; - else { - cfqd->serving_wl_class = IDLE_WORKLOAD; - cfqd->workload_expires = now + jiffies_to_nsecs(1); - return; - } - - if (original_class != cfqd->serving_wl_class) - goto new_workload; - - /* - * For RT and BE, we have to choose also the type - * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload - * expiration time - */ - st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); - count = st->count; - - /* - * check workload expiration, and that we still have other queues ready - */ - if (count && !(now > cfqd->workload_expires)) - return; - -new_workload: - /* otherwise select new workload type */ - cfqd->serving_wl_type = cfq_choose_wl_type(cfqd, cfqg, - cfqd->serving_wl_class); - st = st_for(cfqg, cfqd->serving_wl_class, cfqd->serving_wl_type); - count = st->count; - - /* - * the workload slice is computed as a fraction of target latency - * proportional to the number of queues in that workload, over - * all the queues in the same priority class - */ - group_slice = cfq_group_slice(cfqd, cfqg); - - slice = div_u64(group_slice * count, - max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_wl_class], - cfq_group_busy_queues_wl(cfqd->serving_wl_class, cfqd, - cfqg))); - - if (cfqd->serving_wl_type == ASYNC_WORKLOAD) { - u64 tmp; - - /* - * Async queues are currently system wide. Just taking - * proportion of queues with-in same group will lead to higher - * async ratio system wide as generally root group is going - * to have higher weight. A more accurate thing would be to - * calculate system wide asnc/sync ratio. - */ - tmp = cfqd->cfq_target_latency * - cfqg_busy_async_queues(cfqd, cfqg); - tmp = div_u64(tmp, cfqd->busy_queues); - slice = min_t(u64, slice, tmp); - - /* async workload slice is scaled down according to - * the sync/async slice ratio. */ - slice = div64_u64(slice*cfqd->cfq_slice[0], cfqd->cfq_slice[1]); - } else - /* sync workload slice is at least 2 * cfq_slice_idle */ - slice = max(slice, 2 * cfqd->cfq_slice_idle); - - slice = max_t(u64, slice, CFQ_MIN_TT); - cfq_log(cfqd, "workload slice:%llu", slice); - cfqd->workload_expires = now + slice; -} - -static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd) -{ - struct cfq_rb_root *st = &cfqd->grp_service_tree; - struct cfq_group *cfqg; - - if (RB_EMPTY_ROOT(&st->rb.rb_root)) - return NULL; - cfqg = cfq_rb_first_group(st); - update_min_vdisktime(st); - return cfqg; -} - -static void cfq_choose_cfqg(struct cfq_data *cfqd) -{ - struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd); - u64 now = ktime_get_ns(); - - cfqd->serving_group = cfqg; - - /* Restore the workload type data */ - if (cfqg->saved_wl_slice) { - cfqd->workload_expires = now + cfqg->saved_wl_slice; - cfqd->serving_wl_type = cfqg->saved_wl_type; - cfqd->serving_wl_class = cfqg->saved_wl_class; - } else - cfqd->workload_expires = now - 1; - - choose_wl_class_and_type(cfqd, cfqg); -} - -/* - * Select a queue for service. If we have a current active queue, - * check whether to continue servicing it, or retrieve and set a new one. - */ -static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq, *new_cfqq = NULL; - u64 now = ktime_get_ns(); - - cfqq = cfqd->active_queue; - if (!cfqq) - goto new_queue; - - if (!cfqd->rq_queued) - return NULL; - - /* - * We were waiting for group to get backlogged. Expire the queue - */ - if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list)) - goto expire; - - /* - * The active queue has run out of time, expire it and select new. - */ - if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) { - /* - * If slice had not expired at the completion of last request - * we might not have turned on wait_busy flag. Don't expire - * the queue yet. Allow the group to get backlogged. - * - * The very fact that we have used the slice, that means we - * have been idling all along on this queue and it should be - * ok to wait for this request to complete. - */ - if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list) - && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { - cfqq = NULL; - goto keep_queue; - } else - goto check_group_idle; - } - - /* - * The active queue has requests and isn't expired, allow it to - * dispatch. - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto keep_queue; - - /* - * If another queue has a request waiting within our mean seek - * distance, let it run. The expire code will check for close - * cooperators and put the close queue at the front of the service - * tree. If possible, merge the expiring queue with the new cfqq. - */ - new_cfqq = cfq_close_cooperator(cfqd, cfqq); - if (new_cfqq) { - if (!cfqq->new_cfqq) - cfq_setup_merge(cfqq, new_cfqq); - goto expire; - } - - /* - * No requests pending. If the active queue still has requests in - * flight or is idling for a new request, allow either of these - * conditions to happen (or time out) before selecting a new queue. - */ - if (hrtimer_active(&cfqd->idle_slice_timer)) { - cfqq = NULL; - goto keep_queue; - } - - /* - * This is a deep seek queue, but the device is much faster than - * the queue can deliver, don't idle - **/ - if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) && - (cfq_cfqq_slice_new(cfqq) || - (cfqq->slice_end - now > now - cfqq->slice_start))) { - cfq_clear_cfqq_deep(cfqq); - cfq_clear_cfqq_idle_window(cfqq); - } - - if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) { - cfqq = NULL; - goto keep_queue; - } - - /* - * If group idle is enabled and there are requests dispatched from - * this group, wait for requests to complete. - */ -check_group_idle: - if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 && - cfqq->cfqg->dispatched && - !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) { - cfqq = NULL; - goto keep_queue; - } - -expire: - cfq_slice_expired(cfqd, 0); -new_queue: - /* - * Current queue expired. Check if we have to switch to a new - * service tree - */ - if (!new_cfqq) - cfq_choose_cfqg(cfqd); - - cfqq = cfq_set_active_queue(cfqd, new_cfqq); -keep_queue: - return cfqq; -} - -static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq) -{ - int dispatched = 0; - - while (cfqq->next_rq) { - cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq); - dispatched++; - } - - BUG_ON(!list_empty(&cfqq->fifo)); - - /* By default cfqq is not expired if it is empty. Do it explicitly */ - __cfq_slice_expired(cfqq->cfqd, cfqq, 0); - return dispatched; -} - -/* - * Drain our current requests. Used for barriers and when switching - * io schedulers on-the-fly. - */ -static int cfq_forced_dispatch(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq; - int dispatched = 0; - - /* Expire the timeslice of the current active queue first */ - cfq_slice_expired(cfqd, 0); - while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) { - __cfq_set_active_queue(cfqd, cfqq); - dispatched += __cfq_forced_dispatch_cfqq(cfqq); - } - - BUG_ON(cfqd->busy_queues); - - cfq_log(cfqd, "forced_dispatch=%d", dispatched); - return dispatched; -} - -static inline bool cfq_slice_used_soon(struct cfq_data *cfqd, - struct cfq_queue *cfqq) -{ - u64 now = ktime_get_ns(); - - /* the queue hasn't finished any request, can't estimate */ - if (cfq_cfqq_slice_new(cfqq)) - return true; - if (now + cfqd->cfq_slice_idle * cfqq->dispatched > cfqq->slice_end) - return true; - - return false; -} - -static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - unsigned int max_dispatch; - - if (cfq_cfqq_must_dispatch(cfqq)) - return true; - - /* - * Drain async requests before we start sync IO - */ - if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC]) - return false; - - /* - * If this is an async queue and we have sync IO in flight, let it wait - */ - if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq)) - return false; - - max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1); - if (cfq_class_idle(cfqq)) - max_dispatch = 1; - - /* - * Does this cfqq already have too much IO in flight? - */ - if (cfqq->dispatched >= max_dispatch) { - bool promote_sync = false; - /* - * idle queue must always only have a single IO in flight - */ - if (cfq_class_idle(cfqq)) - return false; - - /* - * If there is only one sync queue - * we can ignore async queue here and give the sync - * queue no dispatch limit. The reason is a sync queue can - * preempt async queue, limiting the sync queue doesn't make - * sense. This is useful for aiostress test. - */ - if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1) - promote_sync = true; - - /* - * We have other queues, don't allow more IO from this one - */ - if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) && - !promote_sync) - return false; - - /* - * Sole queue user, no limit - */ - if (cfqd->busy_queues == 1 || promote_sync) - max_dispatch = -1; - else - /* - * Normally we start throttling cfqq when cfq_quantum/2 - * requests have been dispatched. But we can drive - * deeper queue depths at the beginning of slice - * subjected to upper limit of cfq_quantum. - * */ - max_dispatch = cfqd->cfq_quantum; - } - - /* - * Async queues must wait a bit before being allowed dispatch. - * We also ramp up the dispatch depth gradually for async IO, - * based on the last sync IO we serviced - */ - if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) { - u64 last_sync = ktime_get_ns() - cfqd->last_delayed_sync; - unsigned int depth; - - depth = div64_u64(last_sync, cfqd->cfq_slice[1]); - if (!depth && !cfqq->dispatched) - depth = 1; - if (depth < max_dispatch) - max_dispatch = depth; - } - - /* - * If we're below the current max, allow a dispatch - */ - return cfqq->dispatched < max_dispatch; -} - -/* - * Dispatch a request from cfqq, moving them to the request queue - * dispatch list. - */ -static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct request *rq; - - BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); - - rq = cfq_check_fifo(cfqq); - if (rq) - cfq_mark_cfqq_must_dispatch(cfqq); - - if (!cfq_may_dispatch(cfqd, cfqq)) - return false; - - /* - * follow expired path, else get first next available - */ - if (!rq) - rq = cfqq->next_rq; - else - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); - - /* - * insert request into driver dispatch list - */ - cfq_dispatch_insert(cfqd->queue, rq); - - if (!cfqd->active_cic) { - struct cfq_io_cq *cic = RQ_CIC(rq); - - atomic_long_inc(&cic->icq.ioc->refcount); - cfqd->active_cic = cic; - } - - return true; -} - -/* - * Find the cfqq that we need to service and move a request from that to the - * dispatch list - */ -static int cfq_dispatch_requests(struct request_queue *q, int force) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq; - - if (!cfqd->busy_queues) - return 0; - - if (unlikely(force)) - return cfq_forced_dispatch(cfqd); - - cfqq = cfq_select_queue(cfqd); - if (!cfqq) - return 0; - - /* - * Dispatch a request from this cfqq, if it is allowed - */ - if (!cfq_dispatch_request(cfqd, cfqq)) - return 0; - - cfqq->slice_dispatch++; - cfq_clear_cfqq_must_dispatch(cfqq); - - /* - * expire an async queue immediately if it has used up its slice. idle - * queue always expire after 1 dispatch round. - */ - if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) && - cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) || - cfq_class_idle(cfqq))) { - cfqq->slice_end = ktime_get_ns() + 1; - cfq_slice_expired(cfqd, 0); - } - - cfq_log_cfqq(cfqd, cfqq, "dispatched a request"); - return 1; -} - -/* - * task holds one reference to the queue, dropped when task exits. each rq - * in-flight on this queue also holds a reference, dropped when rq is freed. - * - * Each cfq queue took a reference on the parent group. Drop it now. - * queue lock must be held here. - */ -static void cfq_put_queue(struct cfq_queue *cfqq) -{ - struct cfq_data *cfqd = cfqq->cfqd; - struct cfq_group *cfqg; - - BUG_ON(cfqq->ref <= 0); - - cfqq->ref--; - if (cfqq->ref) - return; - - cfq_log_cfqq(cfqd, cfqq, "put_queue"); - BUG_ON(rb_first(&cfqq->sort_list)); - BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]); - cfqg = cfqq->cfqg; - - if (unlikely(cfqd->active_queue == cfqq)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - - BUG_ON(cfq_cfqq_on_rr(cfqq)); - kmem_cache_free(cfq_pool, cfqq); - cfqg_put(cfqg); -} - -static void cfq_put_cooperator(struct cfq_queue *cfqq) -{ - struct cfq_queue *__cfqq, *next; - - /* - * If this queue was scheduled to merge with another queue, be - * sure to drop the reference taken on that queue (and others in - * the merge chain). See cfq_setup_merge and cfq_merge_cfqqs. - */ - __cfqq = cfqq->new_cfqq; - while (__cfqq) { - if (__cfqq == cfqq) { - WARN(1, "cfqq->new_cfqq loop detected\n"); - break; - } - next = __cfqq->new_cfqq; - cfq_put_queue(__cfqq); - __cfqq = next; - } -} - -static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - if (unlikely(cfqq == cfqd->active_queue)) { - __cfq_slice_expired(cfqd, cfqq, 0); - cfq_schedule_dispatch(cfqd); - } - - cfq_put_cooperator(cfqq); - - cfq_put_queue(cfqq); -} - -static void cfq_init_icq(struct io_cq *icq) -{ - struct cfq_io_cq *cic = icq_to_cic(icq); - - cic->ttime.last_end_request = ktime_get_ns(); -} - -static void cfq_exit_icq(struct io_cq *icq) -{ - struct cfq_io_cq *cic = icq_to_cic(icq); - struct cfq_data *cfqd = cic_to_cfqd(cic); - - if (cic_to_cfqq(cic, false)) { - cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, false)); - cic_set_cfqq(cic, NULL, false); - } - - if (cic_to_cfqq(cic, true)) { - cfq_exit_cfqq(cfqd, cic_to_cfqq(cic, true)); - cic_set_cfqq(cic, NULL, true); - } -} - -static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic) -{ - struct task_struct *tsk = current; - int ioprio_class; - - if (!cfq_cfqq_prio_changed(cfqq)) - return; - - ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - switch (ioprio_class) { - default: - printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class); - /* fall through */ - case IOPRIO_CLASS_NONE: - /* - * no prio set, inherit CPU scheduling settings - */ - cfqq->ioprio = task_nice_ioprio(tsk); - cfqq->ioprio_class = task_nice_ioclass(tsk); - break; - case IOPRIO_CLASS_RT: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - cfqq->ioprio_class = IOPRIO_CLASS_RT; - break; - case IOPRIO_CLASS_BE: - cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - cfqq->ioprio_class = IOPRIO_CLASS_BE; - break; - case IOPRIO_CLASS_IDLE: - cfqq->ioprio_class = IOPRIO_CLASS_IDLE; - cfqq->ioprio = 7; - cfq_clear_cfqq_idle_window(cfqq); - break; - } - - /* - * keep track of original prio settings in case we have to temporarily - * elevate the priority of this queue - */ - cfqq->org_ioprio = cfqq->ioprio; - cfqq->org_ioprio_class = cfqq->ioprio_class; - cfq_clear_cfqq_prio_changed(cfqq); -} - -static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio) -{ - int ioprio = cic->icq.ioc->ioprio; - struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *cfqq; - - /* - * Check whether ioprio has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->ioprio == ioprio)) - return; - - cfqq = cic_to_cfqq(cic, false); - if (cfqq) { - cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio); - cic_set_cfqq(cic, cfqq, false); - } - - cfqq = cic_to_cfqq(cic, true); - if (cfqq) - cfq_mark_cfqq_prio_changed(cfqq); - - cic->ioprio = ioprio; -} - -static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq, - pid_t pid, bool is_sync) -{ - RB_CLEAR_NODE(&cfqq->rb_node); - RB_CLEAR_NODE(&cfqq->p_node); - INIT_LIST_HEAD(&cfqq->fifo); - - cfqq->ref = 0; - cfqq->cfqd = cfqd; - - cfq_mark_cfqq_prio_changed(cfqq); - - if (is_sync) { - if (!cfq_class_idle(cfqq)) - cfq_mark_cfqq_idle_window(cfqq); - cfq_mark_cfqq_sync(cfqq); - } - cfqq->pid = pid; -} - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) -{ - struct cfq_data *cfqd = cic_to_cfqd(cic); - struct cfq_queue *cfqq; - uint64_t serial_nr; - - rcu_read_lock(); - serial_nr = bio_blkcg(bio)->css.serial_nr; - rcu_read_unlock(); - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ - if (unlikely(!cfqd) || likely(cic->blkcg_serial_nr == serial_nr)) - return; - - /* - * Drop reference to queues. New queues will be assigned in new - * group upon arrival of fresh requests. - */ - cfqq = cic_to_cfqq(cic, false); - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, false); - cfq_put_queue(cfqq); - } - - cfqq = cic_to_cfqq(cic, true); - if (cfqq) { - cfq_log_cfqq(cfqd, cfqq, "changed cgroup"); - cic_set_cfqq(cic, NULL, true); - cfq_put_queue(cfqq); - } - - cic->blkcg_serial_nr = serial_nr; -} -#else -static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) -{ -} -#endif /* CONFIG_CFQ_GROUP_IOSCHED */ - -static struct cfq_queue ** -cfq_async_queue_prio(struct cfq_group *cfqg, int ioprio_class, int ioprio) -{ - switch (ioprio_class) { - case IOPRIO_CLASS_RT: - return &cfqg->async_cfqq[0][ioprio]; - case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ - case IOPRIO_CLASS_BE: - return &cfqg->async_cfqq[1][ioprio]; - case IOPRIO_CLASS_IDLE: - return &cfqg->async_idle_cfqq; - default: - BUG(); - } -} - -static struct cfq_queue * -cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic, - struct bio *bio) -{ - int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio); - int ioprio = IOPRIO_PRIO_DATA(cic->ioprio); - struct cfq_queue **async_cfqq = NULL; - struct cfq_queue *cfqq; - struct cfq_group *cfqg; - - rcu_read_lock(); - cfqg = cfq_lookup_cfqg(cfqd, bio_blkcg(bio)); - if (!cfqg) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - if (!is_sync) { - if (!ioprio_valid(cic->ioprio)) { - struct task_struct *tsk = current; - ioprio = task_nice_ioprio(tsk); - ioprio_class = task_nice_ioclass(tsk); - } - async_cfqq = cfq_async_queue_prio(cfqg, ioprio_class, ioprio); - cfqq = *async_cfqq; - if (cfqq) - goto out; - } - - cfqq = kmem_cache_alloc_node(cfq_pool, - GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, - cfqd->queue->node); - if (!cfqq) { - cfqq = &cfqd->oom_cfqq; - goto out; - } - - /* cfq_init_cfqq() assumes cfqq->ioprio_class is initialized. */ - cfqq->ioprio_class = IOPRIO_CLASS_NONE; - cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync); - cfq_init_prio_data(cfqq, cic); - cfq_link_cfqq_cfqg(cfqq, cfqg); - cfq_log_cfqq(cfqd, cfqq, "alloced"); - - if (async_cfqq) { - /* a new async queue is created, pin and remember */ - cfqq->ref++; - *async_cfqq = cfqq; - } -out: - cfqq->ref++; - rcu_read_unlock(); - return cfqq; -} - -static void -__cfq_update_io_thinktime(struct cfq_ttime *ttime, u64 slice_idle) -{ - u64 elapsed = ktime_get_ns() - ttime->last_end_request; - elapsed = min(elapsed, 2UL * slice_idle); - - ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; - ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); - ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, - ttime->ttime_samples); -} - -static void -cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_io_cq *cic) -{ - if (cfq_cfqq_sync(cfqq)) { - __cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle); - __cfq_update_io_thinktime(&cfqq->service_tree->ttime, - cfqd->cfq_slice_idle); - } -#ifdef CONFIG_CFQ_GROUP_IOSCHED - __cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle); -#endif -} - -static void -cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - sector_t sdist = 0; - sector_t n_sec = blk_rq_sectors(rq); - if (cfqq->last_request_pos) { - if (cfqq->last_request_pos < blk_rq_pos(rq)) - sdist = blk_rq_pos(rq) - cfqq->last_request_pos; - else - sdist = cfqq->last_request_pos - blk_rq_pos(rq); - } - - cfqq->seek_history <<= 1; - if (blk_queue_nonrot(cfqd->queue)) - cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT); - else - cfqq->seek_history |= (sdist > CFQQ_SEEK_THR); -} - -static inline bool req_noidle(struct request *req) -{ - return req_op(req) == REQ_OP_WRITE && - (req->cmd_flags & (REQ_SYNC | REQ_IDLE)) == REQ_SYNC; -} - -/* - * Disable idle window if the process thinks too long or seeks so much that - * it doesn't matter - */ -static void -cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct cfq_io_cq *cic) -{ - int old_idle, enable_idle; - - /* - * Don't idle for async or idle io prio class - */ - if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq)) - return; - - enable_idle = old_idle = cfq_cfqq_idle_window(cfqq); - - if (cfqq->queued[0] + cfqq->queued[1] >= 4) - cfq_mark_cfqq_deep(cfqq); - - if (cfqq->next_rq && req_noidle(cfqq->next_rq)) - enable_idle = 0; - else if (!atomic_read(&cic->icq.ioc->active_ref) || - !cfqd->cfq_slice_idle || - (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq))) - enable_idle = 0; - else if (sample_valid(cic->ttime.ttime_samples)) { - if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle) - enable_idle = 0; - else - enable_idle = 1; - } - - if (old_idle != enable_idle) { - cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle); - if (enable_idle) - cfq_mark_cfqq_idle_window(cfqq); - else - cfq_clear_cfqq_idle_window(cfqq); - } -} - -/* - * Check if new_cfqq should preempt the currently active queue. Return 0 for - * no or if we aren't sure, a 1 will cause a preempt. - */ -static bool -cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, - struct request *rq) -{ - struct cfq_queue *cfqq; - - cfqq = cfqd->active_queue; - if (!cfqq) - return false; - - if (cfq_class_idle(new_cfqq)) - return false; - - if (cfq_class_idle(cfqq)) - return true; - - /* - * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. - */ - if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) - return false; - - /* - * if the new request is sync, but the currently running queue is - * not, let the sync request have priority. - */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) - return true; - - /* - * Treat ancestors of current cgroup the same way as current cgroup. - * For anybody else we disallow preemption to guarantee service - * fairness among cgroups. - */ - if (!cfqg_is_descendant(cfqq->cfqg, new_cfqq->cfqg)) - return false; - - if (cfq_slice_used(cfqq)) - return true; - - /* - * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice. - */ - if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq)) - return true; - - WARN_ON_ONCE(cfqq->ioprio_class != new_cfqq->ioprio_class); - /* Allow preemption only if we are idling on sync-noidle tree */ - if (cfqd->serving_wl_type == SYNC_NOIDLE_WORKLOAD && - cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD && - RB_EMPTY_ROOT(&cfqq->sort_list)) - return true; - - /* - * So both queues are sync. Let the new request get disk time if - * it's a metadata request and the current queue is doing regular IO. - */ - if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending) - return true; - - /* An idle queue should not be idle now for some reason */ - if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq)) - return true; - - if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq)) - return false; - - /* - * if this request is as-good as one we would expect from the - * current cfqq, let it preempt - */ - if (cfq_rq_close(cfqd, cfqq, rq)) - return true; - - return false; -} - -/* - * cfqq preempts the active queue. if we allowed preempt with no slice left, - * let it have half of its nominal slice. - */ -static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - enum wl_type_t old_type = cfqq_type(cfqd->active_queue); - - cfq_log_cfqq(cfqd, cfqq, "preempt"); - cfq_slice_expired(cfqd, 1); - - /* - * workload type is changed, don't save slice, otherwise preempt - * doesn't happen - */ - if (old_type != cfqq_type(cfqq)) - cfqq->cfqg->saved_wl_slice = 0; - - /* - * Put the new queue at the front of the of the current list, - * so we know that it will be selected next. - */ - BUG_ON(!cfq_cfqq_on_rr(cfqq)); - - cfq_service_tree_add(cfqd, cfqq, 1); - - cfqq->slice_end = 0; - cfq_mark_cfqq_slice_new(cfqq); -} - -/* - * Called when a new fs request (rq) is added (to cfqq). Check if there's - * something we should do about it - */ -static void -cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, - struct request *rq) -{ - struct cfq_io_cq *cic = RQ_CIC(rq); - - cfqd->rq_queued++; - if (rq->cmd_flags & REQ_PRIO) - cfqq->prio_pending++; - - cfq_update_io_thinktime(cfqd, cfqq, cic); - cfq_update_io_seektime(cfqd, cfqq, rq); - cfq_update_idle_window(cfqd, cfqq, cic); - - cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - - if (cfqq == cfqd->active_queue) { - /* - * Remember that we saw a request from this process, but - * don't start queuing just yet. Otherwise we risk seeing lots - * of tiny requests, because we disrupt the normal plugging - * and merging. If the request is already larger than a single - * page, let it rip immediately. For that case we assume that - * merging is already done. Ditto for a busy system that - * has other work pending, don't risk delaying until the - * idle timer unplug to continue working. - */ - if (cfq_cfqq_wait_request(cfqq)) { - if (blk_rq_bytes(rq) > PAGE_SIZE || - cfqd->busy_queues > 1) { - cfq_del_timer(cfqd, cfqq); - cfq_clear_cfqq_wait_request(cfqq); - __blk_run_queue(cfqd->queue); - } else { - cfqg_stats_update_idle_time(cfqq->cfqg); - cfq_mark_cfqq_must_dispatch(cfqq); - } - } - } else if (cfq_should_preempt(cfqd, cfqq, rq)) { - /* - * not the active queue - expire current slice if it is - * idle and has expired it's mean thinktime or this new queue - * has some old slice time left and is of higher priority or - * this new queue is RT and the current one is BE - */ - cfq_preempt_queue(cfqd, cfqq); - __blk_run_queue(cfqd->queue); - } -} - -static void cfq_insert_request(struct request_queue *q, struct request *rq) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - cfq_log_cfqq(cfqd, cfqq, "insert_request"); - cfq_init_prio_data(cfqq, RQ_CIC(rq)); - - rq->fifo_time = ktime_get_ns() + cfqd->cfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &cfqq->fifo); - cfq_add_rq_rb(rq); - cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group, - rq->cmd_flags); - cfq_rq_enqueued(cfqd, cfqq, rq); -} - -/* - * Update hw_tag based on peak queue depth over 50 samples under - * sufficient load. - */ -static void cfq_update_hw_tag(struct cfq_data *cfqd) -{ - struct cfq_queue *cfqq = cfqd->active_queue; - - if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth) - cfqd->hw_tag_est_depth = cfqd->rq_in_driver; - - if (cfqd->hw_tag == 1) - return; - - if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN && - cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN) - return; - - /* - * If active queue hasn't enough requests and can idle, cfq might not - * dispatch sufficient requests to hardware. Don't zero hw_tag in this - * case - */ - if (cfqq && cfq_cfqq_idle_window(cfqq) && - cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] < - CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN) - return; - - if (cfqd->hw_tag_samples++ < 50) - return; - - if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN) - cfqd->hw_tag = 1; - else - cfqd->hw_tag = 0; -} - -static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq) -{ - struct cfq_io_cq *cic = cfqd->active_cic; - u64 now = ktime_get_ns(); - - /* If the queue already has requests, don't wait */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - return false; - - /* If there are other queues in the group, don't wait */ - if (cfqq->cfqg->nr_cfqq > 1) - return false; - - /* the only queue in the group, but think time is big */ - if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) - return false; - - if (cfq_slice_used(cfqq)) - return true; - - /* if slice left is less than think time, wait busy */ - if (cic && sample_valid(cic->ttime.ttime_samples) - && (cfqq->slice_end - now < cic->ttime.ttime_mean)) - return true; - - /* - * If think times is less than a jiffy than ttime_mean=0 and above - * will not be true. It might happen that slice has not expired yet - * but will expire soon (4-5 ns) during select_queue(). To cover the - * case where think time is less than a jiffy, mark the queue wait - * busy if only 1 jiffy is left in the slice. - */ - if (cfqq->slice_end - now <= jiffies_to_nsecs(1)) - return true; - - return false; -} - -static void cfq_completed_request(struct request_queue *q, struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - struct cfq_data *cfqd = cfqq->cfqd; - const int sync = rq_is_sync(rq); - u64 now = ktime_get_ns(); - - cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", req_noidle(rq)); - - cfq_update_hw_tag(cfqd); - - WARN_ON(!cfqd->rq_in_driver); - WARN_ON(!cfqq->dispatched); - cfqd->rq_in_driver--; - cfqq->dispatched--; - (RQ_CFQG(rq))->dispatched--; - cfqg_stats_update_completion(cfqq->cfqg, rq->start_time_ns, - rq->io_start_time_ns, rq->cmd_flags); - - cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--; - - if (sync) { - struct cfq_rb_root *st; - - RQ_CIC(rq)->ttime.last_end_request = now; - - if (cfq_cfqq_on_rr(cfqq)) - st = cfqq->service_tree; - else - st = st_for(cfqq->cfqg, cfqq_class(cfqq), - cfqq_type(cfqq)); - - st->ttime.last_end_request = now; - if (rq->start_time_ns + cfqd->cfq_fifo_expire[1] <= now) - cfqd->last_delayed_sync = now; - } - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - cfqq->cfqg->ttime.last_end_request = now; -#endif - - /* - * If this is the active queue, check if it needs to be expired, - * or if we want to idle in case it has no pending requests. - */ - if (cfqd->active_queue == cfqq) { - const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list); - - if (cfq_cfqq_slice_new(cfqq)) { - cfq_set_prio_slice(cfqd, cfqq); - cfq_clear_cfqq_slice_new(cfqq); - } - - /* - * Should we wait for next request to come in before we expire - * the queue. - */ - if (cfq_should_wait_busy(cfqd, cfqq)) { - u64 extend_sl = cfqd->cfq_slice_idle; - if (!cfqd->cfq_slice_idle) - extend_sl = cfqd->cfq_group_idle; - cfqq->slice_end = now + extend_sl; - cfq_mark_cfqq_wait_busy(cfqq); - cfq_log_cfqq(cfqd, cfqq, "will busy wait"); - } - - /* - * Idling is not enabled on: - * - expired queues - * - idle-priority queues - * - async queues - * - queues with still some requests queued - * - when there is a close cooperator - */ - if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq)) - cfq_slice_expired(cfqd, 1); - else if (sync && cfqq_empty && - !cfq_close_cooperator(cfqd, cfqq)) { - cfq_arm_slice_timer(cfqd); - } - } - - if (!cfqd->rq_in_driver) - cfq_schedule_dispatch(cfqd); -} - -static void cfqq_boost_on_prio(struct cfq_queue *cfqq, unsigned int op) -{ - /* - * If REQ_PRIO is set, boost class and prio level, if it's below - * BE/NORM. If prio is not set, restore the potentially boosted - * class/prio level. - */ - if (!(op & REQ_PRIO)) { - cfqq->ioprio_class = cfqq->org_ioprio_class; - cfqq->ioprio = cfqq->org_ioprio; - } else { - if (cfq_class_idle(cfqq)) - cfqq->ioprio_class = IOPRIO_CLASS_BE; - if (cfqq->ioprio > IOPRIO_NORM) - cfqq->ioprio = IOPRIO_NORM; - } -} - -static inline int __cfq_may_queue(struct cfq_queue *cfqq) -{ - if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) { - cfq_mark_cfqq_must_alloc_slice(cfqq); - return ELV_MQUEUE_MUST; - } - - return ELV_MQUEUE_MAY; -} - -static int cfq_may_queue(struct request_queue *q, unsigned int op) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; - struct cfq_io_cq *cic; - struct cfq_queue *cfqq; - - /* - * don't force setup of a queue from here, as a call to may_queue - * does not necessarily imply that a request actually will be queued. - * so just lookup a possibly existing queue, or return 'may queue' - * if that fails - */ - cic = cfq_cic_lookup(cfqd, tsk->io_context); - if (!cic) - return ELV_MQUEUE_MAY; - - cfqq = cic_to_cfqq(cic, op_is_sync(op)); - if (cfqq) { - cfq_init_prio_data(cfqq, cic); - cfqq_boost_on_prio(cfqq, op); - - return __cfq_may_queue(cfqq); - } - - return ELV_MQUEUE_MAY; -} - -/* - * queue lock held here - */ -static void cfq_put_request(struct request *rq) -{ - struct cfq_queue *cfqq = RQ_CFQQ(rq); - - if (cfqq) { - const int rw = rq_data_dir(rq); - - BUG_ON(!cfqq->allocated[rw]); - cfqq->allocated[rw]--; - - /* Put down rq reference on cfqg */ - cfqg_put(RQ_CFQG(rq)); - rq->elv.priv[0] = NULL; - rq->elv.priv[1] = NULL; - - cfq_put_queue(cfqq); - } -} - -static struct cfq_queue * -cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic, - struct cfq_queue *cfqq) -{ - cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq); - cic_set_cfqq(cic, cfqq->new_cfqq, 1); - cfq_mark_cfqq_coop(cfqq->new_cfqq); - cfq_put_queue(cfqq); - return cic_to_cfqq(cic, 1); -} - -/* - * Returns NULL if a new cfqq should be allocated, or the old cfqq if this - * was the last process referring to said cfqq. - */ -static struct cfq_queue * -split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq) -{ - if (cfqq_process_refs(cfqq) == 1) { - cfqq->pid = current->pid; - cfq_clear_cfqq_coop(cfqq); - cfq_clear_cfqq_split_coop(cfqq); - return cfqq; - } - - cic_set_cfqq(cic, NULL, 1); - - cfq_put_cooperator(cfqq); - - cfq_put_queue(cfqq); - return NULL; -} -/* - * Allocate cfq data structures associated with this request. - */ -static int -cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio, - gfp_t gfp_mask) -{ - struct cfq_data *cfqd = q->elevator->elevator_data; - struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq); - const int rw = rq_data_dir(rq); - const bool is_sync = rq_is_sync(rq); - struct cfq_queue *cfqq; - - spin_lock_irq(q->queue_lock); - - check_ioprio_changed(cic, bio); - check_blkcg_changed(cic, bio); -new_queue: - cfqq = cic_to_cfqq(cic, is_sync); - if (!cfqq || cfqq == &cfqd->oom_cfqq) { - if (cfqq) - cfq_put_queue(cfqq); - cfqq = cfq_get_queue(cfqd, is_sync, cic, bio); - cic_set_cfqq(cic, cfqq, is_sync); - } else { - /* - * If the queue was seeky for too long, break it apart. - */ - if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { - cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); - cfqq = split_cfqq(cic, cfqq); - if (!cfqq) - goto new_queue; - } - - /* - * Check to see if this queue is scheduled to merge with - * another, closely cooperating queue. The merging of - * queues happens here as it must be done in process context. - * The reference on new_cfqq was taken in merge_cfqqs. - */ - if (cfqq->new_cfqq) - cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq); - } - - cfqq->allocated[rw]++; - - cfqq->ref++; - cfqg_get(cfqq->cfqg); - rq->elv.priv[0] = cfqq; - rq->elv.priv[1] = cfqq->cfqg; - spin_unlock_irq(q->queue_lock); - - return 0; -} - -static void cfq_kick_queue(struct work_struct *work) -{ - struct cfq_data *cfqd = - container_of(work, struct cfq_data, unplug_work); - struct request_queue *q = cfqd->queue; - - spin_lock_irq(q->queue_lock); - __blk_run_queue(cfqd->queue); - spin_unlock_irq(q->queue_lock); -} - -/* - * Timer running if the active_queue is currently idling inside its time slice - */ -static enum hrtimer_restart cfq_idle_slice_timer(struct hrtimer *timer) -{ - struct cfq_data *cfqd = container_of(timer, struct cfq_data, - idle_slice_timer); - struct cfq_queue *cfqq; - unsigned long flags; - int timed_out = 1; - - cfq_log(cfqd, "idle timer fired"); - - spin_lock_irqsave(cfqd->queue->queue_lock, flags); - - cfqq = cfqd->active_queue; - if (cfqq) { - timed_out = 0; - - /* - * We saw a request before the queue expired, let it through - */ - if (cfq_cfqq_must_dispatch(cfqq)) - goto out_kick; - - /* - * expired - */ - if (cfq_slice_used(cfqq)) - goto expire; - - /* - * only expire and reinvoke request handler, if there are - * other queues with pending requests - */ - if (!cfqd->busy_queues) - goto out_cont; - - /* - * not expired and it has a request pending, let it dispatch - */ - if (!RB_EMPTY_ROOT(&cfqq->sort_list)) - goto out_kick; - - /* - * Queue depth flag is reset only when the idle didn't succeed - */ - cfq_clear_cfqq_deep(cfqq); - } -expire: - cfq_slice_expired(cfqd, timed_out); -out_kick: - cfq_schedule_dispatch(cfqd); -out_cont: - spin_unlock_irqrestore(cfqd->queue->queue_lock, flags); - return HRTIMER_NORESTART; -} - -static void cfq_shutdown_timer_wq(struct cfq_data *cfqd) -{ - hrtimer_cancel(&cfqd->idle_slice_timer); - cancel_work_sync(&cfqd->unplug_work); -} - -static void cfq_exit_queue(struct elevator_queue *e) -{ - struct cfq_data *cfqd = e->elevator_data; - struct request_queue *q = cfqd->queue; - - cfq_shutdown_timer_wq(cfqd); - - spin_lock_irq(q->queue_lock); - - if (cfqd->active_queue) - __cfq_slice_expired(cfqd, cfqd->active_queue, 0); - - spin_unlock_irq(q->queue_lock); - - cfq_shutdown_timer_wq(cfqd); - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_cfq); -#else - kfree(cfqd->root_group); -#endif - kfree(cfqd); -} - -static int cfq_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct cfq_data *cfqd; - struct blkcg_gq *blkg __maybe_unused; - int i, ret; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - cfqd = kzalloc_node(sizeof(*cfqd), GFP_KERNEL, q->node); - if (!cfqd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = cfqd; - - cfqd->queue = q; - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - - /* Init root service tree */ - cfqd->grp_service_tree = CFQ_RB_ROOT; - - /* Init root group and prefer root group over other groups by default */ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - ret = blkcg_activate_policy(q, &blkcg_policy_cfq); - if (ret) - goto out_free; - - cfqd->root_group = blkg_to_cfqg(q->root_blkg); -#else - ret = -ENOMEM; - cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group), - GFP_KERNEL, cfqd->queue->node); - if (!cfqd->root_group) - goto out_free; - - cfq_init_cfqg_base(cfqd->root_group); - cfqd->root_group->weight = 2 * CFQ_WEIGHT_LEGACY_DFL; - cfqd->root_group->leaf_weight = 2 * CFQ_WEIGHT_LEGACY_DFL; -#endif - - /* - * Not strictly needed (since RB_ROOT just clears the node and we - * zeroed cfqd on alloc), but better be safe in case someone decides - * to add magic to the rb code - */ - for (i = 0; i < CFQ_PRIO_LISTS; i++) - cfqd->prio_trees[i] = RB_ROOT; - - /* - * Our fallback cfqq if cfq_get_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow - * will not attempt to free it. oom_cfqq is linked to root_group - * but shouldn't hold a reference as it'll never be unlinked. Lose - * the reference from linking right away. - */ - cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0); - cfqd->oom_cfqq.ref++; - - spin_lock_irq(q->queue_lock); - cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group); - cfqg_put(cfqd->root_group); - spin_unlock_irq(q->queue_lock); - - hrtimer_init(&cfqd->idle_slice_timer, CLOCK_MONOTONIC, - HRTIMER_MODE_REL); - cfqd->idle_slice_timer.function = cfq_idle_slice_timer; - - INIT_WORK(&cfqd->unplug_work, cfq_kick_queue); - - cfqd->cfq_quantum = cfq_quantum; - cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0]; - cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1]; - cfqd->cfq_back_max = cfq_back_max; - cfqd->cfq_back_penalty = cfq_back_penalty; - cfqd->cfq_slice[0] = cfq_slice_async; - cfqd->cfq_slice[1] = cfq_slice_sync; - cfqd->cfq_target_latency = cfq_target_latency; - cfqd->cfq_slice_async_rq = cfq_slice_async_rq; - cfqd->cfq_slice_idle = cfq_slice_idle; - cfqd->cfq_group_idle = cfq_group_idle; - cfqd->cfq_latency = 1; - cfqd->hw_tag = -1; - /* - * we optimistically start assuming sync ops weren't delayed in last - * second, in order to have larger depth for async operations. - */ - cfqd->last_delayed_sync = ktime_get_ns() - NSEC_PER_SEC; - return 0; - -out_free: - kfree(cfqd); - kobject_put(&eq->kobj); - return ret; -} - -static void cfq_registered_queue(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - struct cfq_data *cfqd = e->elevator_data; - - /* - * Default to IOPS mode with no idling for SSDs - */ - if (blk_queue_nonrot(q)) - cfqd->cfq_slice_idle = 0; - wbt_disable_default(q); -} - -/* - * sysfs parts below --> - */ -static ssize_t -cfq_var_show(unsigned int var, char *page) -{ - return sprintf(page, "%u\n", var); -} - -static void -cfq_var_store(unsigned int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtoul(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - u64 __data = __VAR; \ - if (__CONV) \ - __data = div_u64(__data, NSEC_PER_MSEC); \ - return cfq_var_show(__data, (page)); \ -} -SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0); -SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1); -SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1); -SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0); -SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0); -SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1); -SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1); -SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1); -SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1); -SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0); -SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0); -SHOW_FUNCTION(cfq_target_latency_show, cfqd->cfq_target_latency, 1); -#undef SHOW_FUNCTION - -#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - u64 __data = __VAR; \ - __data = div_u64(__data, NSEC_PER_USEC); \ - return cfq_var_show(__data, (page)); \ -} -USEC_SHOW_FUNCTION(cfq_slice_idle_us_show, cfqd->cfq_slice_idle); -USEC_SHOW_FUNCTION(cfq_group_idle_us_show, cfqd->cfq_group_idle); -USEC_SHOW_FUNCTION(cfq_slice_sync_us_show, cfqd->cfq_slice[1]); -USEC_SHOW_FUNCTION(cfq_slice_async_us_show, cfqd->cfq_slice[0]); -USEC_SHOW_FUNCTION(cfq_target_latency_us_show, cfqd->cfq_target_latency); -#undef USEC_SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data, __min = (MIN), __max = (MAX); \ - \ - cfq_var_store(&__data, (page)); \ - if (__data < __min) \ - __data = __min; \ - else if (__data > __max) \ - __data = __max; \ - if (__CONV) \ - *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ - else \ - *(__PTR) = __data; \ - return count; \ -} -STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0); -STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1, - UINT_MAX, 1); -STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1, - UINT_MAX, 1); -STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0); -STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1, - UINT_MAX, 0); -STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1); -STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1, - UINT_MAX, 0); -STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0); -STORE_FUNCTION(cfq_target_latency_store, &cfqd->cfq_target_latency, 1, UINT_MAX, 1); -#undef STORE_FUNCTION - -#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct cfq_data *cfqd = e->elevator_data; \ - unsigned int __data, __min = (MIN), __max = (MAX); \ - \ - cfq_var_store(&__data, (page)); \ - if (__data < __min) \ - __data = __min; \ - else if (__data > __max) \ - __data = __max; \ - *(__PTR) = (u64)__data * NSEC_PER_USEC; \ - return count; \ -} -USEC_STORE_FUNCTION(cfq_slice_idle_us_store, &cfqd->cfq_slice_idle, 0, UINT_MAX); -USEC_STORE_FUNCTION(cfq_group_idle_us_store, &cfqd->cfq_group_idle, 0, UINT_MAX); -USEC_STORE_FUNCTION(cfq_slice_sync_us_store, &cfqd->cfq_slice[1], 1, UINT_MAX); -USEC_STORE_FUNCTION(cfq_slice_async_us_store, &cfqd->cfq_slice[0], 1, UINT_MAX); -USEC_STORE_FUNCTION(cfq_target_latency_us_store, &cfqd->cfq_target_latency, 1, UINT_MAX); -#undef USEC_STORE_FUNCTION - -#define CFQ_ATTR(name) \ - __ATTR(name, 0644, cfq_##name##_show, cfq_##name##_store) - -static struct elv_fs_entry cfq_attrs[] = { - CFQ_ATTR(quantum), - CFQ_ATTR(fifo_expire_sync), - CFQ_ATTR(fifo_expire_async), - CFQ_ATTR(back_seek_max), - CFQ_ATTR(back_seek_penalty), - CFQ_ATTR(slice_sync), - CFQ_ATTR(slice_sync_us), - CFQ_ATTR(slice_async), - CFQ_ATTR(slice_async_us), - CFQ_ATTR(slice_async_rq), - CFQ_ATTR(slice_idle), - CFQ_ATTR(slice_idle_us), - CFQ_ATTR(group_idle), - CFQ_ATTR(group_idle_us), - CFQ_ATTR(low_latency), - CFQ_ATTR(target_latency), - CFQ_ATTR(target_latency_us), - __ATTR_NULL -}; - -static struct elevator_type iosched_cfq = { - .ops.sq = { - .elevator_merge_fn = cfq_merge, - .elevator_merged_fn = cfq_merged_request, - .elevator_merge_req_fn = cfq_merged_requests, - .elevator_allow_bio_merge_fn = cfq_allow_bio_merge, - .elevator_allow_rq_merge_fn = cfq_allow_rq_merge, - .elevator_bio_merged_fn = cfq_bio_merged, - .elevator_dispatch_fn = cfq_dispatch_requests, - .elevator_add_req_fn = cfq_insert_request, - .elevator_activate_req_fn = cfq_activate_request, - .elevator_deactivate_req_fn = cfq_deactivate_request, - .elevator_completed_req_fn = cfq_completed_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_icq_fn = cfq_init_icq, - .elevator_exit_icq_fn = cfq_exit_icq, - .elevator_set_req_fn = cfq_set_request, - .elevator_put_req_fn = cfq_put_request, - .elevator_may_queue_fn = cfq_may_queue, - .elevator_init_fn = cfq_init_queue, - .elevator_exit_fn = cfq_exit_queue, - .elevator_registered_fn = cfq_registered_queue, - }, - .icq_size = sizeof(struct cfq_io_cq), - .icq_align = __alignof__(struct cfq_io_cq), - .elevator_attrs = cfq_attrs, - .elevator_name = "cfq", - .elevator_owner = THIS_MODULE, -}; - -#ifdef CONFIG_CFQ_GROUP_IOSCHED -static struct blkcg_policy blkcg_policy_cfq = { - .dfl_cftypes = cfq_blkcg_files, - .legacy_cftypes = cfq_blkcg_legacy_files, - - .cpd_alloc_fn = cfq_cpd_alloc, - .cpd_init_fn = cfq_cpd_init, - .cpd_free_fn = cfq_cpd_free, - .cpd_bind_fn = cfq_cpd_bind, - - .pd_alloc_fn = cfq_pd_alloc, - .pd_init_fn = cfq_pd_init, - .pd_offline_fn = cfq_pd_offline, - .pd_free_fn = cfq_pd_free, - .pd_reset_stats_fn = cfq_pd_reset_stats, -}; -#endif - -static int __init cfq_init(void) -{ - int ret; - -#ifdef CONFIG_CFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_cfq); - if (ret) - return ret; -#else - cfq_group_idle = 0; -#endif - - ret = -ENOMEM; - cfq_pool = KMEM_CACHE(cfq_queue, 0); - if (!cfq_pool) - goto err_pol_unreg; - - ret = elv_register(&iosched_cfq); - if (ret) - goto err_free_pool; - - return 0; - -err_free_pool: - kmem_cache_destroy(cfq_pool); -err_pol_unreg: -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_cfq); -#endif - return ret; -} - -static void __exit cfq_exit(void) -{ -#ifdef CONFIG_CFQ_GROUP_IOSCHED - blkcg_policy_unregister(&blkcg_policy_cfq); -#endif - elv_unregister(&iosched_cfq); - kmem_cache_destroy(cfq_pool); -} - -module_init(cfq_init); -module_exit(cfq_exit); - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler"); diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c deleted file mode 100644 index ef2f1f09e9b3..000000000000 --- a/block/deadline-iosched.c +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Deadline i/o scheduler. - * - * Copyright (C) 2002 Jens Axboe <axboe@kernel.dk> - */ -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/blkdev.h> -#include <linux/elevator.h> -#include <linux/bio.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/compiler.h> -#include <linux/rbtree.h> - -/* - * See Documentation/block/deadline-iosched.txt - */ -static const int read_expire = HZ / 2; /* max time before a read is submitted. */ -static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ -static const int writes_starved = 2; /* max times reads can starve a write */ -static const int fifo_batch = 16; /* # of sequential requests treated as one - by the above parameters. For throughput. */ - -struct deadline_data { - /* - * run time data - */ - - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; - - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[2]; - unsigned int batching; /* number of sequential requests made */ - unsigned int starved; /* times reads have starved writes */ - - /* - * settings that change how the i/o scheduler behaves - */ - int fifo_expire[2]; - int fifo_batch; - int writes_starved; - int front_merges; -}; - -static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) -{ - return &dd->sort_list[rq_data_dir(rq)]; -} - -/* - * get the request after `rq' in sector-sorted order - */ -static inline struct request * -deadline_latter_request(struct request *rq) -{ - struct rb_node *node = rb_next(&rq->rb_node); - - if (node) - return rb_entry_rq(node); - - return NULL; -} - -static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) -{ - struct rb_root *root = deadline_rb_root(dd, rq); - - elv_rb_add(root, rq); -} - -static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) -{ - const int data_dir = rq_data_dir(rq); - - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); - - elv_rb_del(deadline_rb_root(dd, rq), rq); -} - -/* - * add rq to rbtree and fifo - */ -static void -deadline_add_request(struct request_queue *q, struct request *rq) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); - - /* - * This may be a requeue of a write request that has locked its - * target zone. If it is the case, this releases the zone lock. - */ - blk_req_zone_write_unlock(rq); - - deadline_add_rq_rb(dd, rq); - - /* - * set expire time and add to fifo list - */ - rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); -} - -/* - * remove rq from rbtree and fifo. - */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - rq_fifo_clear(rq); - deadline_del_rq_rb(dd, rq); -} - -static enum elv_merge -deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) -{ - struct deadline_data *dd = q->elevator->elevator_data; - struct request *__rq; - - /* - * check for front merge - */ - if (dd->front_merges) { - sector_t sector = bio_end_sector(bio); - - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); - if (__rq) { - BUG_ON(sector != blk_rq_pos(__rq)); - - if (elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } - } - } - - return ELEVATOR_NO_MERGE; -} - -static void deadline_merged_request(struct request_queue *q, - struct request *req, enum elv_merge type) -{ - struct deadline_data *dd = q->elevator->elevator_data; - - /* - * if the merge was a front merge, we need to reposition request - */ - if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); - } -} - -static void -deadline_merged_requests(struct request_queue *q, struct request *req, - struct request *next) -{ - /* - * if next expires before rq, assign its expire time to rq - * and move into next position (next will be deleted) in fifo - */ - if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { - if (time_before((unsigned long)next->fifo_time, - (unsigned long)req->fifo_time)) { - list_move(&req->queuelist, &next->queuelist); - req->fifo_time = next->fifo_time; - } - } - - /* - * kill knowledge of next, this one is a goner - */ - deadline_remove_request(q, next); -} - -/* - * move request from sort list to dispatch queue. - */ -static inline void -deadline_move_to_dispatch(struct deadline_data *dd, struct request *rq) -{ - struct request_queue *q = rq->q; - - /* - * For a zoned block device, write requests must write lock their - * target zone. - */ - blk_req_zone_write_lock(rq); - - deadline_remove_request(q, rq); - elv_dispatch_add_tail(q, rq); -} - -/* - * move an entry to dispatch queue - */ -static void -deadline_move_request(struct deadline_data *dd, struct request *rq) -{ - const int data_dir = rq_data_dir(rq); - - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); - - /* - * take it off the sort and fifo list, move - * to dispatch queue - */ - deadline_move_to_dispatch(dd, rq); -} - -/* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) - */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) -{ - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); - - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; -} - -/* - * For the specified data direction, return the next request to dispatch using - * arrival ordered lists. - */ -static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) -{ - struct request *rq; - - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - if (list_empty(&dd->fifo_list[data_dir])) - return NULL; - - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) - return rq; - } - - return NULL; -} - -/* - * For the specified data direction, return the next request to dispatch using - * sector position sorted lists. - */ -static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) -{ - struct request *rq; - - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - rq = dd->next_rq[data_dir]; - if (!rq) - return NULL; - - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) - return rq; - - /* - * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. - */ - while (rq) { - if (blk_req_can_dispatch_to_zone(rq)) - return rq; - rq = deadline_latter_request(rq); - } - - return NULL; -} - -/* - * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc - */ -static int deadline_dispatch_requests(struct request_queue *q, int force) -{ - struct deadline_data *dd = q->elevator->elevator_data; - const int reads = !list_empty(&dd->fifo_list[READ]); - const int writes = !list_empty(&dd->fifo_list[WRITE]); - struct request *rq, *next_rq; - int data_dir; - - /* - * batches are currently reads XOR writes - */ - rq = deadline_next_request(dd, WRITE); - if (!rq) - rq = deadline_next_request(dd, READ); - - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ - goto dispatch_request; - - /* - * at this point we are not running a batch. select the appropriate - * data direction (read / write) - */ - - if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); - - if (deadline_fifo_request(dd, WRITE) && - (dd->starved++ >= dd->writes_starved)) - goto dispatch_writes; - - data_dir = READ; - - goto dispatch_find_request; - } - - /* - * there are either no reads or writes have been starved - */ - - if (writes) { -dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); - - dd->starved = 0; - - data_dir = WRITE; - - goto dispatch_find_request; - } - - return 0; - -dispatch_find_request: - /* - * we are not running a batch, find best request for selected data_dir - */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { - /* - * A deadline has expired, the last request was in the other - * direction, or we have run out of higher-sectored requests. - * Start again from the request with the earliest expiry time. - */ - rq = deadline_fifo_request(dd, data_dir); - } else { - /* - * The last req was the same dir and we have a next request in - * sort order. No expired requests so continue on from here. - */ - rq = next_rq; - } - - /* - * For a zoned block device, if we only have writes queued and none of - * them can be dispatched, rq will be NULL. - */ - if (!rq) - return 0; - - dd->batching = 0; - -dispatch_request: - /* - * rq is the selected appropriate request. - */ - dd->batching++; - deadline_move_request(dd, rq); - - return 1; -} - -/* - * For zoned block devices, write unlock the target zone of completed - * write requests. - */ -static void -deadline_completed_request(struct request_queue *q, struct request *rq) -{ - blk_req_zone_write_unlock(rq); -} - -static void deadline_exit_queue(struct elevator_queue *e) -{ - struct deadline_data *dd = e->elevator_data; - - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); - - kfree(dd); -} - -/* - * initialize elevator private data (deadline_data). - */ -static int deadline_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct deadline_data *dd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = dd; - - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; - dd->writes_starved = writes_starved; - dd->front_merges = 1; - dd->fifo_batch = fifo_batch; - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - return 0; -} - -/* - * sysfs parts below - */ - -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ -} -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION - -#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -{ \ - struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ - if (__data < (MIN)) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ - return count; \ -} -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); -#undef STORE_FUNCTION - -#define DD_ATTR(name) \ - __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) - -static struct elv_fs_entry deadline_attrs[] = { - DD_ATTR(read_expire), - DD_ATTR(write_expire), - DD_ATTR(writes_starved), - DD_ATTR(front_merges), - DD_ATTR(fifo_batch), - __ATTR_NULL -}; - -static struct elevator_type iosched_deadline = { - .ops.sq = { - .elevator_merge_fn = deadline_merge, - .elevator_merged_fn = deadline_merged_request, - .elevator_merge_req_fn = deadline_merged_requests, - .elevator_dispatch_fn = deadline_dispatch_requests, - .elevator_completed_req_fn = deadline_completed_request, - .elevator_add_req_fn = deadline_add_request, - .elevator_former_req_fn = elv_rb_former_request, - .elevator_latter_req_fn = elv_rb_latter_request, - .elevator_init_fn = deadline_init_queue, - .elevator_exit_fn = deadline_exit_queue, - }, - - .elevator_attrs = deadline_attrs, - .elevator_name = "deadline", - .elevator_owner = THIS_MODULE, -}; - -static int __init deadline_init(void) -{ - return elv_register(&iosched_deadline); -} - -static void __exit deadline_exit(void) -{ - elv_unregister(&iosched_deadline); -} - -module_init(deadline_init); -module_exit(deadline_exit); - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("deadline IO scheduler"); diff --git a/block/elevator.c b/block/elevator.c index 8fdcd64ae12e..f05e90d4e695 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -61,10 +61,8 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.allow_merge) - return e->type->ops.mq.allow_merge(q, rq, bio); - else if (!e->uses_mq && e->type->ops.sq.elevator_allow_bio_merge_fn) - return e->type->ops.sq.elevator_allow_bio_merge_fn(q, rq, bio); + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); return 1; } @@ -95,14 +93,14 @@ static bool elevator_match(const struct elevator_type *e, const char *name) } /* - * Return scheduler with name 'name' and with matching 'mq capability + * Return scheduler with name 'name' */ -static struct elevator_type *elevator_find(const char *name, bool mq) +static struct elevator_type *elevator_find(const char *name) { struct elevator_type *e; list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name) && (mq == e->uses_mq)) + if (elevator_match(e, name)) return e; } @@ -121,12 +119,12 @@ static struct elevator_type *elevator_get(struct request_queue *q, spin_lock(&elv_list_lock); - e = elevator_find(name, q->mq_ops != NULL); + e = elevator_find(name); if (!e && try_loading) { spin_unlock(&elv_list_lock); request_module("%s-iosched", name); spin_lock(&elv_list_lock); - e = elevator_find(name, q->mq_ops != NULL); + e = elevator_find(name); } if (e && !try_module_get(e->elevator_owner)) @@ -150,26 +148,6 @@ static int __init elevator_setup(char *str) __setup("elevator=", elevator_setup); -/* called during boot to load the elevator chosen by the elevator param */ -void __init load_default_elevator_module(void) -{ - struct elevator_type *e; - - if (!chosen_elevator[0]) - return; - - /* - * Boot parameter is deprecated, we haven't supported that for MQ. - * Only look for non-mq schedulers from here. - */ - spin_lock(&elv_list_lock); - e = elevator_find(chosen_elevator, false); - spin_unlock(&elv_list_lock); - - if (!e) - request_module("%s-iosched", chosen_elevator); -} - static struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, @@ -185,7 +163,6 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); hash_init(eq->hash); - eq->uses_mq = e->uses_mq; return eq; } @@ -200,54 +177,11 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -/* - * Use the default elevator specified by config boot param for non-mq devices, - * or by config option. Don't try to load modules as we could be running off - * async and request_module() isn't allowed from async. - */ -int elevator_init(struct request_queue *q) -{ - struct elevator_type *e = NULL; - int err = 0; - - /* - * q->sysfs_lock must be held to provide mutual exclusion between - * elevator_switch() and here. - */ - mutex_lock(&q->sysfs_lock); - if (unlikely(q->elevator)) - goto out_unlock; - - if (*chosen_elevator) { - e = elevator_get(q, chosen_elevator, false); - if (!e) - printk(KERN_ERR "I/O scheduler %s not found\n", - chosen_elevator); - } - - if (!e) - e = elevator_get(q, CONFIG_DEFAULT_IOSCHED, false); - if (!e) { - printk(KERN_ERR - "Default I/O scheduler not found. Using noop.\n"); - e = elevator_get(q, "noop", false); - } - - err = e->ops.sq.elevator_init_fn(q, e); - if (err) - elevator_put(e); -out_unlock: - mutex_unlock(&q->sysfs_lock); - return err; -} - void elevator_exit(struct request_queue *q, struct elevator_queue *e) { mutex_lock(&e->sysfs_lock); - if (e->uses_mq && e->type->ops.mq.exit_sched) + if (e->type->ops.exit_sched) blk_mq_exit_sched(q, e); - else if (!e->uses_mq && e->type->ops.sq.elevator_exit_fn) - e->type->ops.sq.elevator_exit_fn(e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -356,68 +290,6 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector) } EXPORT_SYMBOL(elv_rb_find); -/* - * Insert rq into dispatch queue of q. Queue lock must be held on - * entry. rq is sort instead into the dispatch queue. To be used by - * specific elevators. - */ -void elv_dispatch_sort(struct request_queue *q, struct request *rq) -{ - sector_t boundary; - struct list_head *entry; - - if (q->last_merge == rq) - q->last_merge = NULL; - - elv_rqhash_del(q, rq); - - q->nr_sorted--; - - boundary = q->end_sector; - list_for_each_prev(entry, &q->queue_head) { - struct request *pos = list_entry_rq(entry); - - if (req_op(rq) != req_op(pos)) - break; - if (rq_data_dir(rq) != rq_data_dir(pos)) - break; - if (pos->rq_flags & (RQF_STARTED | RQF_SOFTBARRIER)) - break; - if (blk_rq_pos(rq) >= boundary) { - if (blk_rq_pos(pos) < boundary) - continue; - } else { - if (blk_rq_pos(pos) >= boundary) - break; - } - if (blk_rq_pos(rq) >= blk_rq_pos(pos)) - break; - } - - list_add(&rq->queuelist, entry); -} -EXPORT_SYMBOL(elv_dispatch_sort); - -/* - * Insert rq into dispatch queue of q. Queue lock must be held on - * entry. rq is added to the back of the dispatch queue. To be used by - * specific elevators. - */ -void elv_dispatch_add_tail(struct request_queue *q, struct request *rq) -{ - if (q->last_merge == rq) - q->last_merge = NULL; - - elv_rqhash_del(q, rq); - - q->nr_sorted--; - - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - list_add_tail(&rq->queuelist, &q->queue_head); -} -EXPORT_SYMBOL(elv_dispatch_add_tail); - enum elv_merge elv_merge(struct request_queue *q, struct request **req, struct bio *bio) { @@ -457,10 +329,8 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, return ELEVATOR_BACK_MERGE; } - if (e->uses_mq && e->type->ops.mq.request_merge) - return e->type->ops.mq.request_merge(q, req, bio); - else if (!e->uses_mq && e->type->ops.sq.elevator_merge_fn) - return e->type->ops.sq.elevator_merge_fn(q, req, bio); + if (e->type->ops.request_merge) + return e->type->ops.request_merge(q, req, bio); return ELEVATOR_NO_MERGE; } @@ -511,10 +381,8 @@ void elv_merged_request(struct request_queue *q, struct request *rq, { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.request_merged) - e->type->ops.mq.request_merged(q, rq, type); - else if (!e->uses_mq && e->type->ops.sq.elevator_merged_fn) - e->type->ops.sq.elevator_merged_fn(q, rq, type); + if (e->type->ops.request_merged) + e->type->ops.request_merged(q, rq, type); if (type == ELEVATOR_BACK_MERGE) elv_rqhash_reposition(q, rq); @@ -526,176 +394,20 @@ void elv_merge_requests(struct request_queue *q, struct request *rq, struct request *next) { struct elevator_queue *e = q->elevator; - bool next_sorted = false; - - if (e->uses_mq && e->type->ops.mq.requests_merged) - e->type->ops.mq.requests_merged(q, rq, next); - else if (e->type->ops.sq.elevator_merge_req_fn) { - next_sorted = (__force bool)(next->rq_flags & RQF_SORTED); - if (next_sorted) - e->type->ops.sq.elevator_merge_req_fn(q, rq, next); - } - elv_rqhash_reposition(q, rq); - - if (next_sorted) { - elv_rqhash_del(q, next); - q->nr_sorted--; - } + if (e->type->ops.requests_merged) + e->type->ops.requests_merged(q, rq, next); + elv_rqhash_reposition(q, rq); q->last_merge = rq; } -void elv_bio_merged(struct request_queue *q, struct request *rq, - struct bio *bio) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - if (e->type->ops.sq.elevator_bio_merged_fn) - e->type->ops.sq.elevator_bio_merged_fn(q, rq, bio); -} - -void elv_requeue_request(struct request_queue *q, struct request *rq) -{ - /* - * it already went through dequeue, we need to decrement the - * in_flight count again - */ - if (blk_account_rq(rq)) { - q->in_flight[rq_is_sync(rq)]--; - if (rq->rq_flags & RQF_SORTED) - elv_deactivate_rq(q, rq); - } - - rq->rq_flags &= ~RQF_STARTED; - - blk_pm_requeue_request(rq); - - __elv_add_request(q, rq, ELEVATOR_INSERT_REQUEUE); -} - -void elv_drain_elevator(struct request_queue *q) -{ - struct elevator_queue *e = q->elevator; - static int printed; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - lockdep_assert_held(q->queue_lock); - - while (e->type->ops.sq.elevator_dispatch_fn(q, 1)) - ; - if (q->nr_sorted && !blk_queue_is_zoned(q) && printed++ < 10 ) { - printk(KERN_ERR "%s: forced dispatching is broken " - "(nr_sorted=%u), please report this\n", - q->elevator->type->elevator_name, q->nr_sorted); - } -} - -void __elv_add_request(struct request_queue *q, struct request *rq, int where) -{ - trace_block_rq_insert(q, rq); - - blk_pm_add_request(q, rq); - - rq->q = q; - - if (rq->rq_flags & RQF_SOFTBARRIER) { - /* barriers are scheduling boundary, update end_sector */ - if (!blk_rq_is_passthrough(rq)) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = rq; - } - } else if (!(rq->rq_flags & RQF_ELVPRIV) && - (where == ELEVATOR_INSERT_SORT || - where == ELEVATOR_INSERT_SORT_MERGE)) - where = ELEVATOR_INSERT_BACK; - - switch (where) { - case ELEVATOR_INSERT_REQUEUE: - case ELEVATOR_INSERT_FRONT: - rq->rq_flags |= RQF_SOFTBARRIER; - list_add(&rq->queuelist, &q->queue_head); - break; - - case ELEVATOR_INSERT_BACK: - rq->rq_flags |= RQF_SOFTBARRIER; - elv_drain_elevator(q); - list_add_tail(&rq->queuelist, &q->queue_head); - /* - * We kick the queue here for the following reasons. - * - The elevator might have returned NULL previously - * to delay requests and returned them now. As the - * queue wasn't empty before this request, ll_rw_blk - * won't run the queue on return, resulting in hang. - * - Usually, back inserted requests won't be merged - * with anything. There's no point in delaying queue - * processing. - */ - __blk_run_queue(q); - break; - - case ELEVATOR_INSERT_SORT_MERGE: - /* - * If we succeed in merging this request with one in the - * queue already, we are done - rq has now been freed, - * so no need to do anything further. - */ - if (elv_attempt_insert_merge(q, rq)) - break; - /* fall through */ - case ELEVATOR_INSERT_SORT: - BUG_ON(blk_rq_is_passthrough(rq)); - rq->rq_flags |= RQF_SORTED; - q->nr_sorted++; - if (rq_mergeable(rq)) { - elv_rqhash_add(q, rq); - if (!q->last_merge) - q->last_merge = rq; - } - - /* - * Some ioscheds (cfq) run q->request_fn directly, so - * rq cannot be accessed after calling - * elevator_add_req_fn. - */ - q->elevator->type->ops.sq.elevator_add_req_fn(q, rq); - break; - - case ELEVATOR_INSERT_FLUSH: - rq->rq_flags |= RQF_SOFTBARRIER; - blk_insert_flush(rq); - break; - default: - printk(KERN_ERR "%s: bad insertion point %d\n", - __func__, where); - BUG(); - } -} -EXPORT_SYMBOL(__elv_add_request); - -void elv_add_request(struct request_queue *q, struct request *rq, int where) -{ - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - __elv_add_request(q, rq, where); - spin_unlock_irqrestore(q->queue_lock, flags); -} -EXPORT_SYMBOL(elv_add_request); - struct request *elv_latter_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.next_request) - return e->type->ops.mq.next_request(q, rq); - else if (!e->uses_mq && e->type->ops.sq.elevator_latter_req_fn) - return e->type->ops.sq.elevator_latter_req_fn(q, rq); + if (e->type->ops.next_request) + return e->type->ops.next_request(q, rq); return NULL; } @@ -704,66 +416,10 @@ struct request *elv_former_request(struct request_queue *q, struct request *rq) { struct elevator_queue *e = q->elevator; - if (e->uses_mq && e->type->ops.mq.former_request) - return e->type->ops.mq.former_request(q, rq); - if (!e->uses_mq && e->type->ops.sq.elevator_former_req_fn) - return e->type->ops.sq.elevator_former_req_fn(q, rq); - return NULL; -} - -int elv_set_request(struct request_queue *q, struct request *rq, - struct bio *bio, gfp_t gfp_mask) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return 0; - - if (e->type->ops.sq.elevator_set_req_fn) - return e->type->ops.sq.elevator_set_req_fn(q, rq, bio, gfp_mask); - return 0; -} - -void elv_put_request(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; - - if (e->type->ops.sq.elevator_put_req_fn) - e->type->ops.sq.elevator_put_req_fn(rq); -} - -int elv_may_queue(struct request_queue *q, unsigned int op) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return 0; - - if (e->type->ops.sq.elevator_may_queue_fn) - return e->type->ops.sq.elevator_may_queue_fn(q, op); - - return ELV_MQUEUE_MAY; -} - -void elv_completed_request(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (WARN_ON_ONCE(e->uses_mq)) - return; + if (e->type->ops.former_request) + return e->type->ops.former_request(q, rq); - /* - * request is released from the driver, io must be done - */ - if (blk_account_rq(rq)) { - q->in_flight[rq_is_sync(rq)]--; - if ((rq->rq_flags & RQF_SORTED) && - e->type->ops.sq.elevator_completed_req_fn) - e->type->ops.sq.elevator_completed_req_fn(q, rq); - } + return NULL; } #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) @@ -832,8 +488,6 @@ int elv_register_queue(struct request_queue *q) } kobject_uevent(&e->kobj, KOBJ_ADD); e->registered = 1; - if (!e->uses_mq && e->type->ops.sq.elevator_registered_fn) - e->type->ops.sq.elevator_registered_fn(q); } return error; } @@ -873,7 +527,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name, e->uses_mq)) { + if (elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -881,12 +535,6 @@ int elv_register(struct elevator_type *e) list_add_tail(&e->list, &elv_list); spin_unlock(&elv_list_lock); - /* print pretty message */ - if (elevator_match(e, chosen_elevator) || - (!*chosen_elevator && - elevator_match(e, CONFIG_DEFAULT_IOSCHED))) - def = " (default)"; - printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def); return 0; @@ -989,71 +637,17 @@ out_unlock: */ static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - struct elevator_queue *old = q->elevator; - bool old_registered = false; int err; lockdep_assert_held(&q->sysfs_lock); - if (q->mq_ops) { - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - err = elevator_switch_mq(q, new_e); - - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - return err; - } - - /* - * Turn on BYPASS and drain all requests w/ elevator private data. - * Block layer doesn't call into a quiesced elevator - all requests - * are directly put on the dispatch list without elevator data - * using INSERT_BACK. All requests have SOFTBARRIER set and no - * merge happens either. - */ - if (old) { - old_registered = old->registered; - - blk_queue_bypass_start(q); - - /* unregister and clear all auxiliary data of the old elevator */ - if (old_registered) - elv_unregister_queue(q); - - ioc_clear_queue(q); - } + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); - /* allocate, init and register new elevator */ - err = new_e->ops.sq.elevator_init_fn(q, new_e); - if (err) - goto fail_init; - - err = elv_register_queue(q); - if (err) - goto fail_register; - - /* done, kill the old one and finish */ - if (old) { - elevator_exit(q, old); - blk_queue_bypass_end(q); - } + err = elevator_switch_mq(q, new_e); - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - - return 0; - -fail_register: - elevator_exit(q, q->elevator); -fail_init: - /* switch failed, restore and re-register old elevator */ - if (old) { - q->elevator = old; - elv_register_queue(q); - blk_queue_bypass_end(q); - } + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); return err; } @@ -1073,7 +667,7 @@ static int __elevator_change(struct request_queue *q, const char *name) /* * Special case for mq, turn off scheduling */ - if (q->mq_ops && !strncmp(name, "none", 4)) + if (!strncmp(name, "none", 4)) return elevator_switch(q, NULL); strlcpy(elevator_name, name, sizeof(elevator_name)); @@ -1091,8 +685,7 @@ static int __elevator_change(struct request_queue *q, const char *name) static inline bool elv_support_iosched(struct request_queue *q) { - if (q->mq_ops && q->tag_set && (q->tag_set->flags & - BLK_MQ_F_NO_SCHED)) + if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) return false; return true; } @@ -1102,7 +695,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, { int ret; - if (!(q->mq_ops || q->request_fn) || !elv_support_iosched(q)) + if (!queue_is_mq(q) || !elv_support_iosched(q)) return count; ret = __elevator_change(q, name); @@ -1117,10 +710,9 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) struct elevator_queue *e = q->elevator; struct elevator_type *elv = NULL; struct elevator_type *__e; - bool uses_mq = q->mq_ops != NULL; int len = 0; - if (!queue_is_rq_based(q)) + if (!queue_is_mq(q)) return sprintf(name, "none\n"); if (!q->elevator) @@ -1130,19 +722,16 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name) spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name) && - (__e->uses_mq == uses_mq)) { + if (elv && elevator_match(elv, __e->elevator_name)) { len += sprintf(name+len, "[%s] ", elv->elevator_name); continue; } - if (__e->uses_mq && q->mq_ops && elv_support_iosched(q)) - len += sprintf(name+len, "%s ", __e->elevator_name); - else if (!__e->uses_mq && !q->mq_ops) + if (elv_support_iosched(q)) len += sprintf(name+len, "%s ", __e->elevator_name); } spin_unlock(&elv_list_lock); - if (q->mq_ops && q->elevator) + if (q->elevator) len += sprintf(name+len, "none"); len += sprintf(len+name, "\n"); diff --git a/block/genhd.c b/block/genhd.c index cff6bdf27226..1dd8fd6613b8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -47,51 +47,64 @@ static void disk_release_events(struct gendisk *disk); void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; - atomic_inc(&part->in_flight[rw]); + part_stat_local_inc(part, in_flight[rw]); if (part->partno) - atomic_inc(&part_to_disk(part)->part0.in_flight[rw]); + part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]); } void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw) { - if (q->mq_ops) + if (queue_is_mq(q)) return; - atomic_dec(&part->in_flight[rw]); + part_stat_local_dec(part, in_flight[rw]); if (part->partno) - atomic_dec(&part_to_disk(part)->part0.in_flight[rw]); + part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]); } -void part_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part) { - if (q->mq_ops) { - blk_mq_in_flight(q, part, inflight); - return; + int cpu; + unsigned int inflight; + + if (queue_is_mq(q)) { + return blk_mq_in_flight(q, part); } - inflight[0] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); - if (part->partno) { - part = &part_to_disk(part)->part0; - inflight[1] = atomic_read(&part->in_flight[0]) + - atomic_read(&part->in_flight[1]); + inflight = 0; + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + + part_stat_local_read_cpu(part, in_flight[1], cpu); } + if ((int)inflight < 0) + inflight = 0; + + return inflight; } void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, unsigned int inflight[2]) { - if (q->mq_ops) { + int cpu; + + if (queue_is_mq(q)) { blk_mq_in_flight_rw(q, part, inflight); return; } - inflight[0] = atomic_read(&part->in_flight[0]); - inflight[1] = atomic_read(&part->in_flight[1]); + inflight[0] = 0; + inflight[1] = 0; + for_each_possible_cpu(cpu) { + inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); + inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight[0] < 0) + inflight[0] = 0; + if ((int)inflight[1] < 0) + inflight[1] = 0; } struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) @@ -1325,8 +1338,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) struct disk_part_iter piter; struct hd_struct *hd; char buf[BDEVNAME_SIZE]; - unsigned int inflight[2]; - int cpu; + unsigned int inflight; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1338,10 +1350,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); while ((hd = disk_part_iter_next(&piter))) { - cpu = part_stat_lock(); - part_round_stats(gp->queue, cpu, hd); - part_stat_unlock(); - part_in_flight(gp->queue, hd, inflight); + inflight = part_in_flight(gp->queue, hd); seq_printf(seqf, "%4d %7d %s " "%lu %lu %lu %u " "%lu %lu %lu %u " @@ -1357,7 +1366,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) part_stat_read(hd, merges[STAT_WRITE]), part_stat_read(hd, sectors[STAT_WRITE]), (unsigned int)part_stat_read_msecs(hd, STAT_WRITE), - inflight[0], + inflight, jiffies_to_msecs(part_stat_read(hd, io_ticks)), jiffies_to_msecs(part_stat_read(hd, time_in_queue)), part_stat_read(hd, ios[STAT_DISCARD]), diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index eccac01a10b6..ec6a04e01bc1 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -195,7 +195,7 @@ struct kyber_hctx_data { unsigned int batching; struct kyber_ctx_queue *kcqs; struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; - wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; + struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; atomic_t wait_index[KYBER_NUM_DOMAINS]; }; @@ -501,10 +501,11 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { INIT_LIST_HEAD(&khd->rqs[i]); - init_waitqueue_func_entry(&khd->domain_wait[i], + khd->domain_wait[i].sbq = NULL; + init_waitqueue_func_entry(&khd->domain_wait[i].wait, kyber_domain_wake); - khd->domain_wait[i].private = hctx; - INIT_LIST_HEAD(&khd->domain_wait[i].entry); + khd->domain_wait[i].wait.private = hctx; + INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); atomic_set(&khd->wait_index[i], 0); } @@ -576,7 +577,7 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) { struct kyber_hctx_data *khd = hctx->sched_data; struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); - struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; bool merged; @@ -602,7 +603,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, list_for_each_entry_safe(rq, next, rq_list, queuelist) { unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); - struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; struct list_head *head = &kcq->rq_list[sched_domain]; spin_lock(&kcq->lock); @@ -611,7 +612,7 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], - rq->mq_ctx->index_hw); + rq->mq_ctx->index_hw[hctx->type]); blk_mq_sched_request_inserted(rq); spin_unlock(&kcq->lock); } @@ -698,12 +699,13 @@ static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, flush_busy_kcq, &data); } -static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, +static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, void *key) { - struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); + struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); + struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); - list_del_init(&wait->entry); + sbitmap_del_wait_queue(wait); blk_mq_run_hw_queue(hctx, true); return 1; } @@ -714,7 +716,7 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, { unsigned int sched_domain = khd->cur_domain; struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; - wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; + struct sbq_wait *wait = &khd->domain_wait[sched_domain]; struct sbq_wait_state *ws; int nr; @@ -725,11 +727,11 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * run when one becomes available. Note that this is serialized on * khd->lock, but we still need to be careful about the waker. */ - if (nr < 0 && list_empty_careful(&wait->entry)) { + if (nr < 0 && list_empty_careful(&wait->wait.entry)) { ws = sbq_wait_ptr(domain_tokens, &khd->wait_index[sched_domain]); khd->domain_ws[sched_domain] = ws; - add_wait_queue(&ws->wait, wait); + sbitmap_add_wait_queue(domain_tokens, ws, wait); /* * Try again in case a token was freed before we got on the wait @@ -745,10 +747,10 @@ static int kyber_get_domain_token(struct kyber_queue_data *kqd, * between the !list_empty_careful() check and us grabbing the lock, but * list_del_init() is okay with that. */ - if (nr >= 0 && !list_empty_careful(&wait->entry)) { + if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { ws = khd->domain_ws[sched_domain]; spin_lock_irq(&ws->wait.lock); - list_del_init(&wait->entry); + sbitmap_del_wait_queue(wait); spin_unlock_irq(&ws->wait.lock); } @@ -951,7 +953,7 @@ static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ { \ struct blk_mq_hw_ctx *hctx = data; \ struct kyber_hctx_data *khd = hctx->sched_data; \ - wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ + wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ \ seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ return 0; \ @@ -1017,7 +1019,7 @@ static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { #endif static struct elevator_type kyber_sched = { - .ops.mq = { + .ops = { .init_sched = kyber_init_sched, .exit_sched = kyber_exit_sched, .init_hctx = kyber_init_hctx, @@ -1032,7 +1034,6 @@ static struct elevator_type kyber_sched = { .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, }, - .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 099a9e05854c..14288f864e94 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -373,9 +373,16 @@ done: /* * One confusing aspect here is that we get called for a specific - * hardware queue, but we return a request that may not be for a + * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared * state for all hardware queues, in terms of sorting, FIFOs, etc. + * + * For a zoned block device, __dd_dispatch_request() may return NULL + * if all the queued write requests are directed at zones that are already + * locked due to on-going write requests. In this case, make sure to mark + * the queue as needing a restart to ensure that the queue is run again + * and the pending writes dispatched once the target zones for the ongoing + * write requests are unlocked in dd_finish_request(). */ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { @@ -384,6 +391,9 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) spin_lock(&dd->lock); rq = __dd_dispatch_request(dd); + if (!rq && blk_queue_is_zoned(hctx->queue) && + !list_empty(&dd->fifo_list[WRITE])) + blk_mq_sched_mark_restart_hctx(hctx); spin_unlock(&dd->lock); return rq; @@ -761,7 +771,7 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { #endif static struct elevator_type mq_deadline = { - .ops.mq = { + .ops = { .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -777,7 +787,6 @@ static struct elevator_type mq_deadline = { .exit_sched = dd_exit_queue, }, - .uses_mq = true, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = deadline_queue_debugfs_attrs, #endif diff --git a/block/noop-iosched.c b/block/noop-iosched.c deleted file mode 100644 index 2d1b15d89b45..000000000000 --- a/block/noop-iosched.c +++ /dev/null @@ -1,124 +0,0 @@ -/* - * elevator noop - */ -#include <linux/blkdev.h> -#include <linux/elevator.h> -#include <linux/bio.h> -#include <linux/module.h> -#include <linux/slab.h> -#include <linux/init.h> - -struct noop_data { - struct list_head queue; -}; - -static void noop_merged_requests(struct request_queue *q, struct request *rq, - struct request *next) -{ - list_del_init(&next->queuelist); -} - -static int noop_dispatch(struct request_queue *q, int force) -{ - struct noop_data *nd = q->elevator->elevator_data; - struct request *rq; - - rq = list_first_entry_or_null(&nd->queue, struct request, queuelist); - if (rq) { - list_del_init(&rq->queuelist); - elv_dispatch_sort(q, rq); - return 1; - } - return 0; -} - -static void noop_add_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - list_add_tail(&rq->queuelist, &nd->queue); -} - -static struct request * -noop_former_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - if (rq->queuelist.prev == &nd->queue) - return NULL; - return list_prev_entry(rq, queuelist); -} - -static struct request * -noop_latter_request(struct request_queue *q, struct request *rq) -{ - struct noop_data *nd = q->elevator->elevator_data; - - if (rq->queuelist.next == &nd->queue) - return NULL; - return list_next_entry(rq, queuelist); -} - -static int noop_init_queue(struct request_queue *q, struct elevator_type *e) -{ - struct noop_data *nd; - struct elevator_queue *eq; - - eq = elevator_alloc(q, e); - if (!eq) - return -ENOMEM; - - nd = kmalloc_node(sizeof(*nd), GFP_KERNEL, q->node); - if (!nd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } - eq->elevator_data = nd; - - INIT_LIST_HEAD(&nd->queue); - - spin_lock_irq(q->queue_lock); - q->elevator = eq; - spin_unlock_irq(q->queue_lock); - return 0; -} - -static void noop_exit_queue(struct elevator_queue *e) -{ - struct noop_data *nd = e->elevator_data; - - BUG_ON(!list_empty(&nd->queue)); - kfree(nd); -} - -static struct elevator_type elevator_noop = { - .ops.sq = { - .elevator_merge_req_fn = noop_merged_requests, - .elevator_dispatch_fn = noop_dispatch, - .elevator_add_req_fn = noop_add_request, - .elevator_former_req_fn = noop_former_request, - .elevator_latter_req_fn = noop_latter_request, - .elevator_init_fn = noop_init_queue, - .elevator_exit_fn = noop_exit_queue, - }, - .elevator_name = "noop", - .elevator_owner = THIS_MODULE, -}; - -static int __init noop_init(void) -{ - return elv_register(&elevator_noop); -} - -static void __exit noop_exit(void) -{ - elv_unregister(&elevator_noop); -} - -module_init(noop_init); -module_exit(noop_exit); - - -MODULE_AUTHOR("Jens Axboe"); -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("No-op IO scheduler"); diff --git a/block/partition-generic.c b/block/partition-generic.c index d3d14e81fb12..8e596a8dff32 100644 --- a/block/partition-generic.c +++ b/block/partition-generic.c @@ -120,13 +120,9 @@ ssize_t part_stat_show(struct device *dev, { struct hd_struct *p = dev_to_part(dev); struct request_queue *q = part_to_disk(p)->queue; - unsigned int inflight[2]; - int cpu; + unsigned int inflight; - cpu = part_stat_lock(); - part_round_stats(q, cpu, p); - part_stat_unlock(); - part_in_flight(q, p, inflight); + inflight = part_in_flight(q, p); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -141,7 +137,7 @@ ssize_t part_stat_show(struct device *dev, part_stat_read(p, merges[STAT_WRITE]), (unsigned long long)part_stat_read(p, sectors[STAT_WRITE]), (unsigned int)part_stat_read_msecs(p, STAT_WRITE), - inflight[0], + inflight, jiffies_to_msecs(part_stat_read(p, io_ticks)), jiffies_to_msecs(part_stat_read(p, time_in_queue)), part_stat_read(p, ios[STAT_DISCARD]), @@ -249,9 +245,10 @@ struct device_type part_type = { .uevent = part_uevent, }; -static void delete_partition_rcu_cb(struct rcu_head *head) +static void delete_partition_work_fn(struct work_struct *work) { - struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + struct hd_struct *part = container_of(to_rcu_work(work), struct hd_struct, + rcu_work); part->start_sect = 0; part->nr_sects = 0; @@ -262,7 +259,8 @@ static void delete_partition_rcu_cb(struct rcu_head *head) void __delete_partition(struct percpu_ref *ref) { struct hd_struct *part = container_of(ref, struct hd_struct, ref); - call_rcu(&part->rcu_head, delete_partition_rcu_cb); + INIT_RCU_WORK(&part->rcu_work, delete_partition_work_fn); + queue_rcu_work(system_wq, &part->rcu_work); } /* |