blk-mq: save memory by freeing requests on unused hardware queues

Depending on the topology of the machine and the number of queues exposed by a device, we can end up in a situation where some of the hardware queues are unused (as in, they don't map to any software queues). For this case, free up the memory used by the request map, as we will not use it. This can be a substantial amount of memory, depending on the number of queues vs CPUs and the queue depth of the device. Signed-off-by: Jens Axboe <axboe@fb.com>
author: Jens Axboe <axboe@fb.com> 2014-05-21 14:01:15 -0600
committer: Jens Axboe <axboe@fb.com> 2014-05-21 14:01:15 -0600
commit: 484b4061e6683e0e6a09c7455f80781128dc8a6b (patch)
tree: 614cc76c04c6a7e17278839d28b3e26608a63f82 /block
parent: e814e71ba4a6e1d7509b0f4b1928365ea650cace (diff)
1 files changed, 105 insertions, 52 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 5a3683fc5bd..103aa1dbc00 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -597,8 +597,16 @@ static void blk_mq_rq_timer(unsigned long data)
 	unsigned long next = 0;
 	int i, next_set = 0;
 
-	queue_for_each_hw_ctx(q, hctx, i)
+	queue_for_each_hw_ctx(q, hctx, i) {
+		/*
+		 * If not software queues are currently mapped to this
+		 * hardware queue, there's nothing to check
+		 */
+		if (!hctx->nr_ctx || !hctx->tags)
+			continue;
+
 		blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set);
+	}
 
 	if (next_set) {
 		next = blk_rq_timeout(round_jiffies_up(next));
@@ -1196,53 +1204,6 @@ void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx,
 }
 EXPORT_SYMBOL(blk_mq_free_single_hw_queue);
 
-static int blk_mq_hctx_notify(void *data, unsigned long action,
-			      unsigned int cpu)
-{
-	struct blk_mq_hw_ctx *hctx = data;
-	struct request_queue *q = hctx->queue;
-	struct blk_mq_ctx *ctx;
-	LIST_HEAD(tmp);
-
-	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
-		return NOTIFY_OK;
-
-	/*
-	 * Move ctx entries to new CPU, if this one is going away.
-	 */
-	ctx = __blk_mq_get_ctx(q, cpu);
-
-	spin_lock(&ctx->lock);
-	if (!list_empty(&ctx->rq_list)) {
-		list_splice_init(&ctx->rq_list, &tmp);
-		blk_mq_hctx_clear_pending(hctx, ctx);
-	}
-	spin_unlock(&ctx->lock);
-
-	if (list_empty(&tmp))
-		return NOTIFY_OK;
-
-	ctx = blk_mq_get_ctx(q);
-	spin_lock(&ctx->lock);
-
-	while (!list_empty(&tmp)) {
-		struct request *rq;
-
-		rq = list_first_entry(&tmp, struct request, queuelist);
-		rq->mq_ctx = ctx;
-		list_move_tail(&rq->queuelist, &ctx->rq_list);
-	}
-
-	hctx = q->mq_ops->map_queue(q, ctx->cpu);
-	blk_mq_hctx_mark_pending(hctx, ctx);
-
-	spin_unlock(&ctx->lock);
-
-	blk_mq_run_hw_queue(hctx, true);
-	blk_mq_put_ctx(ctx);
-	return NOTIFY_OK;
-}
-
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
@@ -1384,6 +1345,77 @@ static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
 	return 0;
 }
 
+static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_ctx *ctx;
+	LIST_HEAD(tmp);
+
+	/*
+	 * Move ctx entries to new CPU, if this one is going away.
+	 */
+	ctx = __blk_mq_get_ctx(q, cpu);
+
+	spin_lock(&ctx->lock);
+	if (!list_empty(&ctx->rq_list)) {
+		list_splice_init(&ctx->rq_list, &tmp);
+		blk_mq_hctx_clear_pending(hctx, ctx);
+	}
+	spin_unlock(&ctx->lock);
+
+	if (list_empty(&tmp))
+		return NOTIFY_OK;
+
+	ctx = blk_mq_get_ctx(q);
+	spin_lock(&ctx->lock);
+
+	while (!list_empty(&tmp)) {
+		struct request *rq;
+
+		rq = list_first_entry(&tmp, struct request, queuelist);
+		rq->mq_ctx = ctx;
+		list_move_tail(&rq->queuelist, &ctx->rq_list);
+	}
+
+	hctx = q->mq_ops->map_queue(q, ctx->cpu);
+	blk_mq_hctx_mark_pending(hctx, ctx);
+
+	spin_unlock(&ctx->lock);
+
+	blk_mq_run_hw_queue(hctx, true);
+	blk_mq_put_ctx(ctx);
+	return NOTIFY_OK;
+}
+
+static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
+{
+	struct request_queue *q = hctx->queue;
+	struct blk_mq_tag_set *set = q->tag_set;
+
+	if (set->tags[hctx->queue_num])
+		return NOTIFY_OK;
+
+	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
+	if (!set->tags[hctx->queue_num])
+		return NOTIFY_STOP;
+
+	hctx->tags = set->tags[hctx->queue_num];
+	return NOTIFY_OK;
+}
+
+static int blk_mq_hctx_notify(void *data, unsigned long action,
+			      unsigned int cpu)
+{
+	struct blk_mq_hw_ctx *hctx = data;
+
+	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
+		return blk_mq_hctx_cpu_offline(hctx, cpu);
+	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
+		return blk_mq_hctx_cpu_online(hctx, cpu);
+
+	return NOTIFY_OK;
+}
+
 static int blk_mq_init_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
@@ -1513,6 +1545,24 @@ static void blk_mq_map_swqueue(struct request_queue *q)
 	}
 
 	queue_for_each_hw_ctx(q, hctx, i) {
+		/*
+		 * If not software queues are mapped to this hardware queue,
+		 * disable it and free the request entries
+		 */
+		if (!hctx->nr_ctx) {
+			struct blk_mq_tag_set *set = q->tag_set;
+
+			if (set->tags[i]) {
+				blk_mq_free_rq_map(set, set->tags[i], i);
+				set->tags[i] = NULL;
+				hctx->tags = NULL;
+			}
+			continue;
+		}
+
+		/*
+		 * Initialize batch roundrobin counts
+		 */
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
@@ -1645,14 +1695,14 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 	if (blk_mq_init_hw_queues(q, set))
 		goto err_flush_rq;
 
-	blk_mq_map_swqueue(q);
-
 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
 	mutex_unlock(&all_q_mutex);
 
 	blk_mq_add_queue_tag_set(set, q);
 
+	blk_mq_map_swqueue(q);
+
 	return q;
 
 err_flush_rq:
@@ -1790,8 +1840,11 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 
-	for (i = 0; i < set->nr_hw_queues; i++)
-		blk_mq_free_rq_map(set, set->tags[i], i);
+	for (i = 0; i < set->nr_hw_queues; i++) {
+		if (set->tags[i])
+			blk_mq_free_rq_map(set, set->tags[i], i);
+	}
+
 	kfree(set->tags);
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
author	Jens Axboe <axboe@fb.com>	2014-05-21 14:01:15 -0600
committer	Jens Axboe <axboe@fb.com>	2014-05-21 14:01:15 -0600
commit	484b4061e6683e0e6a09c7455f80781128dc8a6b (patch)
tree	614cc76c04c6a7e17278839d28b3e26608a63f82 /block
parent	e814e71ba4a6e1d7509b0f4b1928365ea650cace (diff)