From 3e453522593d74a87cf68a38e14aa36ebca1dbcd Mon Sep 17 00:00:00 2001 From: Xiao Ni Date: Wed, 22 Feb 2023 11:59:16 +0800 Subject: md: Free resources in __md_stop If md_run() fails after ->active_io is initialized, then percpu_ref_exit is called in error path. However, later md_free_disk will call percpu_ref_exit again which leads to a panic because of null pointer dereference. It can also trigger this bug when resources are initialized but are freed in error path, then will be freed again in md_free_disk. BUG: kernel NULL pointer dereference, address: 0000000000000038 Oops: 0000 [#1] PREEMPT SMP Workqueue: md_misc mddev_delayed_delete RIP: 0010:free_percpu+0x110/0x630 Call Trace: __percpu_ref_exit+0x44/0x70 percpu_ref_exit+0x16/0x90 md_free_disk+0x2f/0x80 disk_release+0x101/0x180 device_release+0x84/0x110 kobject_put+0x12a/0x380 kobject_put+0x160/0x380 mddev_delayed_delete+0x19/0x30 process_one_work+0x269/0x680 worker_thread+0x266/0x640 kthread+0x151/0x1b0 ret_from_fork+0x1f/0x30 For creating raid device, md raid calls do_md_run->md_run, dm raid calls md_run. We alloc those memory in md_run. For stopping raid device, md raid calls do_md_stop->__md_stop, dm raid calls md_stop->__md_stop. So we can free those memory resources in __md_stop. Fixes: 72adae23a72c ("md: Change active_io to percpu") Reported-and-tested-by: Yu Kuai Signed-off-by: Xiao Ni Signed-off-by: Song Liu --- drivers/md/md.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index 927a43db5dfb..f5480778e2f7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6256,6 +6256,11 @@ static void __md_stop(struct mddev *mddev) mddev->to_remove = &md_redundancy_group; module_put(pers->owner); clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); + + percpu_ref_exit(&mddev->writes_pending); + percpu_ref_exit(&mddev->active_io); + bioset_exit(&mddev->bio_set); + bioset_exit(&mddev->sync_set); } void md_stop(struct mddev *mddev) @@ -6265,10 +6270,6 @@ void md_stop(struct mddev *mddev) */ __md_stop_writes(mddev); __md_stop(mddev); - percpu_ref_exit(&mddev->writes_pending); - percpu_ref_exit(&mddev->active_io); - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); } EXPORT_SYMBOL_GPL(md_stop); @@ -7839,11 +7840,6 @@ static void md_free_disk(struct gendisk *disk) { struct mddev *mddev = disk->private_data; - percpu_ref_exit(&mddev->writes_pending); - percpu_ref_exit(&mddev->active_io); - bioset_exit(&mddev->bio_set); - bioset_exit(&mddev->sync_set); - mddev_free(mddev); } -- cgit v1.2.3 From 3bc57292278a0b6ac4656cad94c14f2453344b57 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 6 Mar 2023 09:36:25 +1100 Subject: md: avoid signed overflow in slot_store() slot_store() uses kstrtouint() to get a slot number, but stores the result in an "int" variable (by casting a pointer). This can result in a negative slot number if the unsigned int value is very large. A negative number means that the slot is empty, but setting a negative slot number this way will not remove the device from the array. I don't think this is a serious problem, but it could cause confusion and it is best to fix it. Reported-by: Dan Carpenter Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/md.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'drivers') diff --git a/drivers/md/md.c b/drivers/md/md.c index f5480778e2f7..39e49e5d7182 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -3128,6 +3128,9 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) err = kstrtouint(buf, 10, (unsigned int *)&slot); if (err < 0) return err; + if (slot < 0) + /* overflow */ + return -ENOSPC; } if (rdev->mddev->pers && slot == -1) { /* Setting 'slot' on an active array requires also -- cgit v1.2.3 From 9b0cb770f5d7b1ff40bea7ca385438ee94570eec Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Tue, 14 Mar 2023 11:21:54 -0700 Subject: loop: Fix use-after-free issues do_req_filebacked() calls blk_mq_complete_request() synchronously or asynchronously when using asynchronous I/O unless memory allocation fails. Hence, modify loop_handle_cmd() such that it does not dereference 'cmd' nor 'rq' after do_req_filebacked() finished unless we are sure that the request has not yet been completed. This patch fixes the following kernel crash: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000054 Call trace: css_put.42938+0x1c/0x1ac loop_process_work+0xc8c/0xfd4 loop_rootcg_workfn+0x24/0x34 process_one_work+0x244/0x558 worker_thread+0x400/0x8fc kthread+0x16c/0x1e0 ret_from_fork+0x10/0x20 Cc: Christoph Hellwig Cc: Ming Lei Cc: Jan Kara Cc: Johannes Weiner Cc: Dan Schatzberg Fixes: c74d40e8b5e2 ("loop: charge i/o to mem and blk cg") Fixes: bc07c10a3603 ("block: loop: support DIO & AIO") Signed-off-by: Bart Van Assche Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230314182155.80625-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- drivers/block/loop.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'drivers') diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 839373451c2b..28eb59fd71ca 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1859,35 +1859,44 @@ static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx, static void loop_handle_cmd(struct loop_cmd *cmd) { + struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css; + struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css; struct request *rq = blk_mq_rq_from_pdu(cmd); const bool write = op_is_write(req_op(rq)); struct loop_device *lo = rq->q->queuedata; int ret = 0; struct mem_cgroup *old_memcg = NULL; + const bool use_aio = cmd->use_aio; if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) { ret = -EIO; goto failed; } - if (cmd->blkcg_css) - kthread_associate_blkcg(cmd->blkcg_css); - if (cmd->memcg_css) + if (cmd_blkcg_css) + kthread_associate_blkcg(cmd_blkcg_css); + if (cmd_memcg_css) old_memcg = set_active_memcg( - mem_cgroup_from_css(cmd->memcg_css)); + mem_cgroup_from_css(cmd_memcg_css)); + /* + * do_req_filebacked() may call blk_mq_complete_request() synchronously + * or asynchronously if using aio. Hence, do not touch 'cmd' after + * do_req_filebacked() has returned unless we are sure that 'cmd' has + * not yet been completed. + */ ret = do_req_filebacked(lo, rq); - if (cmd->blkcg_css) + if (cmd_blkcg_css) kthread_associate_blkcg(NULL); - if (cmd->memcg_css) { + if (cmd_memcg_css) { set_active_memcg(old_memcg); - css_put(cmd->memcg_css); + css_put(cmd_memcg_css); } failed: /* complete non-aio request */ - if (!cmd->use_aio || ret) { + if (!use_aio || ret) { if (ret == -EOPNOTSUPP) cmd->ret = ret; else -- cgit v1.2.3 From 63f886597085f346276e3b3c8974de0100d65f32 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 14 Mar 2023 13:11:05 +0900 Subject: block: null_blk: Fix handling of fake timeout request When injecting a fake timeout into the null_blk driver using fail_io_timeout, the request timeout handler does not execute blk_mq_complete_request(), so the complete callback is never executed for a timedout request. The null_blk driver also has a driver-specific fake timeout mechanism which does not have this problem. Fix the problem with fail_io_timeout by using the same meachanism as null_blk internal timeout feature, using the fake_timeout field of null_blk commands. Reported-by: Akinobu Mita Fixes: de3510e52b0a ("null_blk: fix command timeout completion handling") Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Link: https://lore.kernel.org/r/20230314041106.19173-2-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'drivers') diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4c601ca9552a..7d95ad203c97 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1413,8 +1413,7 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) case NULL_IRQ_SOFTIRQ: switch (cmd->nq->dev->queue_mode) { case NULL_Q_MQ: - if (likely(!blk_should_fake_timeout(cmd->rq->q))) - blk_mq_complete_request(cmd->rq); + blk_mq_complete_request(cmd->rq); break; case NULL_Q_BIO: /* @@ -1675,7 +1674,8 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, cmd->rq = bd->rq; cmd->error = BLK_STS_OK; cmd->nq = nq; - cmd->fake_timeout = should_timeout_request(bd->rq); + cmd->fake_timeout = should_timeout_request(bd->rq) || + blk_should_fake_timeout(bd->rq->q); blk_mq_start_request(bd->rq); -- cgit v1.2.3 From b6402014cab0481bdfd1ffff3e1dad714e8e1205 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Tue, 14 Mar 2023 13:11:06 +0900 Subject: block: null_blk: cleanup null_queue_rq() Use a local struct request pointer variable to avoid having to dereference struct blk_mq_queue_data multiple times. While at it, also fix the function argument indentation and remove a useless "else" after a return. Signed-off-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20230314041106.19173-2-damien.lemoal@opensource.wdc.com Signed-off-by: Jens Axboe --- drivers/block/null_blk/main.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'drivers') diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 7d95ad203c97..9e6b032c8ecc 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1657,12 +1657,13 @@ static enum blk_eh_timer_return null_timeout_rq(struct request *rq) } static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); + struct request *rq = bd->rq; + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); struct nullb_queue *nq = hctx->driver_data; - sector_t nr_sectors = blk_rq_sectors(bd->rq); - sector_t sector = blk_rq_pos(bd->rq); + sector_t nr_sectors = blk_rq_sectors(rq); + sector_t sector = blk_rq_pos(rq); const bool is_poll = hctx->type == HCTX_TYPE_POLL; might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); @@ -1671,15 +1672,15 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); cmd->timer.function = null_cmd_timer_expired; } - cmd->rq = bd->rq; + cmd->rq = rq; cmd->error = BLK_STS_OK; cmd->nq = nq; - cmd->fake_timeout = should_timeout_request(bd->rq) || - blk_should_fake_timeout(bd->rq->q); + cmd->fake_timeout = should_timeout_request(rq) || + blk_should_fake_timeout(rq->q); - blk_mq_start_request(bd->rq); + blk_mq_start_request(rq); - if (should_requeue_request(bd->rq)) { + if (should_requeue_request(rq)) { /* * Alternate between hitting the core BUSY path, and the * driver driven requeue path @@ -1687,22 +1688,20 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx, nq->requeue_selection++; if (nq->requeue_selection & 1) return BLK_STS_RESOURCE; - else { - blk_mq_requeue_request(bd->rq, true); - return BLK_STS_OK; - } + blk_mq_requeue_request(rq, true); + return BLK_STS_OK; } if (is_poll) { spin_lock(&nq->poll_lock); - list_add_tail(&bd->rq->queuelist, &nq->poll_list); + list_add_tail(&rq->queuelist, &nq->poll_list); spin_unlock(&nq->poll_lock); return BLK_STS_OK; } if (cmd->fake_timeout) return BLK_STS_OK; - return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq)); + return null_handle_cmd(cmd, sector, nr_sectors, req_op(rq)); } static void cleanup_queue(struct nullb_queue *nq) -- cgit v1.2.3 From 37f0dc2ec78af0c3f35dd05578763de059f6fe77 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sat, 4 Mar 2023 07:13:45 +0800 Subject: nvme: fix handling single range discard request When investigating one customer report on warning in nvme_setup_discard, we observed the controller(nvme/tcp) actually exposes queue_max_discard_segments(req->q) == 1. Obviously the current code can't handle this situation, since contiguity merge like normal RW request is taken. Fix the issue by building range from request sector/nr_sectors directly. Fixes: b35ba01ea697 ("nvme: support ranged discard requests") Signed-off-by: Ming Lei Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index c2730b116dc6..d4be525f8100 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -781,16 +781,26 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req, range = page_address(ns->ctrl->discard_page); } - __rq_for_each_bio(bio, req) { - u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); - u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; - - if (n < segments) { - range[n].cattr = cpu_to_le32(0); - range[n].nlb = cpu_to_le32(nlb); - range[n].slba = cpu_to_le64(slba); + if (queue_max_discard_segments(req->q) == 1) { + u64 slba = nvme_sect_to_lba(ns, blk_rq_pos(req)); + u32 nlb = blk_rq_sectors(req) >> (ns->lba_shift - 9); + + range[0].cattr = cpu_to_le32(0); + range[0].nlb = cpu_to_le32(nlb); + range[0].slba = cpu_to_le64(slba); + n = 1; + } else { + __rq_for_each_bio(bio, req) { + u64 slba = nvme_sect_to_lba(ns, bio->bi_iter.bi_sector); + u32 nlb = bio->bi_iter.bi_size >> ns->lba_shift; + + if (n < segments) { + range[n].cattr = cpu_to_le32(0); + range[n].nlb = cpu_to_le32(nlb); + range[n].slba = cpu_to_le64(slba); + } + n++; } - n++; } if (WARN_ON_ONCE(n != segments)) { -- cgit v1.2.3 From a61d265533b7fe0026a02a49916aa564ffe38e4c Mon Sep 17 00:00:00 2001 From: Irvin Cote Date: Wed, 8 Mar 2023 18:05:08 -0300 Subject: nvme-pci: fixing memory leak in probe teardown path In case the nvme_probe teardown path is triggered the ctrl ref count does not reach 0 thus creating a memory leak upon failure of nvme_probe. Signed-off-by: Irvin Cote Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 1 + 1 file changed, 1 insertion(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5b95c94ee40f..e77a8a873b1a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3073,6 +3073,7 @@ out_dev_unmap: nvme_dev_unmap(dev); out_uninit_ctrl: nvme_uninit_ctrl(&dev->ctrl); + nvme_put_ctrl(&dev->ctrl); return result; } -- cgit v1.2.3 From 9630d80655bfe7e62e4aff2889dc4eae7ceeb887 Mon Sep 17 00:00:00 2001 From: Elmer Miroslav Mosher Golovin Date: Wed, 8 Mar 2023 19:19:29 +0300 Subject: nvme-pci: add NVME_QUIRK_BOGUS_NID for Netac NV3000 Added a quirk to fix the Netac NV3000 SSD reporting duplicate NGUIDs. Cc: Signed-off-by: Elmer Miroslav Mosher Golovin Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index e77a8a873b1a..8a536d5300a4 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3416,6 +3416,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x2646, 0x501E), /* KINGSTON OM3PGP4xxxxQ OS21011 NVMe SSD */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x1f40, 0x1202), /* Netac Technologies Co. NV3000 NVMe SSD */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1f40, 0x5236), /* Netac Technologies Co. NV7000 NVMe SSD */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1e4B, 0x1001), /* MAXIO MAP1001 */ -- cgit v1.2.3 From b65d44fa0fe072c91bf41cd8756baa2b4c77eff2 Mon Sep 17 00:00:00 2001 From: Philipp Geulen Date: Mon, 13 Mar 2023 11:11:50 +0100 Subject: nvme-pci: add NVME_QUIRK_BOGUS_NID for Lexar NM620 Added a quirk to fix Lexar NM620 1TB SSD reporting duplicate NGUIDs. Signed-off-by: Philipp Geulen Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8a536d5300a4..b615906263f3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3438,6 +3438,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, + { PCI_DEVICE(0x1d97, 0x1d97), /* Lexar NM620 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2269), /* Lexar NM760 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(PCI_VENDOR_ID_AMAZON, 0x0061), -- cgit v1.2.3 From a3406352c54fbc476f4f6b98159c3ea1c7dbb6fc Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Mar 2023 10:56:22 +0200 Subject: nvme-tcp: fix opcode reporting in the timeout handler For non in-capsule writes we reuse the request pdu space for a h2cdata pdu in order to avoid over allocating space (either preallocate or dynamically upon receving an r2t pdu). However if the request times out the core expects to find the opcode in the start of the request, which we override. In order to prevent that, without sacrificing additional 24 bytes per request, we just use the tail of the command pdu space instead (last 24 bytes from the 72 bytes command pdu). That should make the command opcode always available, and we get away from allocating more space. If in the future we would need the last 24 bytes of the nvme command available we would need to allocate a dedicated space for it in the request, but until then we can avoid doing so. Reported-by: Akinobu Mita Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Tested-by: Akinobu Mita Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 7723a4989524..2e174fad57d7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -208,6 +208,18 @@ static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue) return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; } +static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req) +{ + return req->pdu; +} + +static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req) +{ + /* use the pdu space in the back for the data pdu */ + return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) - + sizeof(struct nvme_tcp_data_pdu); +} + static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req) { if (nvme_is_fabrics(req->req.cmd)) @@ -614,7 +626,7 @@ static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue, static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req) { - struct nvme_tcp_data_pdu *data = req->pdu; + struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req); struct nvme_tcp_queue *queue = req->queue; struct request *rq = blk_mq_rq_from_pdu(req); u32 h2cdata_sent = req->pdu_len; @@ -1038,7 +1050,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); bool inline_data = nvme_tcp_has_inline_data(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) + hdgst - req->offset; @@ -1077,7 +1089,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) { struct nvme_tcp_queue *queue = req->queue; - struct nvme_tcp_data_pdu *pdu = req->pdu; + struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req); u8 hdgst = nvme_tcp_hdgst_len(queue); int len = sizeof(*pdu) - req->offset + hdgst; int ret; @@ -2284,7 +2296,7 @@ static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl; - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype; int qid = nvme_tcp_queue_id(req->queue); @@ -2323,7 +2335,7 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_command *c = &pdu->cmd; c->common.flags |= NVME_CMD_SGL_METABUF; @@ -2343,7 +2355,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns, struct request *rq) { struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq); - struct nvme_tcp_cmd_pdu *pdu = req->pdu; + struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req); struct nvme_tcp_queue *queue = req->queue; u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0; blk_status_t ret; -- cgit v1.2.3 From 7e87965d3807ab1f518ef2365f91d5ba6b0c5abe Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Mon, 13 Mar 2023 10:56:23 +0200 Subject: nvme-tcp: add nvme-tcp pdu size build protection Make sure that we don't somehow mess up the wire structures in the spec. Signed-off-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'drivers') diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 2e174fad57d7..42c0598c31f2 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2694,6 +2694,15 @@ static struct nvmf_transport_ops nvme_tcp_transport = { static int __init nvme_tcp_init_module(void) { + BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8); + BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72); + BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128); + BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24); + nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); if (!nvme_tcp_wq) -- cgit v1.2.3 From 6173a77b7e9d3e202bdb9897b23f2a8afe7bf286 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 6 Mar 2023 10:13:13 +0900 Subject: nvmet: avoid potential UAF in nvmet_req_complete() An nvme target ->queue_response() operation implementation may free the request passed as argument. Such implementation potentially could result in a use after free of the request pointer when percpu_ref_put() is called in nvmet_req_complete(). Avoid such problem by using a local variable to save the sq pointer before calling __nvmet_req_complete(), thus avoiding dereferencing the req pointer after that function call. Fixes: a07b4970f464 ("nvmet: add a generic NVMe target") Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index f66ed13d7c11..3935165048e7 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -756,8 +756,10 @@ static void __nvmet_req_complete(struct nvmet_req *req, u16 status) void nvmet_req_complete(struct nvmet_req *req, u16 status) { + struct nvmet_sq *sq = req->sq; + __nvmet_req_complete(req, status); - percpu_ref_put(&req->sq->ref); + percpu_ref_put(&sq->ref); } EXPORT_SYMBOL_GPL(nvmet_req_complete); -- cgit v1.2.3 From 6030363199e3a6341afb467ddddbed56640cbf6a Mon Sep 17 00:00:00 2001 From: Liang He Date: Wed, 15 Mar 2023 14:20:32 +0800 Subject: block: sunvdc: add check for mdesc_grab() returning NULL In vdc_port_probe(), we should check the return value of mdesc_grab() as it may return NULL, which can cause potential NPD bug. Fixes: 43fdf27470b2 ("[SPARC64]: Abstract out mdesc accesses for better MD update handling.") Signed-off-by: Liang He Link: https://lore.kernel.org/r/20230315062032.1741692-1-windhl@126.com [axboe: style cleanup] Signed-off-by: Jens Axboe --- drivers/block/sunvdc.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index fb855da971ee..9fa821fa76b0 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -972,6 +972,8 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) print_version(); hp = mdesc_grab(); + if (!hp) + return -ENODEV; err = -ENODEV; if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) { -- cgit v1.2.3 From 5f27571382ca42daa3e3d40d1b252bf18c2b61d2 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 23 Feb 2023 17:12:26 +0800 Subject: block: count 'ios' and 'sectors' when io is done for bio-based device While using iostat for raid, I observed very strange 'await' occasionally, and turns out it's due to that 'ios' and 'sectors' is counted in bdev_start_io_acct(), while 'nsecs' is counted in bdev_end_io_acct(). I'm not sure why they are ccounted like that but I think this behaviour is obviously wrong because user will get wrong disk stats. Fix the problem by counting 'ios' and 'sectors' when io is done, like what rq-based device does. Fixes: 394ffa503bc4 ("blk: introduce generic io stat accounting help function") Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230223091226.1135678-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk-core.c | 16 ++++++---------- drivers/md/dm.c | 6 +++--- drivers/nvme/host/multipath.c | 8 ++++---- include/linux/blkdev.h | 5 ++--- 4 files changed, 15 insertions(+), 20 deletions(-) (limited to 'drivers') diff --git a/block/blk-core.c b/block/blk-core.c index 9e5e0277a4d9..42926e6cb83c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -959,16 +959,11 @@ again: } } -unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, enum req_op op, +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time) { - const int sgrp = op_stat_group(op); - part_stat_lock(); update_io_ticks(bdev, start_time, false); - part_stat_inc(bdev, ios[sgrp]); - part_stat_add(bdev, sectors[sgrp], sectors); part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -984,13 +979,12 @@ EXPORT_SYMBOL(bdev_start_io_acct); */ unsigned long bio_start_io_acct(struct bio *bio) { - return bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio), - bio_op(bio), jiffies); + return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } EXPORT_SYMBOL_GPL(bio_start_io_acct); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, - unsigned long start_time) + unsigned int sectors, unsigned long start_time) { const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); @@ -998,6 +992,8 @@ void bdev_end_io_acct(struct block_device *bdev, enum req_op op, part_stat_lock(); update_io_ticks(bdev, now, true); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); @@ -1007,7 +1003,7 @@ EXPORT_SYMBOL(bdev_end_io_acct); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, struct block_device *orig_bdev) { - bdev_end_io_acct(orig_bdev, bio_op(bio), start_time); + bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index eace45a18d45..f5cc330bb549 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -512,10 +512,10 @@ static void dm_io_acct(struct dm_io *io, bool end) sectors = io->sectors; if (!end) - bdev_start_io_acct(bio->bi_bdev, sectors, bio_op(bio), - start_time); + bdev_start_io_acct(bio->bi_bdev, bio_op(bio), start_time); else - bdev_end_io_acct(bio->bi_bdev, bio_op(bio), start_time); + bdev_end_io_acct(bio->bi_bdev, bio_op(bio), sectors, + start_time); if (static_branch_unlikely(&stats_enabled) && unlikely(dm_stats_used(&md->stats))) { diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index fc39d01e7b63..9171452e2f6d 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -123,9 +123,8 @@ void nvme_mpath_start_request(struct request *rq) return; nvme_req(rq)->flags |= NVME_MPATH_IO_STATS; - nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, - blk_rq_bytes(rq) >> SECTOR_SHIFT, - req_op(rq), jiffies); + nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0, req_op(rq), + jiffies); } EXPORT_SYMBOL_GPL(nvme_mpath_start_request); @@ -136,7 +135,8 @@ void nvme_mpath_end_request(struct request *rq) if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS)) return; bdev_end_io_acct(ns->head->disk->part0, req_op(rq), - nvme_req(rq)->start_time); + blk_rq_bytes(rq) >> SECTOR_SHIFT, + nvme_req(rq)->start_time); } void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d1aee08f8c18..941304f17492 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1446,11 +1446,10 @@ static inline void blk_wake_io_task(struct task_struct *waiter) wake_up_process(waiter); } -unsigned long bdev_start_io_acct(struct block_device *bdev, - unsigned int sectors, enum req_op op, +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, unsigned long start_time); void bdev_end_io_acct(struct block_device *bdev, enum req_op op, - unsigned long start_time); + unsigned int sectors, unsigned long start_time); unsigned long bio_start_io_acct(struct bio *bio); void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, -- cgit v1.2.3 From 6c0f5898836c05c6d850a750ed7940ba29e4e6c5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 13 Mar 2023 13:29:17 -0700 Subject: md: select BLOCK_LEGACY_AUTOLOAD When BLOCK_LEGACY_AUTOLOAD is not enable, mdadm is not able to activate new arrays unless "CREATE names=yes" appears in mdadm.conf As this is a regression we need to always enable BLOCK_LEGACY_AUTOLOAD for when MD is selected - at least until mdadm is updated and the updates widely available. Cc: stable@vger.kernel.org # v5.18+ Fixes: fbdee71bb5d8 ("block: deprecate autoloading based on dev_t") Signed-off-by: NeilBrown Signed-off-by: Song Liu --- drivers/md/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'drivers') diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 998a5cfdbc4e..662d219c39bf 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -16,6 +16,10 @@ if MD config BLK_DEV_MD tristate "RAID support" select BLOCK_HOLDER_DEPRECATED if SYSFS + # BLOCK_LEGACY_AUTOLOAD requirement should be removed + # after relevant mdadm enhancements - to make "names=yes" + # the default - are widely available. + select BLOCK_LEGACY_AUTOLOAD help This driver lets you combine several hard disk partitions into one logical block device. This can be used to simply append one -- cgit v1.2.3