diff options
Diffstat (limited to 'drivers/nvme/host/pci.c')
-rw-r--r-- | drivers/nvme/host/pci.c | 181 |
1 files changed, 113 insertions, 68 deletions
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 5e52034ab010..26a5fd05fe88 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -43,6 +43,7 @@ #include <linux/types.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> +#include <linux/sed-opal.h> #include "nvme.h" @@ -50,7 +51,7 @@ #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) - + /* * We handle AEN commands ourselves and don't even let the * block layer know about them. @@ -141,6 +142,7 @@ struct nvme_queue { * allocated to store the PRP list. */ struct nvme_iod { + struct nvme_request req; struct nvme_queue *nvmeq; int aborted; int npages; /* In the PRP list. 0 means small pool in use */ @@ -302,14 +304,14 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, static __le64 **iod_list(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - return (__le64 **)(iod->sg + req->nr_phys_segments); + return (__le64 **)(iod->sg + blk_rq_nr_phys_segments(req)); } -static int nvme_init_iod(struct request *rq, unsigned size, - struct nvme_dev *dev) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); - int nseg = rq->nr_phys_segments; + int nseg = blk_rq_nr_phys_segments(rq); + unsigned int size = blk_rq_payload_bytes(rq); if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); @@ -324,11 +326,11 @@ static int nvme_init_iod(struct request *rq, unsigned size, iod->nents = 0; iod->length = size; - if (!(rq->cmd_flags & REQ_DONTPREP)) { + if (!(rq->rq_flags & RQF_DONTPREP)) { rq->retries = 0; - rq->cmd_flags |= REQ_DONTPREP; + rq->rq_flags |= RQF_DONTPREP; } - return 0; + return BLK_MQ_RQ_QUEUE_OK; } static void nvme_free_iod(struct nvme_dev *dev, struct request *req) @@ -339,8 +341,6 @@ static void nvme_free_iod(struct nvme_dev *dev, struct request *req) __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; - nvme_cleanup_cmd(req); - if (iod->npages == 0) dma_pool_free(dev->prp_small_pool, list[0], prp_dma); for (i = 0; i < iod->npages; i++) { @@ -421,12 +421,11 @@ static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) } #endif -static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, - int total_len) +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; - int length = total_len; + int length = blk_rq_payload_bytes(req); struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); @@ -502,7 +501,7 @@ static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, } static int nvme_map_data(struct nvme_dev *dev, struct request *req, - unsigned size, struct nvme_command *cmnd) + struct nvme_command *cmnd) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct request_queue *q = req->q; @@ -510,7 +509,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_TO_DEVICE : DMA_FROM_DEVICE; int ret = BLK_MQ_RQ_QUEUE_ERROR; - sg_init_table(iod->sg, req->nr_phys_segments); + sg_init_table(iod->sg, blk_rq_nr_phys_segments(req)); iod->nents = blk_rq_map_sg(q, req, iod->sg); if (!iod->nents) goto out; @@ -520,7 +519,7 @@ static int nvme_map_data(struct nvme_dev *dev, struct request *req, DMA_ATTR_NO_WARN)) goto out; - if (!nvme_setup_prps(dev, req, size)) + if (!nvme_setup_prps(dev, req)) goto out_unmap; ret = BLK_MQ_RQ_QUEUE_ERROR; @@ -566,6 +565,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) } } + nvme_cleanup_cmd(req); nvme_free_iod(dev, req); } @@ -580,7 +580,6 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_command cmnd; - unsigned map_len; int ret = BLK_MQ_RQ_QUEUE_OK; /* @@ -590,45 +589,42 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, */ if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && - req->cmd_type != REQ_TYPE_DRV_PRIV) { + !blk_rq_is_passthrough(req)) { blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } - map_len = nvme_map_len(req); - ret = nvme_init_iod(req, map_len, dev); - if (ret) + ret = nvme_setup_cmd(ns, req, &cmnd); + if (ret != BLK_MQ_RQ_QUEUE_OK) return ret; - ret = nvme_setup_cmd(ns, req, &cmnd); - if (ret) - goto out; + ret = nvme_init_iod(req, dev); + if (ret != BLK_MQ_RQ_QUEUE_OK) + goto out_free_cmd; - if (req->nr_phys_segments) - ret = nvme_map_data(dev, req, map_len, &cmnd); + if (blk_rq_nr_phys_segments(req)) + ret = nvme_map_data(dev, req, &cmnd); - if (ret) - goto out; + if (ret != BLK_MQ_RQ_QUEUE_OK) + goto out_cleanup_iod; - cmnd.common.command_id = req->tag; blk_mq_start_request(req); spin_lock_irq(&nvmeq->q_lock); if (unlikely(nvmeq->cq_vector < 0)) { - if (ns && !test_bit(NVME_NS_DEAD, &ns->flags)) - ret = BLK_MQ_RQ_QUEUE_BUSY; - else - ret = BLK_MQ_RQ_QUEUE_ERROR; + ret = BLK_MQ_RQ_QUEUE_ERROR; spin_unlock_irq(&nvmeq->q_lock); - goto out; + goto out_cleanup_iod; } __nvme_submit_cmd(nvmeq, &cmnd); nvme_process_cq(nvmeq); spin_unlock_irq(&nvmeq->q_lock); return BLK_MQ_RQ_QUEUE_OK; -out: +out_cleanup_iod: nvme_free_iod(dev, req); +out_free_cmd: + nvme_cleanup_cmd(req); return ret; } @@ -647,7 +643,7 @@ static void nvme_complete_rq(struct request *req) return; } - if (req->cmd_type == REQ_TYPE_DRV_PRIV) + if (blk_rq_is_passthrough(req)) error = req->errors; else error = nvme_error_status(req->errors); @@ -703,23 +699,16 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) */ if (unlikely(nvmeq->qid == 0 && cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { - nvme_complete_async_event(&nvmeq->dev->ctrl, &cqe); + nvme_complete_async_event(&nvmeq->dev->ctrl, + cqe.status, &cqe.result); continue; } req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); - if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) - memcpy(req->special, &cqe, sizeof(cqe)); + nvme_req(req)->result = cqe.result; blk_mq_complete_request(req, le16_to_cpu(cqe.status) >> 1); - } - /* If the controller ignores the cq head doorbell and continuously - * writes to the queue, it is theoretically possible to wrap around - * the queue twice and mistakenly return IRQ_NONE. Linux only - * requires that 0.1% of your interrupts are handled, so this isn't - * a big problem. - */ if (head == nvmeq->cq_head && phase == nvmeq->cq_phase) return; @@ -904,12 +893,11 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) return BLK_EH_HANDLED; } - iod->aborted = 1; - if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } + iod->aborted = 1; memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; @@ -1050,9 +1038,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth) + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, + node); if (!nvmeq) return NULL; @@ -1187,6 +1176,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); + dev->admin_tagset.flags = BLK_MQ_F_NO_SCHED; dev->admin_tagset.driver_data = dev; if (blk_mq_alloc_tag_set(&dev->admin_tagset)) @@ -1228,7 +1218,8 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); if (!nvmeq) return -ENOMEM; } @@ -1281,6 +1272,24 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts) return true; } +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) +{ + /* Read a config register to help see what died. */ + u16 pci_status; + int result; + + result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS, + &pci_status); + if (result == PCIBIOS_SUCCESSFUL) + dev_warn(dev->dev, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n", + csts, pci_status); + else + dev_warn(dev->dev, + "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n", + csts, result); +} + static void nvme_watchdog_timer(unsigned long data) { struct nvme_dev *dev = (struct nvme_dev *)data; @@ -1289,9 +1298,7 @@ static void nvme_watchdog_timer(unsigned long data) /* Skip controllers under certain specific conditions. */ if (nvme_should_reset(dev, csts)) { if (!nvme_reset(dev)) - dev_warn(dev->dev, - "Failed status: 0x%x, reset controller.\n", - csts); + nvme_warn_reset(dev, csts); return; } @@ -1304,7 +1311,9 @@ static int nvme_create_io_queues(struct nvme_dev *dev) int ret = 0; for (i = dev->queue_count; i <= dev->max_qid; i++) { - if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + /* vector == qid - 1, match nvme_create_queue */ + if (!nvme_alloc_queue(dev, i, dev->q_depth, + pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; } @@ -1332,7 +1341,7 @@ static ssize_t nvme_cmb_show(struct device *dev, { struct nvme_dev *ndev = to_nvme_dev(dev_get_drvdata(dev)); - return snprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", + return scnprintf(buf, PAGE_SIZE, "cmbloc : x%08x\ncmbsz : x%08x\n", ndev->cmbloc, ndev->cmbsz); } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); @@ -1666,21 +1675,34 @@ static void nvme_pci_disable(struct nvme_dev *dev) static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i, queues; - u32 csts = -1; + bool dead = true; + struct pci_dev *pdev = to_pci_dev(dev->dev); del_timer_sync(&dev->watchdog_timer); mutex_lock(&dev->shutdown_lock); - if (pci_is_enabled(to_pci_dev(dev->dev))) { - nvme_stop_queues(&dev->ctrl); - csts = readl(dev->bar + NVME_REG_CSTS); + if (pci_is_enabled(pdev)) { + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + if (dev->ctrl.state == NVME_CTRL_LIVE) + nvme_start_freeze(&dev->ctrl); + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) || + pdev->error_state != pci_channel_io_normal); } + /* + * Give the controller a chance to complete all entered requests if + * doing a safe shutdown. + */ + if (!dead && shutdown) + nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT); + nvme_stop_queues(&dev->ctrl); + queues = dev->online_queues - 1; for (i = dev->queue_count - 1; i > 0; i--) nvme_suspend_queue(dev->queues[i]); - if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { + if (dead) { /* A device might become IO incapable very soon during * probe, before the admin queue is configured. Thus, * queue_count can be 0 here. @@ -1695,6 +1717,14 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + + /* + * The driver will not be starting up queues again if shutting down so + * must flush all entered requests to their failed completion to avoid + * deadlocking blk-mq hot-cpu notifier. + */ + if (shutdown) + nvme_start_queues(&dev->ctrl); mutex_unlock(&dev->shutdown_lock); } @@ -1731,6 +1761,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) if (dev->ctrl.admin_q) blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); + free_opal_dev(dev->ctrl.opal_dev); kfree(dev); } @@ -1747,6 +1778,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status) static void nvme_reset_work(struct work_struct *work) { struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL); int result = -ENODEV; if (WARN_ON(dev->ctrl.state == NVME_CTRL_RESETTING)) @@ -1779,6 +1811,17 @@ static void nvme_reset_work(struct work_struct *work) if (result) goto out; + if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) { + if (!dev->ctrl.opal_dev) + dev->ctrl.opal_dev = + init_opal_dev(&dev->ctrl, &nvme_sec_submit); + else if (was_suspend) + opal_unlock_from_suspend(dev->ctrl.opal_dev); + } else { + free_opal_dev(dev->ctrl.opal_dev); + dev->ctrl.opal_dev = NULL; + } + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -1804,7 +1847,9 @@ static void nvme_reset_work(struct work_struct *work) nvme_remove_namespaces(&dev->ctrl); } else { nvme_start_queues(&dev->ctrl); + nvme_wait_freeze(&dev->ctrl); nvme_dev_add(dev); + nvme_unfreeze(&dev->ctrl); } if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) { @@ -1892,10 +1937,10 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!dev->bar) goto release; - return 0; + return 0; release: - pci_release_mem_regions(pdev); - return -ENODEV; + pci_release_mem_regions(pdev); + return -ENODEV; } static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) @@ -1983,8 +2028,10 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); - if (!pci_device_is_present(pdev)) + if (!pci_device_is_present(pdev)) { nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD); + nvme_dev_disable(dev, false); + } flush_work(&dev->reset_work); nvme_uninit_ctrl(&dev->ctrl); @@ -2085,9 +2132,6 @@ static const struct pci_error_handlers nvme_err_handler = { .reset_notify = nvme_reset_notify, }; -/* Move to pci_ids.h later */ -#define PCI_CLASS_STORAGE_EXPRESS 0x010802 - static const struct pci_device_id nvme_id_table[] = { { PCI_VDEVICE(INTEL, 0x0953), .driver_data = NVME_QUIRK_STRIPE_SIZE | @@ -2106,6 +2150,7 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DELAY_BEFORE_CHK_RDY, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); |