diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-17 16:57:47 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-17 16:57:47 -0700 |
commit | 7ad67ca5534ee7c958559c4ad610f05c4578e361 (patch) | |
tree | dc6b6a8a6b70b5f25b07bcdc06d8e77e705f6822 /drivers/nvme/host | |
parent | 5260c2b863ef1152445ce93476c95d8c8a727eef (diff) | |
parent | 9c7eddf1b080f98fed1aadb74fe784f29bf77a08 (diff) |
Merge tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- Two NVMe pull requests:
- ana log parse fix from Anton
- nvme quirks support for Apple devices from Ben
- fix missing bio completion tracing for multipath stack devices
from Hannes and Mikhail
- IP TOS settings for nvme rdma and tcp transports from Israel
- rq_dma_dir cleanups from Israel
- tracing for Get LBA Status command from Minwoo
- Some nvme-tcp cleanups from Minwoo, Potnuri and Myself
- Some consolidation between the fabrics transports for handling
the CAP register
- reset race with ns scanning fix for fabrics (move fabrics
commands to a dedicated request queue with a different lifetime
from the admin request queue)."
- controller reset and namespace scan races fixes
- nvme discovery log change uevent support
- naming improvements from Keith
- multiple discovery controllers reject fix from James
- some regular cleanups from various people
- Series fixing (and re-fixing) null_blk debug printing and nr_devices
checks (André)
- A few pull requests from Song, with fixes from Andy, Guoqing,
Guilherme, Neil, Nigel, and Yufen.
- REQ_OP_ZONE_RESET_ALL support (Chaitanya)
- Bio merge handling unification (Christoph)
- Pick default elevator correctly for devices with special needs
(Damien)
- Block stats fixes (Hou)
- Timeout and support devices nbd fixes (Mike)
- Series fixing races around elevator switching and device add/remove
(Ming)
- sed-opal cleanups (Revanth)
- Per device weight support for BFQ (Fam)
- Support for blk-iocost, a new model that can properly account cost of
IO workloads. (Tejun)
- blk-cgroup writeback fixes (Tejun)
- paride queue init fixes (zhengbin)
- blk_set_runtime_active() cleanup (Stanley)
- Block segment mapping optimizations (Bart)
- lightnvm fixes (Hans/Minwoo/YueHaibing)
- Various little fixes and cleanups
* tag 'for-5.4/block-2019-09-16' of git://git.kernel.dk/linux-block: (186 commits)
null_blk: format pr_* logs with pr_fmt
null_blk: match the type of parameter nr_devices
null_blk: do not fail the module load with zero devices
block: also check RQF_STATS in blk_mq_need_time_stamp()
block: make rq sector size accessible for block stats
bfq: Fix bfq linkage error
raid5: use bio_end_sector in r5_next_bio
raid5: remove STRIPE_OPS_REQ_PENDING
md: add feature flag MD_FEATURE_RAID0_LAYOUT
md/raid0: avoid RAID0 data corruption due to layout confusion.
raid5: don't set STRIPE_HANDLE to stripe which is in batch list
raid5: don't increment read_errors on EILSEQ return
nvmet: fix a wrong error status returned in error log page
nvme: send discovery log page change events to userspace
nvme: add uevent variables for controller devices
nvme: enable aen regardless of the presence of I/O queues
nvme-fabrics: allow discovery subsystems accept a kato
nvmet: Use PTR_ERR_OR_ZERO() in nvmet_init_discovery()
nvme: Remove redundant assignment of cq vector
nvme: Assign subsys instance from first ctrl
...
Diffstat (limited to 'drivers/nvme/host')
-rw-r--r-- | drivers/nvme/host/Kconfig | 1 | ||||
-rw-r--r-- | drivers/nvme/host/core.c | 201 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.c | 38 | ||||
-rw-r--r-- | drivers/nvme/host/fabrics.h | 3 | ||||
-rw-r--r-- | drivers/nvme/host/fc.c | 73 | ||||
-rw-r--r-- | drivers/nvme/host/lightnvm.c | 45 | ||||
-rw-r--r-- | drivers/nvme/host/multipath.c | 8 | ||||
-rw-r--r-- | drivers/nvme/host/nvme.h | 36 | ||||
-rw-r--r-- | drivers/nvme/host/pci.c | 102 | ||||
-rw-r--r-- | drivers/nvme/host/rdma.c | 61 | ||||
-rw-r--r-- | drivers/nvme/host/tcp.c | 144 | ||||
-rw-r--r-- | drivers/nvme/host/trace.c | 18 |
12 files changed, 500 insertions, 230 deletions
diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index ec43ac9199e2..2b36f052bfb9 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -64,6 +64,7 @@ config NVME_TCP depends on INET depends on BLK_DEV_NVME select NVME_FABRICS + select CRYPTO_CRC32C help This provides support for the NVMe over Fabrics protocol using the TCP transport. This allows you to use remote block devices diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index d3d6b7bd6903..1ede1763a5ee 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -22,12 +22,12 @@ #include <linux/pm_qos.h> #include <asm/unaligned.h> -#define CREATE_TRACE_POINTS -#include "trace.h" - #include "nvme.h" #include "fabrics.h" +#define CREATE_TRACE_POINTS +#include "trace.h" + #define NVME_MINORS (1U << MINORBITS) unsigned int admin_timeout = 60; @@ -81,7 +81,6 @@ EXPORT_SYMBOL_GPL(nvme_reset_wq); struct workqueue_struct *nvme_delete_wq; EXPORT_SYMBOL_GPL(nvme_delete_wq); -static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -197,9 +196,9 @@ static inline bool nvme_ns_has_pi(struct nvme_ns *ns) return ns->pi_type && ns->ms == sizeof(struct t10_pi_tuple); } -static blk_status_t nvme_error_status(struct request *req) +static blk_status_t nvme_error_status(u16 status) { - switch (nvme_req(req)->status & 0x7ff) { + switch (status & 0x7ff) { case NVME_SC_SUCCESS: return BLK_STS_OK; case NVME_SC_CAP_EXCEEDED: @@ -226,6 +225,8 @@ static blk_status_t nvme_error_status(struct request *req) return BLK_STS_PROTECTION; case NVME_SC_RESERVATION_CONFLICT: return BLK_STS_NEXUS; + case NVME_SC_HOST_PATH_ERROR: + return BLK_STS_TRANSPORT; default: return BLK_STS_IOERR; } @@ -260,7 +261,7 @@ static void nvme_retry_req(struct request *req) void nvme_complete_rq(struct request *req) { - blk_status_t status = nvme_error_status(req); + blk_status_t status = nvme_error_status(nvme_req(req)->status); trace_nvme_complete_rq(req); @@ -279,6 +280,8 @@ void nvme_complete_rq(struct request *req) return; } } + + nvme_trace_bio_complete(req, status); blk_mq_end_request(req, status); } EXPORT_SYMBOL_GPL(nvme_complete_rq); @@ -288,8 +291,12 @@ bool nvme_cancel_request(struct request *req, void *data, bool reserved) dev_dbg_ratelimited(((struct nvme_ctrl *) data)->device, "Cancelling I/O %d", req->tag); - nvme_req(req)->status = NVME_SC_ABORT_REQ; - blk_mq_complete_request_sync(req); + /* don't abort one completed request */ + if (blk_mq_request_completed(req)) + return true; + + nvme_req(req)->status = NVME_SC_HOST_PATH_ERROR; + blk_mq_complete_request(req); return true; } EXPORT_SYMBOL_GPL(nvme_cancel_request); @@ -1088,10 +1095,9 @@ static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *n NVME_IDENTIFY_DATA_SIZE); } -static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, - unsigned nsid) +static int nvme_identify_ns(struct nvme_ctrl *ctrl, + unsigned nsid, struct nvme_id_ns **id) { - struct nvme_id_ns *id; struct nvme_command c = { }; int error; @@ -1100,18 +1106,17 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl, c.identify.nsid = cpu_to_le32(nsid); c.identify.cns = NVME_ID_CNS_NS; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (!id) - return NULL; + *id = kmalloc(sizeof(**id), GFP_KERNEL); + if (!*id) + return -ENOMEM; - error = nvme_submit_sync_cmd(ctrl->admin_q, &c, id, sizeof(*id)); + error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - kfree(id); - return NULL; + kfree(*id); } - return id; + return error; } static int nvme_features(struct nvme_ctrl *dev, u8 op, unsigned int fid, @@ -1180,7 +1185,8 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) EXPORT_SYMBOL_GPL(nvme_set_queue_count); #define NVME_AEN_SUPPORTED \ - (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | NVME_AEN_CFG_ANA_CHANGE) + (NVME_AEN_CFG_NS_ATTR | NVME_AEN_CFG_FW_ACT | \ + NVME_AEN_CFG_ANA_CHANGE | NVME_AEN_CFG_DISC_CHANGE) static void nvme_enable_aen(struct nvme_ctrl *ctrl) { @@ -1195,6 +1201,8 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl) if (status) dev_warn(ctrl->device, "Failed to configure AEN (cfg %x)\n", supported_aens); + + queue_work(nvme_wq, &ctrl->async_event_work); } static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) @@ -1594,9 +1602,11 @@ static void nvme_config_write_zeroes(struct gendisk *disk, struct nvme_ns *ns) blk_queue_max_write_zeroes_sectors(disk->queue, max_sectors); } -static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, +static int nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, struct nvme_id_ns *id, struct nvme_ns_ids *ids) { + int ret = 0; + memset(ids, 0, sizeof(*ids)); if (ctrl->vs >= NVME_VS(1, 1, 0)) @@ -1607,10 +1617,12 @@ static void nvme_report_ns_ids(struct nvme_ctrl *ctrl, unsigned int nsid, /* Don't treat error as fatal we potentially * already have a NGUID or EUI-64 */ - if (nvme_identify_ns_descs(ctrl, nsid, ids)) + ret = nvme_identify_ns_descs(ctrl, nsid, ids); + if (ret) dev_warn(ctrl->device, - "%s: Identify Descriptors failed\n", __func__); + "Identify Descriptors failed (%d)\n", ret); } + return ret; } static bool nvme_ns_ids_valid(struct nvme_ns_ids *ids) @@ -1738,25 +1750,37 @@ static int nvme_revalidate_disk(struct gendisk *disk) return -ENODEV; } - id = nvme_identify_ns(ctrl, ns->head->ns_id); - if (!id) - return -ENODEV; + ret = nvme_identify_ns(ctrl, ns->head->ns_id, &id); + if (ret) + goto out; if (id->ncap == 0) { ret = -ENODEV; - goto out; + goto free_id; } __nvme_revalidate_disk(disk, id); - nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + ret = nvme_report_ns_ids(ctrl, ns->head->ns_id, id, &ids); + if (ret) + goto free_id; + if (!nvme_ns_ids_equal(&ns->head->ids, &ids)) { dev_err(ctrl->device, "identifiers changed for nsid %d\n", ns->head->ns_id); ret = -ENODEV; } -out: +free_id: kfree(id); +out: + /* + * Only fail the function if we got a fatal error back from the + * device, otherwise ignore the error and just move on. + */ + if (ret == -ENOMEM || (ret > 0 && !(ret & NVME_SC_DNR))) + ret = 0; + else if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -1952,7 +1976,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) * bits', but doing so may cause the device to complete commands to the * admin queue ... and we don't know what memory that might be pointing at! */ -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_disable_ctrl(struct nvme_ctrl *ctrl) { int ret; @@ -1966,20 +1990,27 @@ int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY) msleep(NVME_QUIRK_DELAY_AMOUNT); - return nvme_wait_ready(ctrl, cap, false); + return nvme_wait_ready(ctrl, ctrl->cap, false); } EXPORT_SYMBOL_GPL(nvme_disable_ctrl); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +int nvme_enable_ctrl(struct nvme_ctrl *ctrl) { /* * Default to a 4K page size, with the intention to update this * path in the future to accomodate architectures with differing * kernel and IO page sizes. */ - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + unsigned dev_page_min, page_shift = 12; int ret; + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); + if (ret) { + dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); + return ret; + } + dev_page_min = NVME_CAP_MPSMIN(ctrl->cap) + 12; + if (page_shift < dev_page_min) { dev_err(ctrl->device, "Minimum device page size %u too large for host (%u)\n", @@ -1998,7 +2029,7 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); if (ret) return ret; - return nvme_wait_ready(ctrl, cap, true); + return nvme_wait_ready(ctrl, ctrl->cap, true); } EXPORT_SYMBOL_GPL(nvme_enable_ctrl); @@ -2332,7 +2363,8 @@ static void nvme_release_subsystem(struct device *dev) struct nvme_subsystem *subsys = container_of(dev, struct nvme_subsystem, dev); - ida_simple_remove(&nvme_subsystems_ida, subsys->instance); + if (subsys->instance >= 0) + ida_simple_remove(&nvme_instance_ida, subsys->instance); kfree(subsys); } @@ -2361,6 +2393,17 @@ static struct nvme_subsystem *__nvme_find_get_subsystem(const char *subsysnqn) lockdep_assert_held(&nvme_subsystems_lock); + /* + * Fail matches for discovery subsystems. This results + * in each discovery controller bound to a unique subsystem. + * This avoids issues with validating controller values + * that can only be true when there is a single unique subsystem. + * There may be multiple and completely independent entities + * that provide discovery controllers. + */ + if (!strcmp(subsysnqn, NVME_DISC_SUBSYS_NAME)) + return NULL; + list_for_each_entry(subsys, &nvme_subsystems, entry) { if (strcmp(subsys->subnqn, subsysnqn)) continue; @@ -2461,12 +2504,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); if (!subsys) return -ENOMEM; - ret = ida_simple_get(&nvme_subsystems_ida, 0, 0, GFP_KERNEL); - if (ret < 0) { - kfree(subsys); - return ret; - } - subsys->instance = ret; + + subsys->instance = -1; mutex_init(&subsys->lock); kref_init(&subsys->ref); INIT_LIST_HEAD(&subsys->ctrls); @@ -2485,7 +2524,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) subsys->dev.class = nvme_subsys_class; subsys->dev.release = nvme_release_subsystem; subsys->dev.groups = nvme_subsys_attrs_groups; - dev_set_name(&subsys->dev, "nvme-subsys%d", subsys->instance); + dev_set_name(&subsys->dev, "nvme-subsys%d", ctrl->instance); device_initialize(&subsys->dev); mutex_lock(&nvme_subsystems_lock); @@ -2517,6 +2556,8 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) goto out_put_subsystem; } + if (!found) + subsys->instance = ctrl->instance; ctrl->subsys = subsys; list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); mutex_unlock(&nvme_subsystems_lock); @@ -2574,7 +2615,6 @@ static int nvme_get_effects_log(struct nvme_ctrl *ctrl) int nvme_init_identify(struct nvme_ctrl *ctrl) { struct nvme_id_ctrl *id; - u64 cap; int ret, page_shift; u32 max_hw_sectors; bool prev_apst_enabled; @@ -2584,16 +2624,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl) dev_err(ctrl->device, "Reading VS failed (%d)\n", ret); return ret; } - - ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); - if (ret) { - dev_err(ctrl->device, "Reading CAP failed (%d)\n", ret); - return ret; - } - page_shift = NVME_CAP_MPSMIN(cap) + 12; + page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; + ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); if (ctrl->vs >= NVME_VS(1, 1, 0)) - ctrl->subsystem = NVME_CAP_NSSRC(cap); + ctrl->subsystem = NVME_CAP_NSSRC(ctrl->cap); ret = nvme_identify_ctrl(ctrl, &id); if (ret) { @@ -3184,7 +3219,9 @@ static struct nvme_ns_head *nvme_alloc_ns_head(struct nvme_ctrl *ctrl, head->ns_id = nsid; kref_init(&head->ref); - nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &head->ids); + if (ret) + goto out_cleanup_srcu; ret = __nvme_check_ids(ctrl->subsys, head); if (ret) { @@ -3209,6 +3246,8 @@ out_ida_remove: out_free_head: kfree(head); out: + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ERR_PTR(ret); } @@ -3232,7 +3271,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, } else { struct nvme_ns_ids ids; - nvme_report_ns_ids(ctrl, nsid, id, &ids); + ret = nvme_report_ns_ids(ctrl, nsid, id, &ids); + if (ret) + goto out_unlock; + if (!nvme_ns_ids_equal(&head->ids, &ids)) { dev_err(ctrl->device, "IDs don't match for shared namespace %d\n", @@ -3247,6 +3289,8 @@ static int nvme_init_ns_head(struct nvme_ns *ns, unsigned nsid, out_unlock: mutex_unlock(&ctrl->subsys->lock); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3338,11 +3382,9 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - id = nvme_identify_ns(ctrl, nsid); - if (!id) { - ret = -EIO; + ret = nvme_identify_ns(ctrl, nsid, &id); + if (ret) goto out_free_queue; - } if (id->ncap == 0) { ret = -EINVAL; @@ -3404,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) blk_cleanup_queue(ns->queue); out_free_ns: kfree(ns); + if (ret > 0) + ret = blk_status_to_errno(nvme_error_status(ret)); return ret; } @@ -3617,6 +3661,33 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_remove_namespaces); +static int nvme_class_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct nvme_ctrl *ctrl = + container_of(dev, struct nvme_ctrl, ctrl_device); + struct nvmf_ctrl_options *opts = ctrl->opts; + int ret; + + ret = add_uevent_var(env, "NVME_TRTYPE=%s", ctrl->ops->name); + if (ret) + return ret; + + if (opts) { + ret = add_uevent_var(env, "NVME_TRADDR=%s", opts->traddr); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_TRSVCID=%s", + opts->trsvcid ?: "none"); + if (ret) + return ret; + + ret = add_uevent_var(env, "NVME_HOST_TRADDR=%s", + opts->host_traddr ?: "none"); + } + return ret; +} + static void nvme_aen_uevent(struct nvme_ctrl *ctrl) { char *envp[2] = { NULL, NULL }; @@ -3723,6 +3794,9 @@ static void nvme_handle_aen_notice(struct nvme_ctrl *ctrl, u32 result) queue_work(nvme_wq, &ctrl->ana_work); break; #endif + case NVME_AER_NOTICE_DISC_CHANGED: + ctrl->aen_result = result; + break; default: dev_warn(ctrl->device, "async event result %08x\n", result); } @@ -3769,10 +3843,10 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl) if (ctrl->kato) nvme_start_keep_alive(ctrl); + nvme_enable_aen(ctrl); + if (ctrl->queue_count > 1) { nvme_queue_scan(ctrl); - nvme_enable_aen(ctrl); - queue_work(nvme_wq, &ctrl->async_event_work); nvme_start_queues(ctrl); } } @@ -3792,7 +3866,9 @@ static void nvme_free_ctrl(struct device *dev) container_of(dev, struct nvme_ctrl, ctrl_device); struct nvme_subsystem *subsys = ctrl->subsys; - ida_simple_remove(&nvme_instance_ida, ctrl->instance); + if (subsys && ctrl->instance != subsys->instance) + ida_simple_remove(&nvme_instance_ida, ctrl->instance); + kfree(ctrl->effects); nvme_mpath_uninit(ctrl); __free_page(ctrl->discard_page); @@ -3992,6 +4068,9 @@ void nvme_sync_queues(struct nvme_ctrl *ctrl) list_for_each_entry(ns, &ctrl->namespaces, list) blk_sync_queue(ns->queue); up_read(&ctrl->namespaces_rwsem); + + if (ctrl->admin_q) + blk_sync_queue(ctrl->admin_q); } EXPORT_SYMBOL_GPL(nvme_sync_queues); @@ -4050,6 +4129,7 @@ static int __init nvme_core_init(void) result = PTR_ERR(nvme_class); goto unregister_chrdev; } + nvme_class->dev_uevent = nvme_class_uevent; nvme_subsys_class = class_create(THIS_MODULE, "nvme-subsystem"); if (IS_ERR(nvme_subsys_class)) { @@ -4074,7 +4154,6 @@ out: static void __exit nvme_core_exit(void) { - ida_destroy(&nvme_subsystems_ida); class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 1994d5b42f94..74b8818ac9a1 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -150,7 +150,7 @@ int nvmf_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) cmd.prop_get.fctype = nvme_fabrics_type_property_get; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -197,7 +197,7 @@ int nvmf_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) cmd.prop_get.attrib = 1; cmd.prop_get.offset = cpu_to_le32(off); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (ret >= 0) @@ -243,7 +243,7 @@ int nvmf_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) cmd.prop_set.offset = cpu_to_le32(off); cmd.prop_set.value = cpu_to_le64(val); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, NULL, NULL, 0, 0, NVME_QID_ANY, 0, 0, false); if (unlikely(ret)) dev_err(ctrl->device, @@ -381,8 +381,8 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) * Set keep-alive timeout in seconds granularity (ms * 1000) * and add a grace period for controller kato enforcement */ - cmd.connect.kato = ctrl->opts->discovery_nqn ? 0 : - cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000); + cmd.connect.kato = ctrl->kato ? + cpu_to_le32((ctrl->kato + NVME_KATO_GRACE) * 1000) : 0; if (ctrl->opts->disable_sqflow) cmd.connect.cattr |= NVME_CONNECT_DISABLE_SQFLOW; @@ -396,7 +396,7 @@ int nvmf_connect_admin_queue(struct nvme_ctrl *ctrl) strncpy(data->subsysnqn, ctrl->opts->subsysnqn, NVMF_NQN_SIZE); strncpy(data->hostnqn, ctrl->opts->host->nqn, NVMF_NQN_SIZE); - ret = __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, &res, + ret = __nvme_submit_sync_cmd(ctrl->fabrics_q, &cmd, &res, data, sizeof(*data), 0, NVME_QID_ANY, 1, BLK_MQ_REQ_RESERVED | BLK_MQ_REQ_NOWAIT, false); if (ret) { @@ -611,6 +611,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_DATA_DIGEST, "data_digest" }, { NVMF_OPT_NR_WRITE_QUEUES, "nr_write_queues=%d" }, { NVMF_OPT_NR_POLL_QUEUES, "nr_poll_queues=%d" }, + { NVMF_OPT_TOS, "tos=%d" }, { NVMF_OPT_ERR, NULL } }; @@ -632,6 +633,7 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, opts->duplicate_connect = false; opts->hdr_digest = false; opts->data_digest = false; + opts->tos = -1; /* < 0 == use transport default */ options = o = kstrdup(buf, GFP_KERNEL); if (!options) @@ -738,13 +740,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, pr_warn("keep_alive_tmo 0 won't execute keep alives!!!\n"); } opts->kato = token; - - if (opts->discovery_nqn && opts->kato) { - pr_err("Discovery controllers cannot accept KATO != 0\n"); - ret = -EINVAL; - goto out; - } - break; case NVMF_OPT_CTRL_LOSS_TMO: if (match_int(args, &token)) { @@ -856,6 +851,22 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->nr_poll_queues = token; break; + case NVMF_OPT_TOS: + if (match_int(args, &token)) { + ret = -EINVAL; + goto out; + } + if (token < 0) { + pr_err("Invalid type of service %d\n", token); + ret = -EINVAL; + goto out; + } + if (token > 255) { + pr_warn("Clamping type of service to 255\n"); + token = 255; + } + opts->tos = token; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -865,7 +876,6 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } if (opts->discovery_nqn) { - opts->kato = 0; opts->nr_io_queues = 0; opts->nr_write_queues = 0; opts->nr_poll_queues = 0; diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 3044d8b99a24..93f08d77c896 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -55,6 +55,7 @@ enum { NVMF_OPT_DATA_DIGEST = 1 << 16, NVMF_OPT_NR_WRITE_QUEUES = 1 << 17, NVMF_OPT_NR_POLL_QUEUES = 1 << 18, + NVMF_OPT_TOS = 1 << 19, }; /** @@ -87,6 +88,7 @@ enum { * @data_digest: generate/verify data digest (TCP) * @nr_write_queues: number of queues for write I/O * @nr_poll_queues: number of queues for polling I/O + * @tos: type of service */ struct nvmf_ctrl_options { unsigned mask; @@ -108,6 +110,7 @@ struct nvmf_ctrl_options { bool data_digest; unsigned int nr_write_queues; unsigned int nr_poll_queues; + int tos; }; /* diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 232d8094091b..265f89e11d8b 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -1608,9 +1608,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) sizeof(op->rsp_iu), DMA_FROM_DEVICE); if (opstate == FCPOP_STATE_ABORTED) - status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - else if (freq->status) - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + else if (freq->status) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to lldd error %d\n", + ctrl->cnum, freq->status); + } /* * For the linux implementation, if we have an unsuccesful @@ -1637,8 +1641,13 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) * no payload in the CQE by the transport. */ if (freq->transferred_length != - be32_to_cpu(op->cmd_iu.data_len)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + be32_to_cpu(op->cmd_iu.data_len)) { + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad transfer " + "length: %d vs expected %d\n", + ctrl->cnum, freq->transferred_length, + be32_to_cpu(op->cmd_iu.data_len)); goto done; } result.u64 = 0; @@ -1655,7 +1664,17 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) freq->transferred_length || op->rsp_iu.status_code || sqe->common.command_id != cqe->command_id)) { - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to bad NVMe_ERSP: " + "iu len %d, xfr len %d vs %d, status code " + "%d, cmdid %d vs %d\n", + ctrl->cnum, be16_to_cpu(op->rsp_iu.iu_len), + be32_to_cpu(op->rsp_iu.xfrd_len), + freq->transferred_length, + op->rsp_iu.status_code, + sqe->common.command_id, + cqe->command_id); goto done; } result = cqe->result; @@ -1663,7 +1682,11 @@ nvme_fc_fcpio_done(struct nvmefc_fcp_req *req) break; default: - status = cpu_to_le16(NVME_SC_INTERNAL << 1); + status = cpu_to_le16(NVME_SC_HOST_PATH_ERROR << 1); + dev_info(ctrl->ctrl.device, + "NVME-FC{%d}: io failed due to odd NVMe_xRSP iu " + "len %d\n", + ctrl->cnum, freq->rcv_rsplen); goto done; } @@ -2006,6 +2029,7 @@ nvme_fc_ctrl_free(struct kref *ref) blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(&ctrl->admin_tag_set); kfree(ctrl->queues); @@ -2107,7 +2131,6 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, struct nvme_fc_fcp_op *op) { struct nvmefc_fcp_req *freq = &op->fcp_req; - enum dma_data_direction dir; int ret; freq->sg_cnt = 0; @@ -2124,9 +2147,8 @@ nvme_fc_map_data(struct nvme_fc_ctrl *ctrl, struct request *rq, op->nents = blk_rq_map_sg(rq->q, rq, freq->sg_table.sgl); WARN_ON(op->nents > blk_rq_nr_phys_segments(rq)); - dir = (rq_data_dir(rq) == WRITE) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; freq->sg_cnt = fc_dma_map_sg(ctrl->lport->dev, freq->sg_table.sgl, - op->nents, dir); + op->nents, rq_dma_dir(rq)); if (unlikely(freq->sg_cnt <= 0)) { sg_free_table_chained(&freq->sg_table, SG_CHUNK_SIZE); freq->sg_cnt = 0; @@ -2149,8 +2171,7 @@ nvme_fc_unmap_data(struct nvme_fc_ctrl *ctrl, struct request *rq, return; fc_dma_unmap_sg(ctrl->lport->dev, freq->sg_table.sgl, op->nents, - ((rq_data_dir(rq) == WRITE) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE)); + rq_dma_dir(rq)); nvme_cleanup_cmd(rq); @@ -2633,8 +2654,6 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) if (ret) goto out_delete_hw_queue; - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); - ret = nvmf_connect_admin_queue(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2648,23 +2667,15 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) * prior connection values */ - ret = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->ctrl.cap); - if (ret) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_disconnect_admin_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + ret = nvme_enable_ctrl(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; ctrl->ctrl.max_hw_sectors = (ctrl->lport->ops->max_sgl_segments - 1) << (PAGE_SHIFT - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + ret = nvme_init_identify(&ctrl->ctrl); if (ret) goto out_disconnect_admin_queue; @@ -2774,6 +2785,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) nvme_stop_queues(&ctrl->ctrl); blk_mq_tagset_busy_iter(&ctrl->tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->tag_set); } /* @@ -2796,6 +2808,7 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) blk_mq_quiesce_queue(ctrl->ctrl.admin_q); blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, nvme_fc_terminate_exchange, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set); /* kill the aens as they are a separate path */ nvme_fc_abort_aen_ops(ctrl); @@ -3109,10 +3122,16 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts, goto out_free_queues; ctrl->ctrl.admin_tagset = &ctrl->admin_tag_set; + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + ret = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_admin_tag_set; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { ret = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_admin_tag_set; + goto out_cleanup_fabrics_q; } /* @@ -3184,6 +3203,8 @@ fail_ctrl: out_cleanup_admin_q: blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_admin_tag_set: blk_mq_free_tag_set(&ctrl->admin_tag_set); out_free_queues: diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index ba009d4c9dfa..ec46693f6b64 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -667,11 +667,14 @@ static struct request *nvme_nvm_alloc_request(struct request_queue *q, return rq; } -static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd, + void *buf) { + struct nvm_geo *geo = &dev->geo; struct request_queue *q = dev->q; struct nvme_nvm_command *cmd; struct request *rq; + int ret; cmd = kzalloc(sizeof(struct nvme_nvm_command), GFP_KERNEL); if (!cmd) @@ -679,8 +682,15 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) rq = nvme_nvm_alloc_request(q, rqd, cmd); if (IS_ERR(rq)) { - kfree(cmd); - return PTR_ERR(rq); + ret = PTR_ERR(rq); + goto err_free_cmd; + } + + if (buf) { + ret = blk_rq_map_kern(q, rq, buf, geo->csecs * rqd->nr_ppas, + GFP_KERNEL); + if (ret) + goto err_free_cmd; } rq->end_io_data = rqd; @@ -688,33 +698,9 @@ static int nvme_nvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) blk_execute_rq_nowait(q, NULL, rq, 0, nvme_nvm_end_io); return 0; -} - -static int nvme_nvm_submit_io_sync(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - struct request_queue *q = dev->q; - struct request *rq; - struct nvme_nvm_command cmd; - int ret = 0; - - memset(&cmd, 0, sizeof(struct nvme_nvm_command)); - - rq = nvme_nvm_alloc_request(q, rqd, &cmd); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - /* I/Os can fail and the error is signaled through rqd. Callers must - * handle the error accordingly. - */ - blk_execute_rq(q, NULL, rq, 0); - if (nvme_req(rq)->flags & NVME_REQ_CANCELLED) - ret = -EINTR; - - rqd->ppa_status = le64_to_cpu(nvme_req(rq)->result.u64); - rqd->error = nvme_req(rq)->status; - - blk_mq_free_request(rq); +err_free_cmd: + kfree(cmd); return ret; } @@ -754,7 +740,6 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .get_chk_meta = nvme_nvm_get_chk_meta, .submit_io = nvme_nvm_submit_io, - .submit_io_sync = nvme_nvm_submit_io_sync, .create_dma_pool = nvme_nvm_create_dma_pool, .destroy_dma_pool = nvme_nvm_destroy_dma_pool, diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index af831d3d15d0..30de7efef003 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -509,14 +509,16 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl, down_write(&ctrl->namespaces_rwsem); list_for_each_entry(ns, &ctrl->namespaces, list) { - if (ns->head->ns_id != le32_to_cpu(desc->nsids[n])) + unsigned nsid = le32_to_cpu(desc->nsids[n]); + + if (ns->head->ns_id < nsid) continue; - nvme_update_ns_ana_state(desc, ns); + if (ns->head->ns_id == nsid) + nvme_update_ns_ana_state(desc, ns); if (++n == nr_nsids) break; } up_write(&ctrl->namespaces_rwsem); - WARN_ON_ONCE(n < nr_nsids); return 0; } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 2d678fb968c7..b5013c101b35 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -16,6 +16,8 @@ #include <linux/fault-inject.h> #include <linux/rcupdate.h> +#include <trace/events/block.h> + extern unsigned int nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) @@ -97,6 +99,21 @@ enum nvme_quirks { * Force simple suspend/resume path. */ NVME_QUIRK_SIMPLE_SUSPEND = (1 << 10), + + /* + * Use only one interrupt vector for all queues + */ + NVME_QUIRK_SINGLE_VECTOR = (1 << 11), + + /* + * Use non-standard 128 bytes SQEs. + */ + NVME_QUIRK_128_BYTES_SQES = (1 << 12), + + /* + * Prevent tag overlap between queues + */ + NVME_QUIRK_SHARED_TAGS = (1 << 13), }; /* @@ -169,6 +186,7 @@ struct nvme_ctrl { const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; struct request_queue *connect_q; + struct request_queue *fabrics_q; struct device *dev; int instance; int numa_node; @@ -431,8 +449,8 @@ void nvme_complete_rq(struct request *req); bool nvme_cancel_request(struct request *req, void *data, bool reserved); bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl, enum nvme_ctrl_state new_state); -int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); -int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_disable_ctrl(struct nvme_ctrl *ctrl); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl); int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, const struct nvme_ctrl_ops *ops, unsigned long quirks); @@ -520,6 +538,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) kblockd_schedule_work(&head->requeue_work); } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ + struct nvme_ns *ns = req->q->queuedata; + + if (req->cmd_flags & REQ_NVME_MPATH) + trace_block_bio_complete(ns->head->disk->queue, + req->bio, status); +} + extern struct device_attribute dev_attr_ana_grpid; extern struct device_attribute dev_attr_ana_state; extern struct device_attribute subsys_attr_iopolicy; @@ -567,6 +595,10 @@ static inline void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl) static inline void nvme_mpath_check_last_path(struct nvme_ns *ns) { } +static inline void nvme_trace_bio_complete(struct request *req, + blk_status_t status) +{ +} static inline int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id) { diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 732d5b63ec05..6b4d7b064b38 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -28,8 +28,8 @@ #include "trace.h" #include "nvme.h" -#define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) -#define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) +#define SQ_SIZE(q) ((q)->q_depth << (q)->sqes) +#define CQ_SIZE(q) ((q)->q_depth * sizeof(struct nvme_completion)) #define SGES_PER_PAGE (PAGE_SIZE / sizeof(struct nvme_sgl_desc)) @@ -100,6 +100,7 @@ struct nvme_dev { unsigned io_queues[HCTX_MAX_TYPES]; unsigned int num_vecs; int q_depth; + int io_sqes; u32 db_stride; void __iomem *bar; unsigned long bar_mapped_size; @@ -162,7 +163,7 @@ static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) struct nvme_queue { struct nvme_dev *dev; spinlock_t sq_lock; - struct nvme_command *sq_cmds; + void *sq_cmds; /* only used for poll queues: */ spinlock_t cq_poll_lock ____cacheline_aligned_in_smp; volatile struct nvme_completion *cqes; @@ -178,6 +179,7 @@ struct nvme_queue { u16 last_cq_head; u16 qid; u8 cq_phase; + u8 sqes; unsigned long flags; #define NVMEQ_ENABLED 0 #define NVMEQ_SQ_CMB 1 @@ -488,7 +490,8 @@ static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd, bool write_sq) { spin_lock(&nvmeq->sq_lock); - memcpy(&nvmeq->sq_cmds[nvmeq->sq_tail], cmd, sizeof(*cmd)); + memcpy(nvmeq->sq_cmds + (nvmeq->sq_tail << nvmeq->sqes), + cmd, sizeof(*cmd)); if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; nvme_write_sq_db(nvmeq, write_sq); @@ -534,14 +537,13 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - enum dma_data_direction dma_dir = rq_data_dir(req) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE; const int last_prp = dev->ctrl.page_size / sizeof(__le64) - 1; dma_addr_t dma_addr = iod->first_dma, next_dma_addr; int i; if (iod->dma_len) { - dma_unmap_page(dev->dev, dma_addr, iod->dma_len, dma_dir); + dma_unmap_page(dev->dev, dma_addr, iod->dma_len, + rq_dma_dir(req)); return; } @@ -1344,16 +1346,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) static void nvme_free_queue(struct nvme_queue *nvmeq) { - dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); if (!nvmeq->sq_cmds) return; if (test_and_clear_bit(NVMEQ_SQ_CMB, &nvmeq->flags)) { pci_free_p2pmem(to_pci_dev(nvmeq->dev->dev), - nvmeq->sq_cmds, SQ_SIZE(nvmeq->q_depth)); + nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } else { - dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq->q_depth), + dma_free_coherent(nvmeq->dev->dev, SQ_SIZE(nvmeq), nvmeq->sq_cmds, nvmeq->sq_dma_addr); } } @@ -1403,7 +1405,7 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) if (shutdown) nvme_shutdown_ctrl(&dev->ctrl); else - nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + nvme_disable_ctrl(&dev->ctrl); nvme_poll_irqdisable(nvmeq, -1); } @@ -1433,12 +1435,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, } static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, - int qid, int depth) + int qid) { struct pci_dev *pdev = to_pci_dev(dev->dev); if (qid && dev->cmb_use_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { - nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(depth)); + nvmeq->sq_cmds = pci_alloc_p2pmem(pdev, SQ_SIZE(nvmeq)); if (nvmeq->sq_cmds) { nvmeq->sq_dma_addr = pci_p2pmem_virt_to_bus(pdev, nvmeq->sq_cmds); @@ -1447,11 +1449,11 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } - pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(depth)); + pci_free_p2pmem(pdev, nvmeq->sq_cmds, SQ_SIZE(nvmeq)); } } - nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(nvmeq), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) return -ENOMEM; @@ -1465,12 +1467,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) if (dev->ctrl.queue_count > qid) return 0; - nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(depth), + nvmeq->sqes = qid ? dev->io_sqes : NVME_ADM_SQES; + nvmeq->q_depth = depth; + nvmeq->cqes = dma_alloc_coherent(dev->dev, CQ_SIZE(nvmeq), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - if (nvme_alloc_sq_cmds(dev, nvmeq, qid, depth)) + if (nvme_alloc_sq_cmds(dev, nvmeq, qid)) goto free_cqdma; nvmeq->dev = dev; @@ -1479,15 +1483,14 @@ static int nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - nvmeq->q_depth = depth; nvmeq->qid = qid; dev->ctrl.queue_count++; return 0; free_cqdma: - dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, - nvmeq->cq_dma_addr); + dma_free_coherent(dev->dev, CQ_SIZE(nvmeq), (void *)nvmeq->cqes, + nvmeq->cq_dma_addr); free_nvmeq: return -ENOMEM; } @@ -1515,7 +1518,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); + memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq)); nvme_dbbuf_init(dev, nvmeq, qid); dev->online_queues++; wmb(); /* ensure the first interrupt sees the initialization */ @@ -1552,7 +1555,6 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid, bool polled) nvme_init_queue(nvmeq, qid); if (!polled) { - nvmeq->cq_vector = vector; result = queue_request_irq(nvmeq); if (result < 0) goto release_sq; @@ -1679,7 +1681,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_disable_ctrl(&dev->ctrl); if (result < 0) return result; @@ -1695,7 +1697,7 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(&dev->ctrl, dev->ctrl.cap); + result = nvme_enable_ctrl(&dev->ctrl); if (result) return result; @@ -2077,6 +2079,13 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues) dev->io_queues[HCTX_TYPE_DEFAULT] = 1; dev->io_queues[HCTX_TYPE_READ] = 0; + /* + * Some Apple controllers require all queues to use the + * first vector. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SINGLE_VECTOR) + irq_queues = 1; + return pci_alloc_irq_vectors_affinity(pdev, 1, irq_queues, PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd); } @@ -2095,6 +2104,14 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) unsigned long size; nr_io_queues = max_io_queues(); + + /* + * If tags are shared with admin queue (Apple bug), then + * make sure we only use one IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + nr_io_queues = 1; + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); if (result < 0) return result; @@ -2265,6 +2282,14 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; dev->tagset.driver_data = dev; + /* + * Some Apple controllers requires tags to be unique + * across admin and IO queue, so reserve the first 32 + * tags of the IO queue. + */ + if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) + dev->tagset.reserved_tags = NVME_AQ_DEPTH; + ret = blk_mq_alloc_tag_set(&dev->tagset); if (ret) { dev_warn(dev->ctrl.device, @@ -2314,10 +2339,21 @@ static int nvme_pci_enable(struct nvme_dev *dev) dev->q_depth = min_t(int, NVME_CAP_MQES(dev->ctrl.cap) + 1, io_queue_depth); + dev->ctrl.sqsize = dev->q_depth - 1; /* 0's based queue depth */ dev->db_stride = 1 << NVME_CAP_STRIDE(dev->ctrl.cap); dev->dbs = dev->bar + 4096; /* + * Some Apple controllers require a non-standard SQE size. + * Interestingly they also seem to ignore the CC:IOSQES register + * so we don't bother updating it here. + */ + if (dev->ctrl.quirks & NVME_QUIRK_128_BYTES_SQES) + dev->io_sqes = 7; + else + dev->io_sqes = NVME_NVM_IOSQES; + + /* * Temporary fix for the Apple controller found in the MacBook8,1 and * some MacBook7,1 to avoid controller resets and data loss. */ @@ -2334,6 +2370,18 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } + /* + * Controllers with the shared tags quirk need the IO queue to be + * big enough so that we get 32 tags for the admin queue + */ + if ((dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS) && + (dev->q_depth < (NVME_AQ_DEPTH + 2))) { + dev->q_depth = NVME_AQ_DEPTH + 2; + dev_warn(dev->ctrl.device, "IO queue depth clamped to %d\n", + dev->q_depth); + } + + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); @@ -2401,6 +2449,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl); blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl); + blk_mq_tagset_wait_completed_request(&dev->tagset); + blk_mq_tagset_wait_completed_request(&dev->admin_tagset); /* * The driver will not be starting up queues again if shutting down so @@ -3041,6 +3091,10 @@ static const struct pci_device_id nvme_id_table[] = { { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) }, + { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2005), + .driver_data = NVME_QUIRK_SINGLE_VECTOR | + NVME_QUIRK_128_BYTES_SQES | + NVME_QUIRK_SHARED_TAGS }, { 0, } }; MODULE_DEVICE_TABLE(pci, nvme_id_table); diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 1a6449bc547b..dfa07bb9dfeb 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -757,6 +757,7 @@ static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl, { if (remove) { blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_cleanup_queue(ctrl->ctrl.fabrics_q); blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); } if (ctrl->async_event_sqe.data) { @@ -798,10 +799,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, goto out_free_async_qe; } + ctrl->ctrl.fabrics_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.fabrics_q)) { + error = PTR_ERR(ctrl->ctrl.fabrics_q); + goto out_free_tagset; + } + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); if (IS_ERR(ctrl->ctrl.admin_q)) { error = PTR_ERR(ctrl->ctrl.admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -809,24 +816,15 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl, if (error) goto out_cleanup_queue; - error = ctrl->ctrl.ops->reg_read64(&ctrl->ctrl, NVME_REG_CAP, - &ctrl->ctrl.cap); - if (error) { - dev_err(ctrl->ctrl.device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->ctrl.sqsize = - min_t(int, NVME_CAP_MQES(ctrl->ctrl.cap), ctrl->ctrl.sqsize); - - error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + error = nvme_enable_ctrl(&ctrl->ctrl); if (error) goto out_stop_queue; ctrl->ctrl.max_hw_sectors = (ctrl->max_fr_pages - 1) << (ilog2(SZ_4K) - 9); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + error = nvme_init_identify(&ctrl->ctrl); if (error) goto out_stop_queue; @@ -838,6 +836,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->ctrl.admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->ctrl.fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->ctrl.admin_tagset); @@ -907,10 +908,13 @@ static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_stop_queue(&ctrl->queues[0]); - if (ctrl->ctrl.admin_tagset) + if (ctrl->ctrl.admin_tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.admin_tagset, nvme_cancel_request, &ctrl->ctrl); - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.admin_tagset); + } + if (remove) + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); nvme_rdma_destroy_admin_queue(ctrl, remove); } @@ -920,9 +924,11 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl, if (ctrl->ctrl.queue_count > 1) { nvme_stop_queues(&ctrl->ctrl); nvme_rdma_stop_io_queues(ctrl); - if (ctrl->ctrl.tagset) + if (ctrl->ctrl.tagset) { blk_mq_tagset_busy_iter(ctrl->ctrl.tagset, nvme_cancel_request, &ctrl->ctrl); + blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset); + } if (remove) nvme_start_queues(&ctrl->ctrl); nvme_rdma_destroy_io_queues(ctrl, remove); @@ -1059,6 +1065,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work) nvme_rdma_teardown_io_queues(ctrl, false); nvme_start_queues(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1145,9 +1152,7 @@ static void nvme_rdma_unmap_data(struct nvme_rdma_queue *queue, req->mr = NULL; } - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); nvme_cleanup_cmd(rq); sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); @@ -1273,7 +1278,7 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue, req->nents = blk_rq_map_sg(rq->q, rq, req->sg_table.sgl); count = ib_dma_map_sg(ibdev, req->sg_table.sgl, req->nents, - rq_data_dir(rq) == WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + rq_dma_dir(rq)); if (unlikely(count <= 0)) { ret = -EIO; goto out_free_table; @@ -1302,9 +1307,7 @@ out: return 0; out_unmap_sg: - ib_dma_unmap_sg(ibdev, req->sg_table.sgl, - req->nents, rq_data_dir(rq) == - WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + ib_dma_unmap_sg(ibdev, req->sg_table.sgl, req->nents, rq_dma_dir(rq)); out_free_table: sg_free_table_chained(&req->sg_table, SG_CHUNK_SIZE); return ret; @@ -1547,16 +1550,18 @@ static int nvme_rdma_conn_rejected(struct nvme_rdma_queue *queue, static int nvme_rdma_addr_resolved(struct nvme_rdma_queue *queue) { + struct nvme_ctrl *ctrl = &queue->ctrl->ctrl; int ret; ret = nvme_rdma_create_queue_ib(queue); if (ret) return ret; + if (ctrl->opts->tos >= 0) + rdma_set_service_type(queue->cm_id, ctrl->opts->tos); ret = rdma_resolve_route(queue->cm_id, NVME_RDMA_CONNECT_TIMEOUT_MS); if (ret) { - dev_err(queue->ctrl->ctrl.device, - "rdma_resolve_route failed (%d).\n", + dev_err(ctrl->device, "rdma_resolve_route failed (%d).\n", queue->cm_error); goto out_destroy_queue; } @@ -1869,10 +1874,11 @@ static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&ctrl->reconnect_work); nvme_rdma_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->ctrl.admin_q); if (shutdown) nvme_shutdown_ctrl(&ctrl->ctrl); else - nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap); + nvme_disable_ctrl(&ctrl->ctrl); nvme_rdma_teardown_admin_queue(ctrl, shutdown); } @@ -2051,7 +2057,8 @@ static struct nvmf_transport_ops nvme_rdma_transport = { .required_opts = NVMF_OPT_TRADDR, .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | - NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_rdma_create_ctrl, }; diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 606b13d35d16..4ffd5957637a 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -13,6 +13,7 @@ #include <net/tcp.h> #include <linux/blk-mq.h> #include <crypto/hash.h> +#include <net/busy_poll.h> #include "nvme.h" #include "fabrics.h" @@ -72,6 +73,7 @@ struct nvme_tcp_queue { int pdu_offset; size_t data_remaining; size_t ddgst_remaining; + unsigned int nr_cqe; /* send state */ struct nvme_tcp_request *request; @@ -438,6 +440,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue, } nvme_end_request(rq, cqe->status, cqe->result); + queue->nr_cqe++; return 0; } @@ -608,23 +611,18 @@ static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb, switch (hdr->type) { case nvme_tcp_c2h_data: - ret = nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_c2h_data(queue, (void *)queue->pdu); case nvme_tcp_rsp: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_comp(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_comp(queue, (void *)queue->pdu); case nvme_tcp_r2t: nvme_tcp_init_recv_ctx(queue); - ret = nvme_tcp_handle_r2t(queue, (void *)queue->pdu); - break; + return nvme_tcp_handle_r2t(queue, (void *)queue->pdu); default: dev_err(queue->ctrl->ctrl.device, "unsupported pdu type (%d)\n", hdr->type); return -EINVAL; } - - return ret; } static inline void nvme_tcp_end_request(struct request *rq, u16 status) @@ -701,8 +699,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; } else { - if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; + } nvme_tcp_init_recv_ctx(queue); } } @@ -742,6 +742,7 @@ static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue, pdu->command_id); nvme_tcp_end_request(rq, NVME_SC_SUCCESS); + queue->nr_cqe++; } nvme_tcp_init_recv_ctx(queue); @@ -841,7 +842,7 @@ static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue) static void nvme_tcp_fail_request(struct nvme_tcp_request *req) { - nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_DATA_XFER_ERROR); + nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR); } static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) @@ -1023,14 +1024,16 @@ done: static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue) { - struct sock *sk = queue->sock->sk; + struct socket *sock = queue->sock; + struct sock *sk = sock->sk; read_descriptor_t rd_desc; int consumed; rd_desc.arg.data = queue; rd_desc.count = 1; lock_sock(sk); - consumed = tcp_read_sock(sk, &rd_desc, nvme_tcp_recv_skb); + queue->nr_cqe = 0; + consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb); release_sock(sk); return consumed; } @@ -1255,7 +1258,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->queue_size = queue_size; if (qid > 0) - queue->cmnd_capsule_len = ctrl->ctrl.ioccsz * 16; + queue->cmnd_capsule_len = nctrl->ioccsz * 16; else queue->cmnd_capsule_len = sizeof(struct nvme_command) + NVME_TCP_ADMIN_CCSZ; @@ -1263,7 +1266,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = sock_create(ctrl->addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &queue->sock); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to create socket: %d\n", ret); return ret; } @@ -1273,7 +1276,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_SYNCNT, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_SYNCNT sock opt %d\n", ret); goto err_sock; } @@ -1283,7 +1286,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, IPPROTO_TCP, TCP_NODELAY, (char *)&opt, sizeof(opt)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set TCP_NODELAY sock opt %d\n", ret); goto err_sock; } @@ -1296,11 +1299,23 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ret = kernel_setsockopt(queue->sock, SOL_SOCKET, SO_LINGER, (char *)&sol, sizeof(sol)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to set SO_LINGER sock opt %d\n", ret); goto err_sock; } + /* Set socket type of service */ + if (nctrl->opts->tos >= 0) { + opt = nctrl->opts->tos; + ret = kernel_setsockopt(queue->sock, SOL_IP, IP_TOS, + (char *)&opt, sizeof(opt)); + if (ret) { + dev_err(nctrl->device, + "failed to set IP_TOS sock opt %d\n", ret); + goto err_sock; + } + } + queue->sock->sk->sk_allocation = GFP_ATOMIC; if (!qid) n = 0; @@ -1314,11 +1329,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->pdu_offset = 0; sk_set_memalloc(queue->sock->sk); - if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR) { + if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) { ret = kernel_bind(queue->sock, (struct sockaddr *)&ctrl->src_addr, sizeof(ctrl->src_addr)); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to bind queue %d socket %d\n", qid, ret); goto err_sock; @@ -1330,7 +1345,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, if (queue->hdr_digest || queue->data_digest) { ret = nvme_tcp_alloc_crypto(queue); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to allocate queue %d crypto\n", qid); goto err_sock; } @@ -1344,13 +1359,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, goto err_crypto; } - dev_dbg(ctrl->ctrl.device, "connecting queue %d\n", + dev_dbg(nctrl->device, "connecting queue %d\n", nvme_tcp_queue_id(queue)); ret = kernel_connect(queue->sock, (struct sockaddr *)&ctrl->addr, sizeof(ctrl->addr), 0); if (ret) { - dev_err(ctrl->ctrl.device, + dev_err(nctrl->device, "failed to connect socket: %d\n", ret); goto err_rcv_pdu; } @@ -1371,6 +1386,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, queue->sock->sk->sk_data_ready = nvme_tcp_data_ready; queue->sock->sk->sk_state_change = nvme_tcp_state_change; queue->sock->sk->sk_write_space = nvme_tcp_write_space; + queue->sock->sk->sk_ll_usec = 1; write_unlock_bh(&queue->sock->sk->sk_callback_lock); return 0; @@ -1469,7 +1485,7 @@ static struct blk_mq_tag_set *nvme_tcp_alloc_tagset(struct nvme_ctrl *nctrl, set->driver_data = ctrl; set->nr_hw_queues = nctrl->queue_count - 1; set->timeout = NVME_IO_TIMEOUT; - set->nr_maps = 2 /* default + read */; + set->nr_maps = nctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2; } ret = blk_mq_alloc_tag_set(set); @@ -1568,6 +1584,7 @@ static unsigned int nvme_tcp_nr_io_queues(struct nvme_ctrl *ctrl) nr_io_queues = min(ctrl->opts->nr_io_queues, num_online_cpus()); nr_io_queues += min(ctrl->opts->nr_write_queues, num_online_cpus()); + nr_io_queues += min(ctrl->opts->nr_poll_queues, num_online_cpus()); return nr_io_queues; } @@ -1599,6 +1616,12 @@ static void nvme_tcp_set_io_queues(struct nvme_ctrl *nctrl, min(opts->nr_io_queues, nr_io_queues); nr_io_queues -= ctrl->io_queues[HCTX_TYPE_DEFAULT]; } + + if (opts->nr_poll_queues && nr_io_queues) { + /* map dedicated poll queues only if we have queues left */ + ctrl->io_queues[HCTX_TYPE_POLL] = + min(opts->nr_poll_queues, nr_io_queues); + } } static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl) @@ -1680,6 +1703,7 @@ static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove) nvme_tcp_stop_queue(ctrl, 0); if (remove) { blk_cleanup_queue(ctrl->admin_q); + blk_cleanup_queue(ctrl->fabrics_q); blk_mq_free_tag_set(ctrl->admin_tagset); } nvme_tcp_free_admin_queue(ctrl); @@ -1700,10 +1724,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) goto out_free_queue; } + ctrl->fabrics_q = blk_mq_init_queue(ctrl->admin_tagset); + if (IS_ERR(ctrl->fabrics_q)) { + error = PTR_ERR(ctrl->fabrics_q); + goto out_free_tagset; + } + ctrl->admin_q = blk_mq_init_queue(ctrl->admin_tagset); if (IS_ERR(ctrl->admin_q)) { error = PTR_ERR(ctrl->admin_q); - goto out_free_tagset; + goto out_cleanup_fabrics_q; } } @@ -1711,19 +1741,12 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new) if (error) goto out_cleanup_queue; - error = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &ctrl->cap); - if (error) { - dev_err(ctrl->device, - "prop_get NVME_REG_CAP failed\n"); - goto out_stop_queue; - } - - ctrl->sqsize = min_t(int, NVME_CAP_MQES(ctrl->cap), ctrl->sqsize); - - error = nvme_enable_ctrl(ctrl, ctrl->cap); + error = nvme_enable_ctrl(ctrl); if (error) goto out_stop_queue; + blk_mq_unquiesce_queue(ctrl->admin_q); + error = nvme_init_identify(ctrl); if (error) goto out_stop_queue; @@ -1735,6 +1758,9 @@ out_stop_queue: out_cleanup_queue: if (new) blk_cleanup_queue(ctrl->admin_q); +out_cleanup_fabrics_q: + if (new) + blk_cleanup_queue(ctrl->fabrics_q); out_free_tagset: if (new) blk_mq_free_tag_set(ctrl->admin_tagset); @@ -1748,10 +1774,13 @@ static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl, { blk_mq_quiesce_queue(ctrl->admin_q); nvme_tcp_stop_queue(ctrl, 0); - if (ctrl->admin_tagset) + if (ctrl->admin_tagset) { blk_mq_tagset_busy_iter(ctrl->admin_tagset, nvme_cancel_request, ctrl); - blk_mq_unquiesce_queue(ctrl->admin_q); + blk_mq_tagset_wait_completed_request(ctrl->admin_tagset); + } + if (remove) + blk_mq_unquiesce_queue(ctrl->admin_q); nvme_tcp_destroy_admin_queue(ctrl, remove); } @@ -1762,9 +1791,11 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl, return; nvme_stop_queues(ctrl); nvme_tcp_stop_io_queues(ctrl); - if (ctrl->tagset) + if (ctrl->tagset) { blk_mq_tagset_busy_iter(ctrl->tagset, nvme_cancel_request, ctrl); + blk_mq_tagset_wait_completed_request(ctrl->tagset); + } if (remove) nvme_start_queues(ctrl); nvme_tcp_destroy_io_queues(ctrl, remove); @@ -1793,7 +1824,7 @@ static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl) static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new) { struct nvmf_ctrl_options *opts = ctrl->opts; - int ret = -EINVAL; + int ret; ret = nvme_tcp_configure_admin_queue(ctrl, new); if (ret) @@ -1876,6 +1907,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work) /* unquiesce to fail fast pending requests */ nvme_start_queues(ctrl); nvme_tcp_teardown_admin_queue(ctrl, false); + blk_mq_unquiesce_queue(ctrl->admin_q); if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) { /* state change failure is ok if we're in DELETING state */ @@ -1892,10 +1924,11 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown) cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work); nvme_tcp_teardown_io_queues(ctrl, shutdown); + blk_mq_quiesce_queue(ctrl->admin_q); if (shutdown) nvme_shutdown_ctrl(ctrl); else - nvme_disable_ctrl(ctrl, ctrl->cap); + nvme_disable_ctrl(ctrl); nvme_tcp_teardown_admin_queue(ctrl, shutdown); } @@ -2151,14 +2184,36 @@ static int nvme_tcp_map_queues(struct blk_mq_tag_set *set) blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); blk_mq_map_queues(&set->map[HCTX_TYPE_READ]); + if (opts->nr_poll_queues && ctrl->io_queues[HCTX_TYPE_POLL]) { + /* map dedicated poll queues only if we have queues left */ + set->map[HCTX_TYPE_POLL].nr_queues = + ctrl->io_queues[HCTX_TYPE_POLL]; + set->map[HCTX_TYPE_POLL].queue_offset = + ctrl->io_queues[HCTX_TYPE_DEFAULT] + + ctrl->io_queues[HCTX_TYPE_READ]; + blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]); + } + dev_info(ctrl->ctrl.device, - "mapped %d/%d default/read queues.\n", + "mapped %d/%d/%d default/read/poll queues.\n", ctrl->io_queues[HCTX_TYPE_DEFAULT], - ctrl->io_queues[HCTX_TYPE_READ]); + ctrl->io_queues[HCTX_TYPE_READ], + ctrl->io_queues[HCTX_TYPE_POLL]); return 0; } +static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx) +{ + struct nvme_tcp_queue *queue = hctx->driver_data; + struct sock *sk = queue->sock->sk; + + if (sk_can_busy_loop(sk) && skb_queue_empty(&sk->sk_receive_queue)) + sk_busy_loop(sk, true); + nvme_tcp_try_recv(queue); + return queue->nr_cqe; +} + static struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .complete = nvme_complete_rq, @@ -2167,6 +2222,7 @@ static struct blk_mq_ops nvme_tcp_mq_ops = { .init_hctx = nvme_tcp_init_hctx, .timeout = nvme_tcp_timeout, .map_queues = nvme_tcp_map_queues, + .poll = nvme_tcp_poll, }; static struct blk_mq_ops nvme_tcp_admin_mq_ops = { @@ -2220,7 +2276,8 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev, INIT_LIST_HEAD(&ctrl->list); ctrl->ctrl.opts = opts; - ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + 1; + ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues + + opts->nr_poll_queues + 1; ctrl->ctrl.sqsize = opts->queue_size - 1; ctrl->ctrl.kato = opts->kato; @@ -2314,7 +2371,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = { .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO | NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST | - NVMF_OPT_NR_WRITE_QUEUES, + NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES | + NVMF_OPT_TOS, .create_ctrl = nvme_tcp_create_ctrl, }; diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c index 9778eb0406b3..5c3cb6928f3c 100644 --- a/drivers/nvme/host/trace.c +++ b/drivers/nvme/host/trace.c @@ -86,6 +86,22 @@ static const char *nvme_trace_admin_get_features(struct trace_seq *p, return ret; } +static const char *nvme_trace_get_lba_status(struct trace_seq *p, + u8 *cdw10) +{ + const char *ret = trace_seq_buffer_ptr(p); + u64 slba = get_unaligned_le64(cdw10); + u32 mndw = get_unaligned_le32(cdw10 + 8); + u16 rl = get_unaligned_le16(cdw10 + 12); + u8 atype = cdw10[15]; + + trace_seq_printf(p, "slba=0x%llx, mndw=0x%x, rl=0x%x, atype=%u", + slba, mndw, rl, atype); + trace_seq_putc(p, 0); + + return ret; +} + static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10) { const char *ret = trace_seq_buffer_ptr(p); @@ -141,6 +157,8 @@ const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, return nvme_trace_admin_identify(p, cdw10); case nvme_admin_get_features: return nvme_trace_admin_get_features(p, cdw10); + case nvme_admin_get_lba_status: + return nvme_trace_get_lba_status(p, cdw10); default: return nvme_trace_common(p, cdw10); } |