diff options
Diffstat (limited to 'drivers/nvme/target')
-rw-r--r-- | drivers/nvme/target/Kconfig | 36 | ||||
-rw-r--r-- | drivers/nvme/target/Makefile | 9 | ||||
-rw-r--r-- | drivers/nvme/target/admin-cmd.c | 465 | ||||
-rw-r--r-- | drivers/nvme/target/configfs.c | 917 | ||||
-rw-r--r-- | drivers/nvme/target/core.c | 964 | ||||
-rw-r--r-- | drivers/nvme/target/discovery.c | 221 | ||||
-rw-r--r-- | drivers/nvme/target/fabrics-cmd.c | 240 | ||||
-rw-r--r-- | drivers/nvme/target/io-cmd.c | 215 | ||||
-rw-r--r-- | drivers/nvme/target/loop.c | 754 | ||||
-rw-r--r-- | drivers/nvme/target/nvmet.h | 331 | ||||
-rw-r--r-- | drivers/nvme/target/rdma.c | 1448 |
11 files changed, 5600 insertions, 0 deletions
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig new file mode 100644 index 000000000000..a5c31cbeb481 --- /dev/null +++ b/drivers/nvme/target/Kconfig @@ -0,0 +1,36 @@ + +config NVME_TARGET + tristate "NVMe Target support" + depends on BLOCK + depends on CONFIGFS_FS + help + This enabled target side support for the NVMe protocol, that is + it allows the Linux kernel to implement NVMe subsystems and + controllers and export Linux block devices as NVMe namespaces. + You need to select at least one of the transports below to make this + functionality useful. + + To configure the NVMe target you probably want to use the nvmetcli + tool from http://git.infradead.org/users/hch/nvmetcli.git. + +config NVME_TARGET_LOOP + tristate "NVMe loopback device support" + depends on BLK_DEV_NVME + depends on NVME_TARGET + select NVME_FABRICS + select SG_POOL + help + This enables the NVMe loopback device support, which can be useful + to test NVMe host and target side features. + + If unsure, say N. + +config NVME_TARGET_RDMA + tristate "NVMe over Fabrics RDMA target support" + depends on INFINIBAND + depends on NVME_TARGET + help + This enables the NVMe RDMA target support, which allows exporting NVMe + devices over RDMA. + + If unsure, say N. diff --git a/drivers/nvme/target/Makefile b/drivers/nvme/target/Makefile new file mode 100644 index 000000000000..b7a06232c9da --- /dev/null +++ b/drivers/nvme/target/Makefile @@ -0,0 +1,9 @@ + +obj-$(CONFIG_NVME_TARGET) += nvmet.o +obj-$(CONFIG_NVME_TARGET_LOOP) += nvme-loop.o +obj-$(CONFIG_NVME_TARGET_RDMA) += nvmet-rdma.o + +nvmet-y += core.o configfs.o admin-cmd.o io-cmd.o fabrics-cmd.o \ + discovery.o +nvme-loop-y += loop.o +nvmet-rdma-y += rdma.o diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c new file mode 100644 index 000000000000..2fac17a5ad53 --- /dev/null +++ b/drivers/nvme/target/admin-cmd.c @@ -0,0 +1,465 @@ +/* + * NVMe admin command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include <linux/random.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +u32 nvmet_get_log_page_len(struct nvme_command *cmd) +{ + u32 len = le16_to_cpu(cmd->get_log_page.numdu); + + len <<= 16; + len += le16_to_cpu(cmd->get_log_page.numdl); + /* NUMD is a 0's based value */ + len += 1; + len *= sizeof(u32); + + return len; +} + +static void nvmet_execute_get_log_page(struct nvmet_req *req) +{ + size_t data_len = nvmet_get_log_page_len(req->cmd); + void *buf; + u16 status = 0; + + buf = kzalloc(data_len, GFP_KERNEL); + if (!buf) { + status = NVME_SC_INTERNAL; + goto out; + } + + switch (req->cmd->get_log_page.lid) { + case 0x01: + /* + * We currently never set the More bit in the status field, + * so all error log entries are invalid and can be zeroed out. + * This is called a minum viable implementation (TM) of this + * mandatory log page. + */ + break; + case 0x02: + /* + * XXX: fill out actual smart log + * + * We might have a hard time coming up with useful values for + * many of the fields, and even when we have useful data + * available (e.g. units or commands read/written) those aren't + * persistent over power loss. + */ + break; + case 0x03: + /* + * We only support a single firmware slot which always is + * active, so we can zero out the whole firmware slot log and + * still claim to fully implement this mandatory log page. + */ + break; + default: + BUG(); + } + + status = nvmet_copy_to_sgl(req, 0, buf, data_len); + + kfree(buf); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u64 serial; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + /* XXX: figure out how to assign real vendors IDs. */ + id->vid = 0; + id->ssvid = 0; + + /* generate a random serial number as our controllers are ephemeral: */ + get_random_bytes(&serial, sizeof(serial)); + memset(id->sn, ' ', sizeof(id->sn)); + snprintf(id->sn, sizeof(id->sn), "%llx", serial); + + memset(id->mn, ' ', sizeof(id->mn)); + strncpy((char *)id->mn, "Linux", sizeof(id->mn)); + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + id->rab = 6; + + /* + * XXX: figure out how we can assign a IEEE OUI, but until then + * the safest is to leave it as zeroes. + */ + + /* we support multiple ports and multiples hosts: */ + id->mic = (1 << 0) | (1 << 1); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + + /* XXX: figure out what to do about RTD3R/RTD3 */ + id->oaes = cpu_to_le32(1 << 8); + id->ctratt = cpu_to_le32(1 << 0); + + id->oacs = 0; + + /* + * We don't really have a practical limit on the number of abort + * comands. But we don't do anything useful for abort either, so + * no point in allowing more abort commands than the spec requires. + */ + id->acl = 3; + + id->aerl = NVMET_ASYNC_EVENTS - 1; + + /* first slot is read-only, only one slot supported */ + id->frmw = (1 << 0) | (1 << 1); + id->lpa = (1 << 0) | (1 << 2); + id->elpe = NVMET_ERROR_LOG_SLOTS - 1; + id->npss = 0; + + /* We support keep-alive timeout in granularity of seconds */ + id->kas = cpu_to_le16(NVMET_KAS); + + id->sqes = (0x6 << 4) | 0x6; + id->cqes = (0x4 << 4) | 0x4; + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->nn = cpu_to_le32(ctrl->subsys->max_nsid); + id->oncs = cpu_to_le16(NVME_CTRL_ONCS_DSM); + + /* XXX: don't report vwc if the underlying device is write through */ + id->vwc = NVME_CTRL_VWC_PRESENT; + + /* + * We can't support atomic writes bigger than a LBA without support + * from the backend device. + */ + id->awun = 0; + id->awupf = 0; + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + /* Max command capsule size is sqe + single page of in-capsule data */ + id->ioccsz = cpu_to_le32((sizeof(struct nvme_command) + + ctrl->ops->sqe_inline_size) / 16); + /* Max response capsule size is cqe */ + id->iorcsz = cpu_to_le32(sizeof(struct nvme_completion) / 16); + + id->msdbd = ctrl->ops->msdbd; + + /* + * Meh, we don't really support any power state. Fake up the same + * values that qemu does. + */ + id->psd[0].max_power = cpu_to_le16(0x9c4); + id->psd[0].entry_lat = cpu_to_le32(0x10); + id->psd[0].exit_lat = cpu_to_le32(0x4); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_ns(struct nvmet_req *req) +{ + struct nvmet_ns *ns; + struct nvme_id_ns *id; + u16 status = 0; + + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->identify.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS | NVME_SC_DNR; + goto out; + } + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out_put_ns; + } + + /* + * nuse = ncap = nsze isn't aways true, but we have no way to find + * that out from the underlying device. + */ + id->ncap = id->nuse = id->nsze = + cpu_to_le64(ns->size >> ns->blksize_shift); + + /* + * We just provide a single LBA format that matches what the + * underlying device reports. + */ + id->nlbaf = 0; + id->flbas = 0; + + /* + * Our namespace might always be shared. Not just with other + * controllers, but also with any other user of the block device. + */ + id->nmic = (1 << 0); + + memcpy(&id->nguid, &ns->nguid, sizeof(uuid_le)); + + id->lbaf[0].ds = ns->blksize_shift; + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out_put_ns: + nvmet_put_namespace(ns); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_nslist(struct nvmet_req *req) +{ + static const int buf_size = 4096; + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmet_ns *ns; + u32 min_nsid = le32_to_cpu(req->cmd->identify.nsid); + __le32 *list; + u16 status = 0; + int i = 0; + + list = kzalloc(buf_size, GFP_KERNEL); + if (!list) { + status = NVME_SC_INTERNAL; + goto out; + } + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid <= min_nsid) + continue; + list[i++] = cpu_to_le32(ns->nsid); + if (i == buf_size / sizeof(__le32)) + break; + } + rcu_read_unlock(); + + status = nvmet_copy_to_sgl(req, 0, list, buf_size); + + kfree(list); +out: + nvmet_req_complete(req, status); +} + +/* + * A "mimimum viable" abort implementation: the command is mandatory in the + * spec, but we are not required to do any useful work. We couldn't really + * do a useful abort, so don't bother even with waiting for the command + * to be exectuted and return immediately telling the command to abort + * wasn't found. + */ +static void nvmet_execute_abort(struct nvmet_req *req) +{ + nvmet_set_result(req, 1); + nvmet_req_complete(req, 0); +} + +static void nvmet_execute_set_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u64 val; + u32 val32; + u16 status = 0; + + switch (cdw10 & 0xf) { + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid - 1) | ((subsys->max_qid - 1) << 16)); + break; + case NVME_FEAT_KATO: + val = le64_to_cpu(req->cmd->prop_set.value); + val32 = val & 0xffff; + req->sq->ctrl->kato = DIV_ROUND_UP(val32, 1000); + nvmet_set_result(req, req->sq->ctrl->kato); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_get_features(struct nvmet_req *req) +{ + struct nvmet_subsys *subsys = req->sq->ctrl->subsys; + u32 cdw10 = le32_to_cpu(req->cmd->common.cdw10[0]); + u16 status = 0; + + switch (cdw10 & 0xf) { + /* + * These features are mandatory in the spec, but we don't + * have a useful way to implement them. We'll eventually + * need to come up with some fake values for these. + */ +#if 0 + case NVME_FEAT_ARBITRATION: + break; + case NVME_FEAT_POWER_MGMT: + break; + case NVME_FEAT_TEMP_THRESH: + break; + case NVME_FEAT_ERR_RECOVERY: + break; + case NVME_FEAT_IRQ_COALESCE: + break; + case NVME_FEAT_IRQ_CONFIG: + break; + case NVME_FEAT_WRITE_ATOMIC: + break; + case NVME_FEAT_ASYNC_EVENT: + break; +#endif + case NVME_FEAT_VOLATILE_WC: + nvmet_set_result(req, 1); + break; + case NVME_FEAT_NUM_QUEUES: + nvmet_set_result(req, + (subsys->max_qid-1) | ((subsys->max_qid-1) << 16)); + break; + case NVME_FEAT_KATO: + nvmet_set_result(req, req->sq->ctrl->kato * 1000); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_async_event(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + mutex_lock(&ctrl->lock); + if (ctrl->nr_async_event_cmds >= NVMET_ASYNC_EVENTS) { + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_ASYNC_LIMIT | NVME_SC_DNR); + return; + } + ctrl->async_event_cmds[ctrl->nr_async_event_cmds++] = req; + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +static void nvmet_execute_keep_alive(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + + pr_debug("ctrl %d update keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + mod_delayed_work(system_wq, &ctrl->ka_work, ctrl->kato * HZ); + nvmet_req_complete(req, 0); +} + +int nvmet_parse_admin_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got admin cmd %d while CC.EN == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got admin cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case 0x01: + case 0x02: + case 0x03: + req->execute = nvmet_execute_get_log_page; + return 0; + } + break; + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x00: + req->execute = nvmet_execute_identify_ns; + return 0; + case 0x01: + req->execute = nvmet_execute_identify_ctrl; + return 0; + case 0x02: + req->execute = nvmet_execute_identify_nslist; + return 0; + } + break; + case nvme_admin_abort_cmd: + req->execute = nvmet_execute_abort; + req->data_len = 0; + return 0; + case nvme_admin_set_features: + req->execute = nvmet_execute_set_features; + req->data_len = 0; + return 0; + case nvme_admin_get_features: + req->execute = nvmet_execute_get_features; + req->data_len = 0; + return 0; + case nvme_admin_async_event: + req->execute = nvmet_execute_async_event; + req->data_len = 0; + return 0; + case nvme_admin_keep_alive: + req->execute = nvmet_execute_keep_alive; + req->data_len = 0; + return 0; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c new file mode 100644 index 000000000000..af5e2dc4a3d5 --- /dev/null +++ b/drivers/nvme/target/configfs.c @@ -0,0 +1,917 @@ +/* + * Configfs interface for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/stat.h> +#include <linux/ctype.h> + +#include "nvmet.h" + +static struct config_item_type nvmet_host_type; +static struct config_item_type nvmet_subsys_type; + +/* + * nvmet_port Generic ConfigFS definitions. + * Used in any place in the ConfigFS tree that refers to an address. + */ +static ssize_t nvmet_addr_adrfam_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + return sprintf(page, "ipv4\n"); + case NVMF_ADDR_FAMILY_IP6: + return sprintf(page, "ipv6\n"); + case NVMF_ADDR_FAMILY_IB: + return sprintf(page, "ib\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_adrfam_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "ipv4")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP4; + } else if (sysfs_streq(page, "ipv6")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IP6; + } else if (sysfs_streq(page, "ib")) { + port->disc_addr.adrfam = NVMF_ADDR_FAMILY_IB; + } else { + pr_err("Invalid value '%s' for adrfam\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_adrfam); + +static ssize_t nvmet_addr_portid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%d\n", + le16_to_cpu(port->disc_addr.portid)); +} + +static ssize_t nvmet_addr_portid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 portid = 0; + + if (kstrtou16(page, 0, &portid)) { + pr_err("Invalid value '%s' for portid\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + port->disc_addr.portid = cpu_to_le16(portid); + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_portid); + +static ssize_t nvmet_addr_traddr_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.traddr); +} + +static ssize_t nvmet_addr_traddr_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRADDR_SIZE) { + pr_err("Invalid value '%s' for traddr\n", page); + return -EINVAL; + } + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.traddr, + sizeof(port->disc_addr.traddr), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_traddr); + +static ssize_t nvmet_addr_treq_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.treq) { + case NVMF_TREQ_NOT_SPECIFIED: + return sprintf(page, "not specified\n"); + case NVMF_TREQ_REQUIRED: + return sprintf(page, "required\n"); + case NVMF_TREQ_NOT_REQUIRED: + return sprintf(page, "not required\n"); + default: + return sprintf(page, "\n"); + } +} + +static ssize_t nvmet_addr_treq_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "not specified")) { + port->disc_addr.treq = NVMF_TREQ_NOT_SPECIFIED; + } else if (sysfs_streq(page, "required")) { + port->disc_addr.treq = NVMF_TREQ_REQUIRED; + } else if (sysfs_streq(page, "not required")) { + port->disc_addr.treq = NVMF_TREQ_NOT_REQUIRED; + } else { + pr_err("Invalid value '%s' for treq\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_treq); + +static ssize_t nvmet_addr_trsvcid_show(struct config_item *item, + char *page) +{ + struct nvmet_port *port = to_nvmet_port(item); + + return snprintf(page, PAGE_SIZE, "%s\n", + port->disc_addr.trsvcid); +} + +static ssize_t nvmet_addr_trsvcid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (count > NVMF_TRSVCID_SIZE) { + pr_err("Invalid value '%s' for trsvcid\n", page); + return -EINVAL; + } + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + return snprintf(port->disc_addr.trsvcid, + sizeof(port->disc_addr.trsvcid), "%s", page); +} + +CONFIGFS_ATTR(nvmet_, addr_trsvcid); + +static ssize_t nvmet_addr_trtype_show(struct config_item *item, + char *page) +{ + switch (to_nvmet_port(item)->disc_addr.trtype) { + case NVMF_TRTYPE_RDMA: + return sprintf(page, "rdma\n"); + case NVMF_TRTYPE_LOOP: + return sprintf(page, "loop\n"); + default: + return sprintf(page, "\n"); + } +} + +static void nvmet_port_init_tsas_rdma(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_RDMA; + memset(&port->disc_addr.tsas.rdma, 0, NVMF_TSAS_SIZE); + port->disc_addr.tsas.rdma.qptype = NVMF_RDMA_QPTYPE_CONNECTED; + port->disc_addr.tsas.rdma.prtype = NVMF_RDMA_PRTYPE_NOT_SPECIFIED; + port->disc_addr.tsas.rdma.cms = NVMF_RDMA_CMS_RDMA_CM; +} + +static void nvmet_port_init_tsas_loop(struct nvmet_port *port) +{ + port->disc_addr.trtype = NVMF_TRTYPE_LOOP; + memset(&port->disc_addr.tsas, 0, NVMF_TSAS_SIZE); +} + +static ssize_t nvmet_addr_trtype_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *port = to_nvmet_port(item); + + if (port->enabled) { + pr_err("Cannot modify address while enabled\n"); + pr_err("Disable the address before modifying\n"); + return -EACCES; + } + + if (sysfs_streq(page, "rdma")) { + nvmet_port_init_tsas_rdma(port); + } else if (sysfs_streq(page, "loop")) { + nvmet_port_init_tsas_loop(port); + } else { + pr_err("Invalid value '%s' for trtype\n", page); + return -EINVAL; + } + + return count; +} + +CONFIGFS_ATTR(nvmet_, addr_trtype); + +/* + * Namespace structures & file operation functions below + */ +static ssize_t nvmet_ns_device_path_show(struct config_item *item, char *page) +{ + return sprintf(page, "%s\n", to_nvmet_ns(item)->device_path); +} + +static ssize_t nvmet_ns_device_path_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + int ret; + + mutex_lock(&subsys->lock); + ret = -EBUSY; + if (nvmet_ns_enabled(ns)) + goto out_unlock; + + kfree(ns->device_path); + + ret = -ENOMEM; + ns->device_path = kstrdup(page, GFP_KERNEL); + if (!ns->device_path) + goto out_unlock; + + mutex_unlock(&subsys->lock); + return count; + +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +} + +CONFIGFS_ATTR(nvmet_ns_, device_path); + +static ssize_t nvmet_ns_device_nguid_show(struct config_item *item, char *page) +{ + return sprintf(page, "%pUb\n", &to_nvmet_ns(item)->nguid); +} + +static ssize_t nvmet_ns_device_nguid_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + struct nvmet_subsys *subsys = ns->subsys; + u8 nguid[16]; + const char *p = page; + int i; + int ret = 0; + + mutex_lock(&subsys->lock); + if (nvmet_ns_enabled(ns)) { + ret = -EBUSY; + goto out_unlock; + } + + for (i = 0; i < 16; i++) { + if (p + 2 > page + count) { + ret = -EINVAL; + goto out_unlock; + } + if (!isxdigit(p[0]) || !isxdigit(p[1])) { + ret = -EINVAL; + goto out_unlock; + } + + nguid[i] = (hex_to_bin(p[0]) << 4) | hex_to_bin(p[1]); + p += 2; + + if (*p == '-' || *p == ':') + p++; + } + + memcpy(&ns->nguid, nguid, sizeof(nguid)); +out_unlock: + mutex_unlock(&subsys->lock); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, device_nguid); + +static ssize_t nvmet_ns_enable_show(struct config_item *item, char *page) +{ + return sprintf(page, "%d\n", nvmet_ns_enabled(to_nvmet_ns(item))); +} + +static ssize_t nvmet_ns_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + bool enable; + int ret = 0; + + if (strtobool(page, &enable)) + return -EINVAL; + + if (enable) + ret = nvmet_ns_enable(ns); + else + nvmet_ns_disable(ns); + + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_ns_, enable); + +static struct configfs_attribute *nvmet_ns_attrs[] = { + &nvmet_ns_attr_device_path, + &nvmet_ns_attr_device_nguid, + &nvmet_ns_attr_enable, + NULL, +}; + +static void nvmet_ns_release(struct config_item *item) +{ + struct nvmet_ns *ns = to_nvmet_ns(item); + + nvmet_ns_free(ns); +} + +static struct configfs_item_operations nvmet_ns_item_ops = { + .release = nvmet_ns_release, +}; + +static struct config_item_type nvmet_ns_type = { + .ct_item_ops = &nvmet_ns_item_ops, + .ct_attrs = nvmet_ns_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ns_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys = namespaces_to_subsys(&group->cg_item); + struct nvmet_ns *ns; + int ret; + u32 nsid; + + ret = kstrtou32(name, 0, &nsid); + if (ret) + goto out; + + ret = -EINVAL; + if (nsid == 0 || nsid == 0xffffffff) + goto out; + + ret = -ENOMEM; + ns = nvmet_ns_alloc(subsys, nsid); + if (!ns) + goto out; + config_group_init_type_name(&ns->group, name, &nvmet_ns_type); + + pr_info("adding nsid %d to subsystem %s\n", nsid, subsys->subsysnqn); + + return &ns->group; +out: + return ERR_PTR(ret); +} + +static struct configfs_group_operations nvmet_namespaces_group_ops = { + .make_group = nvmet_ns_make, +}; + +static struct config_item_type nvmet_namespaces_type = { + .ct_group_ops = &nvmet_namespaces_group_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_port_subsys_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys; + struct nvmet_subsys_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_subsys_type) { + pr_err("can only link subsystems into the subsystems dir.!\n"); + return -EINVAL; + } + subsys = to_subsys(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->subsys = subsys; + + down_write(&nvmet_config_sem); + ret = -EEXIST; + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto out_free_link; + } + + if (list_empty(&port->subsystems)) { + ret = nvmet_enable_port(port); + if (ret) + goto out_free_link; + } + + list_add_tail(&link->entry, &port->subsystems); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; + +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_port_subsys_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_port *port = to_nvmet_port(parent->ci_parent); + struct nvmet_subsys *subsys = to_subsys(target); + struct nvmet_subsys_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (p->subsys == subsys) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + if (list_empty(&port->subsystems)) + nvmet_disable_port(port); + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_port_subsys_item_ops = { + .allow_link = nvmet_port_subsys_allow_link, + .drop_link = nvmet_port_subsys_drop_link, +}; + +static struct config_item_type nvmet_port_subsys_type = { + .ct_item_ops = &nvmet_port_subsys_item_ops, + .ct_owner = THIS_MODULE, +}; + +static int nvmet_allowed_hosts_allow_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host; + struct nvmet_host_link *link, *p; + int ret; + + if (target->ci_type != &nvmet_host_type) { + pr_err("can only link hosts into the allowed_hosts directory!\n"); + return -EINVAL; + } + + host = to_host(target); + link = kmalloc(sizeof(*link), GFP_KERNEL); + if (!link) + return -ENOMEM; + link->host = host; + + down_write(&nvmet_config_sem); + ret = -EINVAL; + if (subsys->allow_any_host) { + pr_err("can't add hosts when allow_any_host is set!\n"); + goto out_free_link; + } + + ret = -EEXIST; + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto out_free_link; + } + list_add_tail(&link->entry, &subsys->hosts); + nvmet_genctr++; + up_write(&nvmet_config_sem); + return 0; +out_free_link: + up_write(&nvmet_config_sem); + kfree(link); + return ret; +} + +static int nvmet_allowed_hosts_drop_link(struct config_item *parent, + struct config_item *target) +{ + struct nvmet_subsys *subsys = to_subsys(parent->ci_parent); + struct nvmet_host *host = to_host(target); + struct nvmet_host_link *p; + + down_write(&nvmet_config_sem); + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), nvmet_host_name(host))) + goto found; + } + up_write(&nvmet_config_sem); + return -EINVAL; + +found: + list_del(&p->entry); + nvmet_genctr++; + up_write(&nvmet_config_sem); + kfree(p); + return 0; +} + +static struct configfs_item_operations nvmet_allowed_hosts_item_ops = { + .allow_link = nvmet_allowed_hosts_allow_link, + .drop_link = nvmet_allowed_hosts_drop_link, +}; + +static struct config_item_type nvmet_allowed_hosts_type = { + .ct_item_ops = &nvmet_allowed_hosts_item_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_subsys_attr_allow_any_host_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", + to_subsys(item)->allow_any_host); +} + +static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_subsys *subsys = to_subsys(item); + bool allow_any_host; + int ret = 0; + + if (strtobool(page, &allow_any_host)) + return -EINVAL; + + down_write(&nvmet_config_sem); + if (allow_any_host && !list_empty(&subsys->hosts)) { + pr_err("Can't set allow_any_host when explicit hosts are set!\n"); + ret = -EINVAL; + goto out_unlock; + } + + subsys->allow_any_host = allow_any_host; +out_unlock: + up_write(&nvmet_config_sem); + return ret ? ret : count; +} + +CONFIGFS_ATTR(nvmet_subsys_, attr_allow_any_host); + +static struct configfs_attribute *nvmet_subsys_attrs[] = { + &nvmet_subsys_attr_attr_allow_any_host, + NULL, +}; + +/* + * Subsystem structures & folder operation functions below + */ +static void nvmet_subsys_release(struct config_item *item) +{ + struct nvmet_subsys *subsys = to_subsys(item); + + nvmet_subsys_put(subsys); +} + +static struct configfs_item_operations nvmet_subsys_item_ops = { + .release = nvmet_subsys_release, +}; + +static struct config_item_type nvmet_subsys_type = { + .ct_item_ops = &nvmet_subsys_item_ops, + .ct_attrs = nvmet_subsys_attrs, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_subsys_make(struct config_group *group, + const char *name) +{ + struct nvmet_subsys *subsys; + + if (sysfs_streq(name, NVME_DISC_SUBSYS_NAME)) { + pr_err("can't create discovery subsystem through configfs\n"); + return ERR_PTR(-EINVAL); + } + + subsys = nvmet_subsys_alloc(name, NVME_NQN_NVME); + if (!subsys) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&subsys->group, name, &nvmet_subsys_type); + + config_group_init_type_name(&subsys->namespaces_group, + "namespaces", &nvmet_namespaces_type); + configfs_add_default_group(&subsys->namespaces_group, &subsys->group); + + config_group_init_type_name(&subsys->allowed_hosts_group, + "allowed_hosts", &nvmet_allowed_hosts_type); + configfs_add_default_group(&subsys->allowed_hosts_group, + &subsys->group); + + return &subsys->group; +} + +static struct configfs_group_operations nvmet_subsystems_group_ops = { + .make_group = nvmet_subsys_make, +}; + +static struct config_item_type nvmet_subsystems_type = { + .ct_group_ops = &nvmet_subsystems_group_ops, + .ct_owner = THIS_MODULE, +}; + +static ssize_t nvmet_referral_enable_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%d\n", to_nvmet_port(item)->enabled); +} + +static ssize_t nvmet_referral_enable_store(struct config_item *item, + const char *page, size_t count) +{ + struct nvmet_port *parent = to_nvmet_port(item->ci_parent->ci_parent); + struct nvmet_port *port = to_nvmet_port(item); + bool enable; + + if (strtobool(page, &enable)) + goto inval; + + if (enable) + nvmet_referral_enable(parent, port); + else + nvmet_referral_disable(port); + + return count; +inval: + pr_err("Invalid value '%s' for enable\n", page); + return -EINVAL; +} + +CONFIGFS_ATTR(nvmet_referral_, enable); + +/* + * Discovery Service subsystem definitions + */ +static struct configfs_attribute *nvmet_referral_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_portid, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + &nvmet_referral_attr_enable, + NULL, +}; + +static void nvmet_referral_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + nvmet_referral_disable(port); + kfree(port); +} + +static struct configfs_item_operations nvmet_referral_item_ops = { + .release = nvmet_referral_release, +}; + +static struct config_item_type nvmet_referral_type = { + .ct_owner = THIS_MODULE, + .ct_attrs = nvmet_referral_attrs, + .ct_item_ops = &nvmet_referral_item_ops, +}; + +static struct config_group *nvmet_referral_make( + struct config_group *group, const char *name) +{ + struct nvmet_port *port; + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + config_group_init_type_name(&port->group, name, &nvmet_referral_type); + + return &port->group; +} + +static struct configfs_group_operations nvmet_referral_group_ops = { + .make_group = nvmet_referral_make, +}; + +static struct config_item_type nvmet_referrals_type = { + .ct_owner = THIS_MODULE, + .ct_group_ops = &nvmet_referral_group_ops, +}; + +/* + * Ports definitions. + */ +static void nvmet_port_release(struct config_item *item) +{ + struct nvmet_port *port = to_nvmet_port(item); + + kfree(port); +} + +static struct configfs_attribute *nvmet_port_attrs[] = { + &nvmet_attr_addr_adrfam, + &nvmet_attr_addr_treq, + &nvmet_attr_addr_traddr, + &nvmet_attr_addr_trsvcid, + &nvmet_attr_addr_trtype, + NULL, +}; + +static struct configfs_item_operations nvmet_port_item_ops = { + .release = nvmet_port_release, +}; + +static struct config_item_type nvmet_port_type = { + .ct_attrs = nvmet_port_attrs, + .ct_item_ops = &nvmet_port_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_ports_make(struct config_group *group, + const char *name) +{ + struct nvmet_port *port; + u16 portid; + + if (kstrtou16(name, 0, &portid)) + return ERR_PTR(-EINVAL); + + port = kzalloc(sizeof(*port), GFP_KERNEL); + if (!port) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&port->entry); + INIT_LIST_HEAD(&port->subsystems); + INIT_LIST_HEAD(&port->referrals); + + port->disc_addr.portid = cpu_to_le16(portid); + config_group_init_type_name(&port->group, name, &nvmet_port_type); + + config_group_init_type_name(&port->subsys_group, + "subsystems", &nvmet_port_subsys_type); + configfs_add_default_group(&port->subsys_group, &port->group); + + config_group_init_type_name(&port->referrals_group, + "referrals", &nvmet_referrals_type); + configfs_add_default_group(&port->referrals_group, &port->group); + + return &port->group; +} + +static struct configfs_group_operations nvmet_ports_group_ops = { + .make_group = nvmet_ports_make, +}; + +static struct config_item_type nvmet_ports_type = { + .ct_group_ops = &nvmet_ports_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_subsystems_group; +static struct config_group nvmet_ports_group; + +static void nvmet_host_release(struct config_item *item) +{ + struct nvmet_host *host = to_host(item); + + kfree(host); +} + +static struct configfs_item_operations nvmet_host_item_ops = { + .release = nvmet_host_release, +}; + +static struct config_item_type nvmet_host_type = { + .ct_item_ops = &nvmet_host_item_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group *nvmet_hosts_make_group(struct config_group *group, + const char *name) +{ + struct nvmet_host *host; + + host = kzalloc(sizeof(*host), GFP_KERNEL); + if (!host) + return ERR_PTR(-ENOMEM); + + config_group_init_type_name(&host->group, name, &nvmet_host_type); + + return &host->group; +} + +static struct configfs_group_operations nvmet_hosts_group_ops = { + .make_group = nvmet_hosts_make_group, +}; + +static struct config_item_type nvmet_hosts_type = { + .ct_group_ops = &nvmet_hosts_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct config_group nvmet_hosts_group; + +static struct config_item_type nvmet_root_type = { + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem nvmet_configfs_subsystem = { + .su_group = { + .cg_item = { + .ci_namebuf = "nvmet", + .ci_type = &nvmet_root_type, + }, + }, +}; + +int __init nvmet_init_configfs(void) +{ + int ret; + + config_group_init(&nvmet_configfs_subsystem.su_group); + mutex_init(&nvmet_configfs_subsystem.su_mutex); + + config_group_init_type_name(&nvmet_subsystems_group, + "subsystems", &nvmet_subsystems_type); + configfs_add_default_group(&nvmet_subsystems_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_ports_group, + "ports", &nvmet_ports_type); + configfs_add_default_group(&nvmet_ports_group, + &nvmet_configfs_subsystem.su_group); + + config_group_init_type_name(&nvmet_hosts_group, + "hosts", &nvmet_hosts_type); + configfs_add_default_group(&nvmet_hosts_group, + &nvmet_configfs_subsystem.su_group); + + ret = configfs_register_subsystem(&nvmet_configfs_subsystem); + if (ret) { + pr_err("configfs_register_subsystem: %d\n", ret); + return ret; + } + + return 0; +} + +void __exit nvmet_exit_configfs(void) +{ + configfs_unregister_subsystem(&nvmet_configfs_subsystem); +} diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c new file mode 100644 index 000000000000..8a891ca53367 --- /dev/null +++ b/drivers/nvme/target/core.c @@ -0,0 +1,964 @@ +/* + * Common code for the NVMe target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/module.h> +#include "nvmet.h" + +static struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX]; + +/* + * This read/write semaphore is used to synchronize access to configuration + * information on a target system that will result in discovery log page + * information change for at least one host. + * The full list of resources to protected by this semaphore is: + * + * - subsystems list + * - per-subsystem allowed hosts list + * - allow_any_host subsystem attribute + * - nvmet_genctr + * - the nvmet_transports array + * + * When updating any of those lists/structures write lock should be obtained, + * while when reading (popolating discovery log page or checking host-subsystem + * link) read lock is obtained to allow concurrent reads. + */ +DECLARE_RWSEM(nvmet_config_sem); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len) +{ + if (sg_pcopy_from_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, size_t len) +{ + if (sg_pcopy_to_buffer(req->sg, req->sg_cnt, buf, len, off) != len) + return NVME_SC_SGL_INVALID_DATA | NVME_SC_DNR; + return 0; +} + +static u32 nvmet_async_event_result(struct nvmet_async_event *aen) +{ + return aen->event_type | (aen->event_info << 8) | (aen->log_page << 16); +} + +static void nvmet_async_events_free(struct nvmet_ctrl *ctrl) +{ + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + if (!ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, NVME_SC_INTERNAL | NVME_SC_DNR); + } +} + +static void nvmet_async_event_work(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, async_event_work); + struct nvmet_async_event *aen; + struct nvmet_req *req; + + while (1) { + mutex_lock(&ctrl->lock); + aen = list_first_entry_or_null(&ctrl->async_events, + struct nvmet_async_event, entry); + if (!aen || !ctrl->nr_async_event_cmds) { + mutex_unlock(&ctrl->lock); + return; + } + + req = ctrl->async_event_cmds[--ctrl->nr_async_event_cmds]; + nvmet_set_result(req, nvmet_async_event_result(aen)); + + list_del(&aen->entry); + kfree(aen); + + mutex_unlock(&ctrl->lock); + nvmet_req_complete(req, 0); + } +} + +static void nvmet_add_async_event(struct nvmet_ctrl *ctrl, u8 event_type, + u8 event_info, u8 log_page) +{ + struct nvmet_async_event *aen; + + aen = kmalloc(sizeof(*aen), GFP_KERNEL); + if (!aen) + return; + + aen->event_type = event_type; + aen->event_info = event_info; + aen->log_page = log_page; + + mutex_lock(&ctrl->lock); + list_add_tail(&aen->entry, &ctrl->async_events); + mutex_unlock(&ctrl->lock); + + schedule_work(&ctrl->async_event_work); +} + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops) +{ + int ret = 0; + + down_write(&nvmet_config_sem); + if (nvmet_transports[ops->type]) + ret = -EINVAL; + else + nvmet_transports[ops->type] = ops; + up_write(&nvmet_config_sem); + + return ret; +} +EXPORT_SYMBOL_GPL(nvmet_register_transport); + +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops) +{ + down_write(&nvmet_config_sem); + nvmet_transports[ops->type] = NULL; + up_write(&nvmet_config_sem); +} +EXPORT_SYMBOL_GPL(nvmet_unregister_transport); + +int nvmet_enable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + int ret; + + lockdep_assert_held(&nvmet_config_sem); + + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + up_write(&nvmet_config_sem); + request_module("nvmet-transport-%d", port->disc_addr.trtype); + down_write(&nvmet_config_sem); + ops = nvmet_transports[port->disc_addr.trtype]; + if (!ops) { + pr_err("transport type %d not supported\n", + port->disc_addr.trtype); + return -EINVAL; + } + } + + if (!try_module_get(ops->owner)) + return -EINVAL; + + ret = ops->add_port(port); + if (ret) { + module_put(ops->owner); + return ret; + } + + port->enabled = true; + return 0; +} + +void nvmet_disable_port(struct nvmet_port *port) +{ + struct nvmet_fabrics_ops *ops; + + lockdep_assert_held(&nvmet_config_sem); + + port->enabled = false; + + ops = nvmet_transports[port->disc_addr.trtype]; + ops->remove_port(port); + module_put(ops->owner); +} + +static void nvmet_keep_alive_timer(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = container_of(to_delayed_work(work), + struct nvmet_ctrl, ka_work); + + pr_err("ctrl %d keep-alive timer (%d seconds) expired!\n", + ctrl->cntlid, ctrl->kato); + + ctrl->ops->delete_ctrl(ctrl); +} + +static void nvmet_start_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d start keep-alive timer for %d secs\n", + ctrl->cntlid, ctrl->kato); + + INIT_DELAYED_WORK(&ctrl->ka_work, nvmet_keep_alive_timer); + schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ); +} + +static void nvmet_stop_keep_alive_timer(struct nvmet_ctrl *ctrl) +{ + pr_debug("ctrl %d stop keep-alive\n", ctrl->cntlid); + + cancel_delayed_work_sync(&ctrl->ka_work); +} + +static struct nvmet_ns *__nvmet_find_namespace(struct nvmet_ctrl *ctrl, + __le32 nsid) +{ + struct nvmet_ns *ns; + + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + if (ns->nsid == le32_to_cpu(nsid)) + return ns; + } + + return NULL; +} + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid) +{ + struct nvmet_ns *ns; + + rcu_read_lock(); + ns = __nvmet_find_namespace(ctrl, nsid); + if (ns) + percpu_ref_get(&ns->ref); + rcu_read_unlock(); + + return ns; +} + +static void nvmet_destroy_namespace(struct percpu_ref *ref) +{ + struct nvmet_ns *ns = container_of(ref, struct nvmet_ns, ref); + + complete(&ns->disable_done); +} + +void nvmet_put_namespace(struct nvmet_ns *ns) +{ + percpu_ref_put(&ns->ref); +} + +int nvmet_ns_enable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + int ret = 0; + + mutex_lock(&subsys->lock); + if (!list_empty(&ns->dev_link)) + goto out_unlock; + + ns->bdev = blkdev_get_by_path(ns->device_path, FMODE_READ | FMODE_WRITE, + NULL); + if (IS_ERR(ns->bdev)) { + pr_err("nvmet: failed to open block device %s: (%ld)\n", + ns->device_path, PTR_ERR(ns->bdev)); + ret = PTR_ERR(ns->bdev); + ns->bdev = NULL; + goto out_unlock; + } + + ns->size = i_size_read(ns->bdev->bd_inode); + ns->blksize_shift = blksize_bits(bdev_logical_block_size(ns->bdev)); + + ret = percpu_ref_init(&ns->ref, nvmet_destroy_namespace, + 0, GFP_KERNEL); + if (ret) + goto out_blkdev_put; + + if (ns->nsid > subsys->max_nsid) + subsys->max_nsid = ns->nsid; + + /* + * The namespaces list needs to be sorted to simplify the implementation + * of the Identify Namepace List subcommand. + */ + if (list_empty(&subsys->namespaces)) { + list_add_tail_rcu(&ns->dev_link, &subsys->namespaces); + } else { + struct nvmet_ns *old; + + list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) { + BUG_ON(ns->nsid == old->nsid); + if (ns->nsid < old->nsid) + break; + } + + list_add_tail_rcu(&ns->dev_link, &old->dev_link); + } + + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + ret = 0; +out_unlock: + mutex_unlock(&subsys->lock); + return ret; +out_blkdev_put: + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + ns->bdev = NULL; + goto out_unlock; +} + +void nvmet_ns_disable(struct nvmet_ns *ns) +{ + struct nvmet_subsys *subsys = ns->subsys; + struct nvmet_ctrl *ctrl; + + mutex_lock(&subsys->lock); + if (list_empty(&ns->dev_link)) { + mutex_unlock(&subsys->lock); + return; + } + list_del_init(&ns->dev_link); + mutex_unlock(&subsys->lock); + + /* + * Now that we removed the namespaces from the lookup list, we + * can kill the per_cpu ref and wait for any remaining references + * to be dropped, as well as a RCU grace period for anyone only + * using the namepace under rcu_read_lock(). Note that we can't + * use call_rcu here as we need to ensure the namespaces have + * been fully destroyed before unloading the module. + */ + percpu_ref_kill(&ns->ref); + synchronize_rcu(); + wait_for_completion(&ns->disable_done); + percpu_ref_exit(&ns->ref); + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) + nvmet_add_async_event(ctrl, NVME_AER_TYPE_NOTICE, 0, 0); + + if (ns->bdev) + blkdev_put(ns->bdev, FMODE_WRITE|FMODE_READ); + mutex_unlock(&subsys->lock); +} + +void nvmet_ns_free(struct nvmet_ns *ns) +{ + nvmet_ns_disable(ns); + + kfree(ns->device_path); + kfree(ns); +} + +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid) +{ + struct nvmet_ns *ns; + + ns = kzalloc(sizeof(*ns), GFP_KERNEL); + if (!ns) + return NULL; + + INIT_LIST_HEAD(&ns->dev_link); + init_completion(&ns->disable_done); + + ns->nsid = nsid; + ns->subsys = subsys; + + return ns; +} + +static void __nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + if (status) + nvmet_set_status(req, status); + + /* XXX: need to fill in something useful for sq_head */ + req->rsp->sq_head = 0; + if (likely(req->sq)) /* may happen during early failure */ + req->rsp->sq_id = cpu_to_le16(req->sq->qid); + req->rsp->command_id = req->cmd->common.command_id; + + if (req->ns) + nvmet_put_namespace(req->ns); + req->ops->queue_response(req); +} + +void nvmet_req_complete(struct nvmet_req *req, u16 status) +{ + __nvmet_req_complete(req, status); + percpu_ref_put(&req->sq->ref); +} +EXPORT_SYMBOL_GPL(nvmet_req_complete); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, + u16 qid, u16 size) +{ + cq->qid = qid; + cq->size = size; + + ctrl->cqs[qid] = cq; +} + +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, + u16 qid, u16 size) +{ + sq->qid = qid; + sq->size = size; + + ctrl->sqs[qid] = sq; +} + +void nvmet_sq_destroy(struct nvmet_sq *sq) +{ + /* + * If this is the admin queue, complete all AERs so that our + * queue doesn't have outstanding requests on it. + */ + if (sq->ctrl && sq->ctrl->sqs && sq->ctrl->sqs[0] == sq) + nvmet_async_events_free(sq->ctrl); + percpu_ref_kill(&sq->ref); + wait_for_completion(&sq->free_done); + percpu_ref_exit(&sq->ref); + + if (sq->ctrl) { + nvmet_ctrl_put(sq->ctrl); + sq->ctrl = NULL; /* allows reusing the queue later */ + } +} +EXPORT_SYMBOL_GPL(nvmet_sq_destroy); + +static void nvmet_sq_free(struct percpu_ref *ref) +{ + struct nvmet_sq *sq = container_of(ref, struct nvmet_sq, ref); + + complete(&sq->free_done); +} + +int nvmet_sq_init(struct nvmet_sq *sq) +{ + int ret; + + ret = percpu_ref_init(&sq->ref, nvmet_sq_free, 0, GFP_KERNEL); + if (ret) { + pr_err("percpu_ref init failed!\n"); + return ret; + } + init_completion(&sq->free_done); + + return 0; +} +EXPORT_SYMBOL_GPL(nvmet_sq_init); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops) +{ + u8 flags = req->cmd->common.flags; + u16 status; + + req->cq = cq; + req->sq = sq; + req->ops = ops; + req->sg = NULL; + req->sg_cnt = 0; + req->rsp->status = 0; + + /* no support for fused commands yet */ + if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + /* either variant of SGLs is fine, as we don't support metadata */ + if (unlikely((flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METABUF && + (flags & NVME_CMD_SGL_ALL) != NVME_CMD_SGL_METASEG)) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + if (unlikely(!req->sq->ctrl)) + /* will return an error for any Non-connect command: */ + status = nvmet_parse_connect_cmd(req); + else if (likely(req->sq->qid != 0)) + status = nvmet_parse_io_cmd(req); + else if (req->cmd->common.opcode == nvme_fabrics_command) + status = nvmet_parse_fabrics_cmd(req); + else if (req->sq->ctrl->subsys->type == NVME_NQN_DISC) + status = nvmet_parse_discovery_cmd(req); + else + status = nvmet_parse_admin_cmd(req); + + if (status) + goto fail; + + if (unlikely(!percpu_ref_tryget_live(&sq->ref))) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto fail; + } + + return true; + +fail: + __nvmet_req_complete(req, status); + return false; +} +EXPORT_SYMBOL_GPL(nvmet_req_init); + +static inline bool nvmet_cc_en(u32 cc) +{ + return cc & 0x1; +} + +static inline u8 nvmet_cc_css(u32 cc) +{ + return (cc >> 4) & 0x7; +} + +static inline u8 nvmet_cc_mps(u32 cc) +{ + return (cc >> 7) & 0xf; +} + +static inline u8 nvmet_cc_ams(u32 cc) +{ + return (cc >> 11) & 0x7; +} + +static inline u8 nvmet_cc_shn(u32 cc) +{ + return (cc >> 14) & 0x3; +} + +static inline u8 nvmet_cc_iosqes(u32 cc) +{ + return (cc >> 16) & 0xf; +} + +static inline u8 nvmet_cc_iocqes(u32 cc) +{ + return (cc >> 20) & 0xf; +} + +static void nvmet_start_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + if (nvmet_cc_iosqes(ctrl->cc) != NVME_NVM_IOSQES || + nvmet_cc_iocqes(ctrl->cc) != NVME_NVM_IOCQES || + nvmet_cc_mps(ctrl->cc) != 0 || + nvmet_cc_ams(ctrl->cc) != 0 || + nvmet_cc_css(ctrl->cc) != 0) { + ctrl->csts = NVME_CSTS_CFS; + return; + } + + ctrl->csts = NVME_CSTS_RDY; +} + +static void nvmet_clear_ctrl(struct nvmet_ctrl *ctrl) +{ + lockdep_assert_held(&ctrl->lock); + + /* XXX: tear down queues? */ + ctrl->csts &= ~NVME_CSTS_RDY; + ctrl->cc = 0; +} + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new) +{ + u32 old; + + mutex_lock(&ctrl->lock); + old = ctrl->cc; + ctrl->cc = new; + + if (nvmet_cc_en(new) && !nvmet_cc_en(old)) + nvmet_start_ctrl(ctrl); + if (!nvmet_cc_en(new) && nvmet_cc_en(old)) + nvmet_clear_ctrl(ctrl); + if (nvmet_cc_shn(new) && !nvmet_cc_shn(old)) { + nvmet_clear_ctrl(ctrl); + ctrl->csts |= NVME_CSTS_SHST_CMPLT; + } + if (!nvmet_cc_shn(new) && nvmet_cc_shn(old)) + ctrl->csts &= ~NVME_CSTS_SHST_CMPLT; + mutex_unlock(&ctrl->lock); +} + +static void nvmet_init_cap(struct nvmet_ctrl *ctrl) +{ + /* command sets supported: NVMe command set: */ + ctrl->cap = (1ULL << 37); + /* CC.EN timeout in 500msec units: */ + ctrl->cap |= (15ULL << 24); + /* maximum queue entries supported: */ + ctrl->cap |= NVMET_QUEUE_SIZE - 1; +} + +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + u16 status = 0; + + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + } + + mutex_lock(&subsys->lock); + list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) { + if (ctrl->cntlid == cntlid) { + if (strncmp(hostnqn, ctrl->hostnqn, NVMF_NQN_SIZE)) { + pr_warn("hostnqn mismatch.\n"); + continue; + } + if (!kref_get_unless_zero(&ctrl->ref)) + continue; + + *ret = ctrl; + goto out; + } + } + + pr_warn("could not find controller %d for subsys %s / host %s\n", + cntlid, subsysnqn, hostnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + +out: + mutex_unlock(&subsys->lock); + nvmet_subsys_put(subsys); + return status; +} + +static bool __nvmet_host_allowed(struct nvmet_subsys *subsys, + const char *hostnqn) +{ + struct nvmet_host_link *p; + + if (subsys->allow_any_host) + return true; + + list_for_each_entry(p, &subsys->hosts, entry) { + if (!strcmp(nvmet_host_name(p->host), hostnqn)) + return true; + } + + return false; +} + +static bool nvmet_host_discovery_allowed(struct nvmet_req *req, + const char *hostnqn) +{ + struct nvmet_subsys_link *s; + + list_for_each_entry(s, &req->port->subsystems, entry) { + if (__nvmet_host_allowed(s->subsys, hostnqn)) + return true; + } + + return false; +} + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn) +{ + lockdep_assert_held(&nvmet_config_sem); + + if (subsys->type == NVME_NQN_DISC) + return nvmet_host_discovery_allowed(req, hostnqn); + else + return __nvmet_host_allowed(subsys, hostnqn); +} + +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp) +{ + struct nvmet_subsys *subsys; + struct nvmet_ctrl *ctrl; + int ret; + u16 status; + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + subsys = nvmet_find_get_subsys(req->port, subsysnqn); + if (!subsys) { + pr_warn("connect request for invalid subsystem %s!\n", + subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(subsysnqn); + goto out; + } + + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + down_read(&nvmet_config_sem); + if (!nvmet_host_allowed(req, subsys, hostnqn)) { + pr_info("connect by host %s for subsystem %s not allowed\n", + hostnqn, subsysnqn); + req->rsp->result = IPO_IATTR_CONNECT_DATA(hostnqn); + up_read(&nvmet_config_sem); + goto out_put_subsystem; + } + up_read(&nvmet_config_sem); + + status = NVME_SC_INTERNAL; + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + goto out_put_subsystem; + mutex_init(&ctrl->lock); + + nvmet_init_cap(ctrl); + + INIT_WORK(&ctrl->async_event_work, nvmet_async_event_work); + INIT_LIST_HEAD(&ctrl->async_events); + + memcpy(ctrl->subsysnqn, subsysnqn, NVMF_NQN_SIZE); + memcpy(ctrl->hostnqn, hostnqn, NVMF_NQN_SIZE); + + kref_init(&ctrl->ref); + ctrl->subsys = subsys; + + ctrl->cqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_cq *), + GFP_KERNEL); + if (!ctrl->cqs) + goto out_free_ctrl; + + ctrl->sqs = kcalloc(subsys->max_qid + 1, + sizeof(struct nvmet_sq *), + GFP_KERNEL); + if (!ctrl->sqs) + goto out_free_cqs; + + ret = ida_simple_get(&subsys->cntlid_ida, + NVME_CNTLID_MIN, NVME_CNTLID_MAX, + GFP_KERNEL); + if (ret < 0) { + status = NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + goto out_free_sqs; + } + ctrl->cntlid = ret; + + ctrl->ops = req->ops; + if (ctrl->subsys->type == NVME_NQN_DISC) { + /* Don't accept keep-alive timeout for discovery controllers */ + if (kato) { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + goto out_free_sqs; + } + + /* + * Discovery controllers use some arbitrary high value in order + * to cleanup stale discovery sessions + * + * From the latest base diff RC: + * "The Keep Alive command is not supported by + * Discovery controllers. A transport may specify a + * fixed Discovery controller activity timeout value + * (e.g., 2 minutes). If no commands are received + * by a Discovery controller within that time + * period, the controller may perform the + * actions for Keep Alive Timer expiration". + */ + ctrl->kato = NVMET_DISC_KATO; + } else { + /* keep-alive timeout in seconds */ + ctrl->kato = DIV_ROUND_UP(kato, 1000); + } + nvmet_start_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_add_tail(&ctrl->subsys_entry, &subsys->ctrls); + mutex_unlock(&subsys->lock); + + *ctrlp = ctrl; + return 0; + +out_free_sqs: + kfree(ctrl->sqs); +out_free_cqs: + kfree(ctrl->cqs); +out_free_ctrl: + kfree(ctrl); +out_put_subsystem: + nvmet_subsys_put(subsys); +out: + return status; +} + +static void nvmet_ctrl_free(struct kref *ref) +{ + struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref); + struct nvmet_subsys *subsys = ctrl->subsys; + + nvmet_stop_keep_alive_timer(ctrl); + + mutex_lock(&subsys->lock); + list_del(&ctrl->subsys_entry); + mutex_unlock(&subsys->lock); + + ida_simple_remove(&subsys->cntlid_ida, ctrl->cntlid); + nvmet_subsys_put(subsys); + + kfree(ctrl->sqs); + kfree(ctrl->cqs); + kfree(ctrl); +} + +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl) +{ + kref_put(&ctrl->ref, nvmet_ctrl_free); +} + +static void nvmet_fatal_error_handler(struct work_struct *work) +{ + struct nvmet_ctrl *ctrl = + container_of(work, struct nvmet_ctrl, fatal_err_work); + + pr_err("ctrl %d fatal error occurred!\n", ctrl->cntlid); + ctrl->ops->delete_ctrl(ctrl); +} + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl) +{ + ctrl->csts |= NVME_CSTS_CFS; + INIT_WORK(&ctrl->fatal_err_work, nvmet_fatal_error_handler); + schedule_work(&ctrl->fatal_err_work); +} +EXPORT_SYMBOL_GPL(nvmet_ctrl_fatal_error); + +static struct nvmet_subsys *nvmet_find_get_subsys(struct nvmet_port *port, + const char *subsysnqn) +{ + struct nvmet_subsys_link *p; + + if (!port) + return NULL; + + if (!strncmp(NVME_DISC_SUBSYS_NAME, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&nvmet_disc_subsys->ref)) + return NULL; + return nvmet_disc_subsys; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &port->subsystems, entry) { + if (!strncmp(p->subsys->subsysnqn, subsysnqn, + NVMF_NQN_SIZE)) { + if (!kref_get_unless_zero(&p->subsys->ref)) + break; + up_read(&nvmet_config_sem); + return p->subsys; + } + } + up_read(&nvmet_config_sem); + return NULL; +} + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type) +{ + struct nvmet_subsys *subsys; + + subsys = kzalloc(sizeof(*subsys), GFP_KERNEL); + if (!subsys) + return NULL; + + subsys->ver = (1 << 16) | (2 << 8) | 1; /* NVMe 1.2.1 */ + + switch (type) { + case NVME_NQN_NVME: + subsys->max_qid = NVMET_NR_QUEUES; + break; + case NVME_NQN_DISC: + subsys->max_qid = 0; + break; + default: + pr_err("%s: Unknown Subsystem type - %d\n", __func__, type); + kfree(subsys); + return NULL; + } + subsys->type = type; + subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE, + GFP_KERNEL); + if (!subsys->subsysnqn) { + kfree(subsys); + return NULL; + } + + kref_init(&subsys->ref); + + mutex_init(&subsys->lock); + INIT_LIST_HEAD(&subsys->namespaces); + INIT_LIST_HEAD(&subsys->ctrls); + + ida_init(&subsys->cntlid_ida); + + INIT_LIST_HEAD(&subsys->hosts); + + return subsys; +} + +static void nvmet_subsys_free(struct kref *ref) +{ + struct nvmet_subsys *subsys = + container_of(ref, struct nvmet_subsys, ref); + + WARN_ON_ONCE(!list_empty(&subsys->namespaces)); + + ida_destroy(&subsys->cntlid_ida); + kfree(subsys->subsysnqn); + kfree(subsys); +} + +void nvmet_subsys_put(struct nvmet_subsys *subsys) +{ + kref_put(&subsys->ref, nvmet_subsys_free); +} + +static int __init nvmet_init(void) +{ + int error; + + error = nvmet_init_discovery(); + if (error) + goto out; + + error = nvmet_init_configfs(); + if (error) + goto out_exit_discovery; + return 0; + +out_exit_discovery: + nvmet_exit_discovery(); +out: + return error; +} + +static void __exit nvmet_exit(void) +{ + nvmet_exit_configfs(); + nvmet_exit_discovery(); + + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024); + BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024); +} + +module_init(nvmet_init); +module_exit(nvmet_exit); + +MODULE_LICENSE("GPL v2"); diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c new file mode 100644 index 000000000000..6f65646e89cf --- /dev/null +++ b/drivers/nvme/target/discovery.c @@ -0,0 +1,221 @@ +/* + * Discovery service for the NVMe over Fabrics target. + * Copyright (C) 2016 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/slab.h> +#include <generated/utsrelease.h> +#include "nvmet.h" + +struct nvmet_subsys *nvmet_disc_subsys; + +u64 nvmet_genctr; + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (list_empty(&port->entry)) { + list_add_tail(&port->entry, &parent->referrals); + port->enabled = true; + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +void nvmet_referral_disable(struct nvmet_port *port) +{ + down_write(&nvmet_config_sem); + if (!list_empty(&port->entry)) { + port->enabled = false; + list_del_init(&port->entry); + nvmet_genctr++; + } + up_write(&nvmet_config_sem); +} + +static void nvmet_format_discovery_entry(struct nvmf_disc_rsp_page_hdr *hdr, + struct nvmet_port *port, char *subsys_nqn, u8 type, u32 numrec) +{ + struct nvmf_disc_rsp_page_entry *e = &hdr->entries[numrec]; + + e->trtype = port->disc_addr.trtype; + e->adrfam = port->disc_addr.adrfam; + e->treq = port->disc_addr.treq; + e->portid = port->disc_addr.portid; + /* we support only dynamic controllers */ + e->cntlid = cpu_to_le16(NVME_CNTLID_DYNAMIC); + e->asqsz = cpu_to_le16(NVMF_AQ_DEPTH); + e->nqntype = type; + memcpy(e->trsvcid, port->disc_addr.trsvcid, NVMF_TRSVCID_SIZE); + memcpy(e->traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); + memcpy(e->tsas.common, port->disc_addr.tsas.common, NVMF_TSAS_SIZE); + memcpy(e->subnqn, subsys_nqn, NVMF_NQN_SIZE); +} + +static void nvmet_execute_get_disc_log_page(struct nvmet_req *req) +{ + const int entry_size = sizeof(struct nvmf_disc_rsp_page_entry); + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvmf_disc_rsp_page_hdr *hdr; + size_t data_len = nvmet_get_log_page_len(req->cmd); + size_t alloc_len = max(data_len, sizeof(*hdr)); + int residual_len = data_len - sizeof(*hdr); + struct nvmet_subsys_link *p; + struct nvmet_port *r; + u32 numrec = 0; + u16 status = 0; + + /* + * Make sure we're passing at least a buffer of response header size. + * If host provided data len is less than the header size, only the + * number of bytes requested by host will be sent to host. + */ + hdr = kzalloc(alloc_len, GFP_KERNEL); + if (!hdr) { + status = NVME_SC_INTERNAL; + goto out; + } + + down_read(&nvmet_config_sem); + list_for_each_entry(p, &req->port->subsystems, entry) { + if (!nvmet_host_allowed(req, p->subsys, ctrl->hostnqn)) + continue; + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, req->port, + p->subsys->subsysnqn, + NVME_NQN_NVME, numrec); + residual_len -= entry_size; + } + numrec++; + } + + list_for_each_entry(r, &req->port->referrals, entry) { + if (residual_len >= entry_size) { + nvmet_format_discovery_entry(hdr, r, + NVME_DISC_SUBSYS_NAME, + NVME_NQN_DISC, numrec); + residual_len -= entry_size; + } + numrec++; + } + + hdr->genctr = cpu_to_le64(nvmet_genctr); + hdr->numrec = cpu_to_le64(numrec); + hdr->recfmt = cpu_to_le16(0); + + up_read(&nvmet_config_sem); + + status = nvmet_copy_to_sgl(req, 0, hdr, data_len); + kfree(hdr); +out: + nvmet_req_complete(req, status); +} + +static void nvmet_execute_identify_disc_ctrl(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + struct nvme_id_ctrl *id; + u16 status = 0; + + id = kzalloc(sizeof(*id), GFP_KERNEL); + if (!id) { + status = NVME_SC_INTERNAL; + goto out; + } + + memset(id->fr, ' ', sizeof(id->fr)); + strncpy((char *)id->fr, UTS_RELEASE, sizeof(id->fr)); + + /* no limit on data transfer sizes for now */ + id->mdts = 0; + id->cntlid = cpu_to_le16(ctrl->cntlid); + id->ver = cpu_to_le32(ctrl->subsys->ver); + id->lpa = (1 << 2); + + /* no enforcement soft-limit for maxcmd - pick arbitrary high value */ + id->maxcmd = cpu_to_le16(NVMET_MAX_CMD); + + id->sgls = cpu_to_le32(1 << 0); /* we always support SGLs */ + if (ctrl->ops->has_keyed_sgls) + id->sgls |= cpu_to_le32(1 << 2); + if (ctrl->ops->sqe_inline_size) + id->sgls |= cpu_to_le32(1 << 20); + + strcpy(id->subnqn, ctrl->subsys->subsysnqn); + + status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); + + kfree(id); +out: + nvmet_req_complete(req, status); +} + +int nvmet_parse_discovery_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got cmd %d while not ready\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + switch (cmd->common.opcode) { + case nvme_admin_get_log_page: + req->data_len = nvmet_get_log_page_len(cmd); + + switch (cmd->get_log_page.lid) { + case NVME_LOG_DISC: + req->execute = nvmet_execute_get_disc_log_page; + return 0; + default: + pr_err("nvmet: unsupported get_log_page lid %d\n", + cmd->get_log_page.lid); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + case nvme_admin_identify: + req->data_len = 4096; + switch (le32_to_cpu(cmd->identify.cns)) { + case 0x01: + req->execute = + nvmet_execute_identify_disc_ctrl; + return 0; + default: + pr_err("nvmet: unsupported identify cns %d\n", + le32_to_cpu(cmd->identify.cns)); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + default: + pr_err("nvmet: unsupported cmd %d\n", + cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; +} + +int __init nvmet_init_discovery(void) +{ + nvmet_disc_subsys = + nvmet_subsys_alloc(NVME_DISC_SUBSYS_NAME, NVME_NQN_DISC); + if (!nvmet_disc_subsys) + return -ENOMEM; + return 0; +} + +void nvmet_exit_discovery(void) +{ + nvmet_subsys_put(nvmet_disc_subsys); +} diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c new file mode 100644 index 000000000000..9a97ae67e656 --- /dev/null +++ b/drivers/nvme/target/fabrics-cmd.c @@ -0,0 +1,240 @@ +/* + * NVMe Fabrics command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include "nvmet.h" + +static void nvmet_execute_prop_set(struct nvmet_req *req) +{ + u16 status = 0; + + if (!(req->cmd->prop_set.attrib & 1)) { + u64 val = le64_to_cpu(req->cmd->prop_set.value); + + switch (le32_to_cpu(req->cmd->prop_set.offset)) { + case NVME_REG_CC: + nvmet_update_cc(req->sq->ctrl, val); + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + + nvmet_req_complete(req, status); +} + +static void nvmet_execute_prop_get(struct nvmet_req *req) +{ + struct nvmet_ctrl *ctrl = req->sq->ctrl; + u16 status = 0; + u64 val = 0; + + if (req->cmd->prop_get.attrib & 1) { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_CAP: + val = ctrl->cap; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } else { + switch (le32_to_cpu(req->cmd->prop_get.offset)) { + case NVME_REG_VS: + val = ctrl->subsys->ver; + break; + case NVME_REG_CC: + val = ctrl->cc; + break; + case NVME_REG_CSTS: + val = ctrl->csts; + break; + default: + status = NVME_SC_INVALID_FIELD | NVME_SC_DNR; + break; + } + } + + req->rsp->result64 = cpu_to_le64(val); + nvmet_req_complete(req, status); +} + +int nvmet_parse_fabrics_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + switch (cmd->fabrics.fctype) { + case nvme_fabrics_type_property_set: + req->data_len = 0; + req->execute = nvmet_execute_prop_set; + break; + case nvme_fabrics_type_property_get: + req->data_len = 0; + req->execute = nvmet_execute_prop_get; + break; + default: + pr_err("received unknown capsule type 0x%x\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + return 0; +} + +static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + u16 qid = le16_to_cpu(c->qid); + u16 sqsize = le16_to_cpu(c->sqsize); + struct nvmet_ctrl *old; + + old = cmpxchg(&req->sq->ctrl, NULL, ctrl); + if (old) { + pr_warn("queue already connected!\n"); + return NVME_SC_CONNECT_CTRL_BUSY | NVME_SC_DNR; + } + + nvmet_cq_setup(ctrl, req->cq, qid, sqsize); + nvmet_sq_setup(ctrl, req->sq, qid, sqsize); + return 0; +} + +static void nvmet_execute_admin_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + if (unlikely(d->cntlid != cpu_to_le16(0xffff))) { + pr_warn("connect attempt for invalid controller ID %#x\n", + d->cntlid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_DATA(cntlid); + goto out; + } + + status = nvmet_alloc_ctrl(d->subsysnqn, d->hostnqn, req, + le32_to_cpu(c->kato), &ctrl); + if (status) + goto out; + + status = nvmet_install_queue(ctrl, req); + if (status) { + nvmet_ctrl_put(ctrl); + goto out; + } + + pr_info("creating controller %d for NQN %s.\n", + ctrl->cntlid, ctrl->hostnqn); + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); +} + +static void nvmet_execute_io_connect(struct nvmet_req *req) +{ + struct nvmf_connect_command *c = &req->cmd->connect; + struct nvmf_connect_data *d; + struct nvmet_ctrl *ctrl = NULL; + u16 qid = le16_to_cpu(c->qid); + u16 status = 0; + + d = kmap(sg_page(req->sg)) + req->sg->offset; + + /* zero out initial completion result, assign values as needed */ + req->rsp->result = 0; + + if (c->recfmt != 0) { + pr_warn("invalid connect version (%d).\n", + le16_to_cpu(c->recfmt)); + status = NVME_SC_CONNECT_FORMAT | NVME_SC_DNR; + goto out; + } + + status = nvmet_ctrl_find_get(d->subsysnqn, d->hostnqn, + le16_to_cpu(d->cntlid), + req, &ctrl); + if (status) + goto out; + + if (unlikely(qid > ctrl->subsys->max_qid)) { + pr_warn("invalid queue id (%d)\n", qid); + status = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR; + req->rsp->result = IPO_IATTR_CONNECT_SQE(qid); + goto out_ctrl_put; + } + + status = nvmet_install_queue(ctrl, req); + if (status) { + /* pass back cntlid that had the issue of installing queue */ + req->rsp->result16 = cpu_to_le16(ctrl->cntlid); + goto out_ctrl_put; + } + + pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); + +out: + kunmap(sg_page(req->sg)); + nvmet_req_complete(req, status); + return; + +out_ctrl_put: + nvmet_ctrl_put(ctrl); + goto out; +} + +int nvmet_parse_connect_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + req->ns = NULL; + + if (req->cmd->common.opcode != nvme_fabrics_command) { + pr_err("invalid command 0x%x on unconnected queue.\n", + cmd->fabrics.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + if (cmd->fabrics.fctype != nvme_fabrics_type_connect) { + pr_err("invalid capsule type 0x%x on unconnected queue.\n", + cmd->fabrics.fctype); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } + + req->data_len = sizeof(struct nvmf_connect_data); + if (cmd->connect.qid == 0) + req->execute = nvmet_execute_admin_connect; + else + req->execute = nvmet_execute_io_connect; + return 0; +} diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c new file mode 100644 index 000000000000..2cd069b691ae --- /dev/null +++ b/drivers/nvme/target/io-cmd.c @@ -0,0 +1,215 @@ +/* + * NVMe I/O command implementation. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/blkdev.h> +#include <linux/module.h> +#include "nvmet.h" + +static void nvmet_bio_done(struct bio *bio) +{ + struct nvmet_req *req = bio->bi_private; + + nvmet_req_complete(req, + bio->bi_error ? NVME_SC_INTERNAL | NVME_SC_DNR : 0); + + if (bio != &req->inline_bio) + bio_put(bio); +} + +static inline u32 nvmet_rw_len(struct nvmet_req *req) +{ + return ((u32)le16_to_cpu(req->cmd->rw.length) + 1) << + req->ns->blksize_shift; +} + +static void nvmet_inline_bio_init(struct nvmet_req *req) +{ + struct bio *bio = &req->inline_bio; + + bio_init(bio); + bio->bi_max_vecs = NVMET_MAX_INLINE_BIOVEC; + bio->bi_io_vec = req->inline_bvec; +} + +static void nvmet_execute_rw(struct nvmet_req *req) +{ + int sg_cnt = req->sg_cnt; + struct scatterlist *sg; + struct bio *bio; + sector_t sector; + blk_qc_t cookie; + int op, op_flags = 0, i; + + if (!req->sg_cnt) { + nvmet_req_complete(req, 0); + return; + } + + if (req->cmd->rw.opcode == nvme_cmd_write) { + op = REQ_OP_WRITE; + if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) + op_flags |= REQ_FUA; + } else { + op = REQ_OP_READ; + } + + sector = le64_to_cpu(req->cmd->rw.slba); + sector <<= (req->ns->blksize_shift - 9); + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, op, op_flags); + + for_each_sg(req->sg, sg, req->sg_cnt, i) { + while (bio_add_page(bio, sg_page(sg), sg->length, sg->offset) + != sg->length) { + struct bio *prev = bio; + + bio = bio_alloc(GFP_KERNEL, min(sg_cnt, BIO_MAX_PAGES)); + bio->bi_bdev = req->ns->bdev; + bio->bi_iter.bi_sector = sector; + bio_set_op_attrs(bio, op, op_flags); + + bio_chain(bio, prev); + cookie = submit_bio(prev); + } + + sector += sg->length >> 9; + sg_cnt--; + } + + cookie = submit_bio(bio); + + blk_poll(bdev_get_queue(req->ns->bdev), cookie); +} + +static void nvmet_execute_flush(struct nvmet_req *req) +{ + struct bio *bio; + + nvmet_inline_bio_init(req); + bio = &req->inline_bio; + + bio->bi_bdev = req->ns->bdev; + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH); + + submit_bio(bio); +} + +static u16 nvmet_discard_range(struct nvmet_ns *ns, + struct nvme_dsm_range *range, struct bio **bio) +{ + if (__blkdev_issue_discard(ns->bdev, + le64_to_cpu(range->slba) << (ns->blksize_shift - 9), + le32_to_cpu(range->nlb) << (ns->blksize_shift - 9), + GFP_KERNEL, 0, bio)) + return NVME_SC_INTERNAL | NVME_SC_DNR; + return 0; +} + +static void nvmet_execute_discard(struct nvmet_req *req) +{ + struct nvme_dsm_range range; + struct bio *bio = NULL; + int i; + u16 status; + + for (i = 0; i <= le32_to_cpu(req->cmd->dsm.nr); i++) { + status = nvmet_copy_from_sgl(req, i * sizeof(range), &range, + sizeof(range)); + if (status) + break; + + status = nvmet_discard_range(req->ns, &range, &bio); + if (status) + break; + } + + if (bio) { + bio->bi_private = req; + bio->bi_end_io = nvmet_bio_done; + if (status) { + bio->bi_error = -EIO; + bio_endio(bio); + } else { + submit_bio(bio); + } + } else { + nvmet_req_complete(req, status); + } +} + +static void nvmet_execute_dsm(struct nvmet_req *req) +{ + switch (le32_to_cpu(req->cmd->dsm.attributes)) { + case NVME_DSMGMT_AD: + nvmet_execute_discard(req); + return; + case NVME_DSMGMT_IDR: + case NVME_DSMGMT_IDW: + default: + /* Not supported yet */ + nvmet_req_complete(req, 0); + return; + } +} + +int nvmet_parse_io_cmd(struct nvmet_req *req) +{ + struct nvme_command *cmd = req->cmd; + + if (unlikely(!(req->sq->ctrl->cc & NVME_CC_ENABLE))) { + pr_err("nvmet: got io cmd %d while CC.EN == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + if (unlikely(!(req->sq->ctrl->csts & NVME_CSTS_RDY))) { + pr_err("nvmet: got io cmd %d while CSTS.RDY == 0\n", + cmd->common.opcode); + req->ns = NULL; + return NVME_SC_CMD_SEQ_ERROR | NVME_SC_DNR; + } + + req->ns = nvmet_find_namespace(req->sq->ctrl, cmd->rw.nsid); + if (!req->ns) + return NVME_SC_INVALID_NS | NVME_SC_DNR; + + switch (cmd->common.opcode) { + case nvme_cmd_read: + case nvme_cmd_write: + req->execute = nvmet_execute_rw; + req->data_len = nvmet_rw_len(req); + return 0; + case nvme_cmd_flush: + req->execute = nvmet_execute_flush; + req->data_len = 0; + return 0; + case nvme_cmd_dsm: + req->execute = nvmet_execute_dsm; + req->data_len = le32_to_cpu(cmd->dsm.nr) * + sizeof(struct nvme_dsm_range); + return 0; + default: + pr_err("nvmet: unhandled cmd %d\n", cmd->common.opcode); + return NVME_SC_INVALID_OPCODE | NVME_SC_DNR; + } +} diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c new file mode 100644 index 000000000000..94e782987cc9 --- /dev/null +++ b/drivers/nvme/target/loop.c @@ -0,0 +1,754 @@ +/* + * NVMe over Fabrics loopback device. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/scatterlist.h> +#include <linux/delay.h> +#include <linux/blk-mq.h> +#include <linux/nvme.h> +#include <linux/module.h> +#include <linux/parser.h> +#include <linux/t10-pi.h> +#include "nvmet.h" +#include "../host/nvme.h" +#include "../host/fabrics.h" + +#define NVME_LOOP_AQ_DEPTH 256 + +#define NVME_LOOP_MAX_SEGMENTS 256 + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_LOOP_NR_AEN_COMMANDS 1 +#define NVME_LOOP_AQ_BLKMQ_DEPTH \ + (NVME_LOOP_AQ_DEPTH - NVME_LOOP_NR_AEN_COMMANDS) + +struct nvme_loop_iod { + struct nvme_command cmd; + struct nvme_completion rsp; + struct nvmet_req req; + struct nvme_loop_queue *queue; + struct work_struct work; + struct sg_table sg_table; + struct scatterlist first_sgl[]; +}; + +struct nvme_loop_ctrl { + spinlock_t lock; + struct nvme_loop_queue *queues; + u32 queue_count; + + struct blk_mq_tag_set admin_tag_set; + + struct list_head list; + u64 cap; + struct blk_mq_tag_set tag_set; + struct nvme_loop_iod async_event_iod; + struct nvme_ctrl ctrl; + + struct nvmet_ctrl *target_ctrl; + struct work_struct delete_work; + struct work_struct reset_work; +}; + +static inline struct nvme_loop_ctrl *to_loop_ctrl(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_loop_ctrl, ctrl); +} + +struct nvme_loop_queue { + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + struct nvme_loop_ctrl *ctrl; +}; + +static struct nvmet_port *nvmet_loop_port; + +static LIST_HEAD(nvme_loop_ctrl_list); +static DEFINE_MUTEX(nvme_loop_ctrl_mutex); + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req); +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *ctrl); + +static struct nvmet_fabrics_ops nvme_loop_ops; + +static inline int nvme_loop_queue_idx(struct nvme_loop_queue *queue) +{ + return queue - queue->ctrl->queues; +} + +static void nvme_loop_complete_rq(struct request *req) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int error = 0; + + nvme_cleanup_cmd(req); + sg_free_table_chained(&iod->sg_table, true); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; + } + + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } + + blk_mq_end_request(req, error); +} + +static void nvme_loop_queue_response(struct nvmet_req *nvme_req) +{ + struct nvme_loop_iod *iod = + container_of(nvme_req, struct nvme_loop_iod, req); + struct nvme_completion *cqe = &iod->rsp; + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvme_loop_queue_idx(iod->queue) == 0 && + cqe->command_id >= NVME_LOOP_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(&iod->queue->ctrl->ctrl, cqe); + } else { + struct request *req = blk_mq_rq_from_pdu(iod); + + if (req->cmd_type == REQ_TYPE_DRV_PRIV && req->special) + memcpy(req->special, cqe, sizeof(*cqe)); + blk_mq_complete_request(req, le16_to_cpu(cqe->status) >> 1); + } +} + +static void nvme_loop_execute_work(struct work_struct *work) +{ + struct nvme_loop_iod *iod = + container_of(work, struct nvme_loop_iod, work); + + iod->req.execute(&iod->req); +} + +static enum blk_eh_timer_return +nvme_loop_timeout(struct request *rq, bool reserved) +{ + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(rq); + + /* queue error recovery */ + schedule_work(&iod->queue->ctrl->reset_work); + + /* fail with DNR on admin cmd timeout */ + rq->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR; + + return BLK_EH_HANDLED; +} + +static int nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_loop_queue *queue = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_loop_iod *iod = blk_mq_rq_to_pdu(req); + int ret; + + ret = nvme_setup_cmd(ns, req, &iod->cmd); + if (ret) + return ret; + + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + iod->req.port = nvmet_loop_port; + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, + &queue->nvme_sq, &nvme_loop_ops)) { + nvme_cleanup_cmd(req); + blk_mq_start_request(req); + nvme_loop_queue_response(&iod->req); + return 0; + } + + if (blk_rq_bytes(req)) { + iod->sg_table.sgl = iod->first_sgl; + ret = sg_alloc_table_chained(&iod->sg_table, + req->nr_phys_segments, iod->sg_table.sgl); + if (ret) + return BLK_MQ_RQ_QUEUE_BUSY; + + iod->req.sg = iod->sg_table.sgl; + iod->req.sg_cnt = blk_rq_map_sg(req->q, req, iod->sg_table.sgl); + BUG_ON(iod->req.sg_cnt > req->nr_phys_segments); + } + + iod->cmd.common.command_id = req->tag; + blk_mq_start_request(req); + + schedule_work(&iod->work); + return 0; +} + +static void nvme_loop_submit_async_event(struct nvme_ctrl *arg, int aer_idx) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(arg); + struct nvme_loop_queue *queue = &ctrl->queues[0]; + struct nvme_loop_iod *iod = &ctrl->async_event_iod; + + memset(&iod->cmd, 0, sizeof(iod->cmd)); + iod->cmd.common.opcode = nvme_admin_async_event; + iod->cmd.common.command_id = NVME_LOOP_AQ_BLKMQ_DEPTH; + iod->cmd.common.flags |= NVME_CMD_SGL_METABUF; + + if (!nvmet_req_init(&iod->req, &queue->nvme_cq, &queue->nvme_sq, + &nvme_loop_ops)) { + dev_err(ctrl->ctrl.device, "failed async event work\n"); + return; + } + + schedule_work(&iod->work); +} + +static int nvme_loop_init_iod(struct nvme_loop_ctrl *ctrl, + struct nvme_loop_iod *iod, unsigned int queue_idx) +{ + BUG_ON(queue_idx >= ctrl->queue_count); + + iod->req.cmd = &iod->cmd; + iod->req.rsp = &iod->rsp; + iod->queue = &ctrl->queues[queue_idx]; + INIT_WORK(&iod->work, nvme_loop_execute_work); + return 0; +} + +static int nvme_loop_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), hctx_idx + 1); +} + +static int nvme_loop_init_admin_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + return nvme_loop_init_iod(data, blk_mq_rq_to_pdu(req), 0); +} + +static int nvme_loop_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[hctx_idx + 1]; + + BUG_ON(hctx_idx >= ctrl->queue_count); + + hctx->driver_data = queue; + return 0; +} + +static int nvme_loop_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_loop_ctrl *ctrl = data; + struct nvme_loop_queue *queue = &ctrl->queues[0]; + + BUG_ON(hctx_idx != 0); + + hctx->driver_data = queue; + return 0; +} + +static struct blk_mq_ops nvme_loop_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_request, + .init_hctx = nvme_loop_init_hctx, + .timeout = nvme_loop_timeout, +}; + +static struct blk_mq_ops nvme_loop_admin_mq_ops = { + .queue_rq = nvme_loop_queue_rq, + .complete = nvme_loop_complete_rq, + .map_queue = blk_mq_map_queue, + .init_request = nvme_loop_init_admin_request, + .init_hctx = nvme_loop_init_admin_hctx, + .timeout = nvme_loop_timeout, +}; + +static void nvme_loop_destroy_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + blk_cleanup_queue(ctrl->ctrl.admin_q); + blk_mq_free_tag_set(&ctrl->admin_tag_set); + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); +} + +static void nvme_loop_free_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (list_empty(&ctrl->list)) + goto free_ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_del(&ctrl->list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (nctrl->tagset) { + blk_cleanup_queue(ctrl->ctrl.connect_q); + blk_mq_free_tag_set(&ctrl->tag_set); + } + kfree(ctrl->queues); + nvmf_free_options(nctrl->opts); +free_ctrl: + kfree(ctrl); +} + +static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) +{ + int error; + + memset(&ctrl->admin_tag_set, 0, sizeof(ctrl->admin_tag_set)); + ctrl->admin_tag_set.ops = &nvme_loop_admin_mq_ops; + ctrl->admin_tag_set.queue_depth = NVME_LOOP_AQ_BLKMQ_DEPTH; + ctrl->admin_tag_set.reserved_tags = 2; /* connect + keep-alive */ + ctrl->admin_tag_set.numa_node = NUMA_NO_NODE; + ctrl->admin_tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->admin_tag_set.driver_data = ctrl; + ctrl->admin_tag_set.nr_hw_queues = 1; + ctrl->admin_tag_set.timeout = ADMIN_TIMEOUT; + + ctrl->queues[0].ctrl = ctrl; + error = nvmet_sq_init(&ctrl->queues[0].nvme_sq); + if (error) + return error; + ctrl->queue_count = 1; + + error = blk_mq_alloc_tag_set(&ctrl->admin_tag_set); + if (error) + goto out_free_sq; + + ctrl->ctrl.admin_q = blk_mq_init_queue(&ctrl->admin_tag_set); + if (IS_ERR(ctrl->ctrl.admin_q)) { + error = PTR_ERR(ctrl->ctrl.admin_q); + goto out_free_tagset; + } + + error = nvmf_connect_admin_queue(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + error = nvmf_reg_read64(&ctrl->ctrl, NVME_REG_CAP, &ctrl->cap); + if (error) { + dev_err(ctrl->ctrl.device, + "prop_get NVME_REG_CAP failed\n"); + goto out_cleanup_queue; + } + + ctrl->ctrl.sqsize = + min_t(int, NVME_CAP_MQES(ctrl->cap) + 1, ctrl->ctrl.sqsize); + + error = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap); + if (error) + goto out_cleanup_queue; + + ctrl->ctrl.max_hw_sectors = + (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + + error = nvme_init_identify(&ctrl->ctrl); + if (error) + goto out_cleanup_queue; + + nvme_start_keep_alive(&ctrl->ctrl); + + return 0; + +out_cleanup_queue: + blk_cleanup_queue(ctrl->ctrl.admin_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->admin_tag_set); +out_free_sq: + nvmet_sq_destroy(&ctrl->queues[0].nvme_sq); + return error; +} + +static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl) +{ + int i; + + nvme_stop_keep_alive(&ctrl->ctrl); + + if (ctrl->queue_count > 1) { + nvme_stop_queues(&ctrl->ctrl); + blk_mq_tagset_busy_iter(&ctrl->tag_set, + nvme_cancel_request, &ctrl->ctrl); + + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + } + + if (ctrl->ctrl.state == NVME_CTRL_LIVE) + nvme_shutdown_ctrl(&ctrl->ctrl); + + blk_mq_stop_hw_queues(ctrl->ctrl.admin_q); + blk_mq_tagset_busy_iter(&ctrl->admin_tag_set, + nvme_cancel_request, &ctrl->ctrl); + nvme_loop_destroy_admin_queue(ctrl); +} + +static void nvme_loop_del_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, delete_work); + + nvme_remove_namespaces(&ctrl->ctrl); + nvme_loop_shutdown_ctrl(ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int __nvme_loop_del_ctrl(struct nvme_loop_ctrl *ctrl) +{ + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_DELETING)) + return -EBUSY; + + if (!schedule_work(&ctrl->delete_work)) + return -EBUSY; + + return 0; +} + +static int nvme_loop_del_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + int ret; + + ret = __nvme_loop_del_ctrl(ctrl); + if (ret) + return ret; + + flush_work(&ctrl->delete_work); + + return 0; +} + +static void nvme_loop_delete_ctrl(struct nvmet_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl; + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry(ctrl, &nvme_loop_ctrl_list, list) { + if (ctrl->ctrl.cntlid == nctrl->cntlid) + __nvme_loop_del_ctrl(ctrl); + } + mutex_unlock(&nvme_loop_ctrl_mutex); +} + +static void nvme_loop_reset_ctrl_work(struct work_struct *work) +{ + struct nvme_loop_ctrl *ctrl = container_of(work, + struct nvme_loop_ctrl, reset_work); + bool changed; + int i, ret; + + nvme_loop_shutdown_ctrl(ctrl); + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_disable; + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_free_queues; + + ctrl->queue_count++; + } + + for (i = 1; i <= ctrl->ctrl.opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_free_queues; + } + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + + nvme_start_queues(&ctrl->ctrl); + + return; + +out_free_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + nvme_loop_destroy_admin_queue(ctrl); +out_disable: + dev_warn(ctrl->ctrl.device, "Removing after reset failure\n"); + nvme_remove_namespaces(&ctrl->ctrl); + nvme_uninit_ctrl(&ctrl->ctrl); + nvme_put_ctrl(&ctrl->ctrl); +} + +static int nvme_loop_reset_ctrl(struct nvme_ctrl *nctrl) +{ + struct nvme_loop_ctrl *ctrl = to_loop_ctrl(nctrl); + + if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING)) + return -EBUSY; + + if (!schedule_work(&ctrl->reset_work)) + return -EBUSY; + + flush_work(&ctrl->reset_work); + + return 0; +} + +static const struct nvme_ctrl_ops nvme_loop_ctrl_ops = { + .name = "loop", + .module = THIS_MODULE, + .is_fabrics = true, + .reg_read32 = nvmf_reg_read32, + .reg_read64 = nvmf_reg_read64, + .reg_write32 = nvmf_reg_write32, + .reset_ctrl = nvme_loop_reset_ctrl, + .free_ctrl = nvme_loop_free_ctrl, + .submit_async_event = nvme_loop_submit_async_event, + .delete_ctrl = nvme_loop_del_ctrl, + .get_subsysnqn = nvmf_get_subsysnqn, +}; + +static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl) +{ + struct nvmf_ctrl_options *opts = ctrl->ctrl.opts; + int ret, i; + + ret = nvme_set_queue_count(&ctrl->ctrl, &opts->nr_io_queues); + if (ret || !opts->nr_io_queues) + return ret; + + dev_info(ctrl->ctrl.device, "creating %d I/O queues.\n", + opts->nr_io_queues); + + for (i = 1; i <= opts->nr_io_queues; i++) { + ctrl->queues[i].ctrl = ctrl; + ret = nvmet_sq_init(&ctrl->queues[i].nvme_sq); + if (ret) + goto out_destroy_queues; + + ctrl->queue_count++; + } + + memset(&ctrl->tag_set, 0, sizeof(ctrl->tag_set)); + ctrl->tag_set.ops = &nvme_loop_mq_ops; + ctrl->tag_set.queue_depth = ctrl->ctrl.sqsize; + ctrl->tag_set.reserved_tags = 1; /* fabric connect */ + ctrl->tag_set.numa_node = NUMA_NO_NODE; + ctrl->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ctrl->tag_set.cmd_size = sizeof(struct nvme_loop_iod) + + SG_CHUNK_SIZE * sizeof(struct scatterlist); + ctrl->tag_set.driver_data = ctrl; + ctrl->tag_set.nr_hw_queues = ctrl->queue_count - 1; + ctrl->tag_set.timeout = NVME_IO_TIMEOUT; + ctrl->ctrl.tagset = &ctrl->tag_set; + + ret = blk_mq_alloc_tag_set(&ctrl->tag_set); + if (ret) + goto out_destroy_queues; + + ctrl->ctrl.connect_q = blk_mq_init_queue(&ctrl->tag_set); + if (IS_ERR(ctrl->ctrl.connect_q)) { + ret = PTR_ERR(ctrl->ctrl.connect_q); + goto out_free_tagset; + } + + for (i = 1; i <= opts->nr_io_queues; i++) { + ret = nvmf_connect_io_queue(&ctrl->ctrl, i); + if (ret) + goto out_cleanup_connect_q; + } + + return 0; + +out_cleanup_connect_q: + blk_cleanup_queue(ctrl->ctrl.connect_q); +out_free_tagset: + blk_mq_free_tag_set(&ctrl->tag_set); +out_destroy_queues: + for (i = 1; i < ctrl->queue_count; i++) + nvmet_sq_destroy(&ctrl->queues[i].nvme_sq); + return ret; +} + +static struct nvme_ctrl *nvme_loop_create_ctrl(struct device *dev, + struct nvmf_ctrl_options *opts) +{ + struct nvme_loop_ctrl *ctrl; + bool changed; + int ret; + + ctrl = kzalloc(sizeof(*ctrl), GFP_KERNEL); + if (!ctrl) + return ERR_PTR(-ENOMEM); + ctrl->ctrl.opts = opts; + INIT_LIST_HEAD(&ctrl->list); + + INIT_WORK(&ctrl->delete_work, nvme_loop_del_ctrl_work); + INIT_WORK(&ctrl->reset_work, nvme_loop_reset_ctrl_work); + + ret = nvme_init_ctrl(&ctrl->ctrl, dev, &nvme_loop_ctrl_ops, + 0 /* no quirks, we're perfect! */); + if (ret) + goto out_put_ctrl; + + spin_lock_init(&ctrl->lock); + + ret = -ENOMEM; + + ctrl->ctrl.sqsize = opts->queue_size; + ctrl->ctrl.kato = opts->kato; + + ctrl->queues = kcalloc(opts->nr_io_queues + 1, sizeof(*ctrl->queues), + GFP_KERNEL); + if (!ctrl->queues) + goto out_uninit_ctrl; + + ret = nvme_loop_configure_admin_queue(ctrl); + if (ret) + goto out_free_queues; + + if (opts->queue_size > ctrl->ctrl.maxcmd) { + /* warn if maxcmd is lower than queue_size */ + dev_warn(ctrl->ctrl.device, + "queue_size %zu > ctrl maxcmd %u, clamping down\n", + opts->queue_size, ctrl->ctrl.maxcmd); + opts->queue_size = ctrl->ctrl.maxcmd; + } + + if (opts->nr_io_queues) { + ret = nvme_loop_create_io_queues(ctrl); + if (ret) + goto out_remove_admin_queue; + } + + nvme_loop_init_iod(ctrl, &ctrl->async_event_iod, 0); + + dev_info(ctrl->ctrl.device, + "new ctrl: \"%s\"\n", ctrl->ctrl.opts->subsysnqn); + + kref_get(&ctrl->ctrl.kref); + + changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + WARN_ON_ONCE(!changed); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_add_tail(&ctrl->list, &nvme_loop_ctrl_list); + mutex_unlock(&nvme_loop_ctrl_mutex); + + if (opts->nr_io_queues) { + nvme_queue_scan(&ctrl->ctrl); + nvme_queue_async_events(&ctrl->ctrl); + } + + return &ctrl->ctrl; + +out_remove_admin_queue: + nvme_loop_destroy_admin_queue(ctrl); +out_free_queues: + kfree(ctrl->queues); +out_uninit_ctrl: + nvme_uninit_ctrl(&ctrl->ctrl); +out_put_ctrl: + nvme_put_ctrl(&ctrl->ctrl); + if (ret > 0) + ret = -EIO; + return ERR_PTR(ret); +} + +static int nvme_loop_add_port(struct nvmet_port *port) +{ + /* + * XXX: disalow adding more than one port so + * there is no connection rejections when a + * a subsystem is assigned to a port for which + * loop doesn't have a pointer. + * This scenario would be possible if we allowed + * more than one port to be added and a subsystem + * was assigned to a port other than nvmet_loop_port. + */ + + if (nvmet_loop_port) + return -EPERM; + + nvmet_loop_port = port; + return 0; +} + +static void nvme_loop_remove_port(struct nvmet_port *port) +{ + if (port == nvmet_loop_port) + nvmet_loop_port = NULL; +} + +static struct nvmet_fabrics_ops nvme_loop_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_LOOP, + .add_port = nvme_loop_add_port, + .remove_port = nvme_loop_remove_port, + .queue_response = nvme_loop_queue_response, + .delete_ctrl = nvme_loop_delete_ctrl, +}; + +static struct nvmf_transport_ops nvme_loop_transport = { + .name = "loop", + .create_ctrl = nvme_loop_create_ctrl, +}; + +static int __init nvme_loop_init_module(void) +{ + int ret; + + ret = nvmet_register_transport(&nvme_loop_ops); + if (ret) + return ret; + nvmf_register_transport(&nvme_loop_transport); + return 0; +} + +static void __exit nvme_loop_cleanup_module(void) +{ + struct nvme_loop_ctrl *ctrl, *next; + + nvmf_unregister_transport(&nvme_loop_transport); + nvmet_unregister_transport(&nvme_loop_ops); + + mutex_lock(&nvme_loop_ctrl_mutex); + list_for_each_entry_safe(ctrl, next, &nvme_loop_ctrl_list, list) + __nvme_loop_del_ctrl(ctrl); + mutex_unlock(&nvme_loop_ctrl_mutex); + + flush_scheduled_work(); +} + +module_init(nvme_loop_init_module); +module_exit(nvme_loop_cleanup_module); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-254"); /* 254 == NVMF_TRTYPE_LOOP */ diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h new file mode 100644 index 000000000000..57dd6d834c28 --- /dev/null +++ b/drivers/nvme/target/nvmet.h @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#ifndef _NVMET_H +#define _NVMET_H + +#include <linux/dma-mapping.h> +#include <linux/types.h> +#include <linux/device.h> +#include <linux/kref.h> +#include <linux/percpu-refcount.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/nvme.h> +#include <linux/configfs.h> +#include <linux/rcupdate.h> +#include <linux/blkdev.h> + +#define NVMET_ASYNC_EVENTS 4 +#define NVMET_ERROR_LOG_SLOTS 128 + +/* Helper Macros when NVMe error is NVME_SC_CONNECT_INVALID_PARAM + * The 16 bit shift is to set IATTR bit to 1, which means offending + * offset starts in the data section of connect() + */ +#define IPO_IATTR_CONNECT_DATA(x) \ + (cpu_to_le32((1 << 16) | (offsetof(struct nvmf_connect_data, x)))) +#define IPO_IATTR_CONNECT_SQE(x) \ + (cpu_to_le32(offsetof(struct nvmf_connect_command, x))) + +struct nvmet_ns { + struct list_head dev_link; + struct percpu_ref ref; + struct block_device *bdev; + u32 nsid; + u32 blksize_shift; + loff_t size; + u8 nguid[16]; + + struct nvmet_subsys *subsys; + const char *device_path; + + struct config_group device_group; + struct config_group group; + + struct completion disable_done; +}; + +static inline struct nvmet_ns *to_nvmet_ns(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_ns, group); +} + +static inline bool nvmet_ns_enabled(struct nvmet_ns *ns) +{ + return !list_empty_careful(&ns->dev_link); +} + +struct nvmet_cq { + u16 qid; + u16 size; +}; + +struct nvmet_sq { + struct nvmet_ctrl *ctrl; + struct percpu_ref ref; + u16 qid; + u16 size; + struct completion free_done; +}; + +/** + * struct nvmet_port - Common structure to keep port + * information for the target. + * @entry: List head for holding a list of these elements. + * @disc_addr: Address information is stored in a format defined + * for a discovery log page entry. + * @group: ConfigFS group for this element's folder. + * @priv: Private data for the transport. + */ +struct nvmet_port { + struct list_head entry; + struct nvmf_disc_rsp_page_entry disc_addr; + struct config_group group; + struct config_group subsys_group; + struct list_head subsystems; + struct config_group referrals_group; + struct list_head referrals; + void *priv; + bool enabled; +}; + +static inline struct nvmet_port *to_nvmet_port(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_port, + group); +} + +struct nvmet_ctrl { + struct nvmet_subsys *subsys; + struct nvmet_cq **cqs; + struct nvmet_sq **sqs; + + struct mutex lock; + u64 cap; + u32 cc; + u32 csts; + + u16 cntlid; + u32 kato; + + struct nvmet_req *async_event_cmds[NVMET_ASYNC_EVENTS]; + unsigned int nr_async_event_cmds; + struct list_head async_events; + struct work_struct async_event_work; + + struct list_head subsys_entry; + struct kref ref; + struct delayed_work ka_work; + struct work_struct fatal_err_work; + + struct nvmet_fabrics_ops *ops; + + char subsysnqn[NVMF_NQN_FIELD_LEN]; + char hostnqn[NVMF_NQN_FIELD_LEN]; +}; + +struct nvmet_subsys { + enum nvme_subsys_type type; + + struct mutex lock; + struct kref ref; + + struct list_head namespaces; + unsigned int max_nsid; + + struct list_head ctrls; + struct ida cntlid_ida; + + struct list_head hosts; + bool allow_any_host; + + u16 max_qid; + + u64 ver; + char *subsysnqn; + + struct config_group group; + + struct config_group namespaces_group; + struct config_group allowed_hosts_group; +}; + +static inline struct nvmet_subsys *to_subsys(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, group); +} + +static inline struct nvmet_subsys *namespaces_to_subsys( + struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_subsys, + namespaces_group); +} + +struct nvmet_host { + struct config_group group; +}; + +static inline struct nvmet_host *to_host(struct config_item *item) +{ + return container_of(to_config_group(item), struct nvmet_host, group); +} + +static inline char *nvmet_host_name(struct nvmet_host *host) +{ + return config_item_name(&host->group.cg_item); +} + +struct nvmet_host_link { + struct list_head entry; + struct nvmet_host *host; +}; + +struct nvmet_subsys_link { + struct list_head entry; + struct nvmet_subsys *subsys; +}; + +struct nvmet_req; +struct nvmet_fabrics_ops { + struct module *owner; + unsigned int type; + unsigned int sqe_inline_size; + unsigned int msdbd; + bool has_keyed_sgls : 1; + void (*queue_response)(struct nvmet_req *req); + int (*add_port)(struct nvmet_port *port); + void (*remove_port)(struct nvmet_port *port); + void (*delete_ctrl)(struct nvmet_ctrl *ctrl); +}; + +#define NVMET_MAX_INLINE_BIOVEC 8 + +struct nvmet_req { + struct nvme_command *cmd; + struct nvme_completion *rsp; + struct nvmet_sq *sq; + struct nvmet_cq *cq; + struct nvmet_ns *ns; + struct scatterlist *sg; + struct bio inline_bio; + struct bio_vec inline_bvec[NVMET_MAX_INLINE_BIOVEC]; + int sg_cnt; + size_t data_len; + + struct nvmet_port *port; + + void (*execute)(struct nvmet_req *req); + struct nvmet_fabrics_ops *ops; +}; + +static inline void nvmet_set_status(struct nvmet_req *req, u16 status) +{ + req->rsp->status = cpu_to_le16(status << 1); +} + +static inline void nvmet_set_result(struct nvmet_req *req, u32 result) +{ + req->rsp->result = cpu_to_le32(result); +} + +/* + * NVMe command writes actually are DMA reads for us on the target side. + */ +static inline enum dma_data_direction +nvmet_data_dir(struct nvmet_req *req) +{ + return nvme_is_write(req->cmd) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; +} + +struct nvmet_async_event { + struct list_head entry; + u8 event_type; + u8 event_info; + u8 log_page; +}; + +int nvmet_parse_connect_cmd(struct nvmet_req *req); +int nvmet_parse_io_cmd(struct nvmet_req *req); +int nvmet_parse_admin_cmd(struct nvmet_req *req); +int nvmet_parse_discovery_cmd(struct nvmet_req *req); +int nvmet_parse_fabrics_cmd(struct nvmet_req *req); + +bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, + struct nvmet_sq *sq, struct nvmet_fabrics_ops *ops); +void nvmet_req_complete(struct nvmet_req *req, u16 status); + +void nvmet_cq_setup(struct nvmet_ctrl *ctrl, struct nvmet_cq *cq, u16 qid, + u16 size); +void nvmet_sq_setup(struct nvmet_ctrl *ctrl, struct nvmet_sq *sq, u16 qid, + u16 size); +void nvmet_sq_destroy(struct nvmet_sq *sq); +int nvmet_sq_init(struct nvmet_sq *sq); + +void nvmet_ctrl_fatal_error(struct nvmet_ctrl *ctrl); + +void nvmet_update_cc(struct nvmet_ctrl *ctrl, u32 new); +u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn, + struct nvmet_req *req, u32 kato, struct nvmet_ctrl **ctrlp); +u16 nvmet_ctrl_find_get(const char *subsysnqn, const char *hostnqn, u16 cntlid, + struct nvmet_req *req, struct nvmet_ctrl **ret); +void nvmet_ctrl_put(struct nvmet_ctrl *ctrl); + +struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn, + enum nvme_subsys_type type); +void nvmet_subsys_put(struct nvmet_subsys *subsys); + +struct nvmet_ns *nvmet_find_namespace(struct nvmet_ctrl *ctrl, __le32 nsid); +void nvmet_put_namespace(struct nvmet_ns *ns); +int nvmet_ns_enable(struct nvmet_ns *ns); +void nvmet_ns_disable(struct nvmet_ns *ns); +struct nvmet_ns *nvmet_ns_alloc(struct nvmet_subsys *subsys, u32 nsid); +void nvmet_ns_free(struct nvmet_ns *ns); + +int nvmet_register_transport(struct nvmet_fabrics_ops *ops); +void nvmet_unregister_transport(struct nvmet_fabrics_ops *ops); + +int nvmet_enable_port(struct nvmet_port *port); +void nvmet_disable_port(struct nvmet_port *port); + +void nvmet_referral_enable(struct nvmet_port *parent, struct nvmet_port *port); +void nvmet_referral_disable(struct nvmet_port *port); + +u16 nvmet_copy_to_sgl(struct nvmet_req *req, off_t off, const void *buf, + size_t len); +u16 nvmet_copy_from_sgl(struct nvmet_req *req, off_t off, void *buf, + size_t len); + +u32 nvmet_get_log_page_len(struct nvme_command *cmd); + +#define NVMET_QUEUE_SIZE 1024 +#define NVMET_NR_QUEUES 64 +#define NVMET_MAX_CMD NVMET_QUEUE_SIZE +#define NVMET_KAS 10 +#define NVMET_DISC_KATO 120 + +int __init nvmet_init_configfs(void); +void __exit nvmet_exit_configfs(void); + +int __init nvmet_init_discovery(void); +void nvmet_exit_discovery(void); + +extern struct nvmet_subsys *nvmet_disc_subsys; +extern u64 nvmet_genctr; +extern struct rw_semaphore nvmet_config_sem; + +bool nvmet_host_allowed(struct nvmet_req *req, struct nvmet_subsys *subsys, + const char *hostnqn); + +#endif /* _NVMET_H */ diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c new file mode 100644 index 000000000000..e06d504bdf0c --- /dev/null +++ b/drivers/nvme/target/rdma.c @@ -0,0 +1,1448 @@ +/* + * NVMe over Fabrics RDMA target. + * Copyright (c) 2015-2016 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/atomic.h> +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/nvme.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/wait.h> +#include <linux/inet.h> +#include <asm/unaligned.h> + +#include <rdma/ib_verbs.h> +#include <rdma/rdma_cm.h> +#include <rdma/rw.h> + +#include <linux/nvme-rdma.h> +#include "nvmet.h" + +/* + * We allow up to a page of inline data to go with the SQE + */ +#define NVMET_RDMA_INLINE_DATA_SIZE PAGE_SIZE + +struct nvmet_rdma_cmd { + struct ib_sge sge[2]; + struct ib_cqe cqe; + struct ib_recv_wr wr; + struct scatterlist inline_sg; + struct page *inline_page; + struct nvme_command *nvme_cmd; + struct nvmet_rdma_queue *queue; +}; + +enum { + NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), + NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), +}; + +struct nvmet_rdma_rsp { + struct ib_sge send_sge; + struct ib_cqe send_cqe; + struct ib_send_wr send_wr; + + struct nvmet_rdma_cmd *cmd; + struct nvmet_rdma_queue *queue; + + struct ib_cqe read_cqe; + struct rdma_rw_ctx rw; + + struct nvmet_req req; + + u8 n_rdma; + u32 flags; + u32 invalidate_rkey; + + struct list_head wait_list; + struct list_head free_list; +}; + +enum nvmet_rdma_queue_state { + NVMET_RDMA_Q_CONNECTING, + NVMET_RDMA_Q_LIVE, + NVMET_RDMA_Q_DISCONNECTING, +}; + +struct nvmet_rdma_queue { + struct rdma_cm_id *cm_id; + struct nvmet_port *port; + struct ib_cq *cq; + atomic_t sq_wr_avail; + struct nvmet_rdma_device *dev; + spinlock_t state_lock; + enum nvmet_rdma_queue_state state; + struct nvmet_cq nvme_cq; + struct nvmet_sq nvme_sq; + + struct nvmet_rdma_rsp *rsps; + struct list_head free_rsps; + spinlock_t rsps_lock; + struct nvmet_rdma_cmd *cmds; + + struct work_struct release_work; + struct list_head rsp_wait_list; + struct list_head rsp_wr_wait_list; + spinlock_t rsp_wr_wait_lock; + + int idx; + int host_qid; + int recv_queue_size; + int send_queue_size; + + struct list_head queue_list; +}; + +struct nvmet_rdma_device { + struct ib_device *device; + struct ib_pd *pd; + struct ib_srq *srq; + struct nvmet_rdma_cmd *srq_cmds; + size_t srq_size; + struct kref ref; + struct list_head entry; +}; + +static bool nvmet_rdma_use_srq; +module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); +MODULE_PARM_DESC(use_srq, "Use shared receive queue."); + +static DEFINE_IDA(nvmet_rdma_queue_ida); +static LIST_HEAD(nvmet_rdma_queue_list); +static DEFINE_MUTEX(nvmet_rdma_queue_mutex); + +static LIST_HEAD(device_list); +static DEFINE_MUTEX(device_list_mutex); + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); + +static struct nvmet_fabrics_ops nvmet_rdma_ops; + +/* XXX: really should move to a generic header sooner or later.. */ +static inline u32 get_unaligned_le24(const u8 *p) +{ + return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; +} + +static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) +{ + return nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) +{ + return !nvme_is_write(rsp->req.cmd) && + rsp->req.data_len && + !rsp->req.rsp->status && + !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); +} + +static inline struct nvmet_rdma_rsp * +nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_rsp *rsp; + unsigned long flags; + + spin_lock_irqsave(&queue->rsps_lock, flags); + rsp = list_first_entry(&queue->free_rsps, + struct nvmet_rdma_rsp, free_list); + list_del(&rsp->free_list); + spin_unlock_irqrestore(&queue->rsps_lock, flags); + + return rsp; +} + +static inline void +nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) +{ + unsigned long flags; + + spin_lock_irqsave(&rsp->queue->rsps_lock, flags); + list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); + spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); +} + +static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents) +{ + struct scatterlist *sg; + int count; + + if (!sgl || !nents) + return; + + for_each_sg(sgl, sg, nents, count) + __free_page(sg_page(sg)); + kfree(sgl); +} + +static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, + u32 length) +{ + struct scatterlist *sg; + struct page *page; + unsigned int nent; + int i = 0; + + nent = DIV_ROUND_UP(length, PAGE_SIZE); + sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL); + if (!sg) + goto out; + + sg_init_table(sg, nent); + + while (length) { + u32 page_len = min_t(u32, length, PAGE_SIZE); + + page = alloc_page(GFP_KERNEL); + if (!page) + goto out_free_pages; + + sg_set_page(&sg[i], page, page_len, 0); + length -= page_len; + i++; + } + *sgl = sg; + *nents = nent; + return 0; + +out_free_pages: + while (i > 0) { + i--; + __free_page(sg_page(&sg[i])); + } + kfree(sg); +out: + return NVME_SC_INTERNAL; +} + +static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + /* NVMe command / RDMA RECV */ + c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); + if (!c->nvme_cmd) + goto out; + + c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) + goto out_free_cmd; + + c->sge[0].length = sizeof(*c->nvme_cmd); + c->sge[0].lkey = ndev->pd->local_dma_lkey; + + if (!admin) { + c->inline_page = alloc_pages(GFP_KERNEL, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + if (!c->inline_page) + goto out_unmap_cmd; + c->sge[1].addr = ib_dma_map_page(ndev->device, + c->inline_page, 0, NVMET_RDMA_INLINE_DATA_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(ndev->device, c->sge[1].addr)) + goto out_free_inline_page; + c->sge[1].length = NVMET_RDMA_INLINE_DATA_SIZE; + c->sge[1].lkey = ndev->pd->local_dma_lkey; + } + + c->cqe.done = nvmet_rdma_recv_done; + + c->wr.wr_cqe = &c->cqe; + c->wr.sg_list = c->sge; + c->wr.num_sge = admin ? 1 : 2; + + return 0; + +out_free_inline_page: + if (!admin) { + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } +out_unmap_cmd: + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); +out_free_cmd: + kfree(c->nvme_cmd); + +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *c, bool admin) +{ + if (!admin) { + ib_dma_unmap_page(ndev->device, c->sge[1].addr, + NVMET_RDMA_INLINE_DATA_SIZE, DMA_FROM_DEVICE); + __free_pages(c->inline_page, + get_order(NVMET_RDMA_INLINE_DATA_SIZE)); + } + ib_dma_unmap_single(ndev->device, c->sge[0].addr, + sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); + kfree(c->nvme_cmd); +} + +static struct nvmet_rdma_cmd * +nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, + int nr_cmds, bool admin) +{ + struct nvmet_rdma_cmd *cmds; + int ret = -EINVAL, i; + + cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); + if (!cmds) + goto out; + + for (i = 0; i < nr_cmds; i++) { + ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); + if (ret) + goto out_free; + } + + return cmds; + +out_free: + while (--i >= 0) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +out: + return ERR_PTR(ret); +} + +static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) +{ + int i; + + for (i = 0; i < nr_cmds; i++) + nvmet_rdma_free_cmd(ndev, cmds + i, admin); + kfree(cmds); +} + +static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + /* NVMe CQE / RDMA SEND */ + r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); + if (!r->req.rsp) + goto out; + + r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) + goto out_free_rsp; + + r->send_sge.length = sizeof(*r->req.rsp); + r->send_sge.lkey = ndev->pd->local_dma_lkey; + + r->send_cqe.done = nvmet_rdma_send_done; + + r->send_wr.wr_cqe = &r->send_cqe; + r->send_wr.sg_list = &r->send_sge; + r->send_wr.num_sge = 1; + r->send_wr.send_flags = IB_SEND_SIGNALED; + + /* Data In / RDMA READ */ + r->read_cqe.done = nvmet_rdma_read_data_done; + return 0; + +out_free_rsp: + kfree(r->req.rsp); +out: + return -ENOMEM; +} + +static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_rsp *r) +{ + ib_dma_unmap_single(ndev->device, r->send_sge.addr, + sizeof(*r->req.rsp), DMA_TO_DEVICE); + kfree(r->req.rsp); +} + +static int +nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int nr_rsps = queue->recv_queue_size * 2; + int ret = -EINVAL, i; + + queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), + GFP_KERNEL); + if (!queue->rsps) + goto out; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + ret = nvmet_rdma_alloc_rsp(ndev, rsp); + if (ret) + goto out_free; + + list_add_tail(&rsp->free_list, &queue->free_rsps); + } + + return 0; + +out_free: + while (--i >= 0) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +out: + return ret; +} + +static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) +{ + struct nvmet_rdma_device *ndev = queue->dev; + int i, nr_rsps = queue->recv_queue_size * 2; + + for (i = 0; i < nr_rsps; i++) { + struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; + + list_del(&rsp->free_list); + nvmet_rdma_free_rsp(ndev, rsp); + } + kfree(queue->rsps); +} + +static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, + struct nvmet_rdma_cmd *cmd) +{ + struct ib_recv_wr *bad_wr; + + if (ndev->srq) + return ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); + return ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); +} + +static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) +{ + spin_lock(&queue->rsp_wr_wait_lock); + while (!list_empty(&queue->rsp_wr_wait_list)) { + struct nvmet_rdma_rsp *rsp; + bool ret; + + rsp = list_entry(queue->rsp_wr_wait_list.next, + struct nvmet_rdma_rsp, wait_list); + list_del(&rsp->wait_list); + + spin_unlock(&queue->rsp_wr_wait_lock); + ret = nvmet_rdma_execute_command(rsp); + spin_lock(&queue->rsp_wr_wait_lock); + + if (!ret) { + list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); + break; + } + } + spin_unlock(&queue->rsp_wr_wait_lock); +} + + +static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + + if (rsp->n_rdma) { + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + } + + if (rsp->req.sg != &rsp->cmd->inline_sg) + nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt); + + if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) + nvmet_rdma_process_wr_wait_list(queue); + + nvmet_rdma_put_rsp(rsp); +} + +static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) +{ + if (queue->nvme_sq.ctrl) { + nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); + } else { + /* + * we didn't setup the controller yet in case + * of admin connect error, just disconnect and + * cleanup the queue + */ + nvmet_rdma_queue_disconnect(queue); + } +} + +static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); + + nvmet_rdma_release_rsp(rsp); + + if (unlikely(wc->status != IB_WC_SUCCESS && + wc->status != IB_WC_WR_FLUSH_ERR)) { + pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(rsp->queue); + } +} + +static void nvmet_rdma_queue_response(struct nvmet_req *req) +{ + struct nvmet_rdma_rsp *rsp = + container_of(req, struct nvmet_rdma_rsp, req); + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + struct ib_send_wr *first_wr, *bad_wr; + + if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { + rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; + rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; + } else { + rsp->send_wr.opcode = IB_WR_SEND; + } + + if (nvmet_rdma_need_data_out(rsp)) + first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, + cm_id->port_num, NULL, &rsp->send_wr); + else + first_wr = &rsp->send_wr; + + nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); + if (ib_post_send(cm_id->qp, first_wr, &bad_wr)) { + pr_err("sending cmd response failed\n"); + nvmet_rdma_release_rsp(rsp); + } +} + +static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_rsp *rsp = + container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + + WARN_ON(rsp->n_rdma <= 0); + atomic_add(rsp->n_rdma, &queue->sq_wr_avail); + rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, rsp->req.sg, + rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); + rsp->n_rdma = 0; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + nvmet_rdma_release_rsp(rsp); + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + rsp->req.execute(&rsp->req); +} + +static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, + u64 off) +{ + sg_init_table(&rsp->cmd->inline_sg, 1); + sg_set_page(&rsp->cmd->inline_sg, rsp->cmd->inline_page, len, off); + rsp->req.sg = &rsp->cmd->inline_sg; + rsp->req.sg_cnt = 1; +} + +static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; + u64 off = le64_to_cpu(sgl->addr); + u32 len = le32_to_cpu(sgl->length); + + if (!nvme_is_write(rsp->req.cmd)) + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + + if (off + len > NVMET_RDMA_INLINE_DATA_SIZE) { + pr_err("invalid inline data offset!\n"); + return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; + } + + /* no data command? */ + if (!len) + return 0; + + nvmet_rdma_use_inline_sg(rsp, len, off); + rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; + return 0; +} + +static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, + struct nvme_keyed_sgl_desc *sgl, bool invalidate) +{ + struct rdma_cm_id *cm_id = rsp->queue->cm_id; + u64 addr = le64_to_cpu(sgl->addr); + u32 len = get_unaligned_le24(sgl->length); + u32 key = get_unaligned_le32(sgl->key); + int ret; + u16 status; + + /* no data command? */ + if (!len) + return 0; + + /* use the already allocated data buffer if possible */ + if (len <= NVMET_RDMA_INLINE_DATA_SIZE && rsp->queue->host_qid) { + nvmet_rdma_use_inline_sg(rsp, len, 0); + } else { + status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt, + len); + if (status) + return status; + } + + ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, + rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, + nvmet_data_dir(&rsp->req)); + if (ret < 0) + return NVME_SC_INTERNAL; + rsp->n_rdma += ret; + + if (invalidate) { + rsp->invalidate_rkey = key; + rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; + } + + return 0; +} + +static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) +{ + struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; + + switch (sgl->type >> 4) { + case NVME_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_OFFSET: + return nvmet_rdma_map_sgl_inline(rsp); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + case NVME_KEY_SGL_FMT_DATA_DESC: + switch (sgl->type & 0xf) { + case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); + case NVME_SGL_FMT_ADDRESS: + return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); + default: + pr_err("invalid SGL subtype: %#x\n", sgl->type); + return NVME_SC_INVALID_FIELD | NVME_SC_DNR; + } + default: + pr_err("invalid SGL type: %#x\n", sgl->type); + return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; + } +} + +static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) +{ + struct nvmet_rdma_queue *queue = rsp->queue; + + if (unlikely(atomic_sub_return(1 + rsp->n_rdma, + &queue->sq_wr_avail) < 0)) { + pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", + 1 + rsp->n_rdma, queue->idx, + queue->nvme_sq.ctrl->cntlid); + atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); + return false; + } + + if (nvmet_rdma_need_data_in(rsp)) { + if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, + queue->cm_id->port_num, &rsp->read_cqe, NULL)) + nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); + } else { + rsp->req.execute(&rsp->req); + } + + return true; +} + +static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, + struct nvmet_rdma_rsp *cmd) +{ + u16 status; + + cmd->queue = queue; + cmd->n_rdma = 0; + cmd->req.port = queue->port; + + if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, + &queue->nvme_sq, &nvmet_rdma_ops)) + return; + + status = nvmet_rdma_map_sgl(cmd); + if (status) + goto out_err; + + if (unlikely(!nvmet_rdma_execute_command(cmd))) { + spin_lock(&queue->rsp_wr_wait_lock); + list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); + spin_unlock(&queue->rsp_wr_wait_lock); + } + + return; + +out_err: + nvmet_req_complete(&cmd->req, status); +} + +static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct nvmet_rdma_cmd *cmd = + container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); + struct nvmet_rdma_queue *queue = cq->cq_context; + struct nvmet_rdma_rsp *rsp; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + if (wc->status != IB_WC_WR_FLUSH_ERR) { + pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", + wc->wr_cqe, ib_wc_status_msg(wc->status), + wc->status); + nvmet_rdma_error_comp(queue); + } + return; + } + + if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { + pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); + nvmet_rdma_error_comp(queue); + return; + } + + cmd->queue = queue; + rsp = nvmet_rdma_get_rsp(queue); + rsp->cmd = cmd; + rsp->flags = 0; + rsp->req.cmd = cmd->nvme_cmd; + + if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state == NVMET_RDMA_Q_CONNECTING) + list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); + else + nvmet_rdma_put_rsp(rsp); + spin_unlock_irqrestore(&queue->state_lock, flags); + return; + } + + nvmet_rdma_handle_command(queue, rsp); +} + +static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) +{ + if (!ndev->srq) + return; + + nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); + ib_destroy_srq(ndev->srq); +} + +static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) +{ + struct ib_srq_init_attr srq_attr = { NULL, }; + struct ib_srq *srq; + size_t srq_size; + int ret, i; + + srq_size = 4095; /* XXX: tune */ + + srq_attr.attr.max_wr = srq_size; + srq_attr.attr.max_sge = 2; + srq_attr.attr.srq_limit = 0; + srq_attr.srq_type = IB_SRQT_BASIC; + srq = ib_create_srq(ndev->pd, &srq_attr); + if (IS_ERR(srq)) { + /* + * If SRQs aren't supported we just go ahead and use normal + * non-shared receive queues. + */ + pr_info("SRQ requested but not supported.\n"); + return 0; + } + + ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); + if (IS_ERR(ndev->srq_cmds)) { + ret = PTR_ERR(ndev->srq_cmds); + goto out_destroy_srq; + } + + ndev->srq = srq; + ndev->srq_size = srq_size; + + for (i = 0; i < srq_size; i++) + nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); + + return 0; + +out_destroy_srq: + ib_destroy_srq(srq); + return ret; +} + +static void nvmet_rdma_free_dev(struct kref *ref) +{ + struct nvmet_rdma_device *ndev = + container_of(ref, struct nvmet_rdma_device, ref); + + mutex_lock(&device_list_mutex); + list_del(&ndev->entry); + mutex_unlock(&device_list_mutex); + + nvmet_rdma_destroy_srq(ndev); + ib_dealloc_pd(ndev->pd); + + kfree(ndev); +} + +static struct nvmet_rdma_device * +nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) +{ + struct nvmet_rdma_device *ndev; + int ret; + + mutex_lock(&device_list_mutex); + list_for_each_entry(ndev, &device_list, entry) { + if (ndev->device->node_guid == cm_id->device->node_guid && + kref_get_unless_zero(&ndev->ref)) + goto out_unlock; + } + + ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); + if (!ndev) + goto out_err; + + ndev->device = cm_id->device; + kref_init(&ndev->ref); + + ndev->pd = ib_alloc_pd(ndev->device); + if (IS_ERR(ndev->pd)) + goto out_free_dev; + + if (nvmet_rdma_use_srq) { + ret = nvmet_rdma_init_srq(ndev); + if (ret) + goto out_free_pd; + } + + list_add(&ndev->entry, &device_list); +out_unlock: + mutex_unlock(&device_list_mutex); + pr_debug("added %s.\n", ndev->device->name); + return ndev; + +out_free_pd: + ib_dealloc_pd(ndev->pd); +out_free_dev: + kfree(ndev); +out_err: + mutex_unlock(&device_list_mutex); + return NULL; +} + +static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) +{ + struct ib_qp_init_attr qp_attr; + struct nvmet_rdma_device *ndev = queue->dev; + int comp_vector, nr_cqe, ret, i; + + /* + * Spread the io queues across completion vectors, + * but still keep all admin queues on vector 0. + */ + comp_vector = !queue->host_qid ? 0 : + queue->idx % ndev->device->num_comp_vectors; + + /* + * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. + */ + nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; + + queue->cq = ib_alloc_cq(ndev->device, queue, + nr_cqe + 1, comp_vector, + IB_POLL_WORKQUEUE); + if (IS_ERR(queue->cq)) { + ret = PTR_ERR(queue->cq); + pr_err("failed to create CQ cqe= %d ret= %d\n", + nr_cqe + 1, ret); + goto out; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_context = queue; + qp_attr.event_handler = nvmet_rdma_qp_event; + qp_attr.send_cq = queue->cq; + qp_attr.recv_cq = queue->cq; + qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; + qp_attr.qp_type = IB_QPT_RC; + /* +1 for drain */ + qp_attr.cap.max_send_wr = queue->send_queue_size + 1; + qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; + qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, + ndev->device->attrs.max_sge); + + if (ndev->srq) { + qp_attr.srq = ndev->srq; + } else { + /* +1 for drain */ + qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; + qp_attr.cap.max_recv_sge = 2; + } + + ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); + if (ret) { + pr_err("failed to create_qp ret= %d\n", ret); + goto err_destroy_cq; + } + + atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); + + pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", + __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, + qp_attr.cap.max_send_wr, queue->cm_id); + + if (!ndev->srq) { + for (i = 0; i < queue->recv_queue_size; i++) { + queue->cmds[i].queue = queue; + nvmet_rdma_post_recv(ndev, &queue->cmds[i]); + } + } + +out: + return ret; + +err_destroy_cq: + ib_free_cq(queue->cq); + goto out; +} + +static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) +{ + rdma_destroy_qp(queue->cm_id); + ib_free_cq(queue->cq); +} + +static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) +{ + pr_info("freeing queue %d\n", queue->idx); + + nvmet_sq_destroy(&queue->nvme_sq); + + nvmet_rdma_destroy_queue_ib(queue); + if (!queue->dev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } + nvmet_rdma_free_rsps(queue); + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); + kfree(queue); +} + +static void nvmet_rdma_release_queue_work(struct work_struct *w) +{ + struct nvmet_rdma_queue *queue = + container_of(w, struct nvmet_rdma_queue, release_work); + struct rdma_cm_id *cm_id = queue->cm_id; + struct nvmet_rdma_device *dev = queue->dev; + + nvmet_rdma_free_queue(queue); + rdma_destroy_id(cm_id); + kref_put(&dev->ref, nvmet_rdma_free_dev); +} + +static int +nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, + struct nvmet_rdma_queue *queue) +{ + struct nvme_rdma_cm_req *req; + + req = (struct nvme_rdma_cm_req *)conn->private_data; + if (!req || conn->private_data_len == 0) + return NVME_RDMA_CM_INVALID_LEN; + + if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) + return NVME_RDMA_CM_INVALID_RECFMT; + + queue->host_qid = le16_to_cpu(req->qid); + + /* + * req->hsqsize corresponds to our recv queue size + * req->hrqsize corresponds to our send queue size + */ + queue->recv_queue_size = le16_to_cpu(req->hsqsize); + queue->send_queue_size = le16_to_cpu(req->hrqsize); + + if (!queue->host_qid && queue->recv_queue_size > NVMF_AQ_DEPTH) + return NVME_RDMA_CM_INVALID_HSQSIZE; + + /* XXX: Should we enforce some kind of max for IO queues? */ + + return 0; +} + +static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, + enum nvme_rdma_cm_status status) +{ + struct nvme_rdma_cm_rej rej; + + rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + rej.sts = cpu_to_le16(status); + + return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); +} + +static struct nvmet_rdma_queue * +nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, + struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue; + int ret; + + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_reject; + } + + ret = nvmet_sq_init(&queue->nvme_sq); + if (ret) + goto out_free_queue; + + ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); + if (ret) + goto out_destroy_sq; + + /* + * Schedules the actual release because calling rdma_destroy_id from + * inside a CM callback would trigger a deadlock. (great API design..) + */ + INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); + queue->dev = ndev; + queue->cm_id = cm_id; + + spin_lock_init(&queue->state_lock); + queue->state = NVMET_RDMA_Q_CONNECTING; + INIT_LIST_HEAD(&queue->rsp_wait_list); + INIT_LIST_HEAD(&queue->rsp_wr_wait_list); + spin_lock_init(&queue->rsp_wr_wait_lock); + INIT_LIST_HEAD(&queue->free_rsps); + spin_lock_init(&queue->rsps_lock); + + queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); + if (queue->idx < 0) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_queue; + } + + ret = nvmet_rdma_alloc_rsps(queue); + if (ret) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_ida_remove; + } + + if (!ndev->srq) { + queue->cmds = nvmet_rdma_alloc_cmds(ndev, + queue->recv_queue_size, + !queue->host_qid); + if (IS_ERR(queue->cmds)) { + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_responses; + } + } + + ret = nvmet_rdma_create_queue_ib(queue); + if (ret) { + pr_err("%s: creating RDMA queue failed (%d).\n", + __func__, ret); + ret = NVME_RDMA_CM_NO_RSC; + goto out_free_cmds; + } + + return queue; + +out_free_cmds: + if (!ndev->srq) { + nvmet_rdma_free_cmds(queue->dev, queue->cmds, + queue->recv_queue_size, + !queue->host_qid); + } +out_free_responses: + nvmet_rdma_free_rsps(queue); +out_ida_remove: + ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); +out_destroy_sq: + nvmet_sq_destroy(&queue->nvme_sq); +out_free_queue: + kfree(queue); +out_reject: + nvmet_rdma_cm_reject(cm_id, ret); + return NULL; +} + +static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) +{ + struct nvmet_rdma_queue *queue = priv; + + switch (event->event) { + case IB_EVENT_COMM_EST: + rdma_notify(queue->cm_id, event->event); + break; + default: + pr_err("received unrecognized IB QP event %d\n", event->event); + break; + } +} + +static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue, + struct rdma_conn_param *p) +{ + struct rdma_conn_param param = { }; + struct nvme_rdma_cm_rep priv = { }; + int ret = -ENOMEM; + + param.rnr_retry_count = 7; + param.flow_control = 1; + param.initiator_depth = min_t(u8, p->initiator_depth, + queue->dev->device->attrs.max_qp_init_rd_atom); + param.private_data = &priv; + param.private_data_len = sizeof(priv); + priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); + priv.crqsize = cpu_to_le16(queue->recv_queue_size); + + ret = rdma_accept(cm_id, ¶m); + if (ret) + pr_err("rdma_accept failed (error code = %d)\n", ret); + + return ret; +} + +static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_device *ndev; + struct nvmet_rdma_queue *queue; + int ret = -EINVAL; + + ndev = nvmet_rdma_find_get_device(cm_id); + if (!ndev) { + pr_err("no client data!\n"); + nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); + return -ECONNREFUSED; + } + + queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); + if (!queue) { + ret = -ENOMEM; + goto put_device; + } + queue->port = cm_id->context; + + ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); + if (ret) + goto release_queue; + + mutex_lock(&nvmet_rdma_queue_mutex); + list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + return 0; + +release_queue: + nvmet_rdma_free_queue(queue); +put_device: + kref_put(&ndev->ref, nvmet_rdma_free_dev); + + return ret; +} + +static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) +{ + unsigned long flags; + + spin_lock_irqsave(&queue->state_lock, flags); + if (queue->state != NVMET_RDMA_Q_CONNECTING) { + pr_warn("trying to establish a connected queue\n"); + goto out_unlock; + } + queue->state = NVMET_RDMA_Q_LIVE; + + while (!list_empty(&queue->rsp_wait_list)) { + struct nvmet_rdma_rsp *cmd; + + cmd = list_first_entry(&queue->rsp_wait_list, + struct nvmet_rdma_rsp, wait_list); + list_del(&cmd->wait_list); + + spin_unlock_irqrestore(&queue->state_lock, flags); + nvmet_rdma_handle_command(queue, cmd); + spin_lock_irqsave(&queue->state_lock, flags); + } + +out_unlock: + spin_unlock_irqrestore(&queue->state_lock, flags); +} + +static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + unsigned long flags; + + pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); + + spin_lock_irqsave(&queue->state_lock, flags); + switch (queue->state) { + case NVMET_RDMA_Q_CONNECTING: + case NVMET_RDMA_Q_LIVE: + disconnect = true; + queue->state = NVMET_RDMA_Q_DISCONNECTING; + break; + case NVMET_RDMA_Q_DISCONNECTING: + break; + } + spin_unlock_irqrestore(&queue->state_lock, flags); + + if (disconnect) { + rdma_disconnect(queue->cm_id); + ib_drain_qp(queue->cm_id->qp); + schedule_work(&queue->release_work); + } +} + +static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) +{ + bool disconnect = false; + + mutex_lock(&nvmet_rdma_queue_mutex); + if (!list_empty(&queue->queue_list)) { + list_del_init(&queue->queue_list); + disconnect = true; + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + if (disconnect) + __nvmet_rdma_queue_disconnect(queue); +} + +static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, + struct nvmet_rdma_queue *queue) +{ + WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); + + pr_err("failed to connect queue\n"); + schedule_work(&queue->release_work); +} + +static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, + struct rdma_cm_event *event) +{ + struct nvmet_rdma_queue *queue = NULL; + int ret = 0; + + if (cm_id->qp) + queue = cm_id->qp->qp_context; + + pr_debug("%s (%d): status %d id %p\n", + rdma_event_msg(event->event), event->event, + event->status, cm_id); + + switch (event->event) { + case RDMA_CM_EVENT_CONNECT_REQUEST: + ret = nvmet_rdma_queue_connect(cm_id, event); + break; + case RDMA_CM_EVENT_ESTABLISHED: + nvmet_rdma_queue_established(queue); + break; + case RDMA_CM_EVENT_ADDR_CHANGE: + case RDMA_CM_EVENT_DISCONNECTED: + case RDMA_CM_EVENT_DEVICE_REMOVAL: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + /* + * We can get the device removal callback even for a + * CM ID that we aren't actually using. In that case + * the context pointer is NULL, so we shouldn't try + * to disconnect a non-existing queue. But we also + * need to return 1 so that the core will destroy + * it's own ID. What a great API design.. + */ + if (queue) + nvmet_rdma_queue_disconnect(queue); + else + ret = 1; + break; + case RDMA_CM_EVENT_REJECTED: + case RDMA_CM_EVENT_UNREACHABLE: + case RDMA_CM_EVENT_CONNECT_ERROR: + nvmet_rdma_queue_connect_fail(cm_id, queue); + break; + default: + pr_err("received unrecognized RDMA CM event %d\n", + event->event); + break; + } + + return ret; +} + +static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) +{ + struct nvmet_rdma_queue *queue; + +restart: + mutex_lock(&nvmet_rdma_queue_mutex); + list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { + if (queue->nvme_sq.ctrl == ctrl) { + list_del_init(&queue->queue_list); + mutex_unlock(&nvmet_rdma_queue_mutex); + + __nvmet_rdma_queue_disconnect(queue); + goto restart; + } + } + mutex_unlock(&nvmet_rdma_queue_mutex); +} + +static int nvmet_rdma_add_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id; + struct sockaddr_in addr_in; + u16 port_in; + int ret; + + switch (port->disc_addr.adrfam) { + case NVMF_ADDR_FAMILY_IP4: + break; + default: + pr_err("address family %d not supported\n", + port->disc_addr.adrfam); + return -EINVAL; + } + + ret = kstrtou16(port->disc_addr.trsvcid, 0, &port_in); + if (ret) + return ret; + + addr_in.sin_family = AF_INET; + addr_in.sin_addr.s_addr = in_aton(port->disc_addr.traddr); + addr_in.sin_port = htons(port_in); + + cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, + RDMA_PS_TCP, IB_QPT_RC); + if (IS_ERR(cm_id)) { + pr_err("CM ID creation failed\n"); + return PTR_ERR(cm_id); + } + + ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr_in); + if (ret) { + pr_err("binding CM ID to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + ret = rdma_listen(cm_id, 128); + if (ret) { + pr_err("listening to %pISpc failed (%d)\n", &addr_in, ret); + goto out_destroy_id; + } + + pr_info("enabling port %d (%pISpc)\n", + le16_to_cpu(port->disc_addr.portid), &addr_in); + port->priv = cm_id; + return 0; + +out_destroy_id: + rdma_destroy_id(cm_id); + return ret; +} + +static void nvmet_rdma_remove_port(struct nvmet_port *port) +{ + struct rdma_cm_id *cm_id = port->priv; + + rdma_destroy_id(cm_id); +} + +static struct nvmet_fabrics_ops nvmet_rdma_ops = { + .owner = THIS_MODULE, + .type = NVMF_TRTYPE_RDMA, + .sqe_inline_size = NVMET_RDMA_INLINE_DATA_SIZE, + .msdbd = 1, + .has_keyed_sgls = 1, + .add_port = nvmet_rdma_add_port, + .remove_port = nvmet_rdma_remove_port, + .queue_response = nvmet_rdma_queue_response, + .delete_ctrl = nvmet_rdma_delete_ctrl, +}; + +static int __init nvmet_rdma_init(void) +{ + return nvmet_register_transport(&nvmet_rdma_ops); +} + +static void __exit nvmet_rdma_exit(void) +{ + struct nvmet_rdma_queue *queue; + + nvmet_unregister_transport(&nvmet_rdma_ops); + + flush_scheduled_work(); + + mutex_lock(&nvmet_rdma_queue_mutex); + while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list, + struct nvmet_rdma_queue, queue_list))) { + list_del_init(&queue->queue_list); + + mutex_unlock(&nvmet_rdma_queue_mutex); + __nvmet_rdma_queue_disconnect(queue); + mutex_lock(&nvmet_rdma_queue_mutex); + } + mutex_unlock(&nvmet_rdma_queue_mutex); + + flush_scheduled_work(); + ida_destroy(&nvmet_rdma_queue_ida); +} + +module_init(nvmet_rdma_init); +module_exit(nvmet_rdma_exit); + +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ |