1 files changed, 211 insertions, 68 deletions
diff --git a/drivers/scsi/lpfc/lpfc_nvmet.c b/drivers/scsi/lpfc/lpfc_nvmet.c
index bbbd0f84160d..0b7c1a49e203 100644
--- a/drivers/scsi/lpfc/lpfc_nvmet.c
+++ b/drivers/scsi/lpfc/lpfc_nvmet.c
@@ -170,12 +170,14 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf)
 	struct lpfc_nvmet_tgtport *tgtp;
 	struct fc_frame_header *fc_hdr;
 	struct rqb_dmabuf *nvmebuf;
+	struct lpfc_nvmet_ctx_info *infop;
 	uint32_t *payload;
 	uint32_t size, oxid, sid, rc;
+	int cpu;
 	unsigned long iflag;
 
 	if (ctxp->txrdy) {
-		pci_pool_free(phba->txrdy_payload_pool, ctxp->txrdy,
+		dma_pool_free(phba->txrdy_payload_pool, ctxp->txrdy,
 			      ctxp->txrdy_phys);
 		ctxp->txrdy = NULL;
 		ctxp->txrdy_phys = 0;
@@ -267,11 +269,16 @@ lpfc_nvmet_ctxbuf_post(struct lpfc_hba *phba, struct lpfc_nvmet_ctxbuf *ctx_buf)
 	}
 	spin_unlock_irqrestore(&phba->sli4_hba.nvmet_io_wait_lock, iflag);
 
-	spin_lock_irqsave(&phba->sli4_hba.nvmet_ctx_put_lock, iflag);
-	list_add_tail(&ctx_buf->list,
-		      &phba->sli4_hba.lpfc_nvmet_ctx_put_list);
-	phba->sli4_hba.nvmet_ctx_put_cnt++;
-	spin_unlock_irqrestore(&phba->sli4_hba.nvmet_ctx_put_lock, iflag);
+	/*
+	 * Use the CPU context list, from the MRQ the IO was received on
+	 * (ctxp->idx), to save context structure.
+	 */
+	cpu = smp_processor_id();
+	infop = lpfc_get_ctx_list(phba, cpu, ctxp->idx);
+	spin_lock_irqsave(&infop->nvmet_ctx_list_lock, iflag);
+	list_add_tail(&ctx_buf->list, &infop->nvmet_ctx_list);
+	infop->nvmet_ctx_list_cnt++;
+	spin_unlock_irqrestore(&infop->nvmet_ctx_list_lock, iflag);
 #endif
 }
 
@@ -552,7 +559,7 @@ lpfc_nvmet_xmt_fcp_op_cmp(struct lpfc_hba *phba, struct lpfc_iocbq *cmdwqe,
 		/* lpfc_nvmet_xmt_fcp_release() will recycle the context */
 	} else {
 		ctxp->entry_cnt++;
-		start_clean = offsetof(struct lpfc_iocbq, wqe);
+		start_clean = offsetof(struct lpfc_iocbq, iocb_flag);
 		memset(((char *)cmdwqe) + start_clean, 0,
 		       (sizeof(struct lpfc_iocbq) - start_clean));
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
@@ -879,51 +886,54 @@ static struct nvmet_fc_target_template lpfc_tgttemplate = {
 };
 
 static void
-lpfc_nvmet_cleanup_io_context(struct lpfc_hba *phba)
+__lpfc_nvmet_clean_io_for_cpu(struct lpfc_hba *phba,
+		struct lpfc_nvmet_ctx_info *infop)
 {
 	struct lpfc_nvmet_ctxbuf *ctx_buf, *next_ctx_buf;
 	unsigned long flags;
 
-	spin_lock_irqsave(&phba->sli4_hba.nvmet_ctx_get_lock, flags);
-	spin_lock(&phba->sli4_hba.nvmet_ctx_put_lock);
+	spin_lock_irqsave(&infop->nvmet_ctx_list_lock, flags);
 	list_for_each_entry_safe(ctx_buf, next_ctx_buf,
-			&phba->sli4_hba.lpfc_nvmet_ctx_get_list, list) {
+				&infop->nvmet_ctx_list, list) {
 		spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
 		list_del_init(&ctx_buf->list);
 		spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
-		__lpfc_clear_active_sglq(phba,
-					 ctx_buf->sglq->sli4_lxritag);
+
+		__lpfc_clear_active_sglq(phba, ctx_buf->sglq->sli4_lxritag);
 		ctx_buf->sglq->state = SGL_FREED;
 		ctx_buf->sglq->ndlp = NULL;
 
 		spin_lock(&phba->sli4_hba.sgl_list_lock);
 		list_add_tail(&ctx_buf->sglq->list,
-			      &phba->sli4_hba.lpfc_nvmet_sgl_list);
+				&phba->sli4_hba.lpfc_nvmet_sgl_list);
 		spin_unlock(&phba->sli4_hba.sgl_list_lock);
 
 		lpfc_sli_release_iocbq(phba, ctx_buf->iocbq);
 		kfree(ctx_buf->context);
 	}
-	list_for_each_entry_safe(ctx_buf, next_ctx_buf,
-			&phba->sli4_hba.lpfc_nvmet_ctx_put_list, list) {
-		spin_lock(&phba->sli4_hba.abts_nvme_buf_list_lock);
-		list_del_init(&ctx_buf->list);
-		spin_unlock(&phba->sli4_hba.abts_nvme_buf_list_lock);
-		__lpfc_clear_active_sglq(phba,
-					 ctx_buf->sglq->sli4_lxritag);
-		ctx_buf->sglq->state = SGL_FREED;
-		ctx_buf->sglq->ndlp = NULL;
+	spin_unlock_irqrestore(&infop->nvmet_ctx_list_lock, flags);
+}
 
-		spin_lock(&phba->sli4_hba.sgl_list_lock);
-		list_add_tail(&ctx_buf->sglq->list,
-			      &phba->sli4_hba.lpfc_nvmet_sgl_list);
-		spin_unlock(&phba->sli4_hba.sgl_list_lock);
+static void
+lpfc_nvmet_cleanup_io_context(struct lpfc_hba *phba)
+{
+	struct lpfc_nvmet_ctx_info *infop;
+	int i, j;
 
-		lpfc_sli_release_iocbq(phba, ctx_buf->iocbq);
-		kfree(ctx_buf->context);
+	/* The first context list, MRQ 0 CPU 0 */
+	infop = phba->sli4_hba.nvmet_ctx_info;
+	if (!infop)
+		return;
+
+	/* Cycle the the entire CPU context list for every MRQ */
+	for (i = 0; i < phba->cfg_nvmet_mrq; i++) {
+		for (j = 0; j < phba->sli4_hba.num_present_cpu; j++) {
+			__lpfc_nvmet_clean_io_for_cpu(phba, infop);
+			infop++; /* next */
+		}
 	}
-	spin_unlock(&phba->sli4_hba.nvmet_ctx_put_lock);
-	spin_unlock_irqrestore(&phba->sli4_hba.nvmet_ctx_get_lock, flags);
+	kfree(phba->sli4_hba.nvmet_ctx_info);
+	phba->sli4_hba.nvmet_ctx_info = NULL;
 }
 
 static int
@@ -932,15 +942,71 @@ lpfc_nvmet_setup_io_context(struct lpfc_hba *phba)
 	struct lpfc_nvmet_ctxbuf *ctx_buf;
 	struct lpfc_iocbq *nvmewqe;
 	union lpfc_wqe128 *wqe;
-	int i;
+	struct lpfc_nvmet_ctx_info *last_infop;
+	struct lpfc_nvmet_ctx_info *infop;
+	int i, j, idx;
 
 	lpfc_printf_log(phba, KERN_INFO, LOG_NVME,
 			"6403 Allocate NVMET resources for %d XRIs\n",
 			phba->sli4_hba.nvmet_xri_cnt);
 
+	phba->sli4_hba.nvmet_ctx_info = kcalloc(
+		phba->sli4_hba.num_present_cpu * phba->cfg_nvmet_mrq,
+		sizeof(struct lpfc_nvmet_ctx_info), GFP_KERNEL);
+	if (!phba->sli4_hba.nvmet_ctx_info) {
+		lpfc_printf_log(phba, KERN_ERR, LOG_INIT,
+				"6419 Failed allocate memory for "
+				"nvmet context lists\n");
+		return -ENOMEM;
+	}
+
+	/*
+	 * Assuming X CPUs in the system, and Y MRQs, allocate some
+	 * lpfc_nvmet_ctx_info structures as follows:
+	 *
+	 * cpu0/mrq0 cpu1/mrq0 ... cpuX/mrq0
+	 * cpu0/mrq1 cpu1/mrq1 ... cpuX/mrq1
+	 * ...
+	 * cpuX/mrqY cpuX/mrqY ... cpuX/mrqY
+	 *
+	 * Each line represents a MRQ "silo" containing an entry for
+	 * every CPU.
+	 *
+	 * MRQ X is initially assumed to be associated with CPU X, thus
+	 * contexts are initially distributed across all MRQs using
+	 * the MRQ index (N) as follows cpuN/mrqN. When contexts are
+	 * freed, the are freed to the MRQ silo based on the CPU number
+	 * of the IO completion. Thus a context that was allocated for MRQ A
+	 * whose IO completed on CPU B will be freed to cpuB/mrqA.
+	 */
+	infop = phba->sli4_hba.nvmet_ctx_info;
+	for (i = 0; i < phba->sli4_hba.num_present_cpu; i++) {
+		for (j = 0; j < phba->cfg_nvmet_mrq; j++) {
+			INIT_LIST_HEAD(&infop->nvmet_ctx_list);
+			spin_lock_init(&infop->nvmet_ctx_list_lock);
+			infop->nvmet_ctx_list_cnt = 0;
+			infop++;
+		}
+	}
+
+	/*
+	 * Setup the next CPU context info ptr for each MRQ.
+	 * MRQ 0 will cycle thru CPUs 0 - X separately from
+	 * MRQ 1 cycling thru CPUs 0 - X, and so on.
+	 */
+	for (j = 0; j < phba->cfg_nvmet_mrq; j++) {
+		last_infop = lpfc_get_ctx_list(phba, 0, j);
+		for (i = phba->sli4_hba.num_present_cpu - 1;  i >= 0; i--) {
+			infop = lpfc_get_ctx_list(phba, i, j);
+			infop->nvmet_ctx_next_cpu = last_infop;
+			last_infop = infop;
+		}
+	}
+
 	/* For all nvmet xris, allocate resources needed to process a
 	 * received command on a per xri basis.
 	 */
+	idx = 0;
 	for (i = 0; i < phba->sli4_hba.nvmet_xri_cnt; i++) {
 		ctx_buf = kzalloc(sizeof(*ctx_buf), GFP_KERNEL);
 		if (!ctx_buf) {
@@ -977,7 +1043,6 @@ lpfc_nvmet_setup_io_context(struct lpfc_hba *phba)
 		/* Word 7 */
 		bf_set(wqe_ct, &wqe->generic.wqe_com, SLI4_CT_RPI);
 		bf_set(wqe_class, &wqe->generic.wqe_com, CLASS3);
-		bf_set(wqe_pu, &wqe->generic.wqe_com, 1);
 		/* Word 10 */
 		bf_set(wqe_nvme, &wqe->fcp_tsend.wqe_com, 1);
 		bf_set(wqe_ebde_cnt, &wqe->generic.wqe_com, 0);
@@ -995,12 +1060,35 @@ lpfc_nvmet_setup_io_context(struct lpfc_hba *phba)
 					"6407 Ran out of NVMET XRIs\n");
 			return -ENOMEM;
 		}
-		spin_lock(&phba->sli4_hba.nvmet_ctx_get_lock);
-		list_add_tail(&ctx_buf->list,
-			      &phba->sli4_hba.lpfc_nvmet_ctx_get_list);
-		spin_unlock(&phba->sli4_hba.nvmet_ctx_get_lock);
+
+		/*
+		 * Add ctx to MRQidx context list. Our initial assumption
+		 * is MRQidx will be associated with CPUidx. This association
+		 * can change on the fly.
+		 */
+		infop = lpfc_get_ctx_list(phba, idx, idx);
+		spin_lock(&infop->nvmet_ctx_list_lock);
+		list_add_tail(&ctx_buf->list, &infop->nvmet_ctx_list);
+		infop->nvmet_ctx_list_cnt++;
+		spin_unlock(&infop->nvmet_ctx_list_lock);
+
+		/* Spread ctx structures evenly across all MRQs */
+		idx++;
+		if (idx >= phba->cfg_nvmet_mrq)
+			idx = 0;
+	}
+
+	infop = phba->sli4_hba.nvmet_ctx_info;
+	for (j = 0; j < phba->cfg_nvmet_mrq; j++) {
+		for (i = 0; i < phba->sli4_hba.num_present_cpu; i++) {
+			lpfc_printf_log(phba, KERN_INFO, LOG_NVME | LOG_INIT,
+					"6408 TOTAL NVMET ctx for CPU %d "
+					"MRQ %d: cnt %d nextcpu %p\n",
+					i, j, infop->nvmet_ctx_list_cnt,
+					infop->nvmet_ctx_next_cpu);
+			infop++;
+		}
 	}
-	phba->sli4_hba.nvmet_ctx_get_cnt = phba->sli4_hba.nvmet_xri_cnt;
 	return 0;
 }
 
@@ -1365,10 +1453,65 @@ dropit:
 #endif
 }
 
+static struct lpfc_nvmet_ctxbuf *
+lpfc_nvmet_replenish_context(struct lpfc_hba *phba,
+			     struct lpfc_nvmet_ctx_info *current_infop)
+{
+	struct lpfc_nvmet_ctxbuf *ctx_buf = NULL;
+	struct lpfc_nvmet_ctx_info *get_infop;
+	int i;
+
+	/*
+	 * The current_infop for the MRQ a NVME command IU was received
+	 * on is empty. Our goal is to replenish this MRQs context
+	 * list from a another CPUs.
+	 *
+	 * First we need to pick a context list to start looking on.
+	 * nvmet_ctx_start_cpu has available context the last time
+	 * we needed to replenish this CPU where nvmet_ctx_next_cpu
+	 * is just the next sequential CPU for this MRQ.
+	 */
+	if (current_infop->nvmet_ctx_start_cpu)
+		get_infop = current_infop->nvmet_ctx_start_cpu;
+	else
+		get_infop = current_infop->nvmet_ctx_next_cpu;
+
+	for (i = 0; i < phba->sli4_hba.num_present_cpu; i++) {
+		if (get_infop == current_infop) {
+			get_infop = get_infop->nvmet_ctx_next_cpu;
+			continue;
+		}
+		spin_lock(&get_infop->nvmet_ctx_list_lock);
+
+		/* Just take the entire context list, if there are any */
+		if (get_infop->nvmet_ctx_list_cnt) {
+			list_splice_init(&get_infop->nvmet_ctx_list,
+				    &current_infop->nvmet_ctx_list);
+			current_infop->nvmet_ctx_list_cnt =
+				get_infop->nvmet_ctx_list_cnt - 1;
+			get_infop->nvmet_ctx_list_cnt = 0;
+			spin_unlock(&get_infop->nvmet_ctx_list_lock);
+
+			current_infop->nvmet_ctx_start_cpu = get_infop;
+			list_remove_head(&current_infop->nvmet_ctx_list,
+					 ctx_buf, struct lpfc_nvmet_ctxbuf,
+					 list);
+			return ctx_buf;
+		}
+
+		/* Otherwise, move on to the next CPU for this MRQ */
+		spin_unlock(&get_infop->nvmet_ctx_list_lock);
+		get_infop = get_infop->nvmet_ctx_next_cpu;
+	}
+
+	/* Nothing found, all contexts for the MRQ are in-flight */
+	return NULL;
+}
+
 /**
  * lpfc_nvmet_unsol_fcp_buffer - Process an unsolicited event data buffer
  * @phba: pointer to lpfc hba data structure.
- * @pring: pointer to a SLI ring.
+ * @idx: relative index of MRQ vector
  * @nvmebuf: pointer to lpfc nvme command HBQ data structure.
  *
  * This routine is used for processing the WQE associated with a unsolicited
@@ -1380,22 +1523,26 @@ dropit:
  **/
 static void
 lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
-			    struct lpfc_sli_ring *pring,
+			    uint32_t idx,
 			    struct rqb_dmabuf *nvmebuf,
 			    uint64_t isr_timestamp)
 {
-#if (IS_ENABLED(CONFIG_NVME_TARGET_FC))
 	struct lpfc_nvmet_rcv_ctx *ctxp;
 	struct lpfc_nvmet_tgtport *tgtp;
 	struct fc_frame_header *fc_hdr;
 	struct lpfc_nvmet_ctxbuf *ctx_buf;
+	struct lpfc_nvmet_ctx_info *current_infop;
 	uint32_t *payload;
 	uint32_t size, oxid, sid, rc, qno;
 	unsigned long iflag;
+	int current_cpu;
 #ifdef CONFIG_SCSI_LPFC_DEBUG_FS
 	uint32_t id;
 #endif
 
+	if (!IS_ENABLED(CONFIG_NVME_TARGET_FC))
+		return;
+
 	ctx_buf = NULL;
 	if (!nvmebuf || !phba->targetport) {
 		lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
@@ -1407,31 +1554,24 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
 		goto dropit;
 	}
 
-	spin_lock_irqsave(&phba->sli4_hba.nvmet_ctx_get_lock, iflag);
-	if (phba->sli4_hba.nvmet_ctx_get_cnt) {
-		list_remove_head(&phba->sli4_hba.lpfc_nvmet_ctx_get_list,
+	/*
+	 * Get a pointer to the context list for this MRQ based on
+	 * the CPU this MRQ IRQ is associated with. If the CPU association
+	 * changes from our initial assumption, the context list could
+	 * be empty, thus it would need to be replenished with the
+	 * context list from another CPU for this MRQ.
+	 */
+	current_cpu = smp_processor_id();
+	current_infop = lpfc_get_ctx_list(phba, current_cpu, idx);
+	spin_lock_irqsave(&current_infop->nvmet_ctx_list_lock, iflag);
+	if (current_infop->nvmet_ctx_list_cnt) {
+		list_remove_head(&current_infop->nvmet_ctx_list,
 				 ctx_buf, struct lpfc_nvmet_ctxbuf, list);
-		phba->sli4_hba.nvmet_ctx_get_cnt--;
+		current_infop->nvmet_ctx_list_cnt--;
 	} else {
-		spin_lock(&phba->sli4_hba.nvmet_ctx_put_lock);
-		if (phba->sli4_hba.nvmet_ctx_put_cnt) {
-			list_splice(&phba->sli4_hba.lpfc_nvmet_ctx_put_list,
-				    &phba->sli4_hba.lpfc_nvmet_ctx_get_list);
-			INIT_LIST_HEAD(&phba->sli4_hba.lpfc_nvmet_ctx_put_list);
-			phba->sli4_hba.nvmet_ctx_get_cnt =
-				phba->sli4_hba.nvmet_ctx_put_cnt;
-			phba->sli4_hba.nvmet_ctx_put_cnt = 0;
-			spin_unlock(&phba->sli4_hba.nvmet_ctx_put_lock);
-
-			list_remove_head(
-				&phba->sli4_hba.lpfc_nvmet_ctx_get_list,
-				ctx_buf, struct lpfc_nvmet_ctxbuf, list);
-			phba->sli4_hba.nvmet_ctx_get_cnt--;
-		} else {
-			spin_unlock(&phba->sli4_hba.nvmet_ctx_put_lock);
-		}
+		ctx_buf = lpfc_nvmet_replenish_context(phba, current_infop);
 	}
-	spin_unlock_irqrestore(&phba->sli4_hba.nvmet_ctx_get_lock, iflag);
+	spin_unlock_irqrestore(&current_infop->nvmet_ctx_list_lock, iflag);
 
 	fc_hdr = (struct fc_frame_header *)(nvmebuf->hbuf.virt);
 	oxid = be16_to_cpu(fc_hdr->fh_ox_id);
@@ -1483,6 +1623,7 @@ lpfc_nvmet_unsol_fcp_buffer(struct lpfc_hba *phba,
 	ctxp->size = size;
 	ctxp->oxid = oxid;
 	ctxp->sid = sid;
+	ctxp->idx = idx;
 	ctxp->state = LPFC_NVMET_STE_RCV;
 	ctxp->entry_cnt = 1;
 	ctxp->flag = 0;
@@ -1556,7 +1697,6 @@ dropit:
 
 	if (nvmebuf)
 		lpfc_rq_buf_free(phba, &nvmebuf->hbuf); /* repost */
-#endif
 }
 
 /**
@@ -1591,7 +1731,7 @@ lpfc_nvmet_unsol_ls_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
 /**
  * lpfc_nvmet_unsol_fcp_event - Process an unsolicited event from an nvme nport
  * @phba: pointer to lpfc hba data structure.
- * @pring: pointer to a SLI ring.
+ * @idx: relative index of MRQ vector
  * @nvmebuf: pointer to received nvme data structure.
  *
  * This routine is used to process an unsolicited event received from a SLI
@@ -1602,7 +1742,7 @@ lpfc_nvmet_unsol_ls_event(struct lpfc_hba *phba, struct lpfc_sli_ring *pring,
  **/
 void
 lpfc_nvmet_unsol_fcp_event(struct lpfc_hba *phba,
-			   struct lpfc_sli_ring *pring,
+			   uint32_t idx,
 			   struct rqb_dmabuf *nvmebuf,
 			   uint64_t isr_timestamp)
 {
@@ -1610,7 +1750,7 @@ lpfc_nvmet_unsol_fcp_event(struct lpfc_hba *phba,
 		lpfc_rq_buf_free(phba, &nvmebuf->hbuf);
 		return;
 	}
-	lpfc_nvmet_unsol_fcp_buffer(phba, pring, nvmebuf,
+	lpfc_nvmet_unsol_fcp_buffer(phba, idx, nvmebuf,
 				    isr_timestamp);
 }
 
@@ -1863,6 +2003,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
 		       nvmewqe->sli4_xritag);
 
 		/* Word 7 */
+		bf_set(wqe_pu, &wqe->fcp_tsend.wqe_com, 1);
 		bf_set(wqe_cmnd, &wqe->fcp_tsend.wqe_com, CMD_FCP_TSEND64_WQE);
 
 		/* Word 8 */
@@ -1939,7 +2080,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
 
 	case NVMET_FCOP_WRITEDATA:
 		/* Words 0 - 2 : The first sg segment */
-		txrdy = pci_pool_alloc(phba->txrdy_payload_pool,
+		txrdy = dma_pool_alloc(phba->txrdy_payload_pool,
 				       GFP_KERNEL, &physaddr);
 		if (!txrdy) {
 			lpfc_printf_log(phba, KERN_ERR, LOG_NVME_IOERR,
@@ -1971,6 +2112,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
 		       nvmewqe->sli4_xritag);
 
 		/* Word 7 */
+		bf_set(wqe_pu, &wqe->fcp_treceive.wqe_com, 1);
 		bf_set(wqe_ar, &wqe->fcp_treceive.wqe_com, 0);
 		bf_set(wqe_cmnd, &wqe->fcp_treceive.wqe_com,
 		       CMD_FCP_TRECEIVE64_WQE);
@@ -2054,6 +2196,7 @@ lpfc_nvmet_prep_fcp_wqe(struct lpfc_hba *phba,
 		       nvmewqe->sli4_xritag);
 
 		/* Word 7 */
+		bf_set(wqe_pu, &wqe->fcp_trsp.wqe_com, 0);
 		bf_set(wqe_ag, &wqe->fcp_trsp.wqe_com, 1);
 		bf_set(wqe_cmnd, &wqe->fcp_trsp.wqe_com, CMD_FCP_TRSP64_WQE);