sfc:On MCDI timeout, issue an FLR (and mark MCDI to fail-fast)

When an MCDI command times out (whether or not we find it completed when we poll), call efx_mcdi_abandon(), which tells all subsequent MCDI calls to fail-fast, and queues up an FLR. Because an FLR doesn't lead to receiving any reboot even from the MC (unlike most other types of reset), we have to call efx_ef10_reset_mc_allocations. In efx_start_all(), if a reset (of any kind) is pending, we bail out. Without this, attempts to reconfigure (e.g. change mtu) can cause driver/mc state inconsistency if the first MCDI call triggers an FLR. For similar reasons, on EF10, in efx_reset_down(method=RESET_TYPE_MCDI_TIMEOUT), set the number of active queues to zero before calling efx_stop_all(). And, on farch, in efx_reset_up(method=RESET_TYPE_MCDI_TIMEOUT), set active_queues and flushes pending & outstanding to zero. efx_mcdi_mode_{poll,event}() should not take us out of fail-fast mode. Instead, this is done by efx_mcdi_reset() after the FLR completes. The new FLR reset_type RESET_TYPE_MCDI_TIMEOUT doesn't really fit into the hierarchy of reset 'scopes' whereby efx_reset() decides some resets subsume others. Thus, it uses separate logic. Also, fixed up some inconsistency around RESET_TYPE_MC_BIST, which was in the wrong place in that hierarchy. Signed-off-by: Shradha Shah <sshah@solarflare.com> Signed-off-by: David S. Miller <davem@davemloft.net>
author: Edward Cree <ecree@solarflare.com> 2014-04-16 19:27:48 +0100
committer: David S. Miller <davem@davemloft.net> 2014-04-16 14:33:57 -0400
commit: e283546c0465dd3026bc94f7b1a9de7f6b8969ec (patch)
tree: 3828d4faeed3986b0f01b93416b910b11cd33280 /drivers/net/ethernet/sfc/mcdi.c
parent: 10ec34fcb100412ab186c141a9c3557d1270effd (diff)
1 files changed, 44 insertions, 11 deletions
diff --git a/drivers/net/ethernet/sfc/mcdi.c b/drivers/net/ethernet/sfc/mcdi.c
index 7bd4b14bf3b3..5239cf9bdc56 100644
--- a/drivers/net/ethernet/sfc/mcdi.c
+++ b/drivers/net/ethernet/sfc/mcdi.c
@@ -52,12 +52,7 @@ static void efx_mcdi_timeout_async(unsigned long context);
 static int efx_mcdi_drv_attach(struct efx_nic *efx, bool driver_operating,
 			       bool *was_attached_out);
 static bool efx_mcdi_poll_once(struct efx_nic *efx);
-
-static inline struct efx_mcdi_iface *efx_mcdi(struct efx_nic *efx)
-{
-	EFX_BUG_ON_PARANOID(!efx->mcdi);
-	return &efx->mcdi->iface;
-}
+static void efx_mcdi_abandon(struct efx_nic *efx);
 
 int efx_mcdi_init(struct efx_nic *efx)
 {
@@ -558,6 +553,8 @@ static int _efx_mcdi_rpc_finish(struct efx_nic *efx, unsigned cmd, size_t inlen,
 			rc = 0;
 		}
 
+		efx_mcdi_abandon(efx);
+
 		/* Close the race with efx_mcdi_ev_cpl() executing just too late
 		 * and completing a request we've just cancelled, by ensuring
 		 * that the seqno check therein fails.
@@ -672,6 +669,9 @@ int efx_mcdi_rpc_start(struct efx_nic *efx, unsigned cmd,
 	if (efx->mc_bist_for_other_fn)
 		return -ENETDOWN;
 
+	if (mcdi->mode == MCDI_MODE_FAIL)
+		return -ENETDOWN;
+
 	efx_mcdi_acquire_sync(mcdi);
 	efx_mcdi_send_request(efx, cmd, inbuf, inlen);
 	return 0;
@@ -812,7 +812,11 @@ void efx_mcdi_mode_poll(struct efx_nic *efx)
 		return;
 
 	mcdi = efx_mcdi(efx);
-	if (mcdi->mode == MCDI_MODE_POLL)
+	/* If already in polling mode, nothing to do.
+	 * If in fail-fast state, don't switch to polled completion.
+	 * FLR recovery will do that later.
+	 */
+	if (mcdi->mode == MCDI_MODE_POLL || mcdi->mode == MCDI_MODE_FAIL)
 		return;
 
 	/* We can switch from event completion to polled completion, because
@@ -841,8 +845,8 @@ void efx_mcdi_flush_async(struct efx_nic *efx)
 
 	mcdi = efx_mcdi(efx);
 
-	/* We must be in polling mode so no more requests can be queued */
-	BUG_ON(mcdi->mode != MCDI_MODE_POLL);
+	/* We must be in poll or fail mode so no more requests can be queued */
+	BUG_ON(mcdi->mode == MCDI_MODE_EVENTS);
 
 	del_timer_sync(&mcdi->async_timer);
 
@@ -875,8 +879,11 @@ void efx_mcdi_mode_event(struct efx_nic *efx)
 		return;
 
 	mcdi = efx_mcdi(efx);
-
-	if (mcdi->mode == MCDI_MODE_EVENTS)
+	/* If already in event completion mode, nothing to do.
+	 * If in fail-fast state, don't switch to event completion.  FLR
+	 * recovery will do that later.
+	 */
+	if (mcdi->mode == MCDI_MODE_EVENTS || mcdi->mode == MCDI_MODE_FAIL)
 		return;
 
 	/* We can't switch from polled to event completion in the middle of a
@@ -966,6 +973,19 @@ static void efx_mcdi_ev_bist(struct efx_nic *efx)
 	spin_unlock(&mcdi->iface_lock);
 }
 
+/* MCDI timeouts seen, so make all MCDI calls fail-fast and issue an FLR to try
+ * to recover.
+ */
+static void efx_mcdi_abandon(struct efx_nic *efx)
+{
+	struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+
+	if (xchg(&mcdi->mode, MCDI_MODE_FAIL) == MCDI_MODE_FAIL)
+		return; /* it had already been done */
+	netif_dbg(efx, hw, efx->net_dev, "MCDI is timing out; trying to recover\n");
+	efx_schedule_reset(efx, RESET_TYPE_MCDI_TIMEOUT);
+}
+
 /* Called from  falcon_process_eventq for MCDI events */
 void efx_mcdi_process_event(struct efx_channel *channel,
 			    efx_qword_t *event)
@@ -1512,6 +1532,19 @@ int efx_mcdi_reset(struct efx_nic *efx, enum reset_type method)
 {
 	int rc;
 
+	/* If MCDI is down, we can't handle_assertion */
+	if (method == RESET_TYPE_MCDI_TIMEOUT) {
+		rc = pci_reset_function(efx->pci_dev);
+		if (rc)
+			return rc;
+		/* Re-enable polled MCDI completion */
+		if (efx->mcdi) {
+			struct efx_mcdi_iface *mcdi = efx_mcdi(efx);
+			mcdi->mode = MCDI_MODE_POLL;
+		}
+		return 0;
+	}
+
 	/* Recover from a failed assertion pre-reset */
 	rc = efx_mcdi_handle_assertion(efx);
 	if (rc)
author	Edward Cree <ecree@solarflare.com>	2014-04-16 19:27:48 +0100
committer	David S. Miller <davem@davemloft.net>	2014-04-16 14:33:57 -0400
commit	e283546c0465dd3026bc94f7b1a9de7f6b8969ec (patch)
tree	3828d4faeed3986b0f01b93416b910b11cd33280 /drivers/net/ethernet/sfc/mcdi.c
parent	10ec34fcb100412ab186c141a9c3557d1270effd (diff)