drm/i915/guc: implement submission via REQUEST_PREEMPTION action

If a batch is submitted via the preemptive (KMD_HIGH-priority) client then instead of ringing the doorbell we dispatch it using the GuC "REQUEST_PREEMPTION" action. Also, we specify "clear work queue" and "clear submit queue" in that request, so the scheduler can reconsider what is to be done next after preemption. Note that the preemption request requires a reference to the GuC per- context shared data, which in early versions of the GuC firmware was at the end of the context object but nowadays is at the start. For: VIZ-2021 Signed-off-by: Dave Gordon <david.s.gordon@intel.com>
author: Dave Gordon <david.s.gordon@intel.com> 2015-05-19 16:07:07 +0100
committer: John Harrison <John.C.Harrison@Intel.com> 2016-06-28 17:19:15 +0100
commit: 8c90128707ae182a663d1a05ebdb8f10afd0592e (patch)
tree: 08721214e50125fef904edaf341cf31e8f9d2758
parent: fffda29ee47b353914e1a5ed68b2cde19ba48d82 (diff)
4 files changed, 140 insertions, 28 deletions
diff --git a/drivers/gpu/drm/i915/i915_guc_submission.c b/drivers/gpu/drm/i915/i915_guc_submission.c
index 157e55711959..a4c30667c5d1 100644
--- a/drivers/gpu/drm/i915/i915_guc_submission.c
+++ b/drivers/gpu/drm/i915/i915_guc_submission.c
@@ -93,7 +93,8 @@ static int host2guc_action(struct intel_guc *guc, u32 *data, u32 len)
 	for (i = 0; i < len; i++)
 		I915_WRITE(SOFT_SCRATCH(i), data[i]);
 
-	POSTING_READ(SOFT_SCRATCH(i - 1));
+	I915_WRITE(SOFT_SCRATCH(15), 0);
+	POSTING_READ(SOFT_SCRATCH(0));
 
 	I915_WRITE(HOST2GUC_INTERRUPT, HOST2GUC_TRIGGER);
 
@@ -124,6 +125,55 @@ static int host2guc_action(struct intel_guc *guc, u32 *data, u32 len)
 }
 
 /*
+ * Tell the GuC to submit a request pre-emptively
+ */
+static int
+host2guc_preempt(struct i915_guc_client *client,
+		 struct intel_context *ctx,
+		 struct intel_engine_cs *ring)
+{
+	struct drm_i915_private *dev_priv = to_i915(ring->dev);
+	struct intel_guc *guc = &dev_priv->guc;
+	uint32_t engine_id = ring->guc_id;
+	struct drm_i915_gem_object *ctx_obj = ctx->engine[engine_id].state;
+	struct intel_ringbuffer *ringbuf = ctx->engine[engine_id].ringbuf;
+	struct guc_process_desc *desc;
+	void *base;
+	u32 data[7];
+	int ret;
+
+	if (WARN_ON(!ctx_obj || !ringbuf))
+		return -EINVAL;
+
+	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
+	WARN_ON(!i915_gem_obj_is_pinned(ringbuf->obj));
+
+	WARN_ON(guc->preempt_client != client);
+
+	base = kmap_atomic(i915_gem_object_get_page(client->client_obj, 0));
+	desc = base + client->proc_desc_offset;
+
+	/* Update the tail so it is visible to GuC */
+	desc->tail = client->wq_tail;
+	kunmap_atomic(base);
+
+	data[0] = HOST2GUC_ACTION_REQUEST_PREEMPTION;
+	data[1] = guc->preempt_client->ctx_index;		/* preemptive client			*/
+	data[2] = /* PREEMPT_ENGINE_OPTIONS */
+		  HOST2GUC_PREEMPT_OPTION_IMMEDIATE |		/* submit before return			*/
+		  HOST2GUC_PREEMPT_OPTION_DROP_WORK_Q |		/* drop wq for client data[5]		*/
+		  HOST2GUC_PREEMPT_OPTION_DROP_SUBMIT_Q;	/* drop submitted (engine, priority)	*/
+	data[3] = engine_id;					/* target engine			*/
+	data[4] = guc->execbuf_client->priority;		/* victim priority			*/
+	data[5] = guc->execbuf_client->ctx_index;		/* victim ctx/wq			*/
+	data[6] = i915_gem_obj_ggtt_offset(ctx_obj) + LRC_GUCSHR_PN*PAGE_SIZE;
+
+	ret = host2guc_action(guc, data, 7);
+	WARN_ON(ret);
+	return ret;
+}
+
+/*
  * Tell the GuC to allocate or deallocate a specific doorbell
  */
 
@@ -565,16 +615,21 @@ static int guc_add_workqueue_item(struct i915_guc_client *gc,
 int i915_guc_submit(struct i915_guc_client *client,
 		    struct drm_i915_gem_request *rq)
 {
-	struct intel_guc *guc = client->guc;
+	bool preemptive = client->priority <= GUC_CTX_PRIORITY_HIGH;
 	unsigned int engine_id = rq->engine->guc_id;
+	struct intel_guc *guc = client->guc;
 	int q_ret, b_ret;
 
 	if (WARN_ON(engine_id >= I915_NUM_ENGINES))
 		return -ENXIO;
 
 	q_ret = guc_add_workqueue_item(client, rq);
-	if (q_ret == 0)
-		b_ret = guc_ring_doorbell(client);
+	if (q_ret == 0) {
+		if (preemptive)
+			b_ret = host2guc_preempt(client, rq->ctx, rq->engine);
+		else
+			b_ret = guc_ring_doorbell(client);
+	}
 
 	client->submissions[engine_id] += 1;
 	if (q_ret) {
@@ -585,6 +640,7 @@ int i915_guc_submit(struct i915_guc_client *client,
 		client->retcode = q_ret = b_ret;
 	} else {
 		client->retcode = 0;
+		rq->elsp_submitted += 1;
 	}
 	guc->submissions[engine_id] += 1;
 	guc->last_seqno[engine_id] = rq->seqno;
diff --git a/drivers/gpu/drm/i915/intel_guc_fwif.h b/drivers/gpu/drm/i915/intel_guc_fwif.h
index 2de57ffe5e18..7032bae1b700 100644
--- a/drivers/gpu/drm/i915/intel_guc_fwif.h
+++ b/drivers/gpu/drm/i915/intel_guc_fwif.h
@@ -420,6 +420,7 @@ struct guc_ads {
 /* This Action will be programmed in C180 - SOFT_SCRATCH_O_REG */
 enum host2guc_action {
 	HOST2GUC_ACTION_DEFAULT = 0x0,
+	HOST2GUC_ACTION_REQUEST_PREEMPTION = 0x2,
 	HOST2GUC_ACTION_SAMPLE_FORCEWAKE = 0x6,
 	HOST2GUC_ACTION_ALLOCATE_DOORBELL = 0x10,
 	HOST2GUC_ACTION_DEALLOCATE_DOORBELL = 0x20,
@@ -429,6 +430,12 @@ enum host2guc_action {
 	HOST2GUC_ACTION_LIMIT
 };
 
+enum action_preempt_options {
+	HOST2GUC_PREEMPT_OPTION_IMMEDIATE = 0x1,
+	HOST2GUC_PREEMPT_OPTION_DROP_WORK_Q = 0x4,
+	HOST2GUC_PREEMPT_OPTION_DROP_SUBMIT_Q = 0x8,
+};
+
 /*
  * The GuC sends its response to a command by overwriting the
  * command in SS0. The response is distinguishable from a command
diff --git a/drivers/gpu/drm/i915/intel_lrc.c b/drivers/gpu/drm/i915/intel_lrc.c
index 5f42e45bc015..873a5507d70d 100644
--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -801,6 +801,9 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 	struct intel_ringbuffer *ringbuf = request->ringbuf;
 	struct drm_i915_private *dev_priv = request->i915;
 	struct intel_engine_cs *engine = request->engine;
+	struct i915_guc_client *client = dev_priv->guc.execbuf_client;
+	static const bool fake = false;	/* true => only pretend to preempt */
+	bool preemptive = false;	/* for now */
 
 	intel_logical_ring_advance(ringbuf);
 	request->tail = ringbuf->tail;
@@ -829,8 +832,11 @@ intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
 		}
 	}
 
-	if (dev_priv->guc.execbuf_client)
-		i915_guc_submit(dev_priv->guc.execbuf_client, request);
+	if (preemptive && dev_priv->guc.preempt_client && !fake)
+		client = dev_priv->guc.preempt_client;
+
+	if (client)
+		i915_guc_submit(client, request);
 	else
 		execlists_context_queue(request);
 
@@ -2005,57 +2011,96 @@ static void bxt_a_set_seqno(struct intel_engine_cs *engine, u32 seqno)
  */
 #define WA_TAIL_DWORDS 2
 
-static inline u32 hws_seqno_address(struct intel_engine_cs *engine)
-{
-	return engine->status_page.gfx_addr + I915_GEM_HWS_INDEX_ADDR;
-}
-
 static int gen8_emit_request(struct drm_i915_gem_request *request)
 {
 	struct intel_ringbuffer *ringbuf = request->ringbuf;
+	u32 cmd;
+	u64 addr;
 	int ret;
 
-	ret = intel_logical_ring_begin(request, 6 + WA_TAIL_DWORDS);
+	/*
+	 * Reserve space for the instructions below, plus some NOOPs
+	 * at the end of each request to be used as a workaround for
+	 * not being allowed to do lite restore with HEAD==TAIL
+	 * (WaIdleLiteRestore).
+	 */
+	ret = intel_logical_ring_begin(request, 4 + 2 + WA_TAIL_DWORDS);
 	if (ret)
 		return ret;
 
+	cmd = MI_FLUSH_DW;
+	cmd += 1;			/* Gen8+ uses long addresses	*/
+	cmd |= MI_FLUSH_DW_OP_STOREDW;	/* Store DWord as post-op	*/
+	cmd |= MI_FLUSH_DW_STORE_INDEX;	/* Address is relative to HWSP	*/
+
+	// Must be QWord aligned even for DWord write
+	BUILD_BUG_ON((I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT) & (1 << 2));
+
+#if	1
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
-	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));
+	// This is true for a QWord write, but not a DWord
+	BUILD_BUG_ON((I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT) & (1 << 5));
+#endif
 
-	intel_logical_ring_emit(ringbuf,
-				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-	intel_logical_ring_emit(ringbuf,
-				hws_seqno_address(request->engine) |
-				MI_FLUSH_DW_USE_GTT);
-	intel_logical_ring_emit(ringbuf, 0);
+	addr = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	addr |= MI_FLUSH_DW_USE_GTT;
+//	addr += ring->status_page.gfx_addr;
+
+	intel_logical_ring_emit(ringbuf, cmd);
+	intel_logical_ring_emit(ringbuf, lower_32_bits(addr));
+	intel_logical_ring_emit(ringbuf, upper_32_bits(addr));
 	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
+
 	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
 	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
 	return intel_logical_ring_advance_and_submit(request);
 }
 
 static int gen8_emit_request_render(struct drm_i915_gem_request *request)
 {
 	struct intel_ringbuffer *ringbuf = request->ringbuf;
+	u32 cmd, opts;
+	u64 addr;
 	int ret;
 
-	ret = intel_logical_ring_begin(request, 6 + WA_TAIL_DWORDS);
+	/*
+	 * Reserve space for the instructions below, plus some NOOPs
+	 * at the end of each request to be used as a workaround for
+	 * not being allowed to do lite restore with HEAD==TAIL
+	 * (WaIdleLiteRestore).
+	 */
+	ret = intel_logical_ring_begin(request, 6 + 2 + WA_TAIL_DWORDS);
 	if (ret)
 		return ret;
 
+	cmd = GFX_OP_PIPE_CONTROL(6);
+
+	opts = PIPE_CONTROL_GLOBAL_GTT_IVB;	/* Address via GGTT	*/
+	opts |= PIPE_CONTROL_STORE_DATA_INDEX;	/* Index into HWSP	*/
+	opts |= PIPE_CONTROL_CS_STALL;		/* Stall CS until done	*/
+	opts |= PIPE_CONTROL_QW_WRITE;		/* Write QWord		*/
+
+	// Must be QWord aligned
+	BUILD_BUG_ON((I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT) & (1 << 2));
+
+	addr = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+//	addr += ring->status_page.gfx_addr;
+
 	/* w/a for post sync ops following a GPGPU operation we
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
 	 */
-	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(5));
-	intel_logical_ring_emit(ringbuf,
-				(PIPE_CONTROL_GLOBAL_GTT_IVB |
-				 PIPE_CONTROL_CS_STALL |
-				 PIPE_CONTROL_QW_WRITE));
-	intel_logical_ring_emit(ringbuf, hws_seqno_address(request->engine));
-	intel_logical_ring_emit(ringbuf, 0);
+	intel_logical_ring_emit(ringbuf, cmd);
+	intel_logical_ring_emit(ringbuf, opts);
+	intel_logical_ring_emit(ringbuf, lower_32_bits(addr));
+	intel_logical_ring_emit(ringbuf, upper_32_bits(addr));
 	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
+	intel_logical_ring_emit(ringbuf, 0);	/* Clear 'in progress'	*/
+
 	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
+	intel_logical_ring_emit(ringbuf, MI_NOOP);
+
 	return intel_logical_ring_advance_and_submit(request);
 }
 
diff --git a/drivers/gpu/drm/i915/intel_ringbuffer.h b/drivers/gpu/drm/i915/intel_ringbuffer.h
index a0fb0144a1a0..0c22a40b2be4 100644
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -457,7 +457,11 @@ intel_write_status_page(struct intel_engine_cs *engine,
  * chosen carefully to meet those requirements. The list below shows the
  * currently-known alignment requirements:
  *
- *	I915_GEM_SCRATCH_INDEX	    must be EVEN
+ *	I915_GEM_SCRATCH_INDEX	    must be EVEN (QWord aligned)
+ *	I915_GEM_HWS_INDEX	    must be EVEN (QWord aligned), but also bit 3
+ *				    must be ZERO, so that the resulting address
+ *				    has a 0 in bit 5 (see BSpec for limitation
+ *				    on MI_FLUSH_DW instruction).
  */
 
 /*
author	Dave Gordon <david.s.gordon@intel.com>	2015-05-19 16:07:07 +0100
committer	John Harrison <John.C.Harrison@Intel.com>	2016-06-28 17:19:15 +0100
commit	8c90128707ae182a663d1a05ebdb8f10afd0592e (patch)
tree	08721214e50125fef904edaf341cf31e8f9d2758
parent	fffda29ee47b353914e1a5ed68b2cde19ba48d82 (diff)