drm/i915/migrate: Evict and restore the flatccs capable lmem objflat-ccs-ww10.07

When we are swapping out the local memory obj on flat-ccs capable platform, we need to capture the ccs data too along with main meory and we need to restore it when we are swapping in the content. When lmem object is swapped into a smem obj, smem obj will have the extra pages required to hold the ccs data corresponding to the lmem main memory. So main memory of lmem will be copied into the initial pages of the smem and then ccs data corresponding to the main memory will be copied to the subsequent pages of smem. ccs data is 1/256 of lmem size. Swapin happens exactly in reverse order. First main memory of lmem is restored from the smem's initial pages and the ccs data will be restored from the subsequent pages of smem. Extracting and restoring the CCS data is done through a special cmd called XY_CTRL_SURF_COPY_BLT v2: Fixing the ccs handling v3: splitting the loop into subfunctions [Thomas] Avoiding the unrelevant change [Thomas] Signed-off-by: Ramalingam C <ramalingam.c@intel.com>
author: Ramalingam C <ramalingam.c@intel.com> 2022-02-07 14:10:40 +0530
committer: Ramalingam C <ramalingam.c@intel.com> 2022-03-06 21:46:21 +0530
commit: cb42fc01a7684da63293b20443b5f7d5f0c03ddd (patch)
tree: 3db217aa140a0c89956bbdf515b8b020ef377d75
parent: 3d1482f2d1fee6654a6f7672df5939ad23809ec0 (diff)
2 files changed, 284 insertions, 13 deletions
diff --git a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
index 539db8a76f93..04268773975b 100644
--- a/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
+++ b/drivers/gpu/drm/i915/gt/intel_gpu_commands.h
@@ -203,6 +203,21 @@
 #define GFX_OP_DRAWRECT_INFO     ((0x3<<29)|(0x1d<<24)|(0x80<<16)|(0x3))
 #define GFX_OP_DRAWRECT_INFO_I965  ((0x7900<<16)|0x2)
 
+#define XY_CTRL_SURF_INSTR_SIZE		5
+#define MI_FLUSH_DW_SIZE		3
+#define XY_CTRL_SURF_COPY_BLT		((2 << 29) | (0x48 << 22) | 3)
+#define   SRC_ACCESS_TYPE_SHIFT		21
+#define   DST_ACCESS_TYPE_SHIFT		20
+#define   CCS_SIZE_MASK			GENMASK(17, 8)
+#define   XY_CTRL_SURF_MOCS_MASK	GENMASK(31, 25)
+#define   NUM_CCS_BYTES_PER_BLOCK	256
+#define   NUM_BYTES_PER_CCS_BYTE	256
+#define   NUM_CCS_BLKS_PER_XFER		1024
+#define   INDIRECT_ACCESS		0
+#define   DIRECT_ACCESS			1
+#define  MI_FLUSH_LLC			BIT(9)
+#define  MI_FLUSH_CCS			BIT(16)
+
 #define COLOR_BLT_CMD			(2 << 29 | 0x40 << 22 | (5 - 2))
 #define XY_COLOR_BLT_CMD		(2 << 29 | 0x50 << 22)
 #define XY_FAST_COLOR_BLT_CMD		(2 << 29 | 0x44 << 22)
diff --git a/drivers/gpu/drm/i915/gt/intel_migrate.c b/drivers/gpu/drm/i915/gt/intel_migrate.c
index 05262f1b438e..6555cc92dc48 100644
--- a/drivers/gpu/drm/i915/gt/intel_migrate.c
+++ b/drivers/gpu/drm/i915/gt/intel_migrate.c
@@ -497,9 +497,123 @@ static bool wa_1209644611_applies(int ver, u32 size)
  * location.
  */
 
-static int emit_copy(struct i915_request *rq,
-		     u32 dst_offset, u32 src_offset, int size)
+static inline u32 *i915_flush_dw(u32 *cmd, u32 flags)
 {
+	*cmd++ = MI_FLUSH_DW | flags;
+	*cmd++ = 0;
+	*cmd++ = 0;
+
+	return cmd;
+}
+
+static u32 calc_ctrl_surf_instr_size(struct drm_i915_private *i915, int size)
+{
+	u32 num_cmds, num_blks, total_size;
+
+	if (!GET_CCS_BYTES(i915, size))
+		return 0;
+
+	/*
+	 * XY_CTRL_SURF_COPY_BLT transfers CCS in 256 byte
+	 * blocks. one XY_CTRL_SURF_COPY_BLT command can
+	 * transfer upto 1024 blocks.
+	 */
+	num_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
+				NUM_CCS_BYTES_PER_BLOCK);
+	num_cmds = DIV_ROUND_UP(num_blks, NUM_CCS_BLKS_PER_XFER);
+	total_size = XY_CTRL_SURF_INSTR_SIZE * num_cmds;
+
+	/*
+	 * Adding a flush before and after XY_CTRL_SURF_COPY_BLT
+	 */
+	total_size += 2 * MI_FLUSH_DW_SIZE;
+
+	return total_size;
+}
+
+static u32 *_i915_ctrl_surf_copy_blt(u32 *cmd, u64 src_addr, u64 dst_addr,
+				     u8 src_mem_access, u8 dst_mem_access,
+				     int src_mocs, int dst_mocs,
+				     u32 ccs_blocks)
+{
+	/*
+	 * The XY_CTRL_SURF_COPY_BLT instruction is used to copy the CCS
+	 * data in and out of the CCS region.
+	 *
+	 * We can copy at most 1024 blocks of 256 bytes using one
+	 * XY_CTRL_SURF_COPY_BLT instruction.
+	 *
+	 * In case we need to copy more than 1024 blocks, we need to add
+	 * another instruction to the same batch buffer.
+	 *
+	 * 1024 blocks of 256 bytes of CCS represent a total 256KB of CCS.
+	 *
+	 * 256 KB of CCS represents 256 * 256 KB = 64 MB of LMEM.
+	 */
+	do {
+		int blks_per_copy;
+
+		blks_per_copy = ccs_blocks >= NUM_CCS_BLKS_PER_XFER ?
+				NUM_CCS_BLKS_PER_XFER : ccs_blocks;
+		*cmd++ = XY_CTRL_SURF_COPY_BLT |
+			 src_mem_access << SRC_ACCESS_TYPE_SHIFT |
+			 dst_mem_access << DST_ACCESS_TYPE_SHIFT |
+			 FIELD_PREP(CCS_SIZE_MASK, blks_per_copy - 1);
+		*cmd++ = lower_32_bits(src_addr);
+		*cmd++ = (upper_32_bits(src_addr) & 0xFFFF) |
+			  FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, src_mocs);
+		*cmd++ = lower_32_bits(dst_addr);
+		*cmd++ = (upper_32_bits(dst_addr) & 0xFFFF) |
+			  FIELD_PREP(XY_CTRL_SURF_MOCS_MASK, dst_mocs);
+		src_addr += SZ_64M;
+		dst_addr += SZ_64M;
+		ccs_blocks -= blks_per_copy;
+	} while (ccs_blocks > 0);
+
+	return cmd;
+}
+
+static int emit_ccs_copy(struct i915_request *rq,
+			 bool dst_is_lmem, u32 dst_offset,
+			 bool src_is_lmem, u32 src_offset, int size)
+{
+	struct drm_i915_private *i915 = rq->engine->i915;
+	int mocs = rq->engine->gt->mocs.uc_index << 1;
+	u32 num_ccs_blks, ccs_ring_size;
+	u8 src_access, dst_access;
+	u32 *cs;
+
+	GEM_BUG_ON(!(src_is_lmem ^ dst_is_lmem) || !HAS_FLAT_CCS(i915));
+
+	ccs_ring_size = calc_ctrl_surf_instr_size(i915, size);
+	WARN_ON(!ccs_ring_size);
+
+	cs = intel_ring_begin(rq, round_up(ccs_ring_size, 2));
+	if (IS_ERR(cs))
+		return PTR_ERR(cs);
+
+	num_ccs_blks = DIV_ROUND_UP(GET_CCS_BYTES(i915, size),
+				    NUM_CCS_BYTES_PER_BLOCK);
+
+	src_access = !src_is_lmem && dst_is_lmem;
+	dst_access = !src_access;
+
+	cs = i915_flush_dw(cs, MI_FLUSH_LLC | MI_FLUSH_CCS);
+	cs = _i915_ctrl_surf_copy_blt(cs, src_offset, dst_offset,
+				      src_access, dst_access,
+				      mocs, mocs, num_ccs_blks);
+	cs = i915_flush_dw(cs, MI_FLUSH_LLC | MI_FLUSH_CCS);
+	if (ccs_ring_size & 1)
+		*cs++ = MI_NOOP;
+
+	intel_ring_advance(rq, cs);
+
+	return 0;
+}
+
+static int emit_main_copy(struct i915_request *rq,
+			  bool dst_is_lmem, u32 dst_offset,
+			  bool src_is_lmem, u32 src_offset, int size)
 	const int ver = GRAPHICS_VER(rq->engine->i915);
 	u32 instance = rq->engine->instance;
 	u32 *cs;
@@ -544,6 +658,91 @@ static int emit_copy(struct i915_request *rq,
 	return 0;
 }
 
+static int scatter_list_length(struct scatterlist *sg)
+{
+	int len = 0;
+
+	while (sg && sg_dma_len(sg)) {
+		len += sg_dma_len(sg);
+		sg = sg_next(sg);
+	};
+
+	return len;
+}
+
+static void
+calculate_src_dst_sz(struct drm_i915_private *i915, bool src_is_lmem,
+		     u32 *src_sz, u32 *dst_sz, u32 bytes_to_cpy,
+		     bool ccs_copy, u32 ccs_bytes)
+{
+	u32 ccs_sz, smem_sz;
+
+	if (ccs_copy) {
+		/*
+		 * We can only copy the ccs data corresponding to
+		 * the CHUNK_SZ of lmem which is
+		 * GET_CCS_BYTES(i915, CHUNK_SZ))
+		 */
+		ccs_sz = min_t(int, bytes_to_cpy, GET_CCS_BYTES(i915, CHUNK_SZ));
+
+		/* Flat-CCS: CCS data copy */
+		if (!src_is_lmem) { /* src is smem */
+			*src_sz = ccs_sz;
+			*dst_sz = CHUNK_SZ;
+		} else {
+			*src_sz = CHUNK_SZ;
+			*dst_sz = ccs_sz;
+		}
+	} else if (!ccs_copy && ccs_bytes) {
+		/*
+		 * When CHUNK_SZ is passed all the pages upto CHUNK_SZ will
+		 * be taken for the blt. in Flat-ccs supported platform Smem
+		 * obj will have more pages than required for main meory
+		 * hence limit it to the required size for main memory
+		 */
+		smem_sz = min_t(int, bytes_to_cpy, CHUNK_SZ);
+		/* Flat-CCS: Main memory copy */
+		if (!src_is_lmem) {
+			*src_sz = smem_sz;
+			*dst_sz = CHUNK_SZ;
+		} else {
+			*dst_sz = smem_sz;
+			*src_sz = CHUNK_SZ;
+		}
+	} else { /* ccs handling is not required */
+		*src_sz = CHUNK_SZ;
+	}
+}
+
+static int emit_copy(struct i915_request *rq, bool dst_is_lmem, u32 dst_offset,
+		     bool src_is_lmem, u32 src_offset, u32 src_sz, u32 dst_sz,
+		     int *size, bool ccs_copy)
+{
+	int err;
+
+	if (ccs_copy) {
+		/*
+		 * Using max of src_sz and dst_sz, as we need to
+		 * pass the lmem size corresponding to the ccs
+		 * blocks we need to handle.
+		 */
+		*size = max_t(int, src_sz, dst_sz);
+		err = emit_ccs_copy(rq, dst_is_lmem, dst_offset,
+				    src_is_lmem, src_offset,
+				    *size);
+
+		/* Converting back to ccs bytes */
+		*size = GET_CCS_BYTES(rq->engine->i915, *size);
+	} else {
+		WARN(src_sz != dst_sz, "%d != %d", src_sz, dst_sz);
+		*size = src_sz;
+		err = emit_main_copy(rq, dst_is_lmem, dst_offset,
+				     src_is_lmem, src_offset, *size);
+	}
+
+	return err;
+}
+
 int
 intel_context_migrate_copy(struct intel_context *ce,
 			   const struct i915_deps *deps,
@@ -556,7 +755,10 @@ intel_context_migrate_copy(struct intel_context *ce,
 			   struct i915_request **out)
 {
 	struct sgt_dma it_src = sg_sgt(src), it_dst = sg_sgt(dst);
+	struct drm_i915_private *i915 = ce->engine->i915;
+	u32 src_sz, dst_sz, ccs_bytes = 0, bytes_to_cpy;
 	struct i915_request *rq;
+	bool ccs_copy = false;
 	int err;
 
 	GEM_BUG_ON(ce->vm != ce->engine->gt->migrate.context->vm);
@@ -564,9 +766,28 @@ intel_context_migrate_copy(struct intel_context *ce,
 
 	GEM_BUG_ON(ce->ring->size < SZ_64K);
 
+	if (HAS_FLAT_CCS(i915) && src_is_lmem ^ dst_is_lmem) {
+		src_sz = scatter_list_length(src);
+		dst_sz = scatter_list_length(dst);
+
+		if (src_is_lmem)
+			bytes_to_cpy = src_sz;
+		else if (dst_is_lmem)
+			bytes_to_cpy = dst_sz;
+
+		/*
+		 * When there is a eviction of ccs needed smem will have the
+		 * extra pages for the ccs data
+		 *
+		 * TO-DO: Want to move the size mismatch check to a WARN_ON,
+		 * but still we have some requests of smem->lmem with same size.
+		 * Need to fix it.
+		 */
+		ccs_bytes = src_sz != dst_sz ? GET_CCS_BYTES(i915, bytes_to_cpy) : 0;
+	}
+
 	do {
-		u32 src_offset, dst_offset;
-		int len;
+		u32 src_offset, dst_offset, copy_sz;
 
 		rq = i915_request_create(ce);
 		if (IS_ERR(rq)) {
@@ -606,27 +827,38 @@ intel_context_migrate_copy(struct intel_context *ce,
 				dst_offset = 2 * CHUNK_SZ;
 		}
 
-		len = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
-			       src_offset, CHUNK_SZ);
-		if (len <= 0) {
-			err = len;
+		calculate_src_dst_sz(i915, src_is_lmem, &src_sz, &dst_sz,
+				     bytes_to_cpy, ccs_copy, ccs_bytes);
+
+		src_sz = emit_pte(rq, &it_src, src_cache_level, src_is_lmem,
+				  src_offset, src_sz);
+		if (src_sz <= 0) {
+			err = src_sz;
 			goto out_rq;
 		}
 
+		if (!ccs_bytes)
+			dst_sz = src_sz;
+
 		err = emit_pte(rq, &it_dst, dst_cache_level, dst_is_lmem,
-			       dst_offset, len);
+			       dst_offset, dst_sz);
 		if (err < 0)
 			goto out_rq;
-		if (err < len) {
+		if (err < dst_sz && !ccs_bytes) {
 			err = -EINVAL;
 			goto out_rq;
 		}
 
+		dst_sz = err;
+
 		err = rq->engine->emit_flush(rq, EMIT_INVALIDATE);
 		if (err)
 			goto out_rq;
 
-		err = emit_copy(rq, dst_offset, src_offset, len);
+		err = emit_copy(rq, dst_is_lmem, dst_offset, src_is_lmem,
+				src_offset, src_sz, dst_sz, &copy_sz, ccs_copy);
+		if (!err && ccs_bytes)
+			bytes_to_cpy -= copy_sz;
 
 		/* Arbitration is re-enabled between requests. */
 out_rq:
@@ -634,9 +866,33 @@ out_rq:
 			i915_request_put(*out);
 		*out = i915_request_get(rq);
 		i915_request_add(rq);
-		if (err || !it_src.sg || !sg_dma_len(it_src.sg))
-			break;
 
+		if (err || !it_src.sg || !sg_dma_len(it_src.sg) ||
+		    !it_dst.sg || !sg_dma_len(it_src.sg)) {
+			if (err || !ccs_bytes)
+				break;
+
+			GEM_BUG_ON(bytes_to_cpy);
+			if (ccs_copy) {
+				break;
+			} else if (ccs_bytes) {
+				if (src_is_lmem) {
+					WARN_ON(it_src.sg && sg_dma_len(it_src.sg));
+					it_src = sg_sgt(src);
+				} else {
+					WARN_ON(it_dst.sg && sg_dma_len(it_dst.sg));
+					it_dst = sg_sgt(dst);
+				}
+				bytes_to_cpy = ccs_bytes;
+				ccs_copy = true;
+
+				continue;
+			} else {
+				DRM_ERROR("Invalid state\n");
+				err = -EINVAL;
+				break;
+			}
+		}
 		cond_resched();
 	} while (1);
author	Ramalingam C <ramalingam.c@intel.com>	2022-02-07 14:10:40 +0530
committer	Ramalingam C <ramalingam.c@intel.com>	2022-03-06 21:46:21 +0530
commit	cb42fc01a7684da63293b20443b5f7d5f0c03ddd (patch)
tree	3db217aa140a0c89956bbdf515b8b020ef377d75
parent	3d1482f2d1fee6654a6f7672df5939ad23809ec0 (diff)