7 files changed, 244 insertions, 7 deletions
diff --git a/lib/gen7_media.h b/lib/gen7_media.h
index d5f9921a..91294d23 100644
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@
 #define GEN7_PIPELINE_SELECT			GFXPIPE(1, 1, 4)
 # define PIPELINE_SELECT_3D			(0 << 0)
 # define PIPELINE_SELECT_MEDIA			(1 << 0)
+# define PIPELINE_SELECT_GPGPU			(2 << 0)
 
 #define GEN7_STATE_BASE_ADDRESS			GFXPIPE(0, 1, 1)
 # define BASE_ADDRESS_MODIFY			(1 << 0)
@@ -187,6 +188,7 @@
 #define GEN7_MEDIA_CURBE_LOAD			GFXPIPE(2, 0, 1)
 #define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD	GFXPIPE(2, 0, 2)
 #define GEN7_MEDIA_OBJECT			GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER                       GFXPIPE(2, 1, 5)
 
 struct gen7_interface_descriptor_data
 {
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 4b3a5b87..c70f6d8b 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid)
 
 	return fill;
 }
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific gpgpu fill function pointer for the device specified
+ * with @devid. Will return NULL when no gpgpu fill function is implemented.
+ */
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+	igt_fillfunc_t fill = NULL;
+
+	if (IS_GEN7(devid))
+		fill = gen7_gpgpu_fillfunc;
+
+	return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index f0e21ea9..12f7be1c 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
  * @color: fill color to use
  *
  * This is the type of the per-platform fill functions using media
- * pipeline. The platform-specific implementation can be obtained
- * by calling igt_get_media_fillfunc().
+ * or gpgpu pipeline. The platform-specific implementation can be obtained
+ * by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc().
  *
  * A fill function will emit a batchbuffer to the kernel which executes
- * the specified blit fill operation using the media engine.
+ * the specified blit fill operation using the media/gpgpu engine.
  */
 typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
 			       struct igt_buf *dst,
@@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
 			       uint8_t color);
 
 igt_fillfunc_t igt_get_media_fillfunc(int devid);
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
 
 #endif
diff --git a/lib/media_fill.h b/lib/media_fill.h
index 226489cb..2a300550 100644
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
                 unsigned width, unsigned height,
                 uint8_t color);
 
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color);
+
 #endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
index 5a23b7d3..7113fda1 100644
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@
 
 #include <assert.h>
 
-
 static const uint32_t media_kernel[][4] = {
 	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
 	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
 	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
 };
 
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+	{ 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+	{ 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
 static uint32_t
 batch_used(struct intel_batchbuffer *batch)
 {
@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
 }
 
 static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+			       const uint32_t kernel[][4], size_t size)
 {
 	struct gen7_interface_descriptor_data *idd;
 	uint32_t offset;
 	uint32_t binding_table_offset, kernel_offset;
 
 	binding_table_offset = gen7_fill_binding_table(batch, dst);
-	kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+	kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
 
 	idd = batch_alloc(batch, sizeof(*idd), 64);
 	offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
 
 	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
-	interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      media_kernel,
+							      sizeof(media_kernel));
 	igt_assert(batch->ptr < &batch->buffer[4095]);
 
 	/* media pipeline */
@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
 	gen7_render_flush(batch, batch_end);
 	intel_batchbuffer_reset(batch);
 }
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+	OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+	/* scratch buffer */
+	OUT_BATCH(0);
+
+	/* number of threads & urb entries */
+	OUT_BATCH(1 << 16 | /* max num of threads */
+		  0 << 8 | /* num of URB entry */
+		  1 << 2); /* GPGPU mode */
+
+	OUT_BATCH(0);
+
+	/* urb entry size & curbe size */
+	OUT_BATCH(0 << 16 | 	/* URB entry size in 256 bits unit */
+		  1);		/* CURBE entry size in 256 bits unit */
+
+	/* scoreboard */
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+	OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+		     unsigned x, unsigned y,
+		     unsigned width, unsigned height)
+{
+	uint32_t x_dim, y_dim, tmp, right_mask;
+
+	/*
+	 * Simply do SIMD16 based dispatch, so every thread uses
+	 * SIMD16 channels.
+	 *
+	 * Define our own thread group size, e.g 16x1 for every group, then
+	 * will have 1 thread each group in SIMD16 dispatch. So thread
+	 * width/height/depth are all 1.
+	 *
+	 * Then thread group X = width / 16 (aligned to 16)
+	 * thread group Y = height;
+	 */
+	x_dim = (width + 15) / 16;
+	y_dim = height;
+
+	tmp = width & 15;
+	if (tmp == 0)
+		right_mask = (1 << 16) - 1;
+	else
+		right_mask = (1 << tmp) - 1;
+
+	OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+	/* interface descriptor offset */
+	OUT_BATCH(0);
+
+	/* SIMD size, thread w/h/d */
+	OUT_BATCH(1 << 30 | /* SIMD16 */
+		  0 << 16 | /* depth:1 */
+		  0 << 8 | /* height:1 */
+		  0); /* width:1 */
+
+	/* thread group X */
+	OUT_BATCH(0);
+	OUT_BATCH(x_dim);
+
+	/* thread group Y */
+	OUT_BATCH(0);
+	OUT_BATCH(y_dim);
+
+	/* thread group Z */
+	OUT_BATCH(0);
+	OUT_BATCH(1);
+
+	/* right mask */
+	OUT_BATCH(right_mask);
+
+	/* bottom mask, height 1, always 0xffffffff */
+	OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+		    struct igt_buf *dst,
+		    unsigned x, unsigned y,
+		    unsigned width, unsigned height,
+		    uint8_t color)
+{
+	uint32_t curbe_buffer, interface_descriptor;
+	uint32_t batch_end;
+
+	intel_batchbuffer_flush(batch);
+
+	/* setup states */
+	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+	/*
+	 * const buffer needs to fill for every thread, but as we have just 1 thread
+	 * per every group, so need only one curbe data.
+	 *
+	 * For each thread, just use thread group ID for buffer offset.
+	 */
+	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+							      gpgpu_kernel,
+							      sizeof(gpgpu_kernel));
+	igt_assert(batch->ptr < &batch->buffer[4095]);
+
+	batch->ptr = batch->buffer;
+
+	/* GPGPU pipeline */
+	OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+	gen7_emit_state_base_address(batch);
+
+	gen7_emit_vfe_state_gpgpu(batch);
+
+	gen7_emit_curbe_load(batch, curbe_buffer);
+
+	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+	gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+	OUT_BATCH(MI_BATCH_BUFFER_END);
+
+	batch_end = batch_align(batch, 8);
+	igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+	gen7_render_flush(batch, batch_end);
+	intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
new file mode 100644
index 00000000..3bf328ad
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
new file mode 100644
index 00000000..fc309f36
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG',          `g2.0<2,2,1>UD')
+define(`ORIG_X',        `g2.0<1>UD')
+define(`ORIG_Y',        `g2.4<1>UD')
+define(`COLOR',         `g1.0')
+define(`COLORUB',       `COLOR<0,1,0>UB')
+define(`COLORUD',       `COLOR<0,1,0>UD')
+define(`X',             `g0.4<0,1,0>UD')
+define(`Y',             `g0.24<0,1,0>UD')
+
+mov(4)  COLOR<1>UB      COLORUB         {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1)  ORIG_X          X        0x10UD {align1};
+mov(1)  ORIG_Y          Y               {align1};
+mov(8)  g4.0<1>UD       g0.0<8,8,1>UD   {align1};
+mov(2)  g4.0<1>UD       ORIG            {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1)  g4.8<1>UD       0x0000000fUD    {align1};
+
+mov(16) g5.0<1>UD       COLORUD         {align1 compr};
+mov(16) g7.0<1>UD       COLORUD         {align1 compr};
+mov(16) g9.0<1>UD       COLORUD         {align1 compr};
+mov(16) g11.0<1>UD      COLORUD         {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ *   10: media_block_write
+ *   12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ *   10: media_block_write
+ *    0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8)  g112.0<1>UD       g0.0<8,8,1>UD   {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};