summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lib/gen7_media.h2
-rw-r--r--lib/intel_batchbuffer.c19
-rw-r--r--lib/intel_batchbuffer.h7
-rw-r--r--lib/media_fill.h7
-rw-r--r--lib/media_fill_gen7.c161
-rw-r--r--shaders/gpgpu/README4
-rw-r--r--shaders/gpgpu/gpgpu_fill.gxa51
7 files changed, 244 insertions, 7 deletions
diff --git a/lib/gen7_media.h b/lib/gen7_media.h
index d5f9921a..91294d23 100644
--- a/lib/gen7_media.h
+++ b/lib/gen7_media.h
@@ -179,6 +179,7 @@
#define GEN7_PIPELINE_SELECT GFXPIPE(1, 1, 4)
# define PIPELINE_SELECT_3D (0 << 0)
# define PIPELINE_SELECT_MEDIA (1 << 0)
+# define PIPELINE_SELECT_GPGPU (2 << 0)
#define GEN7_STATE_BASE_ADDRESS GFXPIPE(0, 1, 1)
# define BASE_ADDRESS_MODIFY (1 << 0)
@@ -187,6 +188,7 @@
#define GEN7_MEDIA_CURBE_LOAD GFXPIPE(2, 0, 1)
#define GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD GFXPIPE(2, 0, 2)
#define GEN7_MEDIA_OBJECT GFXPIPE(2, 1, 0)
+#define GEN7_GPGPU_WALKER GFXPIPE(2, 1, 5)
struct gen7_interface_descriptor_data
{
diff --git a/lib/intel_batchbuffer.c b/lib/intel_batchbuffer.c
index 4b3a5b87..c70f6d8b 100644
--- a/lib/intel_batchbuffer.c
+++ b/lib/intel_batchbuffer.c
@@ -511,3 +511,22 @@ igt_fillfunc_t igt_get_media_fillfunc(int devid)
return fill;
}
+
+/**
+ * igt_get_gpgpu_fillfunc:
+ * @devid: pci device id
+ *
+ * Returns:
+ *
+ * The platform-specific gpgpu fill function pointer for the device specified
+ * with @devid. Will return NULL when no gpgpu fill function is implemented.
+ */
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid)
+{
+ igt_fillfunc_t fill = NULL;
+
+ if (IS_GEN7(devid))
+ fill = gen7_gpgpu_fillfunc;
+
+ return fill;
+}
diff --git a/lib/intel_batchbuffer.h b/lib/intel_batchbuffer.h
index f0e21ea9..12f7be1c 100644
--- a/lib/intel_batchbuffer.h
+++ b/lib/intel_batchbuffer.h
@@ -250,11 +250,11 @@ igt_render_copyfunc_t igt_get_render_copyfunc(int devid);
* @color: fill color to use
*
* This is the type of the per-platform fill functions using media
- * pipeline. The platform-specific implementation can be obtained
- * by calling igt_get_media_fillfunc().
+ * or gpgpu pipeline. The platform-specific implementation can be obtained
+ * by calling igt_get_media_fillfunc() or igt_get_gpgpu_fillfunc().
*
* A fill function will emit a batchbuffer to the kernel which executes
- * the specified blit fill operation using the media engine.
+ * the specified blit fill operation using the media/gpgpu engine.
*/
typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
struct igt_buf *dst,
@@ -263,5 +263,6 @@ typedef void (*igt_fillfunc_t)(struct intel_batchbuffer *batch,
uint8_t color);
igt_fillfunc_t igt_get_media_fillfunc(int devid);
+igt_fillfunc_t igt_get_gpgpu_fillfunc(int devid);
#endif
diff --git a/lib/media_fill.h b/lib/media_fill.h
index 226489cb..2a300550 100644
--- a/lib/media_fill.h
+++ b/lib/media_fill.h
@@ -32,4 +32,11 @@ gen9_media_fillfunc(struct intel_batchbuffer *batch,
unsigned width, unsigned height,
uint8_t color);
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+ struct igt_buf *dst,
+ unsigned x, unsigned y,
+ unsigned width, unsigned height,
+ uint8_t color);
+
#endif /* RENDE_MEDIA_FILL_H */
diff --git a/lib/media_fill_gen7.c b/lib/media_fill_gen7.c
index 5a23b7d3..7113fda1 100644
--- a/lib/media_fill_gen7.c
+++ b/lib/media_fill_gen7.c
@@ -8,7 +8,6 @@
#include <assert.h>
-
static const uint32_t media_kernel[][4] = {
{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
@@ -23,6 +22,23 @@ static const uint32_t media_kernel[][4] = {
{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
};
+/* shaders/gpgpu/gpgpu_fill.gxa */
+static const uint32_t gpgpu_kernel[][4] = {
+ { 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
+ { 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
+ { 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
+ { 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
+ { 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
+ { 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
+ { 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x20e00021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x21200021, 0x00000020, 0x00000000 },
+ { 0x00800001, 0x21600021, 0x00000020, 0x00000000 },
+ { 0x05800031, 0x24001ca8, 0x00000080, 0x120a8000 },
+ { 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
+ { 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
+};
+
static uint32_t
batch_used(struct intel_batchbuffer *batch)
{
@@ -160,14 +176,15 @@ gen7_fill_media_kernel(struct intel_batchbuffer *batch,
}
static uint32_t
-gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst)
+gen7_fill_interface_descriptor(struct intel_batchbuffer *batch, struct igt_buf *dst,
+ const uint32_t kernel[][4], size_t size)
{
struct gen7_interface_descriptor_data *idd;
uint32_t offset;
uint32_t binding_table_offset, kernel_offset;
binding_table_offset = gen7_fill_binding_table(batch, dst);
- kernel_offset = gen7_fill_media_kernel(batch, media_kernel, sizeof(media_kernel));
+ kernel_offset = gen7_fill_media_kernel(batch, kernel, size);
idd = batch_alloc(batch, sizeof(*idd), 64);
offset = batch_offset(batch, idd);
@@ -329,7 +346,9 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
- interface_descriptor = gen7_fill_interface_descriptor(batch, dst);
+ interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+ media_kernel,
+ sizeof(media_kernel));
igt_assert(batch->ptr < &batch->buffer[4095]);
/* media pipeline */
@@ -353,3 +372,137 @@ gen7_media_fillfunc(struct intel_batchbuffer *batch,
gen7_render_flush(batch, batch_end);
intel_batchbuffer_reset(batch);
}
+
+static void
+gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
+{
+ OUT_BATCH(GEN7_MEDIA_VFE_STATE | (8 - 2));
+
+ /* scratch buffer */
+ OUT_BATCH(0);
+
+ /* number of threads & urb entries */
+ OUT_BATCH(1 << 16 | /* max num of threads */
+ 0 << 8 | /* num of URB entry */
+ 1 << 2); /* GPGPU mode */
+
+ OUT_BATCH(0);
+
+ /* urb entry size & curbe size */
+ OUT_BATCH(0 << 16 | /* URB entry size in 256 bits unit */
+ 1); /* CURBE entry size in 256 bits unit */
+
+ /* scoreboard */
+ OUT_BATCH(0);
+ OUT_BATCH(0);
+ OUT_BATCH(0);
+}
+
+static void
+gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
+ unsigned x, unsigned y,
+ unsigned width, unsigned height)
+{
+ uint32_t x_dim, y_dim, tmp, right_mask;
+
+ /*
+ * Simply do SIMD16 based dispatch, so every thread uses
+ * SIMD16 channels.
+ *
+ * Define our own thread group size, e.g 16x1 for every group, then
+ * will have 1 thread each group in SIMD16 dispatch. So thread
+ * width/height/depth are all 1.
+ *
+ * Then thread group X = width / 16 (aligned to 16)
+ * thread group Y = height;
+ */
+ x_dim = (width + 15) / 16;
+ y_dim = height;
+
+ tmp = width & 15;
+ if (tmp == 0)
+ right_mask = (1 << 16) - 1;
+ else
+ right_mask = (1 << tmp) - 1;
+
+ OUT_BATCH(GEN7_GPGPU_WALKER | 9);
+
+ /* interface descriptor offset */
+ OUT_BATCH(0);
+
+ /* SIMD size, thread w/h/d */
+ OUT_BATCH(1 << 30 | /* SIMD16 */
+ 0 << 16 | /* depth:1 */
+ 0 << 8 | /* height:1 */
+ 0); /* width:1 */
+
+ /* thread group X */
+ OUT_BATCH(0);
+ OUT_BATCH(x_dim);
+
+ /* thread group Y */
+ OUT_BATCH(0);
+ OUT_BATCH(y_dim);
+
+ /* thread group Z */
+ OUT_BATCH(0);
+ OUT_BATCH(1);
+
+ /* right mask */
+ OUT_BATCH(right_mask);
+
+ /* bottom mask, height 1, always 0xffffffff */
+ OUT_BATCH(0xffffffff);
+}
+
+void
+gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
+ struct igt_buf *dst,
+ unsigned x, unsigned y,
+ unsigned width, unsigned height,
+ uint8_t color)
+{
+ uint32_t curbe_buffer, interface_descriptor;
+ uint32_t batch_end;
+
+ intel_batchbuffer_flush(batch);
+
+ /* setup states */
+ batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
+
+ /*
+ * const buffer needs to fill for every thread, but as we have just 1 thread
+ * per every group, so need only one curbe data.
+ *
+ * For each thread, just use thread group ID for buffer offset.
+ */
+ curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
+
+ interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
+ gpgpu_kernel,
+ sizeof(gpgpu_kernel));
+ igt_assert(batch->ptr < &batch->buffer[4095]);
+
+ batch->ptr = batch->buffer;
+
+ /* GPGPU pipeline */
+ OUT_BATCH(GEN7_PIPELINE_SELECT | PIPELINE_SELECT_GPGPU);
+
+ gen7_emit_state_base_address(batch);
+
+ gen7_emit_vfe_state_gpgpu(batch);
+
+ gen7_emit_curbe_load(batch, curbe_buffer);
+
+ gen7_emit_interface_descriptor_load(batch, interface_descriptor);
+
+ gen7_emit_gpgpu_walk(batch, x, y, width, height);
+
+ OUT_BATCH(MI_BATCH_BUFFER_END);
+
+ batch_end = batch_align(batch, 8);
+ igt_assert(batch_end < BATCH_STATE_SPLIT);
+
+ gen7_render_flush(batch, batch_end);
+ intel_batchbuffer_reset(batch);
+}
diff --git a/shaders/gpgpu/README b/shaders/gpgpu/README
new file mode 100644
index 00000000..3bf328ad
--- /dev/null
+++ b/shaders/gpgpu/README
@@ -0,0 +1,4 @@
+
+Commands used to generate the shader on gen7
+$> m4 gpgpu_fill.gxa > gpgpu_fill.gxm
+$> intel-gen4asm -g 7 -o <output> gpgpu_fill.gxm
diff --git a/shaders/gpgpu/gpgpu_fill.gxa b/shaders/gpgpu/gpgpu_fill.gxa
new file mode 100644
index 00000000..fc309f36
--- /dev/null
+++ b/shaders/gpgpu/gpgpu_fill.gxa
@@ -0,0 +1,51 @@
+/*
+ * Registers
+ * g0 -- header
+ * g1 -- constant
+ * g2 -- calculate X/Y offset
+ * g4-g12 payload for write message
+ */
+define(`ORIG', `g2.0<2,2,1>UD')
+define(`ORIG_X', `g2.0<1>UD')
+define(`ORIG_Y', `g2.4<1>UD')
+define(`COLOR', `g1.0')
+define(`COLORUB', `COLOR<0,1,0>UB')
+define(`COLORUD', `COLOR<0,1,0>UD')
+define(`X', `g0.4<0,1,0>UD')
+define(`Y', `g0.24<0,1,0>UD')
+
+mov(4) COLOR<1>UB COLORUB {align1};
+
+/* WRITE */
+/* count thread group ID for X/Y offset */
+mul(1) ORIG_X X 0x10UD {align1};
+mov(1) ORIG_Y Y {align1};
+mov(8) g4.0<1>UD g0.0<8,8,1>UD {align1};
+mov(2) g4.0<1>UD ORIG {align1};
+/* Normal mode: for block height 1 row and block width 16 bytes */
+mov(1) g4.8<1>UD 0x0000000fUD {align1};
+
+mov(16) g5.0<1>UD COLORUD {align1 compr};
+mov(16) g7.0<1>UD COLORUD {align1 compr};
+mov(16) g9.0<1>UD COLORUD {align1 compr};
+mov(16) g11.0<1>UD COLORUD {align1 compr};
+
+/*
+ * comment out the following instruction on Gen7
+ * write(0, 0, 10, 12)
+ * 10: media_block_write
+ * 12: data cache data port 1
+ */
+send(16) 4 acc0<1>UW null write(0, 0, 10, 12) mlen 9 rlen 0 {align1};
+
+/*
+ * uncomment the following instruction on Gen7
+ * write(0, 0, 10, 0)
+ * 10: media_block_write
+ * 0: reander cache data port
+ */
+/* send(16) 4 acc0<1>UW null write(0, 0, 10, 0) mlen 9 rlen 0 {align1}; */
+
+/* EOT */
+mov(8) g112.0<1>UD g0.0<8,8,1>UD {align1};
+send(16) 112 null<1>UW null thread_spawner(0, 0, 1) mlen 1 rlen 0 {align1 EOT};