summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Hopf <mhopf@suse.de>2009-01-29 17:06:42 +0100
committerMatthias Hopf <mhopf@suse.de>2009-01-29 17:06:42 +0100
commit06d939c9c3f12434cbe5c601ac3327a6257a91a5 (patch)
tree50408995303c5d89e40c636cc494b094615fac5c
parent26ec71b288f20c94c1c1c09682270aff57c1f4d3 (diff)
Add ALU performance test.
-rw-r--r--r600_demo.c2
-rw-r--r--r600_lib.h2
-rw-r--r--r600_perf.c417
3 files changed, 413 insertions, 8 deletions
diff --git a/r600_demo.c b/r600_demo.c
index c3577b1..df8eddd 100644
--- a/r600_demo.c
+++ b/r600_demo.c
@@ -693,7 +693,7 @@ int main(int argc, char *argv[])
test_copy (&adapter);
break;
case 'P':
- test_tex_quad_perf (&adapter);
+ test_perf (&adapter);
break;
default:
fprintf (stderr, "***** Don't know '%c' test\n\n", argv[optind][i]);
diff --git a/r600_lib.h b/r600_lib.h
index c28f406..22bdff1 100644
--- a/r600_lib.h
+++ b/r600_lib.h
@@ -141,7 +141,7 @@ void tmp_test (adapter_t *);
void test_solid(adapter_t *adapt);
void test_copy(adapter_t *adapt);
/* r600_perf.c : */
-void test_tex_quad_perf (adapter_t *);
+void test_perf (adapter_t *);
#endif
diff --git a/r600_perf.c b/r600_perf.c
index 745fe2a..ddecbf4 100644
--- a/r600_perf.c
+++ b/r600_perf.c
@@ -38,6 +38,8 @@
#define MAX_NUM_QUADS 32768
+#define MAX_NUM_ALUS_PER_CLAUSE (128/4) // Max component insts per ALU clause: 128
+#define NUM_ALU_CLAUSES 8 // Number of ALU clauses
#define RENDER_QUAD_WIDTH 480 // Not 1:1 by intention
#define RENDER_QUAD_HEIGHT 600
@@ -248,12 +250,6 @@ void test_tex_quad_perf(adapter_t *adapt)
/* Scissor / viewport */
ereg (PA_CL_VTE_CNTL, VTX_XY_FMT_bit);
- /* Not necessary due to PA_CL_VTE_CNTL */
-// pack0 (PA_CL_VPORT_XSCALE_0, 4);
-// efloat (1.0);
-// efloat (0.0);
-// efloat (1.0);
-// efloat (0.0);
ereg (PA_CL_CLIP_CNTL, CLIP_DISABLE_bit);
@@ -395,3 +391,412 @@ void test_tex_quad_perf(adapter_t *adapt)
(float) render_num * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e6) / render_time);
}
+/*
+ * Test ALU performance
+ */
+
+void test_alu_quad_perf(adapter_t *adapt)
+{
+ static uint32_t vs[] = {
+ // CF INST 0
+ CF_DWORD0(ADDR(4)),
+ CF_DWORD1(POP_COUNT(0),
+ CF_CONST(0),
+ COND(SQ_CF_COND_ACTIVE),
+ COUNT(2),
+ CALL_COUNT(0),
+ END_OF_PROGRAM(0),
+ VALID_PIXEL_MODE(0),
+ CF_INST(SQ_CF_INST_VTX),
+ WHOLE_QUAD_MODE(0),
+ BARRIER(1)),
+ // CF INST 1
+ CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0),
+ TYPE(SQ_EXPORT_POS),
+ RW_GPR(1),
+ RW_REL(ABSOLUTE),
+ INDEX_GPR(0),
+ ELEM_SIZE(0)),
+ CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+ SRC_SEL_Y(SQ_SEL_Y),
+ SRC_SEL_Z(SQ_SEL_Z),
+ SRC_SEL_W(SQ_SEL_W),
+ R6xx_ELEM_LOOP(0),
+ BURST_COUNT(0),
+ END_OF_PROGRAM(0),
+ VALID_PIXEL_MODE(0),
+ CF_INST(SQ_CF_INST_EXPORT_DONE),
+ WHOLE_QUAD_MODE(0),
+ BARRIER(1)),
+ // CF INST 2
+ CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0),
+ TYPE(SQ_EXPORT_PARAM),
+ RW_GPR(0),
+ RW_REL(ABSOLUTE),
+ INDEX_GPR(0),
+ ELEM_SIZE(0)),
+ CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+ SRC_SEL_Y(SQ_SEL_Y),
+ SRC_SEL_Z(SQ_SEL_Z),
+ SRC_SEL_W(SQ_SEL_W),
+ R6xx_ELEM_LOOP(0),
+ BURST_COUNT(0),
+ END_OF_PROGRAM(1),
+ VALID_PIXEL_MODE(0),
+ CF_INST(SQ_CF_INST_EXPORT_DONE),
+ WHOLE_QUAD_MODE(0),
+ BARRIER(0)),
+ // padding vtx/tex inst are 128 bit aligned
+ 0x00000000,
+ 0x00000000,
+ // VTX INST 0
+ VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+ FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+ FETCH_WHOLE_QUAD(0),
+ BUFFER_ID(0),
+ SRC_GPR(0),
+ SRC_REL(ABSOLUTE),
+ SRC_SEL_X(SQ_SEL_X),
+ MEGA_FETCH_COUNT(12)),
+ VTX_DWORD1_GPR(DST_GPR(1), DST_REL(0),
+ DST_SEL_X(SQ_SEL_X),
+ DST_SEL_Y(SQ_SEL_Y),
+ DST_SEL_Z(SQ_SEL_Z),
+ DST_SEL_W(SQ_SEL_1),
+ USE_CONST_FIELDS(0),
+ DATA_FORMAT(FMT_32_32_FLOAT),
+ NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM),
+ FORMAT_COMP_ALL(SQ_FORMAT_COMP_UNSIGNED),
+ SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE)),
+ VTX_DWORD2(OFFSET(0),
+ ENDIAN_SWAP(ENDIAN_NONE),
+ CONST_BUF_NO_STRIDE(0),
+ MEGA_FETCH(1)),
+ VTX_DWORD_PAD,
+ // VTX INST 1
+ VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH),
+ FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA),
+ FETCH_WHOLE_QUAD(0),
+ BUFFER_ID(0),
+ SRC_GPR(0),
+ SRC_REL(ABSOLUTE),
+ SRC_SEL_X(SQ_SEL_X),
+ MEGA_FETCH_COUNT(4)),
+ VTX_DWORD1_GPR(DST_GPR(0), DST_REL(0),
+ DST_SEL_X(SQ_SEL_X),
+ DST_SEL_Y(SQ_SEL_Y),
+ DST_SEL_Z(SQ_SEL_Z),
+ DST_SEL_W(SQ_SEL_W),
+ USE_CONST_FIELDS(0),
+ DATA_FORMAT(FMT_8_8_8_8),
+ NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM),
+ FORMAT_COMP_ALL(SQ_FORMAT_COMP_UNSIGNED),
+ SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE)),
+ VTX_DWORD2(OFFSET(8),
+ ENDIAN_SWAP(ENDIAN_NONE),
+ CONST_BUF_NO_STRIDE(0),
+ MEGA_FETCH(0)),
+ VTX_DWORD_PAD,
+ } ;
+
+ static uint32_t ps[2*NUM_ALU_CLAUSES + 2 + 4*2*MAX_NUM_ALUS_PER_CLAUSE];
+
+ struct {
+ float x, y;
+ uint32_t rgba;
+ } *vb, *v;
+
+ draw_config_t draw_conf;
+ cb_config_t cb_conf;
+ vtx_resource_t vtx_res;
+ tex_resource_t tex_res;
+ tex_sampler_t tex_samp;
+ shader_config_t vs_conf, ps_conf;
+
+ uint64_t vb_addr, vs_addr, ps_addr;
+
+ int i, ps_size, alu_num, render_num;
+ float render_time;
+
+
+ /* Create pixel shader that utilizes a maximum of ALUs */
+ ps_size = 2*NUM_ALU_CLAUSES; /* CF_ALU_INST contains #alu ops, thus created per rendering pass */
+
+ // CF INST i+1
+ ps[ps_size++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0),
+ TYPE(SQ_EXPORT_PIXEL),
+ RW_GPR(0),
+ RW_REL(ABSOLUTE),
+ INDEX_GPR(0),
+ ELEM_SIZE(1));
+ ps[ps_size++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X),
+ SRC_SEL_Y(SQ_SEL_Y),
+ SRC_SEL_Z(SQ_SEL_Z),
+ SRC_SEL_W(SQ_SEL_W),
+ R6xx_ELEM_LOOP(0),
+ BURST_COUNT(1),
+ END_OF_PROGRAM(1),
+ VALID_PIXEL_MODE(0),
+ CF_INST(SQ_CF_INST_EXPORT_DONE),
+ WHOLE_QUAD_MODE(0),
+ BARRIER(1));
+ /* only create one clause, use that multiple times */
+ for (i = 0; i < MAX_NUM_ALUS_PER_CLAUSE; i++) {
+ ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst a */
+ SRC0_REL(ABSOLUTE),
+ SRC0_ELEM(ELEM_X),
+ SRC0_NEG(0),
+ SRC1_SEL(0),
+ SRC1_REL(ABSOLUTE),
+ SRC1_ELEM(ELEM_X),
+ SRC1_NEG(0),
+ INDEX_MODE(SQ_INDEX_AR_X),
+ PRED_SEL(SQ_PRED_SEL_OFF),
+ LAST(0));
+ ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset,
+ SRC0_ABS(0),
+ SRC1_ABS(0),
+ UPDATE_EXECUTE_MASK(0),
+ UPDATE_PRED(0),
+ WRITE_MASK(1),
+ FOG_MERGE(0),
+ OMOD(SQ_ALU_OMOD_OFF),
+ ALU_INST(SQ_OP2_INST_MUL),
+ BANK_SWIZZLE(SQ_ALU_VEC_012),
+ DST_GPR(127),
+ DST_REL(ABSOLUTE),
+ DST_ELEM(ELEM_X),
+ CLAMP(0));
+ ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst b */
+ SRC0_REL(ABSOLUTE),
+ SRC0_ELEM(ELEM_Y),
+ SRC0_NEG(0),
+ SRC1_SEL(0),
+ SRC1_REL(ABSOLUTE),
+ SRC1_ELEM(ELEM_Y),
+ SRC1_NEG(0),
+ INDEX_MODE(SQ_INDEX_AR_X),
+ PRED_SEL(SQ_PRED_SEL_OFF),
+ LAST(0));
+ ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset,
+ SRC0_ABS(0),
+ SRC1_ABS(0),
+ UPDATE_EXECUTE_MASK(0),
+ UPDATE_PRED(0),
+ WRITE_MASK(1),
+ FOG_MERGE(0),
+ OMOD(SQ_ALU_OMOD_OFF),
+ ALU_INST(SQ_OP2_INST_MUL),
+ BANK_SWIZZLE(SQ_ALU_VEC_012),
+ DST_GPR(127),
+ DST_REL(ABSOLUTE),
+ DST_ELEM(ELEM_Y),
+ CLAMP(0));
+ ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst c */
+ SRC0_REL(ABSOLUTE),
+ SRC0_ELEM(ELEM_Z),
+ SRC0_NEG(0),
+ SRC1_SEL(0),
+ SRC1_REL(ABSOLUTE),
+ SRC1_ELEM(ELEM_Z),
+ SRC1_NEG(0),
+ INDEX_MODE(SQ_INDEX_AR_X),
+ PRED_SEL(SQ_PRED_SEL_OFF),
+ LAST(0));
+ ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset,
+ SRC0_ABS(0),
+ SRC1_ABS(0),
+ UPDATE_EXECUTE_MASK(0),
+ UPDATE_PRED(0),
+ WRITE_MASK(1),
+ FOG_MERGE(0),
+ OMOD(SQ_ALU_OMOD_OFF),
+ ALU_INST(SQ_OP2_INST_MUL),
+ BANK_SWIZZLE(SQ_ALU_VEC_012),
+ DST_GPR(127),
+ DST_REL(ABSOLUTE),
+ DST_ELEM(ELEM_Z),
+ CLAMP(0));
+ ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst d */
+ SRC0_REL(ABSOLUTE),
+ SRC0_ELEM(ELEM_W),
+ SRC0_NEG(0),
+ SRC1_SEL(0),
+ SRC1_REL(ABSOLUTE),
+ SRC1_ELEM(ELEM_W),
+ SRC1_NEG(0),
+ INDEX_MODE(SQ_INDEX_AR_X),
+ PRED_SEL(SQ_PRED_SEL_OFF),
+ LAST(1));
+ ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset,
+ SRC0_ABS(0),
+ SRC1_ABS(0),
+ UPDATE_EXECUTE_MASK(0),
+ UPDATE_PRED(0),
+ WRITE_MASK(1),
+ FOG_MERGE(0),
+ OMOD(SQ_ALU_OMOD_OFF),
+ ALU_INST(SQ_OP2_INST_MUL),
+ BANK_SWIZZLE(SQ_ALU_VEC_012),
+ DST_GPR(127),
+ DST_REL(ABSOLUTE),
+ DST_ELEM(ELEM_W),
+ CLAMP(0));
+ }
+
+ CLEAR (draw_conf);
+ CLEAR (cb_conf);
+ CLEAR (vtx_res);
+ CLEAR (tex_res);
+ CLEAR (tex_samp);
+ CLEAR (vs_conf);
+ CLEAR (ps_conf);
+
+
+ printf ("\n* ALU Quad Performance Test\n\n");
+
+ /* Init */
+ start_3d(adapt);
+ set_default_state(adapt);
+
+
+ /* Scissor / viewport */
+ ereg (PA_CL_VTE_CNTL, VTX_XY_FMT_bit);
+ ereg (PA_CL_CLIP_CNTL, CLIP_DISABLE_bit);
+
+
+ /* Create vertex buffer */
+ vb = v = calloc (MAX_NUM_QUADS, 4 * 4 * sizeof (float));
+ for (i = 0; i < MAX_NUM_QUADS; i++) {
+ v->x = 514; v->y = 2;
+ v->rgba = (rand() & 0xffffff) | 0xff000000; v++;
+ v->x = 514 + RENDER_QUAD_WIDTH; v->y = 2;
+ v->rgba = (rand() & 0xffffff) | 0xff000000; v++;
+ v->x = 514 + RENDER_QUAD_WIDTH; v->y = 2 + RENDER_QUAD_HEIGHT;
+ v->rgba = (rand() & 0xffffff) | 0xff000000; v++;
+ v->x = 514; v->y = 2 + RENDER_QUAD_HEIGHT;
+ v->rgba = (rand() & 0xffffff) | 0xff000000; v++;
+ }
+
+
+ /* Upload */
+ vs_addr = upload (adapt, vs, sizeof(vs), 0);
+ ps_addr = upload (adapt, ps, 4*ps_size, 4096);
+ vb_addr = upload (adapt, vb, MAX_NUM_QUADS * 4 * sizeof(*vb), 8192);
+ free (vb);
+
+ if (verbose) {
+ dump_shader (adapt, vs, sizeof(vs), "vertex");
+ dump_shader (adapt, ps, sizeof(ps), "pixel");
+ printf ("\n");
+ }
+
+
+ /* Shader */
+ vs_conf.shader_addr = vs_addr;
+ vs_conf.num_gprs = 4;
+ vs_conf.stack_size = 1;
+ vs_setup (adapt, &vs_conf);
+
+ ps_conf.shader_addr = ps_addr;
+ ps_conf.num_gprs = 2;
+ ps_conf.stack_size = 0;
+ ps_conf.uncached_first_inst = 1;
+ ps_conf.clamp_consts = 1;
+ ps_conf.export_mode = 2;
+ ps_setup (adapt, &ps_conf);
+
+
+ /* Render setup */
+ ereg (CB_SHADER_MASK, (0x0f << OUTPUT0_ENABLE_shift));
+ ereg (R7xx_CB_SHADER_CONTROL, (RT0_ENABLE_bit));
+ ereg (CB_COLOR_CONTROL, (0xcc << ROP3_shift)); /* copy */
+
+ cb_conf.id = 0;
+ cb_conf.w = adapt->color_pitch;
+ cb_conf.h = adapt->color_height;
+ cb_conf.base = adapt->color_gpu;
+ cb_conf.format = FMT_8_8_8_8;
+ cb_conf.comp_swap = 0;
+ cb_conf.source_format = 1;
+ cb_conf.blend_clamp = 1;
+ set_render_target(adapt, &cb_conf);
+
+ ereg (PA_SU_SC_MODE_CNTL, (FACE_bit |
+ (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift) |
+ (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift)));
+ ereg (DB_SHADER_CONTROL, ((1 << Z_ORDER_shift) | /* EARLY_Z_THEN_LATE_Z */
+ DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */
+
+ /* Vertex buffer setup */
+ vtx_res.id = SQ_VTX_RESOURCE_vs;
+ vtx_res.vtx_size_dw = sizeof(*vb)/4;
+ vtx_res.vtx_num_entries = MAX_NUM_QUADS * 4 * vtx_res.vtx_size_dw; /* Can overcommit if necessary */
+ vtx_res.mem_req_size = 1;
+ vtx_res.vb_addr = vb_addr;
+ set_vtx_resource (adapt, &vtx_res);
+
+
+ ereg (VGT_INSTANCE_STEP_RATE_0, 0); /* ? */
+ ereg (VGT_INSTANCE_STEP_RATE_1, 0);
+
+ ereg (VGT_MAX_VTX_INDX, vtx_res.vtx_num_entries / vtx_res.vtx_size_dw);
+ ereg (VGT_MIN_VTX_INDX, 0);
+ ereg (VGT_INDX_OFFSET, 0);
+
+ flush_cmds ();
+
+
+ /* Render n times, duplicate alu instructions per clause each time */
+ render_num = 64;
+
+ for (alu_num = 1; alu_num <= MAX_NUM_ALUS_PER_CLAUSE; alu_num *= 2) {
+
+ /* Set number of alu insts per clause */
+ for (i = 0; i < NUM_ALU_CLAUSES; i++) {
+ /* re-using the same clause each time */
+ ps[i*2] = CF_ALU_DWORD0(ADDR(NUM_ALU_CLAUSES + 1),
+ KCACHE_BANK0(0),
+ KCACHE_BANK1(0),
+ KCACHE_MODE0(0));
+ ps[i*2+1] = CF_ALU_DWORD1(KCACHE_MODE1(0),
+ KCACHE_ADDR0(0),
+ KCACHE_ADDR1(0),
+ COUNT(4*alu_num),
+ USES_WATERFALL(0),
+ CF_INST(SQ_CF_INST_ALU),
+ WHOLE_QUAD_MODE(0),
+ BARRIER(1));
+ }
+ upload (adapt, ps, NUM_ALU_CLAUSES*2*4, 4096);
+
+ /* Loop: Start with half the amount of quads as last time, if rendering less than a second, increase */
+ for (render_num /= 2; render_num < MAX_NUM_QUADS; render_num *= 2)
+ {
+ /* Draw */
+ draw_conf.prim_type = DI_PT_QUADLIST;
+ draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX;
+ draw_conf.num_instances = 1;
+ draw_conf.num_indices = render_num * 4;
+ draw_conf.index_type = DI_INDEX_SIZE_16_BIT;
+ draw_auto (adapt, &draw_conf);
+
+ render_time = time_flush_cmds (adapt, 5);
+ printf (" Rendering %d quads with total of %d vector ALU insts: %.3f ms\n",
+ render_num, alu_num * NUM_ALU_CLAUSES, render_time * 1000);
+ if (render_time >= 1)
+ break;
+ }
+
+ printf ("\n ALU speed: %d clauses, %d ALUs -> %.1f Megapixels/s -> %.2f GigaFLOPS\n\n",
+ NUM_ALU_CLAUSES, alu_num,
+ (float) render_num * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e6) / render_time,
+ (float) 4 * render_num * alu_num * NUM_ALU_CLAUSES * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e9) / render_time);
+ }
+}
+
+void test_perf (adapter_t *adapt)
+{
+ test_tex_quad_perf (adapt);
+ test_alu_quad_perf (adapt);
+}