diff options
author | Matthias Hopf <mhopf@suse.de> | 2009-01-29 17:06:42 +0100 |
---|---|---|
committer | Matthias Hopf <mhopf@suse.de> | 2009-01-29 17:06:42 +0100 |
commit | 06d939c9c3f12434cbe5c601ac3327a6257a91a5 (patch) | |
tree | 50408995303c5d89e40c636cc494b094615fac5c | |
parent | 26ec71b288f20c94c1c1c09682270aff57c1f4d3 (diff) |
Add ALU performance test.
-rw-r--r-- | r600_demo.c | 2 | ||||
-rw-r--r-- | r600_lib.h | 2 | ||||
-rw-r--r-- | r600_perf.c | 417 |
3 files changed, 413 insertions, 8 deletions
diff --git a/r600_demo.c b/r600_demo.c index c3577b1..df8eddd 100644 --- a/r600_demo.c +++ b/r600_demo.c @@ -693,7 +693,7 @@ int main(int argc, char *argv[]) test_copy (&adapter); break; case 'P': - test_tex_quad_perf (&adapter); + test_perf (&adapter); break; default: fprintf (stderr, "***** Don't know '%c' test\n\n", argv[optind][i]); @@ -141,7 +141,7 @@ void tmp_test (adapter_t *); void test_solid(adapter_t *adapt); void test_copy(adapter_t *adapt); /* r600_perf.c : */ -void test_tex_quad_perf (adapter_t *); +void test_perf (adapter_t *); #endif diff --git a/r600_perf.c b/r600_perf.c index 745fe2a..ddecbf4 100644 --- a/r600_perf.c +++ b/r600_perf.c @@ -38,6 +38,8 @@ #define MAX_NUM_QUADS 32768 +#define MAX_NUM_ALUS_PER_CLAUSE (128/4) // Max component insts per ALU clause: 128 +#define NUM_ALU_CLAUSES 8 // Number of ALU clauses #define RENDER_QUAD_WIDTH 480 // Not 1:1 by intention #define RENDER_QUAD_HEIGHT 600 @@ -248,12 +250,6 @@ void test_tex_quad_perf(adapter_t *adapt) /* Scissor / viewport */ ereg (PA_CL_VTE_CNTL, VTX_XY_FMT_bit); - /* Not necessary due to PA_CL_VTE_CNTL */ -// pack0 (PA_CL_VPORT_XSCALE_0, 4); -// efloat (1.0); -// efloat (0.0); -// efloat (1.0); -// efloat (0.0); ereg (PA_CL_CLIP_CNTL, CLIP_DISABLE_bit); @@ -395,3 +391,412 @@ void test_tex_quad_perf(adapter_t *adapt) (float) render_num * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e6) / render_time); } +/* + * Test ALU performance + */ + +void test_alu_quad_perf(adapter_t *adapt) +{ + static uint32_t vs[] = { + // CF INST 0 + CF_DWORD0(ADDR(4)), + CF_DWORD1(POP_COUNT(0), + CF_CONST(0), + COND(SQ_CF_COND_ACTIVE), + COUNT(2), + CALL_COUNT(0), + END_OF_PROGRAM(0), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_VTX), + WHOLE_QUAD_MODE(0), + BARRIER(1)), + // CF INST 1 + CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_POS0), + TYPE(SQ_EXPORT_POS), + RW_GPR(1), + RW_REL(ABSOLUTE), + INDEX_GPR(0), + ELEM_SIZE(0)), + CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_Z), + SRC_SEL_W(SQ_SEL_W), + R6xx_ELEM_LOOP(0), + BURST_COUNT(0), + END_OF_PROGRAM(0), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_EXPORT_DONE), + WHOLE_QUAD_MODE(0), + BARRIER(1)), + // CF INST 2 + CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(0), + TYPE(SQ_EXPORT_PARAM), + RW_GPR(0), + RW_REL(ABSOLUTE), + INDEX_GPR(0), + ELEM_SIZE(0)), + CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_Z), + SRC_SEL_W(SQ_SEL_W), + R6xx_ELEM_LOOP(0), + BURST_COUNT(0), + END_OF_PROGRAM(1), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_EXPORT_DONE), + WHOLE_QUAD_MODE(0), + BARRIER(0)), + // padding vtx/tex inst are 128 bit aligned + 0x00000000, + 0x00000000, + // VTX INST 0 + VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), + FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), + FETCH_WHOLE_QUAD(0), + BUFFER_ID(0), + SRC_GPR(0), + SRC_REL(ABSOLUTE), + SRC_SEL_X(SQ_SEL_X), + MEGA_FETCH_COUNT(12)), + VTX_DWORD1_GPR(DST_GPR(1), DST_REL(0), + DST_SEL_X(SQ_SEL_X), + DST_SEL_Y(SQ_SEL_Y), + DST_SEL_Z(SQ_SEL_Z), + DST_SEL_W(SQ_SEL_1), + USE_CONST_FIELDS(0), + DATA_FORMAT(FMT_32_32_FLOAT), + NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), + FORMAT_COMP_ALL(SQ_FORMAT_COMP_UNSIGNED), + SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE)), + VTX_DWORD2(OFFSET(0), + ENDIAN_SWAP(ENDIAN_NONE), + CONST_BUF_NO_STRIDE(0), + MEGA_FETCH(1)), + VTX_DWORD_PAD, + // VTX INST 1 + VTX_DWORD0(VTX_INST(SQ_VTX_INST_FETCH), + FETCH_TYPE(SQ_VTX_FETCH_VERTEX_DATA), + FETCH_WHOLE_QUAD(0), + BUFFER_ID(0), + SRC_GPR(0), + SRC_REL(ABSOLUTE), + SRC_SEL_X(SQ_SEL_X), + MEGA_FETCH_COUNT(4)), + VTX_DWORD1_GPR(DST_GPR(0), DST_REL(0), + DST_SEL_X(SQ_SEL_X), + DST_SEL_Y(SQ_SEL_Y), + DST_SEL_Z(SQ_SEL_Z), + DST_SEL_W(SQ_SEL_W), + USE_CONST_FIELDS(0), + DATA_FORMAT(FMT_8_8_8_8), + NUM_FORMAT_ALL(SQ_NUM_FORMAT_NORM), + FORMAT_COMP_ALL(SQ_FORMAT_COMP_UNSIGNED), + SRF_MODE_ALL(SRF_MODE_ZERO_CLAMP_MINUS_ONE)), + VTX_DWORD2(OFFSET(8), + ENDIAN_SWAP(ENDIAN_NONE), + CONST_BUF_NO_STRIDE(0), + MEGA_FETCH(0)), + VTX_DWORD_PAD, + } ; + + static uint32_t ps[2*NUM_ALU_CLAUSES + 2 + 4*2*MAX_NUM_ALUS_PER_CLAUSE]; + + struct { + float x, y; + uint32_t rgba; + } *vb, *v; + + draw_config_t draw_conf; + cb_config_t cb_conf; + vtx_resource_t vtx_res; + tex_resource_t tex_res; + tex_sampler_t tex_samp; + shader_config_t vs_conf, ps_conf; + + uint64_t vb_addr, vs_addr, ps_addr; + + int i, ps_size, alu_num, render_num; + float render_time; + + + /* Create pixel shader that utilizes a maximum of ALUs */ + ps_size = 2*NUM_ALU_CLAUSES; /* CF_ALU_INST contains #alu ops, thus created per rendering pass */ + + // CF INST i+1 + ps[ps_size++] = CF_ALLOC_IMP_EXP_DWORD0(ARRAY_BASE(CF_PIXEL_MRT0), + TYPE(SQ_EXPORT_PIXEL), + RW_GPR(0), + RW_REL(ABSOLUTE), + INDEX_GPR(0), + ELEM_SIZE(1)); + ps[ps_size++] = CF_ALLOC_IMP_EXP_DWORD1_SWIZ(SRC_SEL_X(SQ_SEL_X), + SRC_SEL_Y(SQ_SEL_Y), + SRC_SEL_Z(SQ_SEL_Z), + SRC_SEL_W(SQ_SEL_W), + R6xx_ELEM_LOOP(0), + BURST_COUNT(1), + END_OF_PROGRAM(1), + VALID_PIXEL_MODE(0), + CF_INST(SQ_CF_INST_EXPORT_DONE), + WHOLE_QUAD_MODE(0), + BARRIER(1)); + /* only create one clause, use that multiple times */ + for (i = 0; i < MAX_NUM_ALUS_PER_CLAUSE; i++) { + ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst a */ + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_X), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_X), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_AR_X), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MUL), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(127), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_X), + CLAMP(0)); + ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst b */ + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Y), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Y), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_AR_X), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MUL), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(127), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Y), + CLAMP(0)); + ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst c */ + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_Z), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_Z), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_AR_X), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(0)); + ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MUL), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(127), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_Z), + CLAMP(0)); + ps[ps_size++] = ALU_DWORD0(SRC0_SEL(1), /* ALU inst d */ + SRC0_REL(ABSOLUTE), + SRC0_ELEM(ELEM_W), + SRC0_NEG(0), + SRC1_SEL(0), + SRC1_REL(ABSOLUTE), + SRC1_ELEM(ELEM_W), + SRC1_NEG(0), + INDEX_MODE(SQ_INDEX_AR_X), + PRED_SEL(SQ_PRED_SEL_OFF), + LAST(1)); + ps[ps_size++] = ALU_DWORD1_OP2(adapt->chipset, + SRC0_ABS(0), + SRC1_ABS(0), + UPDATE_EXECUTE_MASK(0), + UPDATE_PRED(0), + WRITE_MASK(1), + FOG_MERGE(0), + OMOD(SQ_ALU_OMOD_OFF), + ALU_INST(SQ_OP2_INST_MUL), + BANK_SWIZZLE(SQ_ALU_VEC_012), + DST_GPR(127), + DST_REL(ABSOLUTE), + DST_ELEM(ELEM_W), + CLAMP(0)); + } + + CLEAR (draw_conf); + CLEAR (cb_conf); + CLEAR (vtx_res); + CLEAR (tex_res); + CLEAR (tex_samp); + CLEAR (vs_conf); + CLEAR (ps_conf); + + + printf ("\n* ALU Quad Performance Test\n\n"); + + /* Init */ + start_3d(adapt); + set_default_state(adapt); + + + /* Scissor / viewport */ + ereg (PA_CL_VTE_CNTL, VTX_XY_FMT_bit); + ereg (PA_CL_CLIP_CNTL, CLIP_DISABLE_bit); + + + /* Create vertex buffer */ + vb = v = calloc (MAX_NUM_QUADS, 4 * 4 * sizeof (float)); + for (i = 0; i < MAX_NUM_QUADS; i++) { + v->x = 514; v->y = 2; + v->rgba = (rand() & 0xffffff) | 0xff000000; v++; + v->x = 514 + RENDER_QUAD_WIDTH; v->y = 2; + v->rgba = (rand() & 0xffffff) | 0xff000000; v++; + v->x = 514 + RENDER_QUAD_WIDTH; v->y = 2 + RENDER_QUAD_HEIGHT; + v->rgba = (rand() & 0xffffff) | 0xff000000; v++; + v->x = 514; v->y = 2 + RENDER_QUAD_HEIGHT; + v->rgba = (rand() & 0xffffff) | 0xff000000; v++; + } + + + /* Upload */ + vs_addr = upload (adapt, vs, sizeof(vs), 0); + ps_addr = upload (adapt, ps, 4*ps_size, 4096); + vb_addr = upload (adapt, vb, MAX_NUM_QUADS * 4 * sizeof(*vb), 8192); + free (vb); + + if (verbose) { + dump_shader (adapt, vs, sizeof(vs), "vertex"); + dump_shader (adapt, ps, sizeof(ps), "pixel"); + printf ("\n"); + } + + + /* Shader */ + vs_conf.shader_addr = vs_addr; + vs_conf.num_gprs = 4; + vs_conf.stack_size = 1; + vs_setup (adapt, &vs_conf); + + ps_conf.shader_addr = ps_addr; + ps_conf.num_gprs = 2; + ps_conf.stack_size = 0; + ps_conf.uncached_first_inst = 1; + ps_conf.clamp_consts = 1; + ps_conf.export_mode = 2; + ps_setup (adapt, &ps_conf); + + + /* Render setup */ + ereg (CB_SHADER_MASK, (0x0f << OUTPUT0_ENABLE_shift)); + ereg (R7xx_CB_SHADER_CONTROL, (RT0_ENABLE_bit)); + ereg (CB_COLOR_CONTROL, (0xcc << ROP3_shift)); /* copy */ + + cb_conf.id = 0; + cb_conf.w = adapt->color_pitch; + cb_conf.h = adapt->color_height; + cb_conf.base = adapt->color_gpu; + cb_conf.format = FMT_8_8_8_8; + cb_conf.comp_swap = 0; + cb_conf.source_format = 1; + cb_conf.blend_clamp = 1; + set_render_target(adapt, &cb_conf); + + ereg (PA_SU_SC_MODE_CNTL, (FACE_bit | + (POLYMODE_PTYPE__TRIANGLES << POLYMODE_FRONT_PTYPE_shift) | + (POLYMODE_PTYPE__TRIANGLES << POLYMODE_BACK_PTYPE_shift))); + ereg (DB_SHADER_CONTROL, ((1 << Z_ORDER_shift) | /* EARLY_Z_THEN_LATE_Z */ + DUAL_EXPORT_ENABLE_bit)); /* Only useful if no depth export */ + + /* Vertex buffer setup */ + vtx_res.id = SQ_VTX_RESOURCE_vs; + vtx_res.vtx_size_dw = sizeof(*vb)/4; + vtx_res.vtx_num_entries = MAX_NUM_QUADS * 4 * vtx_res.vtx_size_dw; /* Can overcommit if necessary */ + vtx_res.mem_req_size = 1; + vtx_res.vb_addr = vb_addr; + set_vtx_resource (adapt, &vtx_res); + + + ereg (VGT_INSTANCE_STEP_RATE_0, 0); /* ? */ + ereg (VGT_INSTANCE_STEP_RATE_1, 0); + + ereg (VGT_MAX_VTX_INDX, vtx_res.vtx_num_entries / vtx_res.vtx_size_dw); + ereg (VGT_MIN_VTX_INDX, 0); + ereg (VGT_INDX_OFFSET, 0); + + flush_cmds (); + + + /* Render n times, duplicate alu instructions per clause each time */ + render_num = 64; + + for (alu_num = 1; alu_num <= MAX_NUM_ALUS_PER_CLAUSE; alu_num *= 2) { + + /* Set number of alu insts per clause */ + for (i = 0; i < NUM_ALU_CLAUSES; i++) { + /* re-using the same clause each time */ + ps[i*2] = CF_ALU_DWORD0(ADDR(NUM_ALU_CLAUSES + 1), + KCACHE_BANK0(0), + KCACHE_BANK1(0), + KCACHE_MODE0(0)); + ps[i*2+1] = CF_ALU_DWORD1(KCACHE_MODE1(0), + KCACHE_ADDR0(0), + KCACHE_ADDR1(0), + COUNT(4*alu_num), + USES_WATERFALL(0), + CF_INST(SQ_CF_INST_ALU), + WHOLE_QUAD_MODE(0), + BARRIER(1)); + } + upload (adapt, ps, NUM_ALU_CLAUSES*2*4, 4096); + + /* Loop: Start with half the amount of quads as last time, if rendering less than a second, increase */ + for (render_num /= 2; render_num < MAX_NUM_QUADS; render_num *= 2) + { + /* Draw */ + draw_conf.prim_type = DI_PT_QUADLIST; + draw_conf.vgt_draw_initiator = DI_SRC_SEL_AUTO_INDEX; + draw_conf.num_instances = 1; + draw_conf.num_indices = render_num * 4; + draw_conf.index_type = DI_INDEX_SIZE_16_BIT; + draw_auto (adapt, &draw_conf); + + render_time = time_flush_cmds (adapt, 5); + printf (" Rendering %d quads with total of %d vector ALU insts: %.3f ms\n", + render_num, alu_num * NUM_ALU_CLAUSES, render_time * 1000); + if (render_time >= 1) + break; + } + + printf ("\n ALU speed: %d clauses, %d ALUs -> %.1f Megapixels/s -> %.2f GigaFLOPS\n\n", + NUM_ALU_CLAUSES, alu_num, + (float) render_num * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e6) / render_time, + (float) 4 * render_num * alu_num * NUM_ALU_CLAUSES * (RENDER_QUAD_WIDTH * RENDER_QUAD_HEIGHT / 1e9) / render_time); + } +} + +void test_perf (adapter_t *adapt) +{ + test_tex_quad_perf (adapt); + test_alu_quad_perf (adapt); +} |