diff options
author | Tom Stellard <tstellar@gmail.com> | 2011-09-20 21:05:55 -0700 |
---|---|---|
committer | Tom Stellard <tstellar@gmail.com> | 2011-09-20 21:05:55 -0700 |
commit | 67d4953e20fd74eb3f524f0ddd0f90413f99f6ec (patch) | |
tree | be03932107b2620a8d617bfc5656196f0e7f12d9 | |
parent | 2d1004d9aa719bb93a4f057b0eefe88f23b44e44 (diff) |
r300/compiler: Another attempt at nested loops and branchesvert-loops
I have no idea why this doesn't work. The branch decisions appear to be
flipped sometimes, but other times they work.
4 files changed, 170 insertions, 57 deletions
diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c index 654f9a070d5..92aaaf133f6 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c @@ -371,9 +371,10 @@ static void mark_write(void * userdata, struct rc_instruction * inst, writemasks[index] |= mask; } -static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler) +static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler, + int index) { - return PVS_SRC_OPERAND(compiler->PredicateIndex, + return PVS_SRC_OPERAND(index, t_swizzle(RC_SWIZZLE_ZERO), t_swizzle(RC_SWIZZLE_ZERO), t_swizzle(RC_SWIZZLE_ZERO), @@ -383,17 +384,72 @@ static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler) } static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler, - unsigned int hw_opcode, int is_math) + unsigned int hw_opcode, int is_math, + int index) { return PVS_OP_DST_OPERAND(hw_opcode, is_math, 0, - compiler->PredicateIndex, + index, RC_MASK_W, t_dst_class(RC_FILE_TEMPORARY)); } +static void ei_pred_mov(struct r300_vertex_program_compiler * compiler, + unsigned int * inst) +{ + int dst_index = compiler->PredicateRegs[compiler->loop_depth]; + int src_index = compiler->PredicateRegs[compiler->loop_depth - 1]; + inst[0] = t_pred_dst(compiler, VE_ADD, 0, dst_index); + inst[1] = t_pred_src(compiler, src_index); + inst[2] = PVS_SRC_OPERAND(0, t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_src_class(RC_FILE_NONE), 0); + inst[3] = 0; +} + +static int reserve_predicate_reg( + struct r300_vertex_program_compiler * compiler) +{ + int i; + unsigned int writemasks[RC_REGISTER_MAX_INDEX]; + struct rc_instruction * inst; + memset(writemasks, 0, sizeof(writemasks)); + for(inst = compiler->Base.Program.Instructions.Next; + inst != &compiler->Base.Program.Instructions; + inst = inst->Next) { + rc_for_all_writes_mask(inst, mark_write, writemasks); + } + + for (i = 0; i < R500_PVS_MAX_LOOP_DEPTH; i++) { + if (compiler->PredicateRegs[i] != -1) { + writemasks[compiler->PredicateRegs[i]] = RC_MASK_XYZW; + } + } + + for(i = 0; i < compiler->Base.max_temp_regs; i++) { + /* Most of the control flow instructions only write the + * W component of the Predicate Register, but + * the docs say that ME_PRED_SET_CLR and + * ME_PRED_SET_RESTORE write all components of the + * register, so we must reserve a register that has + * all its components free. */ + if (!writemasks[i]) { + compiler->PredicateRegs[compiler->loop_depth] = i; + break; + } + } + if (i == compiler->Base.max_temp_regs) { + rc_error(&compiler->Base, "No free temporary to use for" + " predicate stack counter.\n"); + return 0; + } + return 1; +} + static void ei_if(struct r300_vertex_program_compiler * compiler, struct rc_instruction *rci, unsigned int * inst, @@ -409,29 +465,12 @@ static void ei_if(struct r300_vertex_program_compiler * compiler, /* Reserve a temporary to use as our predicate stack counter, if we * don't already have one. */ - if (!compiler->PredicateMask) { - unsigned int writemasks[RC_REGISTER_MAX_INDEX]; - struct rc_instruction * inst; - unsigned int i; - memset(writemasks, 0, sizeof(writemasks)); - for(inst = compiler->Base.Program.Instructions.Next; - inst != &compiler->Base.Program.Instructions; - inst = inst->Next) { - rc_for_all_writes_mask(inst, mark_write, writemasks); - } - for(i = 0; i < compiler->Base.max_temp_regs; i++) { - unsigned int mask = ~writemasks[i] & RC_MASK_XYZW; - /* Only the W component can be used fo the predicate - * stack counter. */ - if (mask & RC_MASK_W) { - compiler->PredicateMask = RC_MASK_W; - compiler->PredicateIndex = i; - break; - } - } - if (i == compiler->Base.max_temp_regs) { - rc_error(&compiler->Base, "No free temporary to use for" - " predicate stack counter.\n"); + if (compiler->PredicateRegs[compiler->loop_depth] == -1) { + /* If we are inside a loop, the Predicate Register should + * have already been defined. */ + assert(compiler->loop_depth == 0); + + if (!reserve_predicate_reg(compiler)) { return; } } @@ -446,13 +485,13 @@ static void ei_if(struct r300_vertex_program_compiler * compiler, inst[2] = 0; } else { predicate_opcode = VE_PRED_SET_NEQ_PUSH; - inst[1] = t_pred_src(compiler); + inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]); inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]); } - inst[0] = t_pred_dst(compiler, predicate_opcode, is_math); + inst[0] = t_pred_dst(compiler, predicate_opcode, is_math, + compiler->PredicateRegs[compiler->loop_depth]); inst[3] = 0; - } static void ei_else(struct r300_vertex_program_compiler * compiler, @@ -462,8 +501,9 @@ static void ei_else(struct r300_vertex_program_compiler * compiler, rc_error(&compiler->Base,"Opcode ELSE not supported\n"); return; } - inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1); - inst[1] = t_pred_src(compiler); + inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1, + compiler->PredicateRegs[compiler->loop_depth]); + inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]); inst[2] = 0; inst[3] = 0; } @@ -475,8 +515,54 @@ static void ei_endif(struct r300_vertex_program_compiler *compiler, rc_error(&compiler->Base,"Opcode ENDIF not supported\n"); return; } - inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1); - inst[1] = t_pred_src(compiler); + inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1, + compiler->PredicateRegs[compiler->loop_depth]); + inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]); + inst[2] = 0; + inst[3] = 0; +} + +static void ei_brk(struct r300_vertex_program_compiler * compiler, + unsigned int * inst) +{ + if (!compiler->Base.is_r500) { + rc_error(&compiler->Base, "Opcode BRK not supported\n"); + return; + } + + inst[0] = t_pred_dst(compiler, ME_PRED_SET_CLR, 1, + compiler->PredicateRegs[compiler->loop_depth]); + inst[1] = 0; + inst[2] = 0; + inst[3] = 0; +} + +static void ei_endloop(struct r300_vertex_program_compiler * compiler, + unsigned int * inst) +{ + if (!compiler->Base.is_r500) { + rc_error(&compiler->Base, "Opcode ENDLOOP not supported\n"); + return; + } + + inst[0] = t_pred_dst(compiler, ME_PRED_SET_RESTORE, 1, + compiler->PredicateRegs[compiler->loop_depth]); + inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]); + inst[2] = 0; + inst[3] = 0; +} + +static void ei_pred_set(struct r300_vertex_program_compiler * compiler, + unsigned int * inst) +{ + inst[0] = t_pred_dst(compiler, ME_PRED_SET_EQ, 1, compiler->PredicateRegs[compiler->loop_depth]); + inst[1] = PVS_SRC_OPERAND(0, + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_swizzle(RC_SWIZZLE_ZERO), + t_src_class(RC_FILE_NONE), 0); + inst[2] = 0; inst[3] = 0; } @@ -487,9 +573,9 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) struct rc_instruction *rci; struct loop * loops = NULL; - int current_loop_depth = 0; int loops_reserved = 0; + unsigned int i; unsigned int branch_depth = 0; compiler->code->pos_end = 0; /* Not supported yet */ @@ -498,6 +584,10 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) compiler->SetHwInputOutput(compiler); + for (i = 0; i < R500_PVS_MAX_LOOP_DEPTH; i++) { + compiler->PredicateRegs[i] = -1; + } + for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) { struct rc_sub_instruction *vpi = &rci->U.I; unsigned int *inst = compiler->code->body.d + compiler->code->length; @@ -527,6 +617,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) switch (vpi->Opcode) { case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break; case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break; + case RC_OPCODE_BRK: ei_brk(compiler, inst); break; case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break; case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break; case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break; @@ -558,18 +649,26 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) if ((!compiler->Base.is_r500 && loops_reserved >= R300_VS_MAX_LOOP_DEPTH) - || loops_reserved >= R500_VS_MAX_FC_DEPTH) { + || loops_reserved >= R500_PVS_MAX_LOOP_DEPTH) { rc_error(&compiler->Base, "Loops are nested too deep."); return; } memory_pool_array_reserve(&compiler->Base.Pool, - struct loop, loops, current_loop_depth, + struct loop, loops, compiler->loop_depth, loops_reserved, 1); - l = &loops[current_loop_depth++]; + l = &loops[compiler->loop_depth++]; memset(l , 0, sizeof(struct loop)); - l->BgnLoop = (compiler->code->length / 4); - continue; + if (!reserve_predicate_reg(compiler)) { + return; + } + if (branch_depth > 0) { + ei_pred_mov(compiler, inst); + } else { + ei_pred_set(compiler, inst); + } + l->BgnLoop = ((compiler->code->length + 4)/ 4); + break; } case RC_OPCODE_ENDLOOP: { @@ -579,7 +678,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) unsigned int ret_addr; assert(loops); - l = &loops[current_loop_depth - 1]; + l = &loops[compiler->loop_depth - 1]; act_addr = l->BgnLoop - 1; last_addr = (compiler->code->length / 4) - 1; ret_addr = l->BgnLoop; @@ -593,7 +692,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].lw = R500_PVS_FC_ACT_ADRS(act_addr) - | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff) + | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff) ; compiler->code->fc_op_addrs.r500 [compiler->code->num_fc_ops].uw = @@ -616,8 +715,14 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP( compiler->code->num_fc_ops); compiler->code->num_fc_ops++; - current_loop_depth--; - continue; + if (branch_depth == 0) { + ei_endloop(compiler, inst); + compiler->loop_depth--; + } else { + compiler->loop_depth--; + ei_endloop(compiler, inst); + } + break; } default: @@ -627,10 +732,12 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) /* Non-flow control instructions that are inside an if statement * need to pay attention to the predicate bit. */ - if (branch_depth + if ((branch_depth || compiler->loop_depth) && vpi->Opcode != RC_OPCODE_IF && vpi->Opcode != RC_OPCODE_ELSE - && vpi->Opcode != RC_OPCODE_ENDIF) { + && vpi->Opcode != RC_OPCODE_ENDIF + && vpi->Opcode != RC_OPCODE_BGNLOOP + && vpi->Opcode != RC_OPCODE_ENDLOOP) { inst[0] |= (PVS_DST_PRED_ENABLE_MASK << PVS_DST_PRED_ENABLE_SHIFT); @@ -648,9 +755,13 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user) vpi->SrcReg[i].Index >= compiler->code->num_temporaries) compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1; - if (compiler->PredicateMask) - if (compiler->PredicateIndex >= compiler->code->num_temporaries) - compiler->code->num_temporaries = compiler->PredicateIndex + 1; + if (compiler->PredicateRegs[compiler->loop_depth] != -1 && + compiler->PredicateRegs[compiler->loop_depth] >= + compiler->code->num_temporaries) { + compiler->code->num_temporaries = + compiler->PredicateRegs[compiler->loop_depth] + + 1; + } if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) { rc_error(&compiler->Base, "Too many temporaries.\n"); @@ -1016,7 +1127,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c) struct radeon_compiler_pass vs_list[] = { /* NAME DUMP PREDICATE FUNCTION PARAM */ {"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL}, - {"transform loops", 1, 1, rc_transform_loops, NULL}, +// {"transform loops", 1, 1, rc_transform_loops, NULL}, {"emulate branches", 1, !is_r500, rc_emulate_branches, NULL}, {"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL}, {"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500}, diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c index 2bc0a87eed8..e4db8e2f790 100644 --- a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c +++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c @@ -197,9 +197,11 @@ void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user) case 3: fprintf(stderr, "JSR"); break; } if (c->Base.is_r500) { - fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n", + fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x " + "loop data->0x%08x\n", vs->fc_op_addrs.r500[i].uw, - vs->fc_op_addrs.r500[i].lw); + vs->fc_op_addrs.r500[i].lw, + vs->fc_loop_index[i]); } else { fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]); } diff --git a/src/gallium/drivers/r300/compiler/radeon_code.h b/src/gallium/drivers/r300/compiler/radeon_code.h index 67e6acf8b10..9478e199fc2 100644 --- a/src/gallium/drivers/r300/compiler/radeon_code.h +++ b/src/gallium/drivers/r300/compiler/radeon_code.h @@ -40,6 +40,9 @@ #define R500_PFS_MAX_BRANCH_DEPTH_FULL 32 #define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4 +/* The r500 maximum depth is not just for loops, but any combination of loops + * and subroutine jumps. */ +#define R500_PVS_MAX_LOOP_DEPTH 8 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0) @@ -264,9 +267,6 @@ struct rX00_fragment_program_code { #define R300_VS_MAX_TEMPS 32 /* This is the max for all chipsets (r300-r500) */ #define R300_VS_MAX_FC_OPS 16 -/* The r500 maximum depth is not just for loops, but any combination of loops - * and subroutine jumps. */ -#define R500_VS_MAX_FC_DEPTH 8 #define R300_VS_MAX_LOOP_DEPTH 1 #define VSF_MAX_INPUTS 32 diff --git a/src/gallium/drivers/r300/compiler/radeon_compiler.h b/src/gallium/drivers/r300/compiler/radeon_compiler.h index ac9691c816f..b86b2e6594b 100644 --- a/src/gallium/drivers/r300/compiler/radeon_compiler.h +++ b/src/gallium/drivers/r300/compiler/radeon_compiler.h @@ -137,8 +137,8 @@ struct r300_vertex_program_compiler { void * UserData; void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c); - int PredicateIndex; - unsigned int PredicateMask; + int PredicateRegs[R500_PVS_MAX_LOOP_DEPTH]; + unsigned int loop_depth; }; void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c); |