summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTom Stellard <tstellar@gmail.com>2011-09-20 21:05:55 -0700
committerTom Stellard <tstellar@gmail.com>2011-09-20 21:05:55 -0700
commit67d4953e20fd74eb3f524f0ddd0f90413f99f6ec (patch)
treebe03932107b2620a8d617bfc5656196f0e7f12d9
parent2d1004d9aa719bb93a4f057b0eefe88f23b44e44 (diff)
r300/compiler: Another attempt at nested loops and branchesvert-loops
I have no idea why this doesn't work. The branch decisions appear to be flipped sometimes, but other times they work.
-rw-r--r--src/gallium/drivers/r300/compiler/r3xx_vertprog.c211
-rw-r--r--src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c6
-rw-r--r--src/gallium/drivers/r300/compiler/radeon_code.h6
-rw-r--r--src/gallium/drivers/r300/compiler/radeon_compiler.h4
4 files changed, 170 insertions, 57 deletions
diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
index 654f9a070d5..92aaaf133f6 100644
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@@ -371,9 +371,10 @@ static void mark_write(void * userdata, struct rc_instruction * inst,
writemasks[index] |= mask;
}
-static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
+static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler,
+ int index)
{
- return PVS_SRC_OPERAND(compiler->PredicateIndex,
+ return PVS_SRC_OPERAND(index,
t_swizzle(RC_SWIZZLE_ZERO),
t_swizzle(RC_SWIZZLE_ZERO),
t_swizzle(RC_SWIZZLE_ZERO),
@@ -383,17 +384,72 @@ static unsigned long t_pred_src(struct r300_vertex_program_compiler * compiler)
}
static unsigned long t_pred_dst(struct r300_vertex_program_compiler * compiler,
- unsigned int hw_opcode, int is_math)
+ unsigned int hw_opcode, int is_math,
+ int index)
{
return PVS_OP_DST_OPERAND(hw_opcode,
is_math,
0,
- compiler->PredicateIndex,
+ index,
RC_MASK_W,
t_dst_class(RC_FILE_TEMPORARY));
}
+static void ei_pred_mov(struct r300_vertex_program_compiler * compiler,
+ unsigned int * inst)
+{
+ int dst_index = compiler->PredicateRegs[compiler->loop_depth];
+ int src_index = compiler->PredicateRegs[compiler->loop_depth - 1];
+ inst[0] = t_pred_dst(compiler, VE_ADD, 0, dst_index);
+ inst[1] = t_pred_src(compiler, src_index);
+ inst[2] = PVS_SRC_OPERAND(0, t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_src_class(RC_FILE_NONE), 0);
+ inst[3] = 0;
+}
+
+static int reserve_predicate_reg(
+ struct r300_vertex_program_compiler * compiler)
+{
+ int i;
+ unsigned int writemasks[RC_REGISTER_MAX_INDEX];
+ struct rc_instruction * inst;
+ memset(writemasks, 0, sizeof(writemasks));
+ for(inst = compiler->Base.Program.Instructions.Next;
+ inst != &compiler->Base.Program.Instructions;
+ inst = inst->Next) {
+ rc_for_all_writes_mask(inst, mark_write, writemasks);
+ }
+
+ for (i = 0; i < R500_PVS_MAX_LOOP_DEPTH; i++) {
+ if (compiler->PredicateRegs[i] != -1) {
+ writemasks[compiler->PredicateRegs[i]] = RC_MASK_XYZW;
+ }
+ }
+
+ for(i = 0; i < compiler->Base.max_temp_regs; i++) {
+ /* Most of the control flow instructions only write the
+ * W component of the Predicate Register, but
+ * the docs say that ME_PRED_SET_CLR and
+ * ME_PRED_SET_RESTORE write all components of the
+ * register, so we must reserve a register that has
+ * all its components free. */
+ if (!writemasks[i]) {
+ compiler->PredicateRegs[compiler->loop_depth] = i;
+ break;
+ }
+ }
+ if (i == compiler->Base.max_temp_regs) {
+ rc_error(&compiler->Base, "No free temporary to use for"
+ " predicate stack counter.\n");
+ return 0;
+ }
+ return 1;
+}
+
static void ei_if(struct r300_vertex_program_compiler * compiler,
struct rc_instruction *rci,
unsigned int * inst,
@@ -409,29 +465,12 @@ static void ei_if(struct r300_vertex_program_compiler * compiler,
/* Reserve a temporary to use as our predicate stack counter, if we
* don't already have one. */
- if (!compiler->PredicateMask) {
- unsigned int writemasks[RC_REGISTER_MAX_INDEX];
- struct rc_instruction * inst;
- unsigned int i;
- memset(writemasks, 0, sizeof(writemasks));
- for(inst = compiler->Base.Program.Instructions.Next;
- inst != &compiler->Base.Program.Instructions;
- inst = inst->Next) {
- rc_for_all_writes_mask(inst, mark_write, writemasks);
- }
- for(i = 0; i < compiler->Base.max_temp_regs; i++) {
- unsigned int mask = ~writemasks[i] & RC_MASK_XYZW;
- /* Only the W component can be used fo the predicate
- * stack counter. */
- if (mask & RC_MASK_W) {
- compiler->PredicateMask = RC_MASK_W;
- compiler->PredicateIndex = i;
- break;
- }
- }
- if (i == compiler->Base.max_temp_regs) {
- rc_error(&compiler->Base, "No free temporary to use for"
- " predicate stack counter.\n");
+ if (compiler->PredicateRegs[compiler->loop_depth] == -1) {
+ /* If we are inside a loop, the Predicate Register should
+ * have already been defined. */
+ assert(compiler->loop_depth == 0);
+
+ if (!reserve_predicate_reg(compiler)) {
return;
}
}
@@ -446,13 +485,13 @@ static void ei_if(struct r300_vertex_program_compiler * compiler,
inst[2] = 0;
} else {
predicate_opcode = VE_PRED_SET_NEQ_PUSH;
- inst[1] = t_pred_src(compiler);
+ inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]);
inst[2] = t_src(compiler->code, &rci->U.I.SrcReg[0]);
}
- inst[0] = t_pred_dst(compiler, predicate_opcode, is_math);
+ inst[0] = t_pred_dst(compiler, predicate_opcode, is_math,
+ compiler->PredicateRegs[compiler->loop_depth]);
inst[3] = 0;
-
}
static void ei_else(struct r300_vertex_program_compiler * compiler,
@@ -462,8 +501,9 @@ static void ei_else(struct r300_vertex_program_compiler * compiler,
rc_error(&compiler->Base,"Opcode ELSE not supported\n");
return;
}
- inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1);
- inst[1] = t_pred_src(compiler);
+ inst[0] = t_pred_dst(compiler, ME_PRED_SET_INV, 1,
+ compiler->PredicateRegs[compiler->loop_depth]);
+ inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]);
inst[2] = 0;
inst[3] = 0;
}
@@ -475,8 +515,54 @@ static void ei_endif(struct r300_vertex_program_compiler *compiler,
rc_error(&compiler->Base,"Opcode ENDIF not supported\n");
return;
}
- inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1);
- inst[1] = t_pred_src(compiler);
+ inst[0] = t_pred_dst(compiler, ME_PRED_SET_POP, 1,
+ compiler->PredicateRegs[compiler->loop_depth]);
+ inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]);
+ inst[2] = 0;
+ inst[3] = 0;
+}
+
+static void ei_brk(struct r300_vertex_program_compiler * compiler,
+ unsigned int * inst)
+{
+ if (!compiler->Base.is_r500) {
+ rc_error(&compiler->Base, "Opcode BRK not supported\n");
+ return;
+ }
+
+ inst[0] = t_pred_dst(compiler, ME_PRED_SET_CLR, 1,
+ compiler->PredicateRegs[compiler->loop_depth]);
+ inst[1] = 0;
+ inst[2] = 0;
+ inst[3] = 0;
+}
+
+static void ei_endloop(struct r300_vertex_program_compiler * compiler,
+ unsigned int * inst)
+{
+ if (!compiler->Base.is_r500) {
+ rc_error(&compiler->Base, "Opcode ENDLOOP not supported\n");
+ return;
+ }
+
+ inst[0] = t_pred_dst(compiler, ME_PRED_SET_RESTORE, 1,
+ compiler->PredicateRegs[compiler->loop_depth]);
+ inst[1] = t_pred_src(compiler, compiler->PredicateRegs[compiler->loop_depth]);
+ inst[2] = 0;
+ inst[3] = 0;
+}
+
+static void ei_pred_set(struct r300_vertex_program_compiler * compiler,
+ unsigned int * inst)
+{
+ inst[0] = t_pred_dst(compiler, ME_PRED_SET_EQ, 1, compiler->PredicateRegs[compiler->loop_depth]);
+ inst[1] = PVS_SRC_OPERAND(0,
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_swizzle(RC_SWIZZLE_ZERO),
+ t_src_class(RC_FILE_NONE), 0);
+
inst[2] = 0;
inst[3] = 0;
}
@@ -487,9 +573,9 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
struct rc_instruction *rci;
struct loop * loops = NULL;
- int current_loop_depth = 0;
int loops_reserved = 0;
+ unsigned int i;
unsigned int branch_depth = 0;
compiler->code->pos_end = 0; /* Not supported yet */
@@ -498,6 +584,10 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
compiler->SetHwInputOutput(compiler);
+ for (i = 0; i < R500_PVS_MAX_LOOP_DEPTH; i++) {
+ compiler->PredicateRegs[i] = -1;
+ }
+
for(rci = compiler->Base.Program.Instructions.Next; rci != &compiler->Base.Program.Instructions; rci = rci->Next) {
struct rc_sub_instruction *vpi = &rci->U.I;
unsigned int *inst = compiler->code->body.d + compiler->code->length;
@@ -527,6 +617,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
switch (vpi->Opcode) {
case RC_OPCODE_ADD: ei_vector2(compiler->code, VE_ADD, vpi, inst); break;
case RC_OPCODE_ARL: ei_vector1(compiler->code, VE_FLT2FIX_DX, vpi, inst); break;
+ case RC_OPCODE_BRK: ei_brk(compiler, inst); break;
case RC_OPCODE_COS: ei_math1(compiler->code, ME_COS, vpi, inst); break;
case RC_OPCODE_DP4: ei_vector2(compiler->code, VE_DOT_PRODUCT, vpi, inst); break;
case RC_OPCODE_DST: ei_vector2(compiler->code, VE_DISTANCE_VECTOR, vpi, inst); break;
@@ -558,18 +649,26 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
if ((!compiler->Base.is_r500
&& loops_reserved >= R300_VS_MAX_LOOP_DEPTH)
- || loops_reserved >= R500_VS_MAX_FC_DEPTH) {
+ || loops_reserved >= R500_PVS_MAX_LOOP_DEPTH) {
rc_error(&compiler->Base,
"Loops are nested too deep.");
return;
}
memory_pool_array_reserve(&compiler->Base.Pool,
- struct loop, loops, current_loop_depth,
+ struct loop, loops, compiler->loop_depth,
loops_reserved, 1);
- l = &loops[current_loop_depth++];
+ l = &loops[compiler->loop_depth++];
memset(l , 0, sizeof(struct loop));
- l->BgnLoop = (compiler->code->length / 4);
- continue;
+ if (!reserve_predicate_reg(compiler)) {
+ return;
+ }
+ if (branch_depth > 0) {
+ ei_pred_mov(compiler, inst);
+ } else {
+ ei_pred_set(compiler, inst);
+ }
+ l->BgnLoop = ((compiler->code->length + 4)/ 4);
+ break;
}
case RC_OPCODE_ENDLOOP:
{
@@ -579,7 +678,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
unsigned int ret_addr;
assert(loops);
- l = &loops[current_loop_depth - 1];
+ l = &loops[compiler->loop_depth - 1];
act_addr = l->BgnLoop - 1;
last_addr = (compiler->code->length / 4) - 1;
ret_addr = l->BgnLoop;
@@ -593,7 +692,7 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
compiler->code->fc_op_addrs.r500
[compiler->code->num_fc_ops].lw =
R500_PVS_FC_ACT_ADRS(act_addr)
- | R500_PVS_FC_LOOP_CNT_JMP_INST(0xffff)
+ | R500_PVS_FC_LOOP_CNT_JMP_INST(0x00ff)
;
compiler->code->fc_op_addrs.r500
[compiler->code->num_fc_ops].uw =
@@ -616,8 +715,14 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
compiler->code->fc_ops |= R300_VAP_PVS_FC_OPC_LOOP(
compiler->code->num_fc_ops);
compiler->code->num_fc_ops++;
- current_loop_depth--;
- continue;
+ if (branch_depth == 0) {
+ ei_endloop(compiler, inst);
+ compiler->loop_depth--;
+ } else {
+ compiler->loop_depth--;
+ ei_endloop(compiler, inst);
+ }
+ break;
}
default:
@@ -627,10 +732,12 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
/* Non-flow control instructions that are inside an if statement
* need to pay attention to the predicate bit. */
- if (branch_depth
+ if ((branch_depth || compiler->loop_depth)
&& vpi->Opcode != RC_OPCODE_IF
&& vpi->Opcode != RC_OPCODE_ELSE
- && vpi->Opcode != RC_OPCODE_ENDIF) {
+ && vpi->Opcode != RC_OPCODE_ENDIF
+ && vpi->Opcode != RC_OPCODE_BGNLOOP
+ && vpi->Opcode != RC_OPCODE_ENDLOOP) {
inst[0] |= (PVS_DST_PRED_ENABLE_MASK
<< PVS_DST_PRED_ENABLE_SHIFT);
@@ -648,9 +755,13 @@ static void translate_vertex_program(struct radeon_compiler *c, void *user)
vpi->SrcReg[i].Index >= compiler->code->num_temporaries)
compiler->code->num_temporaries = vpi->SrcReg[i].Index + 1;
- if (compiler->PredicateMask)
- if (compiler->PredicateIndex >= compiler->code->num_temporaries)
- compiler->code->num_temporaries = compiler->PredicateIndex + 1;
+ if (compiler->PredicateRegs[compiler->loop_depth] != -1 &&
+ compiler->PredicateRegs[compiler->loop_depth] >=
+ compiler->code->num_temporaries) {
+ compiler->code->num_temporaries =
+ compiler->PredicateRegs[compiler->loop_depth]
+ + 1;
+ }
if (compiler->code->num_temporaries > compiler->Base.max_temp_regs) {
rc_error(&compiler->Base, "Too many temporaries.\n");
@@ -1016,7 +1127,7 @@ void r3xx_compile_vertex_program(struct r300_vertex_program_compiler *c)
struct radeon_compiler_pass vs_list[] = {
/* NAME DUMP PREDICATE FUNCTION PARAM */
{"add artificial outputs", 0, 1, rc_vs_add_artificial_outputs, NULL},
- {"transform loops", 1, 1, rc_transform_loops, NULL},
+// {"transform loops", 1, 1, rc_transform_loops, NULL},
{"emulate branches", 1, !is_r500, rc_emulate_branches, NULL},
{"emulate negative addressing", 1, 1, rc_emulate_negative_addressing, NULL},
{"native rewrite", 1, is_r500, rc_local_transform, alu_rewrite_r500},
diff --git a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c
index 2bc0a87eed8..e4db8e2f790 100644
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog_dump.c
@@ -197,9 +197,11 @@ void r300_vertex_program_dump(struct radeon_compiler *compiler, void *user)
case 3: fprintf(stderr, "JSR"); break;
}
if (c->Base.is_r500) {
- fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x\n",
+ fprintf(stderr,": uw-> 0x%08x lw-> 0x%08x "
+ "loop data->0x%08x\n",
vs->fc_op_addrs.r500[i].uw,
- vs->fc_op_addrs.r500[i].lw);
+ vs->fc_op_addrs.r500[i].lw,
+ vs->fc_loop_index[i]);
} else {
fprintf(stderr,": 0x%08x\n", vs->fc_op_addrs.r300[i]);
}
diff --git a/src/gallium/drivers/r300/compiler/radeon_code.h b/src/gallium/drivers/r300/compiler/radeon_code.h
index 67e6acf8b10..9478e199fc2 100644
--- a/src/gallium/drivers/r300/compiler/radeon_code.h
+++ b/src/gallium/drivers/r300/compiler/radeon_code.h
@@ -40,6 +40,9 @@
#define R500_PFS_MAX_BRANCH_DEPTH_FULL 32
#define R500_PFS_MAX_BRANCH_DEPTH_PARTIAL 4
+/* The r500 maximum depth is not just for loops, but any combination of loops
+ * and subroutine jumps. */
+#define R500_PVS_MAX_LOOP_DEPTH 8
#define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
@@ -264,9 +267,6 @@ struct rX00_fragment_program_code {
#define R300_VS_MAX_TEMPS 32
/* This is the max for all chipsets (r300-r500) */
#define R300_VS_MAX_FC_OPS 16
-/* The r500 maximum depth is not just for loops, but any combination of loops
- * and subroutine jumps. */
-#define R500_VS_MAX_FC_DEPTH 8
#define R300_VS_MAX_LOOP_DEPTH 1
#define VSF_MAX_INPUTS 32
diff --git a/src/gallium/drivers/r300/compiler/radeon_compiler.h b/src/gallium/drivers/r300/compiler/radeon_compiler.h
index ac9691c816f..b86b2e6594b 100644
--- a/src/gallium/drivers/r300/compiler/radeon_compiler.h
+++ b/src/gallium/drivers/r300/compiler/radeon_compiler.h
@@ -137,8 +137,8 @@ struct r300_vertex_program_compiler {
void * UserData;
void (*SetHwInputOutput)(struct r300_vertex_program_compiler * c);
- int PredicateIndex;
- unsigned int PredicateMask;
+ int PredicateRegs[R500_PVS_MAX_LOOP_DEPTH];
+ unsigned int loop_depth;
};
void r3xx_compile_vertex_program(struct r300_vertex_program_compiler* c);