summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorIago Toral Quiroga <itoral@igalia.com>2021-03-29 14:28:14 +0200
committerMarge Bot <eric+marge@anholt.net>2021-03-31 05:51:22 +0000
commite266e6c634aa04172484a98466f173c1bda9671c (patch)
treea4011e234320e5cb8b7851c7e8c5799367c51e8b
parentf33ca092daf0e6303b3f7cd0786998da9f0d71c1 (diff)
broadcom/compiler: try to fill up delay slots after a branch instruction
For this we do something similar to what we do with thrsw where we try to move the branch instruction earlier so the previous instructions execute in the delay slots of the branch. Generally, we can do this with any instruction except: - If the instruction reads a uniform: since our branches do as well and uniforms come from an ordered FIFO stream. - If the instruction writes flags, since our branch instruction will probably read them. - If the instruction is in the delay slots of another thread switch, branch, or unifa write, which is disallowed. total instructions in shared programs: 13648140 -> 13613972 (-0.25%) instructions in affected programs: 2209552 -> 2175384 (-1.55%) helped: 6765 HURT: 0 Instructions are helped. total max-temps in shared programs: 2318687 -> 2318436 (-0.01%) max-temps in affected programs: 5046 -> 4795 (-4.97%) helped: 152 HURT: 0 Max-temps are helped. total inst-and-stalls in shared programs: 13680494 -> 13646326 (-0.25%) inst-and-stalls in affected programs: 2220394 -> 2186226 (-1.54%) helped: 6765 HURT: 0 Inst-and-stalls are helped. total nops in shared programs: 399818 -> 365640 (-8.55%) nops in affected programs: 127311 -> 93133 (-26.85%) helped: 6765 HURT: 0 Nops are helped. Reviewed-by: Alejandro PiƱeiro <apinheiro@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9918>
-rw-r--r--src/broadcom/compiler/qpu_schedule.c137
1 files changed, 119 insertions, 18 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index e6a07723618..3dd5d246a45 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -490,6 +490,7 @@ struct choose_scoreboard {
int last_unifa_write_tick;
int last_uniforms_reset_tick;
int last_thrsw_tick;
+ int last_branch_tick;
bool tlb_locked;
bool fixup_ldvary;
int ldvary_count;
@@ -1078,6 +1079,16 @@ retry:
continue;
}
+ /* Don't try to put a branch in the delay slots of another
+ * branch or a unifa write.
+ */
+ if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ if (scoreboard->last_branch_tick + 3 >= scoreboard->tick)
+ continue;
+ if (scoreboard->last_unifa_write_tick + 3 >= scoreboard->tick)
+ continue;
+ }
+
/* If we're trying to pair with another instruction, check
* that they're compatible.
*/
@@ -1674,11 +1685,17 @@ emit_thrsw(struct v3d_compile *c,
assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
- /* Don't try to emit a thrsw in the delay slots of a previous thrsw */
+ /* Don't try to emit a thrsw in the delay slots of a previous thrsw
+ * or branch.
+ */
while (scoreboard->last_thrsw_tick + 2 >= scoreboard->tick) {
emit_nop(c, block, scoreboard);
time++;
}
+ while (scoreboard->last_branch_tick + 3 >= scoreboard->tick) {
+ emit_nop(c, block, scoreboard);
+ time++;
+ }
/* Find how far back into previous instructions we can put the THRSW. */
int slots_filled = 0;
@@ -1746,6 +1763,97 @@ emit_thrsw(struct v3d_compile *c,
}
static bool
+qpu_inst_valid_in_branch_delay_slot(struct v3d_compile *c, struct qinst *inst)
+{
+ if (inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+ return false;
+
+ if (inst->qpu.sig.thrsw)
+ return false;
+
+ if (v3d_qpu_writes_unifa(c->devinfo, &inst->qpu))
+ return false;
+
+ if (vir_has_uniform(inst))
+ return false;
+
+ return true;
+}
+
+static void
+emit_branch(struct v3d_compile *c,
+ struct qblock *block,
+ struct choose_scoreboard *scoreboard,
+ struct qinst *inst)
+{
+ assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+
+ /* We should've not picked up a branch for the delay slots of a previous
+ * thrsw, branch or unifa write instruction.
+ */
+ int branch_tick = scoreboard->tick;
+ assert(scoreboard->last_thrsw_tick + 2 < branch_tick);
+ assert(scoreboard->last_branch_tick + 3 < branch_tick);
+ assert(scoreboard->last_unifa_write_tick + 3 < branch_tick);
+
+ /* Insert the branch instruction */
+ insert_scheduled_instruction(c, block, scoreboard, inst);
+
+ /* Now see if we can move the branch instruction back into the
+ * instruction stream to fill its delay slots
+ */
+ int slots_filled = 0;
+ while (slots_filled < 3 && block->instructions.next != &inst->link) {
+ struct qinst *prev_inst = (struct qinst *) inst->link.prev;
+ assert(prev_inst->qpu.type != V3D_QPU_INSTR_TYPE_BRANCH);
+
+ /* Can't move the branch instruction if that would place it
+ * in the delay slots of other instructions.
+ */
+ if (scoreboard->last_branch_tick + 3 >=
+ branch_tick - slots_filled - 1) {
+ break;
+ }
+
+ if (scoreboard->last_thrsw_tick + 2 >=
+ branch_tick - slots_filled - 1) {
+ break;
+ }
+
+ if (scoreboard->last_unifa_write_tick + 3 >=
+ branch_tick - slots_filled - 1) {
+ break;
+ }
+
+ /* Can't move a conditional branch before the instruction
+ * that writes the flags for its condition.
+ */
+ if (v3d_qpu_writes_flags(&prev_inst->qpu) &&
+ inst->qpu.branch.cond != V3D_QPU_BRANCH_COND_ALWAYS) {
+ break;
+ }
+
+ if (!qpu_inst_valid_in_branch_delay_slot(c, prev_inst))
+ break;
+
+ list_del(&prev_inst->link);
+ list_add(&prev_inst->link, &inst->link);
+ slots_filled++;
+ }
+
+ block->branch_qpu_ip = c->qpu_inst_count - 1 - slots_filled;
+ scoreboard->last_branch_tick = branch_tick - slots_filled;
+
+ /* Fill any remaining delay slots.
+ *
+ * FIXME: For unconditional branches we could fill these with the
+ * first instructions in the successor block.
+ */
+ for (int i = 0; i < 3 - slots_filled; i++)
+ emit_nop(c, block, scoreboard);
+}
+
+static bool
alu_reads_register(struct v3d_qpu_instr *inst,
bool add, bool magic, uint32_t index)
{
@@ -2025,23 +2133,11 @@ schedule_instructions(struct v3d_compile *c,
if (inst->sig.thrsw) {
time += emit_thrsw(c, block, scoreboard, qinst, false);
+ } else if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
+ emit_branch(c, block, scoreboard, qinst);
} else {
insert_scheduled_instruction(c, block,
scoreboard, qinst);
-
- if (inst->type == V3D_QPU_INSTR_TYPE_BRANCH) {
- block->branch_qpu_ip = c->qpu_inst_count - 1;
- /* Fill the delay slots.
- *
- * We should fill these with actual instructions,
- * instead, but that will probably need to be done
- * after this, once we know what the leading
- * instructions of the successors are (so we can
- * handle A/B register file write latency)
- */
- for (int i = 0; i < 3; i++)
- emit_nop(c, block, scoreboard);
- }
}
}
@@ -2111,11 +2207,15 @@ qpu_set_branch_targets(struct v3d_compile *c)
/* Walk back through the delay slots to find the branch
* instr.
*/
+ struct qinst *branch = NULL;
struct list_head *entry = block->instructions.prev;
- for (int i = 0; i < 3; i++)
+ for (int i = 0; i < 3; i++) {
entry = entry->prev;
- struct qinst *branch = container_of(entry, struct qinst, link);
- assert(branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
+ branch = container_of(entry, struct qinst, link);
+ if (branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH)
+ break;
+ }
+ assert(branch && branch->qpu.type == V3D_QPU_INSTR_TYPE_BRANCH);
/* Make sure that the if-we-don't-jump
* successor was scheduled just after the
@@ -2169,6 +2269,7 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
scoreboard.last_magic_sfu_write_tick = -10;
scoreboard.last_uniforms_reset_tick = -10;
scoreboard.last_thrsw_tick = -10;
+ scoreboard.last_branch_tick = -10;
scoreboard.last_stallable_sfu_tick = -10;
if (debug) {