2 files changed, 137 insertions, 39 deletions
diff --git a/src/broadcom/compiler/qpu_schedule.c b/src/broadcom/compiler/qpu_schedule.c
index dff8438d94..fdec5252b1 100644
--- a/src/broadcom/compiler/qpu_schedule.c
+++ b/src/broadcom/compiler/qpu_schedule.c
@@ -1012,6 +1012,19 @@ mark_instruction_scheduled(struct list_head *schedule_list,
         }
 }
 
+static void
+insert_scheduled_instruction(struct v3d_compile *c,
+                             struct qblock *block,
+                             struct choose_scoreboard *scoreboard,
+                             struct qinst *inst)
+{
+        list_addtail(&inst->link, &block->instructions);
+
+        update_scoreboard_for_chosen(scoreboard, &inst->qpu);
+        c->qpu_inst_count++;
+        scoreboard->tick++;
+}
+
 static struct qinst *
 vir_nop()
 {
@@ -1021,61 +1034,145 @@ vir_nop()
         return qinst;
 }
 
-#if 0
-static struct qinst *
-nop_after(struct qinst *inst)
+static void
+emit_nop(struct v3d_compile *c, struct qblock *block,
+         struct choose_scoreboard *scoreboard)
+{
+        insert_scheduled_instruction(c, block, scoreboard, vir_nop());
+}
+
+static bool
+qpu_instruction_valid_in_thrend_slot(struct v3d_compile *c,
+                                     const struct qinst *qinst, int slot)
+{
+        const struct v3d_qpu_instr *inst = &qinst->qpu;
+
+        /* Only TLB Z writes are prohibited in the last slot, but we don't
+         * have those flagged so prohibit all TLB ops for now.
+         */
+        if (slot == 2 && qpu_inst_is_tlb(inst))
+                return false;
+
+        if (slot > 0 && qinst->uniform != ~0)
+                return false;
+
+        if (v3d_qpu_uses_vpm(inst))
+                return false;
+
+        if (inst->sig.ldvary)
+                return false;
+
+        if (inst->type == V3D_QPU_INSTR_TYPE_ALU) {
+                /* No writing physical registers at the end. */
+                if (!inst->alu.add.magic_write ||
+                    !inst->alu.mul.magic_write) {
+                        return false;
+                }
+
+                if (c->devinfo->ver < 40 && inst->alu.add.op == V3D_QPU_A_SETMSF)
+                        return false;
+
+                /* RF0-2 might be overwritten during the delay slots by
+                 * fragment shader setup.
+                 */
+                if (inst->raddr_a < 3 &&
+                    (inst->alu.add.a == V3D_QPU_MUX_A ||
+                     inst->alu.add.b == V3D_QPU_MUX_A ||
+                     inst->alu.mul.a == V3D_QPU_MUX_A ||
+                     inst->alu.mul.b == V3D_QPU_MUX_A)) {
+                        return false;
+                }
+
+                if (inst->raddr_b < 3 &&
+                    !inst->sig.small_imm &&
+                    (inst->alu.add.a == V3D_QPU_MUX_B ||
+                     inst->alu.add.b == V3D_QPU_MUX_B ||
+                     inst->alu.mul.a == V3D_QPU_MUX_B ||
+                     inst->alu.mul.b == V3D_QPU_MUX_B)) {
+                        return false;
+                }
+        }
+
+        return true;
+}
+
+static bool
+valid_thrend_sequence(struct v3d_compile *c,
+                      struct qinst *qinst, int instructions_in_sequence)
 {
-        struct qinst *q = vir_nop();
+        for (int slot = 0; slot < instructions_in_sequence; slot++) {
+                if (!qpu_instruction_valid_in_thrend_slot(c, qinst, slot))
+                        return false;
 
-        list_add(&q->link, &inst->link);
+                /* Note that the list is circular, so we can only do this up
+                 * to instructions_in_sequence.
+                 */
+                qinst = (struct qinst *)qinst->link.next;
+        }
 
-        return q;
+        return true;
 }
 
 /**
- * Emits a THRSW/LTHRSW signal in the stream, trying to move it up to pair
- * with another instruction.
+ * Emits a THRSW signal in the stream, trying to move it up to pair with
+ * another instruction.
  */
-static void
+static int
 emit_thrsw(struct v3d_compile *c,
+           struct qblock *block,
            struct choose_scoreboard *scoreboard,
-           const struct v3d_qpu_instr *inst)
+           struct qinst *inst)
 {
+        int time = 0;
+
         /* There should be nothing in a thrsw inst being scheduled other than
          * the signal bits.
          */
-        assert(inst->type == V3D_QPU_INSTR_TYPE_ALU);
-        assert(inst->alu.add.op == V3D_QPU_A_NOP);
-        assert(inst->alu.mul.op == V3D_QPU_M_NOP);
+        assert(inst->qpu.type == V3D_QPU_INSTR_TYPE_ALU);
+        assert(inst->qpu.alu.add.op == V3D_QPU_A_NOP);
+        assert(inst->qpu.alu.mul.op == V3D_QPU_M_NOP);
+
+        /* Find how far back into previous instructions we can put the THRSW. */
+        int slots_filled = 0;
+        struct qinst *merge_inst = NULL;
+        vir_for_each_inst_rev(prev_inst, block) {
+                struct v3d_qpu_sig sig = prev_inst->qpu.sig;
+                sig.thrsw = true;
+                uint32_t packed_sig;
+
+                if (!v3d_qpu_sig_pack(c->devinfo, &sig, &packed_sig))
+                        break;
 
-        /* Try to find an earlier scheduled instruction that we can merge the
-         * thrsw into.
-         */
-        int thrsw_ip = c->qpu_inst_count;
-        for (int i = 1; i <= MIN2(c->qpu_inst_count, 3); i++) {
-                uint64_t prev_instr = c->qpu_insts[c->qpu_inst_count - i];
-                uint32_t prev_sig = QPU_GET_FIELD(prev_instr, QPU_SIG);
+                if (!valid_thrend_sequence(c, prev_inst, slots_filled + 1))
+                        break;
 
-                if (prev_sig == QPU_SIG_NONE)
-                        thrsw_ip = c->qpu_inst_count - i;
+                merge_inst = prev_inst;
+                if (++slots_filled == 3)
+                        break;
         }
 
-        if (thrsw_ip != c->qpu_inst_count) {
-                /* Merge the thrsw into the existing instruction. */
-                c->qpu_insts[thrsw_ip] =
-                        QPU_UPDATE_FIELD(c->qpu_insts[thrsw_ip], sig, QPU_SIG);
+        if (merge_inst) {
+                merge_inst->qpu.sig.thrsw = true;
         } else {
-                qpu_serialize_one_inst(c, inst);
-                update_scoreboard_for_chosen(scoreboard, inst);
+                insert_scheduled_instruction(c, block, scoreboard, inst);
+                time++;
+                slots_filled++;
         }
 
-        /* Fill the delay slots. */
-        while (c->qpu_inst_count < thrsw_ip + 3) {
-                update_scoreboard_for_chosen(scoreboard, v3d_qpu_nop());
-                qpu_serialize_one_inst(c, v3d_qpu_nop());
+        /* Insert any extra delay slot NOPs we need. */
+        for (int i = 0; i < 3 - slots_filled; i++) {
+                emit_nop(c, block, scoreboard);
+                time++;
         }
+
+        /* If we put our THRSW into another instruction, free up the
+         * instruction that didn't end up scheduled into the list.
+         */
+        if (merge_inst)
+                free(inst);
+
+        return time;
 }
-#endif
 
 static uint32_t
 schedule_instructions(struct v3d_compile *c,
@@ -1337,6 +1434,8 @@ uint32_t
 v3d_qpu_schedule_instructions(struct v3d_compile *c)
 {
         const struct v3d_device_info *devinfo = c->devinfo;
+        struct qblock *end_block = list_last_entry(&c->blocks,
+                                                   struct qblock, link);
 
         /* We reorder the uniforms as we schedule instructions, so save the
          * old data off and replace it.
@@ -1386,6 +1485,11 @@ v3d_qpu_schedule_instructions(struct v3d_compile *c)
                 block->end_qpu_ip = c->qpu_inst_count - 1;
         }
 
+        /* Emit the program-end THRSW instruction. */;
+        struct qinst *thrsw = vir_nop();
+        thrsw->qpu.sig.thrsw = true;
+        emit_thrsw(c, end_block, &scoreboard, thrsw);
+
         qpu_set_branch_targets(c);
 
         assert(next_uniform == c->num_uniforms);
diff --git a/src/broadcom/compiler/vir_to_qpu.c b/src/broadcom/compiler/vir_to_qpu.c
index 525638df69..955eb96a87 100644
--- a/src/broadcom/compiler/vir_to_qpu.c
+++ b/src/broadcom/compiler/vir_to_qpu.c
@@ -322,8 +322,6 @@ void
 v3d_vir_to_qpu(struct v3d_compile *c)
 {
         struct qpu_reg *temp_registers = v3d_register_allocate(c);
-        struct qblock *end_block = list_last_entry(&c->blocks,
-                                                   struct qblock, link);
 
         /* Reset the uniform count to how many will be actually loaded by the
          * generated QPU code.
@@ -333,10 +331,6 @@ v3d_vir_to_qpu(struct v3d_compile *c)
         vir_for_each_block(block, c)
                 v3d_generate_code_block(c, block, temp_registers);
 
-        struct qinst *thrsw = vir_nop();
-        list_addtail(&thrsw->link, &end_block->instructions);
-        thrsw->qpu.sig.thrsw = true;
-
         uint32_t cycles = v3d_qpu_schedule_instructions(c);
 
         c->qpu_insts = rzalloc_array(c, uint64_t, c->qpu_inst_count);