ir3: Add support for "scalar ALU"

On a650 and later, there is a "scalar ALU" capable of executing cat2 instructions, a subset of cat3 instructions (csel but *not* mad), and cat4 instructions. There is also another copy of the scalar ALU embedded in HLSQ, which is responsible for executing preambles with the "early preamble" bit set. The two new features are closely intertwined, because the scalar ALU makes it possible to make most preambles only use shared registers, letting us optimistically use shared registers and only fall back to normal preambles if we ran out of shared registers. But the scalar ALU is also generally useful for moving calculations of uniform values like loop indices to the scalar ALU to reduce normal register pressure and increase parallelism, because like SFU/EFU and texture units different waves can execute ALU and scalar ALU instructions in parallel. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22075>
author: Connor Abbott <cwabbott0@gmail.com> 2022-05-20 15:18:55 +0200
committer: Marge Bot <emma+marge@anholt.net> 2024-04-26 12:55:13 +0000
commit: 876c5396a7893d2a55a2d6635085d70c03c216d4 (patch)
tree: 5aae7fe0b54c19f4aa52c2df6ce30560747c828e
parent: f8ac16b4b98732153ea838e6cc4c1546551823af (diff)
10 files changed, 200 insertions, 34 deletions
diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h
index e27bc1ddee2..83417e41e24 100644
--- a/src/freedreno/common/freedreno_dev_info.h
+++ b/src/freedreno/common/freedreno_dev_info.h
@@ -172,6 +172,9 @@ struct fd_dev_info {
 
       bool broken_ds_ubwc_quirk;
 
+      /* See ir3_compiler::has_scalar_alu. */
+      bool has_scalar_alu;
+
       struct {
          uint32_t PC_POWER_CNTL;
          uint32_t TPL1_DBG_ECO_CNTL;
diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py
index 03505788024..5c8a81126d9 100644
--- a/src/freedreno/common/freedreno_devices.py
+++ b/src/freedreno/common/freedreno_devices.py
@@ -389,6 +389,7 @@ a6xx_gen3 = A6XXProps(
         enable_lrz_fast_clear = True,
         lrz_track_quirk = True,
         has_per_view_viewport = True,
+        has_scalar_alu = True,
     )
 
 a6xx_gen4 = A6XXProps(
@@ -412,6 +413,7 @@ a6xx_gen4 = A6XXProps(
         enable_lrz_fast_clear = True,
         has_lrz_dir_tracking = True,
         has_per_view_viewport = True,
+        has_scalar_alu = True,
     )
 
 a6xx_a690_quirk = A6XXProps(
@@ -790,6 +792,7 @@ a7xx_base = A6XXProps(
         has_per_view_viewport = True,
         line_width_min = 1.0,
         line_width_max = 127.5,
+        has_scalar_alu = True,
     )
 
 a7xx_725 = A7XXProps(
diff --git a/src/freedreno/ir3/ir3.c b/src/freedreno/ir3/ir3.c
index 381136ddc57..713c7651854 100644
--- a/src/freedreno/ir3/ir3.c
+++ b/src/freedreno/ir3/ir3.c
@@ -792,6 +792,33 @@ ir3_instr_set_address(struct ir3_instruction *instr,
    }
 }
 
+/* Does this instruction use the scalar ALU?
+ */
+bool
+is_scalar_alu(struct ir3_instruction *instr,
+              const struct ir3_compiler *compiler)
+{
+   /* MOVMSK seems to always need (ss) even with other scalar ALU instructions
+    */
+   return instr->opc != OPC_MOVMSK &&
+      instr->opc != OPC_SCAN_CLUSTERS_MACRO &&
+      instr->opc != OPC_SCAN_MACRO &&
+      is_alu(instr) && (instr->dsts[0]->flags & IR3_REG_SHARED) &&
+      /* scalar->scalar mov instructions (but NOT cov) were supported before the
+       * scalar ALU was supported, but they still required (ss) whereas on GPUs
+       * that have a scalar ALU they are executed on it and do not require (ss).
+       * We have to be careful to return false for these if scalar ALU isn't
+       * supported, so that we treat them like vector->scalar mov instructions
+       * (such as requiring (ss)).
+       */
+      compiler->has_scalar_alu &&
+      /* moves from normal to shared seem to use a separate ALU as before and
+       * require a (ss) on dependent instructions.
+       */
+      ((instr->opc != OPC_MOV && !is_subgroup_cond_mov_macro(instr)) ||
+       (instr->srcs[0]->flags & (IR3_REG_SHARED | IR3_REG_IMMED | IR3_REG_CONST)));
+}
+
 void
 ir3_block_clear_mark(struct ir3_block *block)
 {
diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h
index 2c510382f3f..3d0607b32b3 100644
--- a/src/freedreno/ir3/ir3.h
+++ b/src/freedreno/ir3/ir3.h
@@ -1908,9 +1908,11 @@ struct log_stream;
 void ir3_print_instr_stream(struct log_stream *stream, struct ir3_instruction *instr);
 
 /* delay calculation: */
-int ir3_delayslots(struct ir3_instruction *assigner,
+int ir3_delayslots(struct ir3_compiler *compiler,
+                   struct ir3_instruction *assigner,
                    struct ir3_instruction *consumer, unsigned n, bool soft);
-unsigned ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+unsigned ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
+                                    struct ir3_instruction *assigner,
                                     struct ir3_instruction *consumer,
                                     unsigned assigner_n, unsigned consumer_n);
 
@@ -1923,7 +1925,10 @@ is_local_mem_load(struct ir3_instruction *instr)
       instr->opc == OPC_LDLW;
 }
 
-/* Does this instruction need (ss) to wait for its result? */
+bool is_scalar_alu(struct ir3_instruction *instr,
+                   const struct ir3_compiler *compiler);
+
+/* Does this instruction sometimes need (ss) to wait for its result? */
 static inline bool
 is_ss_producer(struct ir3_instruction *instr)
 {
@@ -1931,9 +1936,23 @@ is_ss_producer(struct ir3_instruction *instr)
       if (dst->flags & IR3_REG_SHARED)
          return true;
    }
+
    return is_sfu(instr) || is_local_mem_load(instr);
 }
 
+static inline bool
+needs_ss(const struct ir3_compiler *compiler, struct ir3_instruction *producer,
+         struct ir3_instruction *consumer)
+{
+   if (is_scalar_alu(producer, compiler) &&
+       is_scalar_alu(consumer, compiler) &&
+       (producer->dsts[0]->flags & IR3_REG_HALF) ==
+       (consumer->srcs[0]->flags & IR3_REG_HALF))
+      return false;
+
+   return is_ss_producer(producer);
+}
+
 /* The soft delay for approximating the cost of (ss). */
 static inline unsigned
 soft_ss_delay(struct ir3_instruction *instr)
diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c
index 769e6c562d9..cd98feede54 100644
--- a/src/freedreno/ir3/ir3_compiler.c
+++ b/src/freedreno/ir3/ir3_compiler.c
@@ -223,6 +223,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
       compiler->bitops_can_write_predicates = true;
       compiler->has_branch_and_or = true;
       compiler->has_predication = true;
+      compiler->has_scalar_alu = dev_info->a6xx.has_scalar_alu;
    } else {
       compiler->max_const_pipeline = 512;
       compiler->max_const_geom = 512;
@@ -233,6 +234,8 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
        * earlier gen's.
        */
       compiler->max_const_safe = 256;
+
+      compiler->has_scalar_alu = false;
    }
 
    /* This is just a guess for a4xx. */
diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h
index cd86462e291..77d36767deb 100644
--- a/src/freedreno/ir3/ir3_compiler.h
+++ b/src/freedreno/ir3/ir3_compiler.h
@@ -267,6 +267,15 @@ struct ir3_compiler {
 
    bool load_shader_consts_via_preamble;
    bool load_inline_uniforms_via_preamble_ldgk;
+
+   /* True if there is a scalar ALU capable of executing a subset of
+    * cat2-cat4 instructions with a shared register destination. This also
+    * implies expanded MOV/COV capability when writing to shared registers,
+    * as MOV/COV is now executed on the scalar ALU except when reading from a
+    * normal register, as well as the ability for ldc to write to a shared
+    * register.
+    */
+   bool has_scalar_alu;
 };
 
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
diff --git a/src/freedreno/ir3/ir3_delay.c b/src/freedreno/ir3/ir3_delay.c
index d74e6faabba..2eed2af9e1b 100644
--- a/src/freedreno/ir3/ir3_delay.c
+++ b/src/freedreno/ir3/ir3_delay.c
@@ -26,6 +26,8 @@
 
 #include "ir3.h"
 
+#include "ir3_compiler.h"
+
 /* The maximum number of nop's we may need to insert between two instructions.
  */
 #define MAX_NOPS 6
@@ -43,7 +45,8 @@
  * assigns a value and the one that consumes
  */
 int
-ir3_delayslots(struct ir3_instruction *assigner,
+ir3_delayslots(struct ir3_compiler *compiler,
+               struct ir3_instruction *assigner,
                struct ir3_instruction *consumer, unsigned n, bool soft)
 {
    /* generally don't count false dependencies, since this can just be
@@ -63,12 +66,26 @@ ir3_delayslots(struct ir3_instruction *assigner,
    if (writes_addr0(assigner) || writes_addr1(assigner))
       return 6;
 
-   if (soft && is_ss_producer(assigner))
+   if (soft && needs_ss(compiler, assigner, consumer))
       return soft_ss_delay(assigner);
 
    /* handled via sync flags: */
-   if (is_ss_producer(assigner) || is_sy_producer(assigner))
+   if (needs_ss(compiler, assigner, consumer) ||
+       is_sy_producer(assigner))
+      return 0;
+
+   /* scalar ALU -> scalar ALU depdendencies where the source and destination
+    * register sizes match don't require any nops.
+    */
+   if (is_scalar_alu(assigner, compiler)) {
+      assert(is_scalar_alu(consumer, compiler));
+      /* If the sizes don't match then we need (ss) and needs_ss() should've
+       * returned above.
+       */
+      assert((assigner->dsts[0]->flags & IR3_REG_HALF) ==
+             (consumer->srcs[n]->flags & IR3_REG_HALF));
       return 0;
+   }
 
    /* As far as we know, shader outputs don't need any delay. */
    if (consumer->opc == OPC_END || consumer->opc == OPC_CHMASK)
@@ -96,11 +113,12 @@ ir3_delayslots(struct ir3_instruction *assigner,
 }
 
 unsigned
-ir3_delayslots_with_repeat(struct ir3_instruction *assigner,
+ir3_delayslots_with_repeat(struct ir3_compiler *compiler,
+                           struct ir3_instruction *assigner,
                            struct ir3_instruction *consumer,
                            unsigned assigner_n, unsigned consumer_n)
 {
-   unsigned delay = ir3_delayslots(assigner, consumer, consumer_n, false);
+   unsigned delay = ir3_delayslots(compiler, assigner, consumer, consumer_n, false);
 
    struct ir3_register *src = consumer->srcs[consumer_n];
    struct ir3_register *dst = assigner->dsts[assigner_n];
diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c
index 5b592a5e234..f25e2f448f4 100644
--- a/src/freedreno/ir3/ir3_legalize.c
+++ b/src/freedreno/ir3/ir3_legalize.c
@@ -61,7 +61,10 @@ struct ir3_nop_state {
 
 struct ir3_legalize_state {
    regmask_t needs_ss;
+   regmask_t needs_ss_scalar_full; /* half scalar ALU producer -> full scalar ALU consumer */
+   regmask_t needs_ss_scalar_half; /* full scalar ALU producer -> half scalar ALU consumer */
    regmask_t needs_ss_war; /* write after read */
+   regmask_t needs_ss_scalar_war; /* scalar ALU write -> ALU write */
    regmask_t needs_sy;
    bool needs_ss_for_const;
 
@@ -101,6 +104,9 @@ apply_ss(struct ir3_instruction *instr,
    instr->flags |= IR3_INSTR_SS;
    regmask_init(&state->needs_ss_war, mergedregs);
    regmask_init(&state->needs_ss, mergedregs);
+   regmask_init(&state->needs_ss_scalar_war, mergedregs);
+   regmask_init(&state->needs_ss_scalar_full, mergedregs);
+   regmask_init(&state->needs_ss_scalar_half, mergedregs);
    state->needs_ss_for_const = false;
 }
 
@@ -114,14 +120,14 @@ apply_sy(struct ir3_instruction *instr,
 }
 
 static bool
-count_instruction(struct ir3_instruction *n)
+count_instruction(struct ir3_instruction *n, struct ir3_compiler *compiler)
 {
    /* NOTE: don't count branch/jump since we don't know yet if they will
     * be eliminated later in resolve_jumps().. really should do that
     * earlier so we don't have this constraint.
     */
-   return is_alu(n) ||
-          (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
+   return (is_alu(n) && !is_scalar_alu(n, compiler)) ||
+      (is_flow(n) && (n->opc != OPC_JUMP) && (n->opc != OPC_BR) &&
            (n->opc != OPC_BRAA) && (n->opc != OPC_BRAO));
 }
 
@@ -363,6 +369,14 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       struct ir3_legalize_state *pstate = &pbd->state;
 
       regmask_or_shared(&state->needs_ss, &state->needs_ss, &pstate->needs_ss);
+      regmask_or_shared(&state->needs_ss_scalar_full,
+                        &state->needs_ss_scalar_full,
+                        &pstate->needs_ss_scalar_full);
+      regmask_or_shared(&state->needs_ss_scalar_half,
+                        &state->needs_ss_scalar_half,
+                        &pstate->needs_ss_scalar_half);
+      regmask_or_shared(&state->needs_ss_scalar_war, &state->needs_ss_scalar_war,
+                        &pstate->needs_ss_scalar_war);
    }
 
    memcpy(&bd->state, state, sizeof(*state));
@@ -419,6 +433,8 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          apply_ss(n, state, mergedregs);
       }
 
+      bool n_is_scalar_alu = is_scalar_alu(n, ctx->compiler);
+
       /* NOTE: consider dst register too.. it could happen that
        * texture sample instruction (for example) writes some
        * components which are unused.  A subsequent instruction
@@ -443,6 +459,34 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
                last_input_needs_ss = false;
             }
 
+            /* There is a fast feedback path for scalar ALU instructions which
+             * only takes 1 cycle of latency, similar to the normal 3 cycle
+             * latency path for ALU instructions. For this fast path the
+             * producer and consumer must use the same register size (i.e. no
+             * writing a full register and then reading half of it or vice
+             * versa). If we don't hit this path, either because of a mismatched
+             * size or a read via the regular ALU, then the write latency is
+             * variable and we must use (ss) to wait for the scalar ALU. This is
+             * different from the fixed 6 cycle latency for mismatched vector
+             * ALU accesses.
+             */
+            if (n_is_scalar_alu) {
+               /* Check if we have a mismatched size RaW dependency */
+               if (regmask_get((reg->flags & IR3_REG_HALF) ?
+                               &state->needs_ss_scalar_half :
+                               &state->needs_ss_scalar_full, reg)) {
+                  apply_ss(n, state, mergedregs);
+                  last_input_needs_ss = false;
+               }
+            } else {
+               /* check if we have a scalar -> vector RaW dependency */
+               if (regmask_get(&state->needs_ss_scalar_half, reg) ||
+                   regmask_get(&state->needs_ss_scalar_full, reg)) {
+                  apply_ss(n, state, mergedregs);
+                  last_input_needs_ss = false;
+               }
+            }
+
             if (regmask_get(&state->needs_sy, reg)) {
                apply_sy(n, state, mergedregs);
             }
@@ -455,7 +499,9 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
       }
 
       foreach_dst (reg, n) {
-         if (regmask_get(&state->needs_ss_war, reg)) {
+         if (regmask_get(&state->needs_ss_war, reg) ||
+             (!n_is_scalar_alu &&
+              regmask_get(&state->needs_ss_scalar_war, reg))) {
             apply_ss(n, state, mergedregs);
             last_input_needs_ss = false;
          }
@@ -483,6 +529,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
        */
 
       if ((delay > 0) && (ctx->compiler->gen >= 6) && last_n &&
+          !n_is_scalar_alu &&
           ((opc_cat(last_n->opc) == 2) || (opc_cat(last_n->opc) == 3)) &&
           (last_n->repeat == 0)) {
          /* the previous cat2/cat3 instruction can encode at most 3 nop's: */
@@ -528,8 +575,16 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
          regmask_set(&state->needs_ss, n->dsts[0]);
 
       foreach_dst (dst, n) {
-         if (dst->flags & IR3_REG_SHARED)
-            regmask_set(&state->needs_ss, dst);
+         if (dst->flags & IR3_REG_SHARED) {
+            if (n_is_scalar_alu) {
+               if (dst->flags & IR3_REG_HALF)
+                  regmask_set(&state->needs_ss_scalar_full, dst);
+               else
+                  regmask_set(&state->needs_ss_scalar_half, dst);
+            } else {
+               regmask_set(&state->needs_ss, dst);
+            }
+         }
       }
 
       if (is_tex_or_prefetch(n)) {
@@ -566,17 +621,31 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
        * their src register(s):
        */
       if (is_tex(n) || is_mem(n) || is_ss_producer(n)) {
-         foreach_src (reg, n) {
-            regmask_set(&state->needs_ss_war, reg);
+         if (n_is_scalar_alu) {
+            /* Scalar ALU also does not immediately read its source because it
+             * is not executed right away, but scalar ALU instructions are
+             * executed in-order so subsequent scalar ALU instructions don't
+             * need to wait for previous ones.
+             */
+            foreach_src (reg, n) {
+               if (reg->flags & IR3_REG_SHARED) {
+                  regmask_set(&state->needs_ss_scalar_war, reg);
+               }
+            }
+         } else {
+            foreach_src (reg, n) {
+               regmask_set(&state->needs_ss_war, reg);
+            }
          }
       }
 
-      if (count_instruction(n))
+      bool count = count_instruction(n, ctx->compiler);
+      if (count)
          cycle += 1;
 
       delay_update(state, n, cycle, mergedregs);
 
-      if (count_instruction(n))
+      if (count)
          cycle += n->repeat;
 
       if (ctx->early_input_release && is_input(n)) {
@@ -1496,9 +1565,15 @@ ir3_legalize(struct ir3 *ir, struct ir3_shader_variant *so, int *max_bary)
          rzalloc(ctx, struct ir3_legalize_block_data);
 
       regmask_init(&bd->state.needs_ss_war, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_war, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_full, mergedregs);
+      regmask_init(&bd->state.needs_ss_scalar_half, mergedregs);
       regmask_init(&bd->state.needs_ss, mergedregs);
       regmask_init(&bd->state.needs_sy, mergedregs);
       regmask_init(&bd->begin_state.needs_ss_war, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_war, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_full, mergedregs);
+      regmask_init(&bd->begin_state.needs_ss_scalar_half, mergedregs);
       regmask_init(&bd->begin_state.needs_ss, mergedregs);
       regmask_init(&bd->begin_state.needs_sy, mergedregs);
 
diff --git a/src/freedreno/ir3/ir3_postsched.c b/src/freedreno/ir3/ir3_postsched.c
index c56829e9025..7ad22504896 100644
--- a/src/freedreno/ir3/ir3_postsched.c
+++ b/src/freedreno/ir3/ir3_postsched.c
@@ -406,14 +406,15 @@ add_single_reg_dep(struct ir3_postsched_deps_state *state,
 
    unsigned d = 0;
    if (src_n >= 0 && dep && state->direction == F) {
+      struct ir3_compiler *compiler = state->ctx->ir->compiler;
       /* get the dst_n this corresponds to */
       unsigned dst_n = state->dst_n[num];
-      unsigned d_soft = ir3_delayslots(dep->instr, node->instr, src_n, true);
-      d = ir3_delayslots_with_repeat(dep->instr, node->instr, dst_n, src_n);
+      unsigned d_soft = ir3_delayslots(compiler, dep->instr, node->instr, src_n, true);
+      d = ir3_delayslots_with_repeat(compiler, dep->instr, node->instr, dst_n, src_n);
       node->delay = MAX2(node->delay, d_soft);
       if (is_sy_producer(dep->instr))
          node->has_sy_src = true;
-      if (is_ss_producer(dep->instr))
+      if (needs_ss(compiler, dep->instr, node->instr))
          node->has_ss_src = true;
    }
 
diff --git a/src/freedreno/ir3/ir3_sched.c b/src/freedreno/ir3/ir3_sched.c
index 7fb3f53ca76..a0089e78fed 100644
--- a/src/freedreno/ir3/ir3_sched.c
+++ b/src/freedreno/ir3/ir3_sched.c
@@ -90,6 +90,7 @@
  */
 
 struct ir3_sched_ctx {
+   struct ir3_compiler *compiler;
    struct ir3_block *block; /* the current block */
    struct dag *dag;
 
@@ -173,7 +174,8 @@ struct ir3_sched_node {
 
 static void sched_node_init(struct ir3_sched_ctx *ctx,
                             struct ir3_instruction *instr);
-static void sched_node_add_dep(struct ir3_instruction *instr,
+static void sched_node_add_dep(struct ir3_sched_ctx *ctx,
+                               struct ir3_instruction *instr,
                                struct ir3_instruction *src, int i);
 
 static bool
@@ -182,10 +184,11 @@ is_scheduled(struct ir3_instruction *instr)
    return !!(instr->flags & IR3_INSTR_MARK);
 }
 
-/* check_src_cond() passing a ir3_sched_ctx. */
+/* check_src_cond() passing the user and ir3_sched_ctx. */
 static bool
 sched_check_src_cond(struct ir3_instruction *instr,
                      bool (*cond)(struct ir3_instruction *,
+                                  struct ir3_instruction *,
                                   struct ir3_sched_ctx *),
                      struct ir3_sched_ctx *ctx)
 {
@@ -197,7 +200,7 @@ sched_check_src_cond(struct ir3_instruction *instr,
          if (sched_check_src_cond(src, cond, ctx))
             return true;
       } else {
-         if (cond(src, ctx))
+         if (cond(src, instr, ctx))
             return true;
       }
    }
@@ -208,7 +211,8 @@ sched_check_src_cond(struct ir3_instruction *instr,
 /* Is this a sy producer that hasn't been waited on yet? */
 
 static bool
-is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
+is_outstanding_sy(struct ir3_instruction *instr, struct ir3_instruction *use,
+                  struct ir3_sched_ctx *ctx)
 {
    if (!is_sy_producer(instr))
       return false;
@@ -224,9 +228,10 @@ is_outstanding_sy(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
 }
 
 static bool
-is_outstanding_ss(struct ir3_instruction *instr, struct ir3_sched_ctx *ctx)
+is_outstanding_ss(struct ir3_instruction *instr, struct ir3_instruction *use,
+                  struct ir3_sched_ctx *ctx)
 {
-   if (!is_ss_producer(instr))
+   if (!needs_ss(ctx->compiler, instr, use))
       return false;
 
    /* The sched node is only valid within the same block, we cannot
@@ -932,7 +937,7 @@ split_addr(struct ir3_sched_ctx *ctx, struct ir3_instruction **addr,
          /* don't need to remove old dag edge since old addr is
           * already scheduled:
           */
-         sched_node_add_dep(indirect, new_addr, 0);
+         sched_node_add_dep(ctx, indirect, new_addr, 0);
          di(indirect, "new address");
       }
    }
@@ -955,7 +960,8 @@ sched_node_init(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 }
 
 static void
-sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
+sched_node_add_dep(struct ir3_sched_ctx *ctx,
+                   struct ir3_instruction *instr, struct ir3_instruction *src,
                    int i)
 {
    /* don't consider dependencies in other blocks: */
@@ -978,8 +984,8 @@ sched_node_add_dep(struct ir3_instruction *instr, struct ir3_instruction *src,
    if (instr->opc == OPC_META_COLLECT)
       sn->collect = instr;
 
-   unsigned d_soft = ir3_delayslots(src, instr, i, true);
-   unsigned d = ir3_delayslots(src, instr, i, false);
+   unsigned d_soft = ir3_delayslots(ctx->compiler, src, instr, i, true);
+   unsigned d = ir3_delayslots(ctx->compiler, src, instr, i, false);
 
    /* delays from (ss) and (sy) are considered separately and more accurately in
     * the scheduling heuristic, so ignore it when calculating the ip of
@@ -1036,7 +1042,7 @@ is_output_only(struct ir3_instruction *instr)
 }
 
 static void
-sched_node_add_deps(struct ir3_instruction *instr)
+sched_node_add_deps(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
    /* There's nothing to do for phi nodes, since they always go first. And
     * phi nodes can reference sources later in the same block, so handling
@@ -1049,7 +1055,7 @@ sched_node_add_deps(struct ir3_instruction *instr)
     * the DAG easily in a single pass.
     */
    foreach_ssa_src_n (src, i, instr) {
-      sched_node_add_dep(instr, src, i);
+      sched_node_add_dep(ctx, instr, src, i);
    }
 
    /* NOTE that all inputs must be scheduled before a kill, so
@@ -1098,7 +1104,7 @@ sched_dag_init(struct ir3_sched_ctx *ctx)
    dag_validate(ctx->dag, sched_dag_validate_cb, NULL);
 
    foreach_instr (instr, &ctx->unscheduled_list)
-      sched_node_add_deps(instr);
+      sched_node_add_deps(ctx, instr);
 
    dag_traverse_bottom_up(ctx->dag, sched_dag_max_delay_cb, NULL);
 }
@@ -1234,6 +1240,8 @@ ir3_sched(struct ir3 *ir)
 {
    struct ir3_sched_ctx *ctx = rzalloc(NULL, struct ir3_sched_ctx);
 
+   ctx->compiler = ir->compiler;
+
    foreach_block (block, &ir->block_list) {
       foreach_instr (instr, &block->instr_list) {
          instr->data = NULL;
author	Connor Abbott <cwabbott0@gmail.com>	2022-05-20 15:18:55 +0200
committer	Marge Bot <emma+marge@anholt.net>	2024-04-26 12:55:13 +0000
commit	876c5396a7893d2a55a2d6635085d70c03c216d4 (patch)
tree	5aae7fe0b54c19f4aa52c2df6ce30560747c828e
parent	f8ac16b4b98732153ea838e6cc4c1546551823af (diff)