lima/gpir: Rework the schedulercwabbott-lima-2

Now, we do scheduling at the same time as value register allocation. The ready list now acts similarly to the array of registers in value_regalloc, keeping us from running out of slots. Before this, the value register allocator wasn't aware of the scheduling constraints of the actual machine, which meant that it sometimes chose the wrong false dependencies to insert. Now, we assign value registers at the same time as we actually schedule instructions, making its choices reflect reality much better. It was also conservative in some cases where the new scheme doesn't have to be. For example, in something like: 1 = ld_att 2 = ld_uni 3 = add 1, 2 It's possible that one of 1 and 2 can't be scheduled in the same instruction as 3, meaning that a move needs to be inserted, so the value register allocator needs to assume that this sequence requires two registers. But when actually scheduling, we could discover that 1, 2, and 3 can all be scheduled together, so that they only require one register. The new scheduler speculatively inserts the instruction under consideration, as well as all of its child load instructions, and then counts the number of live value registers after all is said and done. This lets us be more aggressive with scheduling when we're close to the limit.
author: Connor Abbott <cwabbott0@gmail.com> 2018-01-11 18:35:58 -0500
committer: Connor Abbott <cwabbott0@gmail.com> 2018-03-02 12:52:50 -0500
commit: 490754058bca093dd27732c8aff62f74efcd8e49 (patch)
tree: 8f8763dc60a08fe18f3c99a027a70aa4d8ec4e96
parent: 1991338973991492654ede2f311c5214c07be2db (diff)
1 files changed, 316 insertions, 146 deletions
diff --git a/src/gallium/drivers/lima/ir/gp/scheduler.c b/src/gallium/drivers/lima/ir/gp/scheduler.c
index 2a11bdfd1c..75adf9ca44 100644
--- a/src/gallium/drivers/lima/ir/gp/scheduler.c
+++ b/src/gallium/drivers/lima/ir/gp/scheduler.c
@@ -79,6 +79,13 @@
  * scheduled total, below the limit. So the algorithm will always succeed.
  */
 
+typedef struct {
+   struct list_head ready_list;
+   int ready_list_slots;
+   gpir_instr *instr;
+   gpir_block *block;
+} sched_ctx;
+
 static int gpir_min_dist_alu(gpir_dep *dep)
 {
    switch (dep->pred->op) {
@@ -230,7 +237,7 @@ static void schedule_update_distance(gpir_node *node)
       if (pred->sched.dist < 0)
          schedule_update_distance(pred);
 
-      int dist = pred->sched.dist + 1;
+      int dist = pred->sched.dist + gpir_min_dist_alu(dep);
       if (node->sched.dist < dist)
          node->sched.dist = dist;
    }
@@ -378,162 +385,357 @@ static gpir_node *schedule_create_move_node(gpir_node *node)
    return &move->node;
 }
 
-static gpir_node *gpir_sched_node(gpir_instr *instr, gpir_node *node)
+static bool gpir_is_input_node(gpir_node *node)
 {
-   if (node->op == gpir_op_mov) {
-      gpir_node *child = gpir_node_to_alu(node)->children[0];
-      gpir_node_foreach_succ_safe(node, dep) {
-         gpir_node *succ = dep->succ;
-         if (succ->sched.instr < 0 ||
-             instr->index < succ->sched.instr + gpir_get_min_dist(dep)) {
-            gpir_node_replace_pred(dep, child);
-            if (dep->type == GPIR_DEP_INPUT)
-               gpir_node_replace_child(succ, node, child);
-         }
+   gpir_node_foreach_succ(node, dep) {
+      if (dep->type == GPIR_DEP_INPUT)
+         return true;
+   }
+   return false;
+}
+
+/* Get the number of slots required for a node on the ready list.
+ */
+static int gpir_get_slots_required(gpir_node *node)
+{
+   if (!gpir_is_input_node(node))
+      return 0;
+
+   if (gpir_op_infos[node->op].may_consume_two_slots && node->sched.ready) {
+      /* If the node is fully ready, we have to assume that it may get
+       * scheduled at any time, at which point it takes up two slots.
+       */
+      return 2;
+   }
+
+   return 1;
+}
+
+/* Once we schedule the successor, would the predecessor be fully ready? */
+static bool pred_almost_ready(gpir_dep *dep)
+{
+   bool fully_ready = true;
+   gpir_node_foreach_succ(dep->pred, other_dep) {
+      gpir_node *succ = other_dep->succ;
+      if (succ->sched.instr < 0 && dep->succ != other_dep->succ) {
+         fully_ready = false;
+         break;
       }
-      MAYBE_UNUSED bool result = schedule_try_place_node(instr, node);
-      assert(result);
-      return node;
    }
-   else {
-      gpir_node *move = schedule_create_move_node(node);
-      list_del(&node->list);
-      node->sched.ready = false;
-      node->sched.inserted = false;
-      gpir_node_replace_succ(move, node);
-      gpir_node_add_dep(move, node, GPIR_DEP_INPUT);
-      return move;
+
+   return fully_ready;
+}
+
+/* Speculatively schedule a dep, and count how many slots it consumes.
+ */
+static int gpir_get_dep_slots(sched_ctx *ctx, gpir_dep *dep)
+{
+   gpir_node *pred = dep->pred;
+   if (!gpir_is_input_node(pred))
+      return 0;
+
+   /* Try and speculatively schedule any loads. */
+   if (pred->type == gpir_node_type_load && pred_almost_ready(dep) &&
+       schedule_try_place_node(ctx->instr, pred)) {
+      return 0;
+   }
+
+   int total = 0;
+   if (!pred->sched.inserted)
+      total++;
+
+   if (gpir_op_infos[pred->op].may_consume_two_slots) {
+      /* If pred goes from partially ready or not ready to fully ready, then
+       * it takes up two slots as per gpir_get_slots_required(), so we need to
+       * add an extra slot.
+       */
+      if (pred_almost_ready(dep))
+         total++;
    }
+
+   return total;
 }
 
-static bool gpir_is_input_node(gpir_node *node)
+static void cleanup_speculative_loads(sched_ctx *ctx, gpir_node *node)
 {
-   gpir_node_foreach_succ(node, dep) {
-      if (dep->type == GPIR_DEP_INPUT)
-         return true;
+   gpir_node_foreach_pred(node, dep) {
+      gpir_node *pred = dep->pred;
+      if (pred->sched.instr >= 0) {
+         pred->sched.instr = -1;
+         gpir_instr_remove_node(ctx->instr, pred);
+      }
    }
-   return false;
 }
 
-static int gpir_get_min_scheduled_succ(gpir_node *node)
+/* Get the total number of slots on the ready list if this node were to be
+ * scheduled.
+ */
+static int gpir_get_ready_list_slots(sched_ctx *ctx, gpir_node *node)
+{
+   assert(ctx->ready_list_slots <= GPIR_VALUE_REG_NUM);
+   int total = ctx->ready_list_slots - gpir_get_slots_required(node);
+
+   if (!schedule_try_place_node(ctx->instr, node))
+      return INT_MAX;
+
+   gpir_node_foreach_pred(node, dep) {
+      int slots = gpir_get_dep_slots(ctx, dep);
+      if (node->op == gpir_op_mov && slots != 0) {
+         /* At this stage, we only insert moves for loads that we couldn't
+          * schedule immediately. If we couldn't schedule the load, there's
+          * no point scheduling the move.
+          */
+         cleanup_speculative_loads(ctx, node);
+         gpir_instr_remove_node(ctx->instr, node);
+         return INT_MAX;
+      }
+      total += slots;
+   }
+
+   /* Cleanup any speculatively scheduled loads. */
+   cleanup_speculative_loads(ctx, node);
+   gpir_instr_remove_node(ctx->instr, node);
+
+   return total;
+}
+
+static bool try_place_node(sched_ctx *ctx, gpir_node *node)
+{
+   if (!schedule_try_place_node(ctx->instr, node)) {
+      gpir_debug("failed to place %d\n", node->index);
+      return false;
+   }
+
+   gpir_debug("placed node %d\n", node->index);
+
+   list_del(&node->list);
+   list_add(&node->list, &ctx->block->node_list);
+   gpir_node_foreach_pred(node, dep) {
+      gpir_node *pred = dep->pred;
+      schedule_insert_ready_list(&ctx->ready_list, pred);
+   }
+
+   return true;
+}
+
+static gpir_node *create_move(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_node *move = schedule_create_move_node(node);
+   list_del(&node->list);
+   node->sched.ready = false;
+   node->sched.inserted = false;
+   gpir_node_replace_succ(move, node);
+   gpir_node_add_dep(move, node, GPIR_DEP_INPUT);
+   schedule_insert_ready_list(&ctx->ready_list, move);
+   return move;
+}
+
+static bool try_schedule_node(sched_ctx *ctx, gpir_node *node)
+{
+   if (!try_place_node(ctx, node))
+      return false;
+
+   gpir_node_foreach_pred(node, dep) {
+      gpir_node *pred = dep->pred;
+      if (dep->type == GPIR_DEP_INPUT && pred->type == gpir_node_type_load) {
+         if (!try_place_node(ctx, pred)) {
+            create_move(ctx, pred);
+         }
+      }
+   }
+
+   return true;
+}
+
+static int gpir_get_curr_ready_list_slots(sched_ctx *ctx)
+{
+   int total = 0;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      total += gpir_get_slots_required(node);
+   }
+
+   return total;
+}
+
+/* What gpir_get_min_end() would return if node were replaced with a move
+ * instruction not in the complex slot. Normally this is 2 + min_end, except
+ * for some store instructions which must have the move node in the same
+ * instruction.
+ */
+static int gpir_get_min_end_move(gpir_node *node)
 {
    int min = INT_MAX;
    gpir_node_foreach_succ(node, dep) {
       gpir_node *succ = dep->succ;
       if (succ->sched.instr >= 0 && dep->type == GPIR_DEP_INPUT) {
-         if (min > succ->sched.instr)
-            min = succ->sched.instr;
+         int dist = 2;
+         switch (succ->op) {
+         case gpir_op_store_temp:
+         case gpir_op_store_reg:
+         case gpir_op_store_varying:
+            dist = 0;
+            break;
+         default:
+            break;
+         }
+         if (min > succ->sched.instr + dist)
+            min = succ->sched.instr + dist;
       }
    }
    return min;
 }
 
-static gpir_node *gpir_sched_instr_pass(gpir_instr *instr,
-                                        struct list_head *ready_list)
+static bool try_node(sched_ctx *ctx, bool max_only)
 {
-   /* fully ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
+   gpir_node *min_node = NULL;
+   int min_slots = INT_MAX;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
       if (node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
-
-         gpir_debug("fully ready max node %d\n", node->index);
+         /* First, schedule required stuff */
+         if (max_only) {
+            int end = gpir_get_min_end(node);
+            if (end != ctx->instr->index)
+               continue;
+         }
 
-         if (schedule_try_place_node(instr, node))
-            return node;
+         int slots = gpir_get_ready_list_slots(ctx, node);
+         if (slots == INT_MAX)
+            continue;
 
-         return gpir_sched_node(instr, node);
+         if (node == NULL) {
+            min_slots = slots;
+            min_node = node;
+            continue;
+         }
+         if (min_slots <= GPIR_VALUE_REG_NUM) {
+            if (node->sched.dist <= min_node->sched.dist)
+               break;
+         }
+         if (slots < min_slots) {
+            min_slots = slots;
+            min_node = node;
+         }
       }
    }
 
-   /* partially ready node reach its max dist with any of its successor */
-   list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-      if (!node->sched.ready) {
-         int end = gpir_get_min_end(node);
-         assert(end >= instr->index);
-         if (instr->index < end)
-            continue;
+   if (min_node && min_slots <= GPIR_VALUE_REG_NUM) {
+      gpir_debug("trying to schedule %d (slots = %d)%s\n", min_node->index,
+                 min_slots, max_only ? " (max)" : "");
+      if (try_schedule_node(ctx, min_node))
+         ctx->ready_list_slots = min_slots;
+      return true;
+   }
 
-         gpir_debug("partially ready max node %d\n", node->index);
+   return false;
+}
+
+static void place_move(sched_ctx *ctx, gpir_node *node)
+{
+   gpir_node *move = create_move(ctx, node);
+   gpir_node_foreach_succ_safe(move, dep) {
+      gpir_node *succ = dep->succ;
+      if (succ->sched.instr < 0 ||
+          ctx->instr->index < succ->sched.instr + gpir_get_min_dist(dep)) {
+         gpir_node_replace_pred(dep, node);
+         if (dep->type == GPIR_DEP_INPUT)
+            gpir_node_replace_child(succ, move, node);
+      }
+   }
+   MAYBE_UNUSED bool result = try_place_node(ctx, move);
+   assert(result);
+}
 
-         return gpir_sched_node(instr, node);
+static bool sched_move(sched_ctx *ctx)
+{
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (gpir_is_input_node(node) &&
+          gpir_get_min_end_move(node) == ctx->instr->index) {
+         place_move(ctx, node);
+         return true;
       }
    }
 
-   /* schedule node used by previous instr when count > 5 */
    int count = 0;
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (gpir_is_input_node(node)) {
-         int min = gpir_get_min_scheduled_succ(node);
-         assert(min >= instr->index - 1);
-         if (min == instr->index - 1)
-            count += gpir_op_infos[node->op].may_consume_two_slots ? 2 : 1;
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      if (gpir_is_input_node(node) &&
+          gpir_get_min_end_move(node) == ctx->instr->index + 1) {
+         count += gpir_get_slots_required(node);
       }
    }
 
    if (count > 5) {
-      /* schedule fully ready node first */
-      list_for_each_entry(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 ready node %d\n", node->index);
-
-               if (schedule_try_place_node(instr, node))
-                  return node;
+      /* This is a bit tricky... if a two-slot instruction becomes ready, then
+       * it could go from counting one slot to counting two. If there was
+       * another use one instruction ago, then it would add to "count",
+       * possibly making large enough that we can't get it below 5 without
+       * running out of slots unless we evict the two-slot instruction itself
+       * (that decreases count by 2 while only taking up one slot).
+       */
+      if (count - 5 > ctx->instr->alu_num_slot_free) {
+         list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+            if (gpir_get_min_end_move(node) == ctx->instr->index + 1 &&
+                node->op == gpir_op_complex1 && node->sched.ready) {
+               gpir_debug("count > 5\n");
+               place_move(ctx, node);
+               return true;
             }
          }
       }
 
-      /* no fully ready node be scheduled, schedule partially ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && !node->sched.ready) {
-               gpir_debug(">5 partially ready node %d\n", node->index);
-
-               return gpir_sched_node(instr, node);
-            }
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         /* complex1 has a latency of two cycles, so if it is ready, we want
+          * to try not to insert a mov during the cycle after it becomes
+          * ready, since this delays when it becomes available by an extra
+          * cycle. Other opcodes don't have this problem, i.e. inserting a
+          * move won't hurt anything.
+          */
+         if (gpir_get_min_end_move(node) == ctx->instr->index + 1 &&
+             !(node->op == gpir_op_complex1 && node->sched.ready)) {
+            gpir_debug("count > 5\n");
+            place_move(ctx, node);
+            return true;
          }
       }
 
-      /* finally schedule move for fully ready node */
-      list_for_each_entry_safe(gpir_node, node, ready_list, list) {
-         if (gpir_is_input_node(node)) {
-            int min = gpir_get_min_scheduled_succ(node);
-            if (min == instr->index - 1 && node->sched.ready) {
-               gpir_debug(">5 fully ready move node %d\n", node->index);
-
-               return gpir_sched_node(instr, node);
-            }
+      /* In the case where everything was complex1, we need to try again. */
+      list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+         if (gpir_get_min_end_move(node) == ctx->instr->index + 1) {
+            gpir_debug("count > 5\n");
+            place_move(ctx, node);
+            return true;
          }
       }
    }
 
-   /* schedule remain fully ready nodes */
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      if (node->sched.ready) {
-         gpir_debug("remain fully ready node %d\n", node->index);
+   return false;
+}
 
-         if (schedule_try_place_node(instr, node))
-            return node;
-      }
-   }
+static bool gpir_sched_instr_pass(sched_ctx *ctx)
+{
+   /* First, schedule max nodes */
+   if (try_node(ctx, true))
+      return true;
 
-   return NULL;
+   /* TODO: try to spill max nodes */
+
+   /* Schedule moves for max nodes if we couldn't schedule them. */
+   if (sched_move(ctx))
+      return true;
+
+   /* Try and schedule the rest of the nodes now. */
+   return try_node(ctx, false);
 }
 
-static void schedule_print_pre_one_instr(gpir_instr *instr,
-                                         struct list_head *ready_list)
+static void schedule_print_pre_one_instr(sched_ctx *ctx)
 {
    if (!lima_shader_debug_gp)
       return;
 
-   printf("instr %d for ready list:", instr->index);
-   list_for_each_entry(gpir_node, node, ready_list, list) {
-      printf(" %d/%c", node->index, node->sched.ready ? 'r' : 'p');
+   printf("instr %d for ready list:", ctx->instr->index);
+   list_for_each_entry(gpir_node, node, &ctx->ready_list, list) {
+      printf(" %d/%c (%d, %d, %d)", node->index, node->sched.ready ? 'r' : 'p',
+             node->sched.dist, gpir_get_slots_required(node),
+             gpir_get_min_end_move(node));
    }
    printf("\n");
 }
@@ -552,31 +754,17 @@ static void schedule_print_post_one_instr(gpir_instr *instr)
 }
 
 
-static bool schedule_one_instr(gpir_block *block, struct list_head *ready_list)
+static bool schedule_one_instr(sched_ctx *ctx)
 {
-   gpir_instr *instr = gpir_instr_create(block);
+   gpir_instr *instr = gpir_instr_create(ctx->block);
    if (unlikely(!instr))
       return false;
+   ctx->instr = instr;
 
-   schedule_print_pre_one_instr(instr, ready_list);
+   schedule_print_pre_one_instr(ctx);
 
-   while (true) {
-      gpir_node *node = gpir_sched_instr_pass(instr, ready_list);
-      if (!node)
-         break;
-
-      if (node->sched.instr < 0)
-         schedule_insert_ready_list(ready_list, node);
-      else {
-         list_del(&node->list);
-         list_add(&node->list, &block->node_list);
-
-         gpir_node_foreach_pred(node, dep) {
-            gpir_node *pred = dep->pred;
-            schedule_insert_ready_list(ready_list, pred);
-         }
-      }
-   }
+   while (gpir_sched_instr_pass(ctx))
+      assert(ctx->ready_list_slots == gpir_get_curr_ready_list_slots(ctx));
 
    schedule_print_post_one_instr(instr);
    return true;
@@ -590,18 +778,20 @@ static bool schedule_block(gpir_block *block)
          schedule_update_distance(node);
    }
 
-   struct list_head ready_list;
-   list_inithead(&ready_list);
+   sched_ctx ctx;
+   list_inithead(&ctx.ready_list);
+   ctx.block = block;
+   ctx.ready_list_slots = 0;
 
    /* construct the ready list from root nodes */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
       if (gpir_node_is_root(node))
-         schedule_insert_ready_list(&ready_list, node);
+         schedule_insert_ready_list(&ctx.ready_list, node);
    }
 
    list_inithead(&block->node_list);
-   while (!list_empty(&ready_list)) {
-      if (!schedule_one_instr(block, &ready_list))
+   while (!list_empty(&ctx.ready_list)) {
+      if (!schedule_one_instr(&ctx))
          return false;
    }
 
@@ -610,26 +800,6 @@ static bool schedule_block(gpir_block *block)
 
 static void schedule_build_vreg_dependency(gpir_block *block)
 {
-   gpir_node *regs[GPIR_VALUE_REG_NUM] = {0};
-   list_for_each_entry(gpir_node, node, &block->node_list, list) {
-      /* store node has no value reg assigned */
-      if (node->value_reg < 0)
-         continue;
-
-      gpir_node *reg = regs[node->value_reg];
-      if (reg) {
-         gpir_node_foreach_succ(reg, dep) {
-            /* write after read dep should only apply to real 'read' */
-            if (dep->type != GPIR_DEP_INPUT)
-               continue;
-
-            gpir_node *succ = dep->succ;
-            gpir_node_add_dep(node, succ, GPIR_DEP_VREG_WRITE_AFTER_READ);
-         }
-      }
-      regs[node->value_reg] = node;
-   }
-
    /* merge dummy_f/m to the node created from */
    list_for_each_entry_safe(gpir_node, node, &block->node_list, list) {
       if (node->op == gpir_op_dummy_m) {
author	Connor Abbott <cwabbott0@gmail.com>	2018-01-11 18:35:58 -0500
committer	Connor Abbott <cwabbott0@gmail.com>	2018-03-02 12:52:50 -0500
commit	490754058bca093dd27732c8aff62f74efcd8e49 (patch)
tree	8f8763dc60a08fe18f3c99a027a70aa4d8ec4e96
parent	1991338973991492654ede2f311c5214c07be2db (diff)