i965/sched: switch to bottom-up schedulingsched

Previously, our scheduler was doing top-down scheduling, scheduling from the beginning to the end. But top-down scheduling isn't good for scheduling things with many nodes with no input dependencies (i.e. many reads) and few nodes with no output dependencies (i.e. few writes), which is generally true of shaders, because it isn't able to determine when to schedule the inputs. With bottom-up scheduling, we're better able to schedule inputs when we need to in order to reduce register pressure, while in other cases still doing aggressive scheduling. After this commit, there are no more shaders in my copy of shader-db that spill. total instructions in shared programs: 7403011 -> 7374802 (-0.38%) instructions in affected programs: 86700 -> 58491 (-32.54%) helped: 55 HURT: 0 total cycles in shared programs: 48350374 -> 48781484 (0.89%) cycles in affected programs: 36812664 -> 37243774 (1.17%) helped: 22829 HURT: 19197 LOST: 6 GAINED: 528
author: Connor Abbott <cwabbott0@gmail.com> 2015-10-02 21:39:41 -0400
committer: Matt Turner <mattst88@gmail.com> 2016-04-28 16:11:21 -0700
commit: 26812f052ee051a8e16e7376c97fb8acb55f3231 (patch)
tree: 4bfbad92b171ca5410afefb22037e3ef8cf40c97
parent: 8a851a77724f64310569f8455f166556a26a471e (diff)
1 files changed, 78 insertions, 88 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 90419a3735e..1e92f680b24 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -407,7 +407,7 @@ public:
       this->mode = mode;
       this->time = 0;
       if (!post_reg_alloc) {
-         this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+         this->reg_pressure_out = rzalloc_array(mem_ctx, int, block_count);
 
          this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
          for (int i = 0; i < block_count; i++)
@@ -424,21 +424,21 @@ public:
             this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
                                                 BITSET_WORDS(hw_reg_count));
 
-         this->written = rzalloc_array(mem_ctx, bool, grf_count);
+         this->read = rzalloc_array(mem_ctx, bool, grf_count);
 
-         this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+         this->writes_remaining = rzalloc_array(mem_ctx, int, grf_count);
 
-         this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
+         this->hw_read = rzalloc_array(mem_ctx, bool, hw_reg_count);
 
          this->max_reg_pressure = 0;
       } else {
-         this->reg_pressure_in = NULL;
+         this->reg_pressure_out = NULL;
          this->livein = NULL;
          this->liveout = NULL;
          this->hw_liveout = NULL;
-         this->written = NULL;
-         this->reads_remaining = NULL;
-         this->hw_reads_remaining = NULL;
+         this->read = NULL;
+         this->writes_remaining = NULL;
+         this->hw_read = NULL;
       }
    }
 
@@ -465,7 +465,7 @@ public:
     */
    virtual int issue_time(backend_instruction *inst) = 0;
 
-   virtual void count_reads_remaining(backend_instruction *inst) = 0;
+   virtual void count_writes_remaining(backend_instruction *inst) = 0;
    virtual void setup_liveness(cfg_t *cfg) = 0;
    virtual void update_register_pressure(backend_instruction *inst) = 0;
    virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
@@ -492,10 +492,10 @@ public:
    int max_reg_pressure;
 
    /*
-    * The register pressure at the beginning of each basic block.
+    * The register pressure at the end of each basic block.
     */
 
-   int *reg_pressure_in;
+   int *reg_pressure_out;
 
    /*
     * The virtual GRF's whose range overlaps the beginning of each basic block.
@@ -516,22 +516,22 @@ public:
    BITSET_WORD **hw_liveout;
 
    /*
-    * Whether we've scheduled a write for this virtual GRF yet.
+    * Whether we've scheduled a read for this virtual GRF yet.
     */
 
-   bool *written;
+   bool *read;
 
    /*
-    * How many reads we haven't scheduled for this virtual GRF yet.
+    * How many writes we haven't scheduled for this virtual GRF yet.
     */
 
-   int *reads_remaining;
+   int *writes_remaining;
 
    /*
-    * How many reads we haven't scheduled for this hardware GRF yet.
+    * Whether we've scheduled a read for this hardware GRF yet.
     */
 
-   int *hw_reads_remaining;
+   bool *hw_read;
 };
 
 class fs_instruction_scheduler : public instruction_scheduler
@@ -546,7 +546,7 @@ public:
    int issue_time(backend_instruction *inst);
    fs_visitor *v;
 
-   void count_reads_remaining(backend_instruction *inst);
+   void count_writes_remaining(backend_instruction *inst);
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
@@ -574,27 +574,15 @@ is_src_duplicate(fs_inst *inst, int src)
 }
 
 void
-fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+fs_instruction_scheduler::count_writes_remaining(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!reads_remaining)
+   if (!writes_remaining)
       return;
 
-   for (int i = 0; i < inst->sources; i++) {
-      if (is_src_duplicate(inst, i))
-         continue;
-
-      if (inst->src[i].file == VGRF) {
-         reads_remaining[inst->src[i].nr]++;
-      } else if (inst->src[i].file == FIXED_GRF) {
-         if (inst->src[i].nr >= hw_reg_count)
-            continue;
-
-         for (int j = 0; j < inst->regs_read(i); j++)
-            hw_reads_remaining[inst->src[i].nr + j]++;
-      }
-   }
+   if (inst->dst.file == VGRF)
+      writes_remaining[inst->dst.nr]++;
 }
 
 void
@@ -607,14 +595,16 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
       for (int i = 0; i < v->live_intervals->num_vars; i++) {
          if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
             int vgrf = v->live_intervals->vgrf_from_var[i];
-            if (!BITSET_TEST(livein[block], vgrf)) {
-               reg_pressure_in[block] += v->alloc.sizes[vgrf];
-               BITSET_SET(livein[block], vgrf);
-            }
+            BITSET_SET(livein[block], vgrf);
          }
 
-         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
-            BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i)) {
+            int vgrf = v->live_intervals->vgrf_from_var[i];
+            if (!BITSET_TEST(liveout[block], vgrf)) {
+               reg_pressure_out[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(liveout[block], vgrf);
+            }
+         }
       }
    }
 
@@ -626,12 +616,12 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
       for (int i = 0; i < grf_count; i++) {
          if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
              v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
-            if (!BITSET_TEST(livein[block + 1], i)) {
-                reg_pressure_in[block + 1] += v->alloc.sizes[i];
-                BITSET_SET(livein[block + 1], i);
+            if (!BITSET_TEST(liveout[block], i)) {
+                reg_pressure_out[block] += v->alloc.sizes[i];
+                BITSET_SET(liveout[block], i);
             }
 
-            BITSET_SET(liveout[block], i);
+            BITSET_SET(livein[block + 1], i);
          }
       }
    }
@@ -645,10 +635,11 @@ fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
 
       for (int block = 0; block < cfg->num_blocks; block++) {
          if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
-            reg_pressure_in[block]++;
 
-         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i]) {
+            reg_pressure_out[block]++;
             BITSET_SET(hw_liveout[block], i);
+         }
       }
    }
 }
@@ -658,11 +649,11 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
 {
    fs_inst *inst = (fs_inst *)be;
 
-   if (!reads_remaining)
+   if (!writes_remaining)
       return;
 
    if (inst->dst.file == VGRF) {
-      written[inst->dst.nr] = true;
+      writes_remaining[inst->dst.nr]--;
    }
 
    for (int i = 0; i < inst->sources; i++) {
@@ -670,11 +661,11 @@ fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
           continue;
 
       if (inst->src[i].file == VGRF) {
-         reads_remaining[inst->src[i].nr]--;
+         read[inst->src[i].nr] = true;
       } else if (inst->src[i].file == FIXED_GRF &&
                  inst->src[i].nr < hw_reg_count) {
          for (int off = 0; off < inst->regs_read(i); off++)
-            hw_reads_remaining[inst->src[i].nr + off]--;
+            hw_read[inst->src[i].nr + off] = true;
       }
    }
 }
@@ -687,8 +678,8 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
 
    if (inst->dst.file == VGRF) {
       if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
-          !written[inst->dst.nr])
-         benefit -= v->alloc.sizes[inst->dst.nr];
+          writes_remaining[inst->dst.nr] == 1)
+         benefit += v->alloc.sizes[inst->dst.nr];
    }
 
    for (int i = 0; i < inst->sources; i++) {
@@ -697,16 +688,16 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
 
       if (inst->src[i].file == VGRF &&
           !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
-          reads_remaining[inst->src[i].nr] == 1)
-         benefit += v->alloc.sizes[inst->src[i].nr];
+          !read[inst->src[i].nr])
+         benefit -= v->alloc.sizes[inst->src[i].nr];
 
       if (inst->src[i].file == FIXED_GRF &&
           inst->src[i].nr < hw_reg_count) {
          for (int off = 0; off < inst->regs_read(i); off++) {
             int reg = inst->src[i].nr + off;
             if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
-                hw_reads_remaining[reg] == 1) {
-               benefit++;
+                !hw_read[reg]) {
+               benefit--;
             }
          }
       }
@@ -724,7 +715,7 @@ public:
    int issue_time(backend_instruction *inst);
    vec4_visitor *v;
 
-   void count_reads_remaining(backend_instruction *inst);
+   void count_writes_remaining(backend_instruction *inst);
    void setup_liveness(cfg_t *cfg);
    void update_register_pressure(backend_instruction *inst);
    int get_register_pressure_benefit(backend_instruction *inst);
@@ -738,7 +729,7 @@ vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
 }
 
 void
-vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+vec4_instruction_scheduler::count_writes_remaining(backend_instruction *be)
 {
 }
 
@@ -821,30 +812,30 @@ instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
 
    assert(before != after);
 
-   for (int i = 0; i < before->child_count; i++) {
-      if (before->children[i] == after) {
-         before->child_latency[i] = MAX2(before->child_latency[i], latency);
+   for (int i = 0; i < after->child_count; i++) {
+      if (after->children[i] == before) {
+         after->child_latency[i] = MAX2(after->child_latency[i], latency);
          return;
       }
    }
 
-   if (before->child_array_size <= before->child_count) {
-      if (before->child_array_size < 16)
-         before->child_array_size = 16;
+   if (after->child_array_size <= after->child_count) {
+      if (after->child_array_size < 16)
+         after->child_array_size = 16;
       else
-         before->child_array_size *= 2;
+         after->child_array_size *= 2;
 
-      before->children = reralloc(mem_ctx, before->children,
-                                  schedule_node *,
-                                  before->child_array_size);
-      before->child_latency = reralloc(mem_ctx, before->child_latency,
-                                       int, before->child_array_size);
+      after->children = reralloc(mem_ctx, after->children,
+                                 schedule_node *,
+                                 after->child_array_size);
+      after->child_latency = reralloc(mem_ctx, after->child_latency,
+                                      int, after->child_array_size);
    }
 
-   before->children[before->child_count] = after;
-   before->child_latency[before->child_count] = latency;
-   before->child_count++;
-   after->parent_count++;
+   after->children[after->child_count] = before;
+   after->child_latency[after->child_count] = latency;
+   after->child_count++;
+   before->parent_count++;
 }
 
 void
@@ -1379,10 +1370,10 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
           * tree of lowered ubo loads, which appear reversed in the
           * instruction stream with respect to when they can be consumed).
           */
-         if (n->delay > chosen->delay) {
+         if (n->delay < chosen->delay) {
             chosen = n;
             continue;
-         } else if (n->delay < chosen->delay) {
+         } else if (n->delay > chosen->delay) {
             continue;
          }
 
@@ -1437,7 +1428,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
    int instructions_to_schedule = block->end_ip - block->start_ip + 1;
    time = 0;
    if (!post_reg_alloc) {
-      reg_pressure = reg_pressure_in[block->num];
+      reg_pressure = reg_pressure_out[block->num];
       max_reg_pressure = MAX2(max_reg_pressure, reg_pressure);
    }
    block_idx = block->num;
@@ -1455,7 +1446,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       assert(chosen);
       chosen->remove();
       chosen->inst->exec_node::remove();
-      block->instructions.push_tail(chosen->inst);
+      block->instructions.push_head(chosen->inst);
       instructions_to_schedule--;
 
       if (!post_reg_alloc) {
@@ -1467,13 +1458,12 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       /* If we expected a delay for scheduling, then bump the clock to reflect
        * that.  In reality, the hardware will switch to another hyperthread
        * and may not return to dispatching our thread for a while even after
-       * we're unblocked.  After this, we have the time when the chosen
+       * we're unblocked.  Before this, we have the time when the chosen
        * instruction will start executing.
        */
       time = MAX2(time, chosen->unblocked_time);
 
-      /* Update the clock for how soon an instruction could start after the
-       * chosen one.
+      /* Update the clock for the actual execution of the instruction.
        */
       time += issue_time(chosen->inst);
 
@@ -1560,15 +1550,15 @@ instruction_scheduler::run(cfg_t *cfg)
       if (block->end_ip - block->start_ip <= 1)
          continue;
 
-      if (reads_remaining) {
-         memset(reads_remaining, 0,
-                grf_count * sizeof(*reads_remaining));
-         memset(hw_reads_remaining, 0,
-                hw_reg_count * sizeof(*hw_reads_remaining));
-         memset(written, 0, grf_count * sizeof(*written));
+      if (writes_remaining) {
+         memset(writes_remaining, 0,
+                grf_count * sizeof(*writes_remaining));
+         memset(hw_read, 0,
+                hw_reg_count * sizeof(*hw_read));
+         memset(read, 0, grf_count * sizeof(*read));
 
          foreach_inst_in_block(fs_inst, inst, block)
-            count_reads_remaining(inst);
+            count_writes_remaining(inst);
       }
 
       add_insts_from_block(block);
author	Connor Abbott <cwabbott0@gmail.com>	2015-10-02 21:39:41 -0400
committer	Matt Turner <mattst88@gmail.com>	2016-04-28 16:11:21 -0700
commit	26812f052ee051a8e16e7376c97fb8acb55f3231 (patch)
tree	4bfbad92b171ca5410afefb22037e3ef8cf40c97
parent	8a851a77724f64310569f8455f166556a26a471e (diff)