nir: Allow opt_peephole_sel to be more aggressive in flattening IFs.

VC4 was running into a major performance regression from enabling control flow in the glmark2 conditionals test, because of short if statements containing an ffract. This pass seems like it was was trying to ensure that we only flattened IFs that should be entirely a win by guaranteeing that there would be fewer bcsels than there were MOVs otherwise. However, if the number of ALU ops is small, we can avoid the overhead of branching (which itself costs cycles) and still get a win, even if it means moving real instructions out of the THEN/ELSE blocks. For now, just turn on aggressive flattening on vc4. i965 will need some tuning to avoid regressions. It does looks like this may be useful to replace freedreno code. Improves glmark2 -b conditionals:fragment-steps=5:vertex-steps=0 from 47 fps to 95 fps on vc4. vc4 shader-db: total instructions in shared programs: 101282 -> 99543 (-1.72%) instructions in affected programs: 17365 -> 15626 (-10.01%) total uniforms in shared programs: 31295 -> 31172 (-0.39%) uniforms in affected programs: 3580 -> 3457 (-3.44%) total estimated cycles in shared programs: 225182 -> 223746 (-0.64%) estimated cycles in affected programs: 26085 -> 24649 (-5.51%) v2: Update shader-db output. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> (v1)
author: Eric Anholt <eric@anholt.net> 2016-09-06 19:45:51 -0700
committer: Eric Anholt <eric@anholt.net> 2016-09-22 11:10:21 +0300
commit: 36f0f0318275f65f8744ec6f9471702e2f58e6d5 (patch)
tree: 1b6a4fb819a5d7a7527635edda690d9f53e5e535
parent: 6c648cdac80233765861bfebb74a6e2725ddbf26 (diff)
4 files changed, 57 insertions, 31 deletions
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index aac247c79c..8d1afb98cb 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2600,7 +2600,7 @@ bool nir_opt_dead_cf(nir_shader *shader);
 
 bool nir_opt_gcm(nir_shader *shader, bool value_number);
 
-bool nir_opt_peephole_select(nir_shader *shader);
+bool nir_opt_peephole_select(nir_shader *shader, unsigned limit);
 
 bool nir_opt_remove_phis(nir_shader *shader);
 
diff --git a/src/compiler/nir/nir_opt_peephole_select.c b/src/compiler/nir/nir_opt_peephole_select.c
index 633e9f486c..6a73d73707 100644
--- a/src/compiler/nir/nir_opt_peephole_select.c
+++ b/src/compiler/nir/nir_opt_peephole_select.c
@@ -32,23 +32,33 @@
  * Implements a small peephole optimization that looks for
  *
  * if (cond) {
- *    <empty>
+ *    <then SSA defs>
  * } else {
- *    <empty>
+ *    <else SSA defs>
  * }
  * phi
  * ...
  * phi
  *
- * and replaces it with a series of selects.  It can also handle the case
- * where, instead of being empty, the if may contain some move operations
- * whose only use is one of the following phi nodes.  This happens all the
- * time when the SSA form comes from a conditional assignment with a
- * swizzle.
+ * and replaces it with:
+ *
+ * <then SSA defs>
+ * <else SSA defs>
+ * bcsel
+ * ...
+ * bcsel
+ *
+ * where the SSA defs are ALU operations or other cheap instructions (not
+ * texturing, for example).
+ *
+ * If the number of ALU operations in the branches is greater than the limit
+ * parameter, then the optimization is skipped.  In limit=0 mode, the SSA defs
+ * must only be MOVs which we expect to get copy-propagated away once they're
+ * out of the inner blocks.
  */
 
 static bool
-block_check_for_allowed_instrs(nir_block *block)
+block_check_for_allowed_instrs(nir_block *block, unsigned *count, bool alu_ok)
 {
    nir_foreach_instr(instr, block) {
       switch (instr->type) {
@@ -67,6 +77,11 @@ block_check_for_allowed_instrs(nir_block *block)
             }
             break;
 
+         case nir_intrinsic_load_uniform:
+            if (!alu_ok)
+               return false;
+            break;
+
          default:
             return false;
          }
@@ -89,29 +104,36 @@ block_check_for_allowed_instrs(nir_block *block)
          case nir_op_vec2:
          case nir_op_vec3:
          case nir_op_vec4:
-            /* It must be a move-like operation. */
             break;
          default:
-            return false;
+            if (!alu_ok) {
+               /* It must be a move-like operation. */
+               return false;
+            }
+            break;
          }
 
-         /* Can't handle saturate */
-         if (mov->dest.saturate)
-            return false;
-
          /* It must be SSA */
          if (!mov->dest.dest.is_ssa)
             return false;
 
-         /* It cannot have any if-uses */
-         if (!list_empty(&mov->dest.dest.ssa.if_uses))
-            return false;
+         if (alu_ok) {
+            (*count)++;
+         } else {
+            /* Can't handle saturate */
+            if (mov->dest.saturate)
+               return false;
 
-         /* The only uses of this definition must be phi's in the successor */
-         nir_foreach_use(use, &mov->dest.dest.ssa) {
-            if (use->parent_instr->type != nir_instr_type_phi ||
-                use->parent_instr->block != block->successors[0])
+            /* It cannot have any if-uses */
+            if (!list_empty(&mov->dest.dest.ssa.if_uses))
                return false;
+
+            /* The only uses of this definition must be phi's in the successor */
+            nir_foreach_use(use, &mov->dest.dest.ssa) {
+               if (use->parent_instr->type != nir_instr_type_phi ||
+                   use->parent_instr->block != block->successors[0])
+                  return false;
+            }
          }
          break;
       }
@@ -125,7 +147,7 @@ block_check_for_allowed_instrs(nir_block *block)
 }
 
 static bool
-nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
+nir_opt_peephole_select_block(nir_block *block, void *mem_ctx, unsigned limit)
 {
    if (nir_cf_node_is_first(&block->cf_node))
       return false;
@@ -147,8 +169,12 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
    nir_block *else_block = nir_cf_node_as_block(else_node);
 
    /* ... and those blocks must only contain "allowed" instructions. */
-   if (!block_check_for_allowed_instrs(then_block) ||
-       !block_check_for_allowed_instrs(else_block))
+   unsigned count = 0;
+   if (!block_check_for_allowed_instrs(then_block, &count, limit != 0) ||
+       !block_check_for_allowed_instrs(else_block, &count, limit != 0))
+      return false;
+
+   if (count > limit)
       return false;
 
    /* At this point, we know that the previous CFG node is an if-then
@@ -212,13 +238,13 @@ nir_opt_peephole_select_block(nir_block *block, void *mem_ctx)
 }
 
 static bool
-nir_opt_peephole_select_impl(nir_function_impl *impl)
+nir_opt_peephole_select_impl(nir_function_impl *impl, unsigned limit)
 {
    void *mem_ctx = ralloc_parent(impl);
    bool progress = false;
 
    nir_foreach_block_safe(block, impl) {
-      progress |= nir_opt_peephole_select_block(block, mem_ctx);
+      progress |= nir_opt_peephole_select_block(block, mem_ctx, limit);
    }
 
    if (progress)
@@ -228,13 +254,13 @@ nir_opt_peephole_select_impl(nir_function_impl *impl)
 }
 
 bool
-nir_opt_peephole_select(nir_shader *shader)
+nir_opt_peephole_select(nir_shader *shader, unsigned limit)
 {
    bool progress = false;
 
    nir_foreach_function(function, shader) {
       if (function->impl)
-         progress |= nir_opt_peephole_select_impl(function->impl);
+         progress |= nir_opt_peephole_select_impl(function->impl, limit);
    }
 
    return progress;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 986a1ffc64..81c67168c1 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1430,7 +1430,7 @@ vc4_optimize_nir(struct nir_shader *s)
                 NIR_PASS(progress, s, nir_opt_dce);
                 NIR_PASS(progress, s, nir_opt_dead_cf);
                 NIR_PASS(progress, s, nir_opt_cse);
-                NIR_PASS(progress, s, nir_opt_peephole_select);
+                NIR_PASS(progress, s, nir_opt_peephole_select, 8);
                 NIR_PASS(progress, s, nir_opt_algebraic);
                 NIR_PASS(progress, s, nir_opt_constant_folding);
                 NIR_PASS(progress, s, nir_opt_undef);
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index fbc84c474f..744865bd4c 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -416,7 +416,7 @@ nir_optimize(nir_shader *nir, bool is_scalar)
       OPT(nir_copy_prop);
       OPT(nir_opt_dce);
       OPT(nir_opt_cse);
-      OPT(nir_opt_peephole_select);
+      OPT(nir_opt_peephole_select, 0);
       OPT(nir_opt_algebraic);
       OPT(nir_opt_constant_folding);
       OPT(nir_opt_dead_cf);
author	Eric Anholt <eric@anholt.net>	2016-09-06 19:45:51 -0700
committer	Eric Anholt <eric@anholt.net>	2016-09-22 11:10:21 +0300
commit	36f0f0318275f65f8744ec6f9471702e2f58e6d5 (patch)
tree	1b6a4fb819a5d7a7527635edda690d9f53e5e535
parent	6c648cdac80233765861bfebb74a6e2725ddbf26 (diff)