intel/fs: Implement performance analysis-based SIMD32 heuristic for fragment shaders.

The heuristic enables the SIMD32 fragment shader based on whether the IR performance modeling pass predicts it to have greater throughput than the SIMD16 and SIMD8 variants of the same shader. It would be straightforward to do the same thing in order to control whether SIMD16 dispatch is enabled, but it's pending additional performance evaluation. The INTEL_DEBUG=do32 option is left around in order to force the SIMD32 shader to be used regardless of the result of the heuristic, since it's useful as a debugging aid e.g. in order to identify SIMD32-specific codegen issues which may be masked by the SIMD32 heuristic, or cases where the heuristic is incorrectly disabling SIMD32 shaders that offer a performance advantage. Currently this is only enabled on Gen6+, since SIMD32 codegen support is incomplete on earlier platforms. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
author: Francisco Jerez <currojerez@riseup.net> 2020-04-02 17:30:06 -0700
committer: Francisco Jerez <currojerez@riseup.net> 2020-04-28 23:01:27 -0700
commit: 14f0a5cf64f6b8725ebe8ae68b19b096995ea0fe (patch)
tree: ee5af7db13ef0c86a65a8999b9d05c1d792dc256
parent: d6aa0c261f2d9ccacaa6579432c16c61ca4cb073 (diff)
1 files changed, 17 insertions, 7 deletions
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 010519ae565..c65bb204242 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -8646,8 +8646,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                char **error_str)
 {
    const struct gen_device_info *devinfo = compiler->devinfo;
-
-   unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16;
+   const unsigned max_subgroup_size = compiler->devinfo->gen >= 6 ? 32 : 16;
 
    brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true);
    brw_nir_lower_fs_inputs(shader, devinfo, key);
@@ -8707,6 +8706,7 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
 
    fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
    cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
+   float throughput = 0;
 
    v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
                        &prog_data->base, shader, 8, shader_time_index8);
@@ -8720,6 +8720,8 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
       simd8_cfg = v8->cfg;
       prog_data->base.dispatch_grf_start_reg = v8->payload.num_regs;
       prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
+      const performance &perf = v8->performance_analysis.require();
+      throughput = MAX2(throughput, perf.throughput);
    }
 
    /* Limit dispatch width to simd8 with dual source blending on gen8.
@@ -8746,13 +8748,14 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
          simd16_cfg = v16->cfg;
          prog_data->dispatch_grf_start_reg_16 = v16->payload.num_regs;
          prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
+         const performance &perf = v16->performance_analysis.require();
+         throughput = MAX2(throughput, perf.throughput);
       }
    }
 
    /* Currently, the compiler only supports SIMD32 on SNB+ */
    if (v8->max_dispatch_width >= 32 && !use_rep_send &&
-       compiler->devinfo->gen >= 6 &&
-       unlikely(INTEL_DEBUG & DEBUG_DO32)) {
+       devinfo->gen >= 6 && simd16_cfg) {
       /* Try a SIMD32 compile */
       v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
                            &prog_data->base, shader, 32, shader_time_index32);
@@ -8762,9 +8765,16 @@ brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
                                    "SIMD32 shader failed to compile: %s",
                                    v32->fail_msg);
       } else {
-         simd32_cfg = v32->cfg;
-         prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;
-         prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
+         const performance &perf = v32->performance_analysis.require();
+
+         if (!(INTEL_DEBUG & DEBUG_DO32) && throughput >= perf.throughput) {
+            compiler->shader_perf_log(log_data, "SIMD32 shader inefficient\n");
+         } else {
+            simd32_cfg = v32->cfg;
+            prog_data->dispatch_grf_start_reg_32 = v32->payload.num_regs;
+            prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
+            throughput = MAX2(throughput, perf.throughput);
+         }
       }
    }
author	Francisco Jerez <currojerez@riseup.net>	2020-04-02 17:30:06 -0700
committer	Francisco Jerez <currojerez@riseup.net>	2020-04-28 23:01:27 -0700
commit	14f0a5cf64f6b8725ebe8ae68b19b096995ea0fe (patch)
tree	ee5af7db13ef0c86a65a8999b9d05c1d792dc256
parent	d6aa0c261f2d9ccacaa6579432c16c61ca4cb073 (diff)