vc4: Lazily emit our FS/VS input loads.

This reduces register pressure in both types of shaders, by reordering the input loads from the var->data.driver_location order to whatever order they appear first in the NIR shader. These instructions aren't reorderable at our QIR scheduling level because the FS takes two in lockstep to do an interpolation, and the VS takes multiple read instructions in a row to get a whole vec4-level attribute read. shader-db impact: total instructions in shared programs: 76666 -> 76590 (-0.10%) instructions in affected programs: 42945 -> 42869 (-0.18%) total max temps in shared programs: 9395 -> 9208 (-1.99%) max temps in affected programs: 2951 -> 2764 (-6.34%) Some programs get their max temps hurt, depending on the order that the load_input intrinsics appear, because we end up being unable to copy propagate an older VPM read into its only use.
author: Eric Anholt <eric@anholt.net> 2017-02-24 12:57:03 -0800
committer: Eric Anholt <eric@anholt.net> 2017-02-24 17:01:29 -0800
commit: 292c24ddac5acc35676424f05291c101fcd47b3e (patch)
tree: 1cc326dc2c1dd5c8abd664dae0b4e1fcfa4bf373 /src/gallium
parent: f06915d7b71eb955cc0db4b5555f5c6474926a01 (diff)
4 files changed, 93 insertions, 75 deletions
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index 6bd2424ec7..f346474abe 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -174,10 +174,10 @@ struct vc4_compiled_shader {
 
         uint8_t num_inputs;
 
-        /* Byte offsets for the start of the vertex attributes 0-7, and the
-         * total size as "attribute" 8.
-         */
-        uint8_t vattr_offsets[9];
+        /** Byte offsets for the start of the vertex attributes. */
+        uint8_t vattr_offsets[8];
+        /** Total size of the vertex inputs, in bytes. */
+        uint8_t vattr_total_size;
         uint8_t vattrs_live;
 
         const struct vc4_fs_inputs *fs_inputs;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index ebd080298a..9f3765db1a 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -170,14 +170,14 @@ vc4_emit_gl_shader_state(struct vc4_context *vc4,
         /* VC4_DIRTY_COMPILED_VS */
         cl_u16(&shader_rec, 0); /* vs num uniforms */
         cl_u8(&shader_rec, vc4->prog.vs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.vs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.vs->vattr_total_size);
         cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.vs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
         /* VC4_DIRTY_COMPILED_CS */
         cl_u16(&shader_rec, 0); /* cs num uniforms */
         cl_u8(&shader_rec, vc4->prog.cs->vattrs_live);
-        cl_u8(&shader_rec, vc4->prog.cs->vattr_offsets[8]);
+        cl_u8(&shader_rec, vc4->prog.cs->vattr_total_size);
         cl_reloc(job, &job->shader_rec, &shader_rec, vc4->prog.cs->bo, 0);
         cl_u32(&shader_rec, 0); /* UBO offset written by kernel */
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 05e596e733..21753439cf 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -733,11 +733,14 @@ emit_vertex_input(struct vc4_compile *c, int attr)
 {
         enum pipe_format format = c->vs_key->attr_formats[attr];
         uint32_t attr_size = util_format_get_blocksize(format);
+        uint32_t vpm_attr = c->next_vpm_input++;
 
-        c->vattr_sizes[attr] = align(attr_size, 4);
+        c->vpm_input_order[vpm_attr] = attr;
+
+        c->vattr_sizes[vpm_attr] = align(attr_size, 4);
         for (int i = 0; i < align(attr_size, 4) / 4; i++) {
                 c->inputs[attr * 4 + i] =
-                        qir_MOV(c, qir_reg(QFILE_VPM, attr * 4 + i));
+                        qir_MOV(c, qir_reg(QFILE_VPM, vpm_attr * 4 + i));
                 c->num_inputs++;
         }
 }
@@ -1466,6 +1469,7 @@ emit_stub_vpm_read(struct vc4_compile *c)
         if (c->num_inputs)
                 return;
 
+        c->next_vpm_input++;
         c->vattr_sizes[0] = 4;
         (void)qir_MOV(c, qir_reg(QFILE_VPM, 0));
         c->num_inputs++;
@@ -1552,64 +1556,6 @@ vc4_optimize_nir(struct nir_shader *s)
         } while (progress);
 }
 
-static int
-driver_location_compare(const void *in_a, const void *in_b)
-{
-        const nir_variable *const *a = in_a;
-        const nir_variable *const *b = in_b;
-
-        return (*a)->data.driver_location - (*b)->data.driver_location;
-}
-
-static void
-ntq_setup_inputs(struct vc4_compile *c)
-{
-        unsigned num_entries = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                num_entries++;
-
-        nir_variable *vars[num_entries];
-
-        unsigned i = 0;
-        nir_foreach_variable(var, &c->s->inputs)
-                vars[i++] = var;
-
-        /* Sort the variables so that we emit the input setup in
-         * driver_location order.  This is required for VPM reads, whose data
-         * is fetched into the VPM in driver_location (TGSI register index)
-         * order.
-         */
-        qsort(&vars, num_entries, sizeof(*vars), driver_location_compare);
-
-        for (unsigned i = 0; i < num_entries; i++) {
-                nir_variable *var = vars[i];
-                unsigned array_len = MAX2(glsl_get_length(var->type), 1);
-                unsigned loc = var->data.driver_location;
-
-                assert(array_len == 1);
-                (void)array_len;
-                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
-                                  (loc + 1) * 4);
-
-                if (c->stage == QSTAGE_FRAG) {
-                        if (var->data.location == VARYING_SLOT_POS) {
-                                emit_fragcoord_input(c, loc);
-                        } else if (var->data.location == VARYING_SLOT_PNTC ||
-                                   (var->data.location >= VARYING_SLOT_VAR0 &&
-                                    (c->fs_key->point_sprite_mask &
-                                     (1 << (var->data.location -
-                                            VARYING_SLOT_VAR0))))) {
-                                c->inputs[loc * 4 + 0] = c->point_x;
-                                c->inputs[loc * 4 + 1] = c->point_y;
-                        } else {
-                                emit_fragment_input(c, loc, var->data.location);
-                        }
-                } else {
-                        emit_vertex_input(c, loc);
-                }
-        }
-}
-
 static void
 ntq_setup_outputs(struct vc4_compile *c)
 {
@@ -1740,10 +1686,73 @@ ntq_emit_load_input(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 return;
         }
 
-        uint32_t offset = nir_intrinsic_base(instr) + const_offset->u32[0];
+        /* Size our inputs array as far as this input.  Input arrays are
+         * small, and we don't have a shader_info field that tells us up front
+         * what the maximum driver_location is.
+         */
+        uint32_t loc = nir_intrinsic_base(instr) + const_offset->u32[0];
+        if ((loc + 1) * 4 > c->inputs_array_size) {
+                resize_qreg_array(c, &c->inputs, &c->inputs_array_size,
+                                  (loc + 1) * 4);
+        }
+
+        /* If we've already loaded this input, just return it.  This would
+         * happen for VPM loads, where we load an entire vertex attribute at
+         * once, or possibly also in the FS if we haven't CSEed away repeated
+         * loads.
+         */
         int comp = nir_intrinsic_component(instr);
+        if (c->inputs[loc * 4 + comp].file != QFILE_NULL) {
+                ntq_store_dest(c, &instr->dest, 0,
+                               qir_MOV(c, c->inputs[loc * 4 + comp]));
+                return;
+        }
+
+        /* In the FS, we always have to fully drain our FS FIFO before
+         * terminating the shader.  For the VS we only have to drain whatever
+         * VPM setup we configure, but vc4_qpu_emit.c configures it for the
+         * entire vertex attribute space.  Because of this, we emit our lazy
+         * varying/VPM loads at the last top level basic block.
+         */
+        struct qblock *saved_cur_block = c->cur_block;
+        c->cur_block = c->last_top_block;
+
+        /* Look up the NIR variable for this input, so we can see how big the
+         * input is, or what sort of interpolation is necessary.
+         */
+        nir_variable *var = NULL;
+        nir_foreach_variable(search_var, &c->s->inputs) {
+                unsigned search_len = MAX2(glsl_get_length(search_var->type), 1);
+                unsigned search_loc = search_var->data.driver_location;
+
+                if (loc >= search_loc && loc < search_loc + search_len) {
+                        var = search_var;
+                        break;
+                }
+        }
+        assert(var);
+
+        if (c->stage == QSTAGE_FRAG) {
+                if (var->data.location == VARYING_SLOT_POS) {
+                        emit_fragcoord_input(c, loc);
+                } else if (var->data.location == VARYING_SLOT_PNTC ||
+                           (var->data.location >= VARYING_SLOT_VAR0 &&
+                            (c->fs_key->point_sprite_mask &
+                             (1 << (var->data.location -
+                                    VARYING_SLOT_VAR0))))) {
+                        c->inputs[loc * 4 + 0] = c->point_x;
+                        c->inputs[loc * 4 + 1] = c->point_y;
+                } else {
+                        emit_fragment_input(c, loc, var->data.location);
+                }
+        } else {
+                emit_vertex_input(c, loc);
+        }
+
+        c->cur_block = saved_cur_block;
+
         ntq_store_dest(c, &instr->dest, 0,
-                       qir_MOV(c, c->inputs[offset * 4 + comp]));
+                       qir_MOV(c, c->inputs[loc * 4 + comp]));
 }
 
 static void
@@ -2161,7 +2170,6 @@ nir_to_qir(struct vc4_compile *c)
         if (c->stage == QSTAGE_FRAG && c->s->info->fs.uses_discard)
                 c->discard = qir_MOV(c, qir_uniform_ui(c, 0));
 
-        ntq_setup_inputs(c);
         ntq_setup_outputs(c);
         ntq_setup_uniforms(c);
         ntq_setup_registers(c, &c->s->registers);
@@ -2587,14 +2595,17 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         } else {
                 shader->num_inputs = c->num_inputs;
 
-                shader->vattr_offsets[0] = 0;
-                for (int i = 0; i < 8; i++) {
-                        shader->vattr_offsets[i + 1] =
-                                shader->vattr_offsets[i] + c->vattr_sizes[i];
+                uint8_t next_vattr_offset = 0;
+                for (int i = 0; i < c->next_vpm_input; i++) {
+                        if (!c->vattr_sizes[i])
+                                continue;
 
-                        if (c->vattr_sizes[i])
-                                shader->vattrs_live |= (1 << i);
+                        uint32_t nir_attr = c->vpm_input_order[i];
+                        shader->vattr_offsets[nir_attr] = next_vattr_offset;
+                        next_vattr_offset += c->vattr_sizes[i];
+                        shader->vattrs_live |= (1 << nir_attr);
                 }
+                shader->vattr_total_size = next_vattr_offset;
         }
 
         shader->failed = c->failed;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index 6469e51b05..fe86232aeb 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -462,6 +462,13 @@ struct vc4_compile {
         uint8_t vattr_sizes[8];
 
         /**
+         * Order in which the vattrs were loaded by the program, to arrange
+         * vattr_offsets[] in the program data appropriately.
+         */
+        uint8_t vpm_input_order[8];
+        uint8_t next_vpm_input;
+
+        /**
          * Array of the VARYING_SLOT_* of all FS QFILE_VARY reads.
          *
          * This includes those that aren't part of the VPM varyings, like
author	Eric Anholt <eric@anholt.net>	2017-02-24 12:57:03 -0800
committer	Eric Anholt <eric@anholt.net>	2017-02-24 17:01:29 -0800
commit	292c24ddac5acc35676424f05291c101fcd47b3e (patch)
tree	1cc326dc2c1dd5c8abd664dae0b4e1fcfa4bf373 /src/gallium
parent	f06915d7b71eb955cc0db4b5555f5c6474926a01 (diff)