diff options
author | Ian Romanick <ian.d.romanick@intel.com> | 2020-08-03 10:20:15 -0700 |
---|---|---|
committer | Ian Romanick <ian.d.romanick@intel.com> | 2021-12-14 13:28:01 -0800 |
commit | 1cdf309f43bdfd294a65d4462d539ec01c6c4c51 (patch) | |
tree | baef8bc4d6e685556008f049288c10718f560a6b | |
parent | 9a9f78db810413c8495b4e3b7e1f19242a1c4733 (diff) |
intel/fs: Switch to using util_combine_constants
At this point in the series, most platforms have the same results with
the new implementation. Gen7 platforms see a significant number of
"small" changes. Due to the coissue optimization on Gen7, each shader
is likely to have most constants affected by constant combining.
If a shader has only a signle basic block, constants are packed into
registers in the order produced by the constant combining process.
Since each constant has a different live range in the shader, even
slightly different packing orders can have dramatic effects on the live
range of a register. Even in cases where this does not affect register
pressure in a meaningful way, it can cause the scheduler to make very
different choices about the ordering of instructions.
From my analysis (using the `if (debug) { ... }` block at the end of
fs_visitor::opt_combine_constants), the old implementation and the new
implementation pick the same set of constants, but the order produced
may be slightly different. For the smaller number of values in non-Gen7
shaders, the orders are similar enough to not matter.
v2: Redact the changes to the sorting for shaders that have a single
block. This was an added 'else' clause to the 'if (cfg->num_blocks !=
1) qsort(...);'
No shader-db or fossil-db changes on any non-Gen7 platforms.
Haswell
total instructions in shared programs: 16362521 -> 16362541 (<.01%)
instructions in affected programs: 12985 -> 13005 (0.15%)
helped: 9
HURT: 20
helped stats (abs) min: 1 max: 3 x̄: 1.44 x̃: 1
helped stats (rel) min: 0.13% max: 0.53% x̄: 0.27% x̃: 0.25%
HURT stats (abs) min: 1 max: 5 x̄: 1.65 x̃: 1
HURT stats (rel) min: 0.22% max: 0.90% x̄: 0.42% x̃: 0.41%
95% mean confidence interval for instructions value: 0.05 1.33
95% mean confidence interval for instructions %-change: 0.06% 0.35%
Instructions are HURT.
total cycles in shared programs: 1034767222 -> 1034901336 (0.01%)
cycles in affected programs: 21184138 -> 21318252 (0.63%)
helped: 1712
HURT: 2142
helped stats (abs) min: 1 max: 7892 x̄: 247.42 x̃: 44
helped stats (rel) min: <.01% max: 50.82% x̄: 4.04% x̃: 0.90%
HURT stats (abs) min: 1 max: 6430 x̄: 260.37 x̃: 51
HURT stats (rel) min: <.01% max: 167.19% x̄: 5.63% x̃: 1.33%
95% mean confidence interval for cycles value: 16.32 53.28
95% mean confidence interval for cycles %-change: 0.99% 1.68%
Cycles are HURT.
total spills in shared programs: 17672 -> 17673 (<.01%)
spills in affected programs: 7 -> 8 (14.29%)
helped: 0
HURT: 1
total fills in shared programs: 20752 -> 20753 (<.01%)
fills in affected programs: 5 -> 6 (20.00%)
helped: 0
HURT: 1
LOST: 84
GAINED: 111
Ivy Bridge
total instructions in shared programs: 15471157 -> 15471168 (<.01%)
instructions in affected programs: 12148 -> 12159 (0.09%)
helped: 6
HURT: 17
helped stats (abs) min: 1 max: 4 x̄: 2.17 x̃: 2
helped stats (rel) min: 0.13% max: 0.83% x̄: 0.38% x̃: 0.32%
HURT stats (abs) min: 1 max: 2 x̄: 1.41 x̃: 1
HURT stats (rel) min: 0.16% max: 0.60% x̄: 0.31% x̃: 0.22%
95% mean confidence interval for instructions value: -0.28 1.24
95% mean confidence interval for instructions %-change: -0.03% 0.28%
Inconclusive result (value mean confidence interval includes 0).
total cycles in shared programs: 584135157 -> 584250366 (0.02%)
cycles in affected programs: 18638458 -> 18753667 (0.62%)
helped: 1505
HURT: 1917
helped stats (abs) min: 1 max: 6290 x̄: 243.52 x̃: 48
helped stats (rel) min: <.01% max: 49.90% x̄: 4.02% x̃: 0.90%
HURT stats (abs) min: 1 max: 5377 x̄: 251.28 x̃: 52
HURT stats (rel) min: 0.01% max: 95.39% x̄: 5.14% x̃: 1.26%
95% mean confidence interval for cycles value: 15.01 52.32
95% mean confidence interval for cycles %-change: 0.78% 1.44%
Cycles are HURT.
LOST: 61
GAINED: 71
-rw-r--r-- | src/intel/compiler/brw_fs_combine_constants.cpp | 395 |
1 files changed, 235 insertions, 160 deletions
diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index 6ed44312470..323d23a3dd8 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -37,6 +37,7 @@ #include "brw_fs.h" #include "brw_cfg.h" #include "util/half_float.h" +#include "util/combine_constants.h" using namespace brw; @@ -66,20 +67,38 @@ could_coissue(const struct intel_device_info *devinfo, const fs_inst *inst) inst->src[0].type == BRW_REGISTER_TYPE_F; } +/** + * Box for storing fs_inst and some other necessary data + * + * The \c fs_inst_box is used as the \c abstract_instruction for + * \c util_combine_constants. + * + * \sa box_instruction + */ +struct fs_inst_box { + fs_inst *inst; + unsigned ip; + bblock_t *block; + bool must_promote; +}; + /** A box for putting fs_regs in a linked list. */ struct reg_link { DECLARE_RALLOC_CXX_OPERATORS(reg_link) - reg_link(fs_reg *reg) : reg(reg) {} + reg_link(fs_inst *inst, unsigned src, bool negate) + : inst(inst), src(src), negate(negate) {} struct exec_node link; - fs_reg *reg; + fs_inst *inst; + uint8_t src; + bool negate; }; static struct exec_node * -link(void *mem_ctx, fs_reg *reg) +link(void *mem_ctx, fs_inst *inst, unsigned src, bool negate) { - reg_link *l = new(mem_ctx) reg_link(reg); + reg_link *l = new(mem_ctx) reg_link(inst, src, negate); return &l->link; } @@ -138,31 +157,66 @@ struct imm { /** The working set of information about immediates. */ struct table { - struct imm *imm; + struct value *values; int size; + int num_values; + + struct imm *imm; int len; + + struct fs_inst_box *boxes; + unsigned num_boxes; + unsigned size_boxes; }; -static struct imm * -find_imm(struct table *table, void *data, uint8_t size) +static struct value * +new_value(struct table *table, void *mem_ctx) { - for (int i = 0; i < table->len; i++) { - if (table->imm[i].size == size && - !memcmp(table->imm[i].bytes, data, size)) { - return &table->imm[i]; - } + if (table->num_values == table->size) { + table->size *= 2; + table->values = reralloc(mem_ctx, table->values, struct value, table->size); } - return NULL; + return &table->values[table->num_values++]; } -static struct imm * -new_imm(struct table *table, void *mem_ctx) +/** + * Store an instruction with some other data in a table. + * + * \returns the index into the dynamic array of boxes for the instruction. + */ +static unsigned +box_instruction(struct table *table, void *mem_ctx, fs_inst *inst, + unsigned ip, bblock_t *block, bool must_promote) { - if (table->len == table->size) { - table->size *= 2; - table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size); + /* It is common for box_instruction to be called consecutively for each + * source of an instruction. As a result, the most common case for finding + * an instruction in the table is when that instruction was the last one + * added. Search the list back to front. + */ + for (unsigned i = table->num_boxes; i > 0; /* empty */) { + i--; + + if (table->boxes[i].inst == inst) + return i; + } + + if (table->num_boxes == table->size_boxes) { + table->size_boxes *= 2; + table->boxes = reralloc(mem_ctx, table->boxes, fs_inst_box, + table->size_boxes); } - return &table->imm[table->len++]; + + assert(table->num_boxes < table->size_boxes); + + const unsigned idx = table->num_boxes++; + fs_inst_box *ib = &table->boxes[idx]; + + ib->inst = inst; + ib->block = block; + ib->ip = ip; + ib->must_promote = must_promote; + + return idx; } /** @@ -190,67 +244,6 @@ compare(const void *_a, const void *_b) return a->first_use_ip - b->first_use_ip; } -static bool -get_constant_value(const struct intel_device_info *devinfo, - const fs_inst *inst, uint32_t src_idx, - void *out, brw_reg_type *out_type) -{ - const bool can_do_source_mods = inst->can_do_source_mods(devinfo); - const fs_reg *src = &inst->src[src_idx]; - - *out_type = src->type; - - switch (*out_type) { - case BRW_REGISTER_TYPE_DF: { - double val = !can_do_source_mods ? src->df : fabs(src->df); - memcpy(out, &val, 8); - break; - } - case BRW_REGISTER_TYPE_F: { - float val = !can_do_source_mods ? src->f : fabsf(src->f); - memcpy(out, &val, 4); - break; - } - case BRW_REGISTER_TYPE_HF: { - uint16_t val = src->d & 0xffffu; - if (can_do_source_mods) - val = _mesa_float_to_half(fabsf(_mesa_half_to_float(val))); - memcpy(out, &val, 2); - break; - } - case BRW_REGISTER_TYPE_Q: { - int64_t val = !can_do_source_mods ? src->d64 : llabs(src->d64); - memcpy(out, &val, 8); - break; - } - case BRW_REGISTER_TYPE_UQ: - memcpy(out, &src->u64, 8); - break; - case BRW_REGISTER_TYPE_D: { - int32_t val = !can_do_source_mods ? src->d : abs(src->d); - memcpy(out, &val, 4); - break; - } - case BRW_REGISTER_TYPE_UD: - memcpy(out, &src->ud, 4); - break; - case BRW_REGISTER_TYPE_W: { - int16_t val = src->d & 0xffffu; - if (can_do_source_mods) - val = abs(val); - memcpy(out, &val, 2); - break; - } - case BRW_REGISTER_TYPE_UW: - memcpy(out, &src->ud, 2); - break; - default: - return false; - }; - - return true; -} - static struct brw_reg build_imm_reg_for_copy(struct imm *imm) { @@ -276,31 +269,6 @@ get_alignment_for_imm(const struct imm *imm) } static bool -needs_negate(const fs_reg *reg, const struct imm *imm) -{ - switch (reg->type) { - case BRW_REGISTER_TYPE_DF: - return signbit(reg->df) != signbit(imm->df); - case BRW_REGISTER_TYPE_F: - return signbit(reg->f) != signbit(imm->f); - case BRW_REGISTER_TYPE_Q: - return (reg->d64 < 0) != (imm->d64 < 0); - case BRW_REGISTER_TYPE_D: - return (reg->d < 0) != (imm->d < 0); - case BRW_REGISTER_TYPE_HF: - return (reg->d & 0x8000u) != (imm->w & 0x8000u); - case BRW_REGISTER_TYPE_W: - return ((int16_t)reg->d < 0) != (imm->w < 0); - case BRW_REGISTER_TYPE_UQ: - case BRW_REGISTER_TYPE_UD: - case BRW_REGISTER_TYPE_UW: - return false; - default: - unreachable("not implemented"); - }; -} - -static bool representable_as_hf(float f, uint16_t *hf) { union fi u; @@ -403,43 +371,49 @@ static void add_candidate_immediate(struct table *table, fs_inst *inst, unsigned ip, unsigned i, bool must_promote, - const brw::idom_tree &idom, bblock_t *block, - const struct intel_device_info *devinfo, + bblock_t *block, + ASSERTED const struct intel_device_info *devinfo, void *const_ctx) { - char data[8]; - brw_reg_type type; - if (!get_constant_value(devinfo, inst, i, data, &type)) - return; - - uint8_t size = type_sz(type); - - struct imm *imm = find_imm(table, data, size); - - if (imm) { - bblock_t *intersection = idom.intersect(block, imm->block); - if (intersection != imm->block) - imm->inst = NULL; - imm->block = intersection; - imm->uses->push_tail(link(const_ctx, &inst->src[i])); - imm->uses_by_coissue += int(!must_promote); - imm->must_promote = imm->must_promote || must_promote; - imm->last_use_ip = ip; - if (type == BRW_REGISTER_TYPE_HF) - imm->is_half_float = true; - } else { - imm = new_imm(table, const_ctx); - imm->block = block; - imm->inst = inst; - imm->uses = new(const_ctx) exec_list(); - imm->uses->push_tail(link(const_ctx, &inst->src[i])); - memcpy(imm->bytes, data, size); - imm->size = size; - imm->is_half_float = type == BRW_REGISTER_TYPE_HF; - imm->uses_by_coissue = int(!must_promote); - imm->must_promote = must_promote; - imm->first_use_ip = ip; - imm->last_use_ip = ip; + struct value *v = new_value(table, const_ctx); + + unsigned box_idx = box_instruction(table, const_ctx, inst, ip, block, + must_promote); + + /* Just for now... */ + assert(inst->can_do_source_mods(devinfo)); + + v->value.u64 = inst->src[i].d64; + v->bit_size = 8 * type_sz(inst->src[i].type); + v->instr = (struct abstract_instruction *)(uintptr_t) box_idx; + v->src = i; + v->allow_one_constant = false; + v->no_negations = false; + + switch (inst->src[i].type) { + case BRW_REGISTER_TYPE_DF: + case BRW_REGISTER_TYPE_NF: + case BRW_REGISTER_TYPE_F: + case BRW_REGISTER_TYPE_HF: + v->type = float_only; + break; + + case BRW_REGISTER_TYPE_UQ: + case BRW_REGISTER_TYPE_Q: + case BRW_REGISTER_TYPE_UD: + case BRW_REGISTER_TYPE_D: + case BRW_REGISTER_TYPE_UW: + case BRW_REGISTER_TYPE_W: + v->type = integer_only; + break; + + case BRW_REGISTER_TYPE_VF: + case BRW_REGISTER_TYPE_UV: + case BRW_REGISTER_TYPE_V: + case BRW_REGISTER_TYPE_UB: + case BRW_REGISTER_TYPE_B: + default: + unreachable("not reached"); } } @@ -449,9 +423,20 @@ fs_visitor::opt_combine_constants() void *const_ctx = ralloc_context(NULL); struct table table; - table.size = 8; - table.len = 0; - table.imm = ralloc_array(const_ctx, struct imm, table.size); + + /* For each of the dynamic arrays in the table, allocate about a page of + * memory. On LP64 systems, this gives 126 value objects 169 fs_inst_box + * objects. Even larger shaders that have been obverved rarely need more + * than 20 or 30 values. Most smaller shaders, which is most shaders, need + * at most a couple dozen fs_inst_box. + */ + table.size = (4096 - (5 * sizeof(void *))) / sizeof(struct value); + table.num_values = 0; + table.values = ralloc_array(const_ctx, struct value, table.size); + + table.size_boxes = (4096 - (5 * sizeof(void *))) / sizeof(struct fs_inst_box); + table.num_boxes = 0; + table.boxes = ralloc_array(const_ctx, fs_inst_box, table.size_boxes); const brw::idom_tree &idom = idom_analysis.require(); unsigned ip = -1; @@ -468,7 +453,7 @@ fs_visitor::opt_combine_constants() assert(inst->src[0].file != IMM); if (inst->src[1].file == IMM && devinfo->ver < 8) { - add_candidate_immediate(&table, inst, ip, 1, true, idom, block, + add_candidate_immediate(&table, inst, ip, 1, true, block, devinfo, const_ctx); } @@ -483,7 +468,7 @@ fs_visitor::opt_combine_constants() if (can_promote_src_as_imm(devinfo, inst, i)) continue; - add_candidate_immediate(&table, inst, ip, i, true, idom, block, + add_candidate_immediate(&table, inst, ip, i, true, block, devinfo, const_ctx); } @@ -495,7 +480,7 @@ fs_visitor::opt_combine_constants() if (inst->src[i].file != IMM) continue; - add_candidate_immediate(&table, inst, ip, i, true, idom, block, + add_candidate_immediate(&table, inst, ip, i, true, block, devinfo, const_ctx); } @@ -503,7 +488,7 @@ fs_visitor::opt_combine_constants() case BRW_OPCODE_MOV: if (could_coissue(devinfo, inst) && inst->src[0].file == IMM) { - add_candidate_immediate(&table, inst, ip, 0, false, idom, block, + add_candidate_immediate(&table, inst, ip, 0, false, block, devinfo, const_ctx); } break; @@ -514,7 +499,7 @@ fs_visitor::opt_combine_constants() assert(inst->src[0].file != IMM); if (could_coissue(devinfo, inst) && inst->src[1].file == IMM) { - add_candidate_immediate(&table, inst, ip, 1, false, idom, block, + add_candidate_immediate(&table, inst, ip, 1, false, block, devinfo, const_ctx); } break; @@ -524,19 +509,106 @@ fs_visitor::opt_combine_constants() } } - /* Remove constants from the table that don't have enough uses to make them - * profitable to store in a register. - */ - for (int i = 0; i < table.len;) { - struct imm *imm = &table.imm[i]; + if (table.num_values == 0) { + ralloc_free(const_ctx); + return false; + } + + struct combine_constants_result *result = + util_combine_constants(table.values, table.num_values); + + table.imm = ralloc_array(const_ctx, struct imm, result->num_values_to_emit); + table.len = 0; + + for (unsigned i = 0; i < result->num_values_to_emit; i++) { + struct imm *imm = &table.imm[table.len]; + + imm->block = NULL; + imm->inst = NULL; + imm->d64 = result->values_to_emit[i].value.u64; + imm->size = result->values_to_emit[i].bit_size / 8; + + imm->uses_by_coissue = 0; + imm->must_promote = false; + imm->is_half_float = false; + + imm->first_use_ip = UINT16_MAX; + imm->last_use_ip = 0; + + imm->uses = new(const_ctx) exec_list; + + const unsigned first_user = result->values_to_emit[i].first_user; + const unsigned last_user = first_user + + result->values_to_emit[i].num_users; + + for (unsigned j = first_user; j < last_user; j++) { + const unsigned idx = + (unsigned)(uintptr_t) table.values[result->user_map[j].index].instr; + fs_inst_box *const ib = &table.boxes[idx]; + + const unsigned src = table.values[result->user_map[j].index].src; + + imm->uses->push_tail(link(const_ctx, ib->inst, src, + result->user_map[j].negate)); + + if (ib->must_promote) + imm->must_promote = true; + else + imm->uses_by_coissue++; + + if (imm->block == NULL) { + /* Block should only be NULL on the first pass. On the first + * pass, inst should also be NULL. + */ + assert(imm->inst == NULL); + + imm->inst = ib->inst; + imm->block = ib->block; + imm->first_use_ip = ib->ip; + imm->last_use_ip = ib->ip; + } else { + bblock_t *intersection = idom.intersect(ib->block, + imm->block); + + if (imm->first_use_ip > ib->ip) { + imm->first_use_ip = ib->ip; + + /* If the first-use instruction is to be tracked, block must be + * the block that contains it. The old block was read in the + * idom.intersect call above, so it is safe to overwrite it + * here. + */ + imm->inst = ib->inst; + imm->block = ib->block; + } + + if (imm->last_use_ip < ib->ip) + imm->last_use_ip = ib->ip; + + /* The common dominator is not the block that contains the + * first-use instruction, so don't track that instruction. The + * load instruction will be added in the common dominator block + * instead of before the first-use instruction. + */ + if (intersection != imm->block) + imm->inst = NULL; + + imm->block = intersection; + } - if (!imm->must_promote && imm->uses_by_coissue < 4) { - table.imm[i] = table.imm[table.len - 1]; - table.len--; - continue; + if (ib->inst->src[src].type == BRW_REGISTER_TYPE_HF) + imm->is_half_float = true; } - i++; + + /* Remove constants from the table that don't have enough uses to make + * them profitable to store in a register. + */ + if (imm->must_promote || imm->uses_by_coissue >= 4) + table.len++; } + + util_combine_constants_result_dtor(result); + if (table.len == 0) { ralloc_free(const_ctx); return false; @@ -594,7 +666,7 @@ fs_visitor::opt_combine_constants() /* Rewrite the immediate sources to refer to the new GRFs. */ for (int i = 0; i < table.len; i++) { foreach_list_typed(reg_link, link, link, table.imm[i].uses) { - fs_reg *reg = link->reg; + fs_reg *reg = &link->inst->src[link->src]; #ifdef DEBUG switch (reg->type) { case BRW_REGISTER_TYPE_DF: @@ -615,18 +687,21 @@ fs_visitor::opt_combine_constants() assert(abs(reg->d64) == abs(table.imm[i].d64)); break; case BRW_REGISTER_TYPE_UQ: + assert(!link->negate); assert(reg->d64 == table.imm[i].d64); break; case BRW_REGISTER_TYPE_D: assert(abs(reg->d) == abs(table.imm[i].d)); break; case BRW_REGISTER_TYPE_UD: + assert(!link->negate); assert(reg->d == table.imm[i].d); break; case BRW_REGISTER_TYPE_W: assert(abs((int16_t) (reg->d & 0xffff)) == table.imm[i].w); break; case BRW_REGISTER_TYPE_UW: + assert(!link->negate); assert((reg->ud & 0xffffu) == (uint16_t) table.imm[i].w); break; default: @@ -637,7 +712,7 @@ fs_visitor::opt_combine_constants() reg->file = VGRF; reg->offset = table.imm[i].subreg_offset; reg->stride = 0; - reg->negate = needs_negate(reg, &table.imm[i]); + reg->negate = link->negate; reg->nr = table.imm[i].nr; } } |