summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatt Turner <mattst88@gmail.com>2015-05-11 09:29:56 -0700
committerMatt Turner <mattst88@gmail.com>2015-05-18 10:11:36 -0700
commit1e4e17fbd9296cc5064aabdb351a894d10190cb6 (patch)
tree6bf569a6938dc4330ad9d12d012394504c0e416f
parentae405d429ff62e279cb4bb84d29581d4f7467b52 (diff)
i965/fs: Lower integer multiplication after optimizations.
32-bit x 32-bit integer multiplication requires multiple instructions until Broadwell. This patch just lets us treat the MUL instruction in the FS backend like it operates on Broadwell, and after optimizations we lower it into a sequence of instructions on older platforms. Doing this will allow us to some extra optimization on integer multiplies. Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp66
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h1
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_nir.cpp36
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp31
4 files changed, 70 insertions, 64 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index b63ca23e3d..cb13fcb1cc 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -3523,6 +3523,71 @@ fs_visitor::lower_load_payload()
return progress;
}
+bool
+fs_visitor::lower_integer_multiplication()
+{
+ bool progress = false;
+
+ /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
+ * directly, but Cherryview cannot.
+ */
+ if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+ return false;
+
+ foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+ if (inst->opcode != BRW_OPCODE_MUL ||
+ inst->dst.is_accumulator() ||
+ (inst->dst.type != BRW_REGISTER_TYPE_D &&
+ inst->dst.type != BRW_REGISTER_TYPE_UD))
+ continue;
+
+#define insert(instr) inst->insert_before(block, instr)
+
+ /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+ * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+ * src1 are used.
+ *
+ * If multiplying by an immediate value that fits in 16-bits, do a
+ * single MUL instruction with that value in the proper location.
+ */
+ if (inst->src[1].file == IMM &&
+ inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+ if (devinfo->gen < 7) {
+ fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+ inst->dst.type, dispatch_width);
+ insert(MOV(imm, inst->src[1]));
+ insert(MUL(inst->dst, imm, inst->src[0]));
+ } else {
+ insert(MUL(inst->dst, inst->src[0], inst->src[1]));
+ }
+ } else {
+ if (devinfo->gen >= 7)
+ no16("SIMD16 integer multiply unsupported\n");
+
+ const unsigned channels = dispatch_width;
+ const enum brw_reg_type type = inst->dst.type;
+ const fs_reg acc(retype(brw_acc_reg(channels), type));
+ const fs_reg null(retype(brw_null_vec(channels), type));
+
+ const fs_reg &src0 = inst->src[0];
+ const fs_reg &src1 = inst->src[1];
+
+ insert(MUL(acc, src0, src1));
+ insert(MACH(null, src0, src1));
+ insert(MOV(inst->dst, acc));
+ }
+#undef insert
+
+ inst->remove(block);
+ progress = true;
+ }
+
+ if (progress)
+ invalidate_live_intervals();
+
+ return progress;
+}
+
void
fs_visitor::dump_instructions()
{
@@ -4001,6 +4066,7 @@ fs_visitor::optimize()
}
OPT(opt_combine_constants);
+ OPT(lower_integer_multiplication);
lower_uniform_pull_constant_loads();
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 991cff9632..f2aa0ae957 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -241,6 +241,7 @@ public:
void no16(const char *msg, ...);
void lower_uniform_pull_constant_loads();
bool lower_load_payload();
+ bool lower_integer_multiplication();
bool opt_combine_constants();
void emit_dummy_fs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 9cfd0e792a..5dd8363b91 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -780,41 +780,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
inst->saturate = instr->dest.saturate;
break;
- case nir_op_imul: {
- if (devinfo->gen >= 8) {
- emit(MUL(result, op[0], op[1]));
- break;
- } else {
- nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
- nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
- if (value0 && value0->u[0] < (1 << 16)) {
- if (devinfo->gen < 7) {
- emit(MUL(result, op[0], op[1]));
- } else {
- emit(MUL(result, op[1], op[0]));
- }
- break;
- } else if (value1 && value1->u[0] < (1 << 16)) {
- if (devinfo->gen < 7) {
- emit(MUL(result, op[1], op[0]));
- } else {
- emit(MUL(result, op[0], op[1]));
- }
- break;
- }
- }
-
- if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
- struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
- emit(MUL(acc, op[0], op[1]));
- emit(MACH(reg_null_d, op[0], op[1]));
- emit(MOV(result, fs_reg(acc)));
+ case nir_op_imul:
+ emit(MUL(result, op[0], op[1]));
break;
- }
case nir_op_imul_high:
case nir_op_umul_high: {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index abaea5f4e1..ead7768664 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -873,36 +873,7 @@ fs_visitor::visit(ir_expression *ir)
unreachable("not reached: should be handled by ir_sub_to_add_neg");
case ir_binop_mul:
- if (devinfo->gen < 8 && ir->type->is_integer()) {
- /* For integer multiplication, the MUL uses the low 16 bits
- * of one of the operands (src0 on gen6, src1 on gen7). The
- * MACH accumulates in the contribution of the upper 16 bits
- * of that operand.
- */
- if (ir->operands[0]->is_uint16_constant()) {
- if (devinfo->gen < 7)
- emit(MUL(this->result, op[0], op[1]));
- else
- emit(MUL(this->result, op[1], op[0]));
- } else if (ir->operands[1]->is_uint16_constant()) {
- if (devinfo->gen < 7)
- emit(MUL(this->result, op[1], op[0]));
- else
- emit(MUL(this->result, op[0], op[1]));
- } else {
- if (devinfo->gen >= 7)
- no16("SIMD16 explicit accumulator operands unsupported\n");
-
- struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
- this->result.type);
-
- emit(MUL(acc, op[0], op[1]));
- emit(MACH(reg_null_d, op[0], op[1]));
- emit(MOV(this->result, fs_reg(acc)));
- }
- } else {
- emit(MUL(this->result, op[0], op[1]));
- }
+ emit(MUL(this->result, op[0], op[1]));
break;
case ir_binop_imul_high: {
if (devinfo->gen >= 7)