summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorCaio Oliveira <caio.oliveira@intel.com>2024-01-19 14:59:28 -0800
committerMarge Bot <emma+marge@anholt.net>2024-02-24 00:24:31 +0000
commitb02712efc2d7803be64be946b46e76ae243cd8cc (patch)
treea010d9c60834bd0413d8d494c411152e57634ae2 /src
parentb743ab7acc86ee1346e13e6aecad03ca34a64b40 (diff)
intel/elk: Remove DPAS lowering
This is meant for Gfx9+. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
Diffstat (limited to 'src')
-rw-r--r--src/intel/compiler/elk/brw_fs.cpp3
-rw-r--r--src/intel/compiler/elk/brw_fs_lower_dpas.cpp306
-rw-r--r--src/intel/compiler/elk/meson.build1
3 files changed, 0 insertions, 310 deletions
diff --git a/src/intel/compiler/elk/brw_fs.cpp b/src/intel/compiler/elk/brw_fs.cpp
index deddc83d456..77daed6b3a5 100644
--- a/src/intel/compiler/elk/brw_fs.cpp
+++ b/src/intel/compiler/elk/brw_fs.cpp
@@ -5990,9 +5990,6 @@ fs_visitor::optimize()
validate();
- if (compiler->lower_dpas)
- OPT(brw_lower_dpas, *this);
-
OPT(split_virtual_grfs);
/* Before anything else, eliminate dead code. The results of some NIR
diff --git a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
deleted file mode 100644
index 306731722af..00000000000
--- a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright 2023 Intel Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "brw_fs.h"
-#include "brw_fs_builder.h"
-
-using namespace brw;
-
-static void
-f16_using_mac(const fs_builder &bld, fs_inst *inst)
-{
- /* We only intend to support configurations where the destination and
- * accumulator have the same type.
- */
- if (!inst->src[0].is_null())
- assert(inst->dst.type == inst->src[0].type);
-
- assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
- assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
-
- const brw_reg_type src0_type = inst->dst.type;
- const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
- const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
-
- const fs_reg dest = inst->dst;
- fs_reg src0 = inst->src[0];
- const fs_reg src1 = retype(inst->src[1], src1_type);
- const fs_reg src2 = retype(inst->src[2], src2_type);
-
- const unsigned dest_stride =
- dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
-
- for (unsigned r = 0; r < inst->rcount; r++) {
- fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
-
- for (unsigned subword = 0; subword < 2; subword++) {
- for (unsigned s = 0; s < inst->sdepth; s++) {
- /* The first multiply of the dot-product operation has to
- * explicitly write the accumulator register. The successive MAC
- * instructions will implicitly read *and* write the
- * accumulator. Those MAC instructions can also optionally
- * explicitly write some other register.
- *
- * FINISHME: The accumulator can actually hold 16 HF values. On
- * Gfx12 there are two accumulators. It should be possible to do
- * this in SIMD16 or even SIMD32. I was unable to get this to work
- * properly.
- */
- if (s == 0 && subword == 0) {
- const unsigned acc_width = 8;
- fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
- inst->group % acc_width);
-
- if (bld.shader->devinfo->verx10 >= 125) {
- acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
- } else {
- acc = retype(acc, BRW_REGISTER_TYPE_HF);
- }
-
- bld.MUL(acc,
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- BRW_REGISTER_TYPE_HF, subword),
- component(retype(byte_offset(src2, r * REG_SIZE),
- BRW_REGISTER_TYPE_HF),
- s * 2 + subword))
- ->writes_accumulator = true;
-
- } else {
- fs_reg result;
-
- /* As mentioned above, the MAC had an optional, explicit
- * destination register. Various optimization passes are not
- * clever enough to understand the intricacies of this
- * instruction, so only write the result register on the final
- * MAC in the sequence.
- */
- if ((s + 1) == inst->sdepth && subword == 1)
- result = temp;
- else
- result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
-
- bld.MAC(result,
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- BRW_REGISTER_TYPE_HF, subword),
- component(retype(byte_offset(src2, r * REG_SIZE),
- BRW_REGISTER_TYPE_HF),
- s * 2 + subword))
- ->writes_accumulator = true;
- }
- }
- }
-
- if (!src0.is_null()) {
- if (src0_type != BRW_REGISTER_TYPE_HF) {
- fs_reg temp2 = bld.vgrf(src0_type, 1);
-
- bld.MOV(temp2, temp);
-
- bld.ADD(byte_offset(dest, r * dest_stride),
- temp2,
- byte_offset(src0, r * dest_stride));
- } else {
- bld.ADD(byte_offset(dest, r * dest_stride),
- temp,
- byte_offset(src0, r * dest_stride));
- }
- } else {
- bld.MOV(byte_offset(dest, r * dest_stride), temp);
- }
- }
-}
-
-static void
-int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
-{
- /* We only intend to support configurations where the destination and
- * accumulator have the same type.
- */
- if (!inst->src[0].is_null())
- assert(inst->dst.type == inst->src[0].type);
-
- assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
- inst->src[1].type == BRW_REGISTER_TYPE_UB);
- assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
- inst->src[2].type == BRW_REGISTER_TYPE_UB);
-
- const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
- ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
- const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
- ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
- fs_reg dest = inst->dst;
- fs_reg src0 = inst->src[0];
- const fs_reg src1 = retype(inst->src[1], src1_type);
- const fs_reg src2 = retype(inst->src[2], src2_type);
-
- const unsigned dest_stride = REG_SIZE;
-
- for (unsigned r = 0; r < inst->rcount; r++) {
- if (!src0.is_null()) {
- bld.MOV(dest, src0);
- src0 = byte_offset(src0, dest_stride);
- } else {
- bld.MOV(dest, retype(brw_imm_d(0), dest.type));
- }
-
- for (unsigned s = 0; s < inst->sdepth; s++) {
- bld.DP4A(dest,
- dest,
- byte_offset(src1, s * REG_SIZE),
- component(byte_offset(src2, r * REG_SIZE), s))
- ->saturate = inst->saturate;
- }
-
- dest = byte_offset(dest, dest_stride);
- }
-}
-
-static void
-int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
-{
- /* We only intend to support configurations where the destination and
- * accumulator have the same type.
- */
- if (!inst->src[0].is_null())
- assert(inst->dst.type == inst->src[0].type);
-
- assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
- inst->src[1].type == BRW_REGISTER_TYPE_UB);
- assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
- inst->src[2].type == BRW_REGISTER_TYPE_UB);
-
- const brw_reg_type src0_type = inst->dst.type;
-
- const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
- ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
- const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
- ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
- fs_reg dest = inst->dst;
- fs_reg src0 = inst->src[0];
- const fs_reg src1 = retype(inst->src[1], src1_type);
- const fs_reg src2 = retype(inst->src[2], src2_type);
-
- const unsigned dest_stride = REG_SIZE;
-
- for (unsigned r = 0; r < inst->rcount; r++) {
- if (!src0.is_null()) {
- bld.MOV(dest, src0);
- src0 = byte_offset(src0, dest_stride);
- } else {
- bld.MOV(dest, retype(brw_imm_d(0), dest.type));
- }
-
- for (unsigned s = 0; s < inst->sdepth; s++) {
- fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
- fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
- fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
- const brw_reg_type temp_type =
- (inst->src[1].type == BRW_REGISTER_TYPE_B ||
- inst->src[2].type == BRW_REGISTER_TYPE_B)
- ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
-
- /* Expand 8 dwords of packed bytes into 16 dwords of packed
- * words.
- *
- * FINISHME: Gfx9 should not need this work around. Gfx11
- * may be able to use integer MAD. Both platforms may be
- * able to use MAC.
- */
- bld.group(32, 0).MOV(retype(temp3, temp_type),
- retype(byte_offset(src2, r * REG_SIZE),
- inst->src[2].type));
-
- bld.MUL(subscript(temp1, temp_type, 0),
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- inst->src[1].type, 0),
- subscript(component(retype(temp3,
- BRW_REGISTER_TYPE_UD),
- s * 2),
- temp_type, 0));
-
- bld.MUL(subscript(temp1, temp_type, 1),
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- inst->src[1].type, 1),
- subscript(component(retype(temp3,
- BRW_REGISTER_TYPE_UD),
- s * 2),
- temp_type, 1));
-
- bld.MUL(subscript(temp2, temp_type, 0),
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- inst->src[1].type, 2),
- subscript(component(retype(temp3,
- BRW_REGISTER_TYPE_UD),
- s * 2 + 1),
- temp_type, 0));
-
- bld.MUL(subscript(temp2, temp_type, 1),
- subscript(retype(byte_offset(src1, s * REG_SIZE),
- BRW_REGISTER_TYPE_UD),
- inst->src[1].type, 3),
- subscript(component(retype(temp3,
- BRW_REGISTER_TYPE_UD),
- s * 2 + 1),
- temp_type, 1));
-
- bld.ADD(subscript(temp1, src0_type, 0),
- subscript(temp1, temp_type, 0),
- subscript(temp1, temp_type, 1));
-
- bld.ADD(subscript(temp2, src0_type, 0),
- subscript(temp2, temp_type, 0),
- subscript(temp2, temp_type, 1));
-
- bld.ADD(retype(temp1, src0_type),
- retype(temp1, src0_type),
- retype(temp2, src0_type));
-
- bld.ADD(dest, dest, retype(temp1, src0_type))
- ->saturate = inst->saturate;
- }
-
- dest = byte_offset(dest, dest_stride);
- }
-}
-
-bool
-brw_lower_dpas(fs_visitor &v)
-{
- bool progress = false;
-
- foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
- if (inst->opcode != BRW_OPCODE_DPAS)
- continue;
-
- const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
-
- if (brw_reg_type_is_floating_point(inst->dst.type)) {
- f16_using_mac(bld, inst);
- } else {
- if (v.devinfo->ver >= 12) {
- int8_using_dp4a(bld, inst);
- } else {
- int8_using_mul_add(bld, inst);
- }
- }
-
- inst->remove(block);
- progress = true;
- }
-
- if (progress)
- v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
- return progress;
-}
diff --git a/src/intel/compiler/elk/meson.build b/src/intel/compiler/elk/meson.build
index e755ffbb285..d9ea33b080e 100644
--- a/src/intel/compiler/elk/meson.build
+++ b/src/intel/compiler/elk/meson.build
@@ -58,7 +58,6 @@ libintel_compiler_elk_files = files(
'brw_fs.h',
'brw_fs_live_variables.cpp',
'brw_fs_live_variables.h',
- 'brw_fs_lower_dpas.cpp',
'brw_fs_lower_pack.cpp',
'brw_fs_lower_regioning.cpp',
'brw_fs_nir.cpp',