intel/elk: Remove DPAS lowering

This is meant for Gfx9+. Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27563>
author: Caio Oliveira <caio.oliveira@intel.com> 2024-01-19 14:59:28 -0800
committer: Marge Bot <emma+marge@anholt.net> 2024-02-24 00:24:31 +0000
commit: b02712efc2d7803be64be946b46e76ae243cd8cc (patch)
tree: a010d9c60834bd0413d8d494c411152e57634ae2 /src
parent: b743ab7acc86ee1346e13e6aecad03ca34a64b40 (diff)
3 files changed, 0 insertions, 310 deletions
diff --git a/src/intel/compiler/elk/brw_fs.cpp b/src/intel/compiler/elk/brw_fs.cpp
index deddc83d456..77daed6b3a5 100644
--- a/src/intel/compiler/elk/brw_fs.cpp
+++ b/src/intel/compiler/elk/brw_fs.cpp
@@ -5990,9 +5990,6 @@ fs_visitor::optimize()
 
    validate();
 
-   if (compiler->lower_dpas)
-      OPT(brw_lower_dpas, *this);
-
    OPT(split_virtual_grfs);
 
    /* Before anything else, eliminate dead code.  The results of some NIR
diff --git a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
deleted file mode 100644
index 306731722af..00000000000
--- a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/*
- * Copyright 2023 Intel Corporation
- * SPDX-License-Identifier: MIT
- */
-
-#include "brw_fs.h"
-#include "brw_fs_builder.h"
-
-using namespace brw;
-
-static void
-f16_using_mac(const fs_builder &bld, fs_inst *inst)
-{
-   /* We only intend to support configurations where the destination and
-    * accumulator have the same type.
-    */
-   if (!inst->src[0].is_null())
-      assert(inst->dst.type == inst->src[0].type);
-
-   assert(inst->src[1].type == BRW_REGISTER_TYPE_HF);
-   assert(inst->src[2].type == BRW_REGISTER_TYPE_HF);
-
-   const brw_reg_type src0_type = inst->dst.type;
-   const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF;
-   const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF;
-
-   const fs_reg dest = inst->dst;
-   fs_reg src0 = inst->src[0];
-   const fs_reg src1 = retype(inst->src[1], src1_type);
-   const fs_reg src2 = retype(inst->src[2], src2_type);
-
-   const unsigned dest_stride =
-      dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE;
-
-   for (unsigned r = 0; r < inst->rcount; r++) {
-      fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1);
-
-      for (unsigned subword = 0; subword < 2; subword++) {
-         for (unsigned s = 0; s < inst->sdepth; s++) {
-            /* The first multiply of the dot-product operation has to
-             * explicitly write the accumulator register. The successive MAC
-             * instructions will implicitly read *and* write the
-             * accumulator. Those MAC instructions can also optionally
-             * explicitly write some other register.
-             *
-             * FINISHME: The accumulator can actually hold 16 HF values. On
-             * Gfx12 there are two accumulators. It should be possible to do
-             * this in SIMD16 or even SIMD32. I was unable to get this to work
-             * properly.
-             */
-            if (s == 0 && subword == 0) {
-               const unsigned acc_width = 8;
-               fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
-                                      inst->group % acc_width);
-
-               if (bld.shader->devinfo->verx10 >= 125) {
-                  acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword);
-               } else {
-                  acc = retype(acc, BRW_REGISTER_TYPE_HF);
-               }
-
-               bld.MUL(acc,
-                       subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                        BRW_REGISTER_TYPE_UD),
-                                 BRW_REGISTER_TYPE_HF, subword),
-                       component(retype(byte_offset(src2, r * REG_SIZE),
-                                        BRW_REGISTER_TYPE_HF),
-                                 s * 2 + subword))
-                  ->writes_accumulator = true;
-
-            } else {
-               fs_reg result;
-
-               /* As mentioned above, the MAC had an optional, explicit
-                * destination register. Various optimization passes are not
-                * clever enough to understand the intricacies of this
-                * instruction, so only write the result register on the final
-                * MAC in the sequence.
-                */
-               if ((s + 1) == inst->sdepth && subword == 1)
-                  result = temp;
-               else
-                  result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF);
-
-               bld.MAC(result,
-                       subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                        BRW_REGISTER_TYPE_UD),
-                                 BRW_REGISTER_TYPE_HF, subword),
-                       component(retype(byte_offset(src2, r * REG_SIZE),
-                                        BRW_REGISTER_TYPE_HF),
-                                 s * 2 + subword))
-                  ->writes_accumulator = true;
-            }
-         }
-      }
-
-      if (!src0.is_null()) {
-         if (src0_type != BRW_REGISTER_TYPE_HF) {
-            fs_reg temp2 = bld.vgrf(src0_type, 1);
-
-            bld.MOV(temp2, temp);
-
-            bld.ADD(byte_offset(dest, r * dest_stride),
-                    temp2,
-                    byte_offset(src0, r * dest_stride));
-         } else {
-            bld.ADD(byte_offset(dest, r * dest_stride),
-                    temp,
-                    byte_offset(src0, r * dest_stride));
-         }
-      } else {
-         bld.MOV(byte_offset(dest, r * dest_stride), temp);
-      }
-   }
-}
-
-static void
-int8_using_dp4a(const fs_builder &bld, fs_inst *inst)
-{
-   /* We only intend to support configurations where the destination and
-    * accumulator have the same type.
-    */
-   if (!inst->src[0].is_null())
-      assert(inst->dst.type == inst->src[0].type);
-
-   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
-          inst->src[1].type == BRW_REGISTER_TYPE_UB);
-   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
-          inst->src[2].type == BRW_REGISTER_TYPE_UB);
-
-   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
-      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
-   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
-      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
-   fs_reg dest = inst->dst;
-   fs_reg src0 = inst->src[0];
-   const fs_reg src1 = retype(inst->src[1], src1_type);
-   const fs_reg src2 = retype(inst->src[2], src2_type);
-
-   const unsigned dest_stride = REG_SIZE;
-
-   for (unsigned r = 0; r < inst->rcount; r++) {
-      if (!src0.is_null()) {
-         bld.MOV(dest, src0);
-         src0 = byte_offset(src0, dest_stride);
-      } else {
-         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
-      }
-
-      for (unsigned s = 0; s < inst->sdepth; s++) {
-         bld.DP4A(dest,
-                  dest,
-                  byte_offset(src1, s * REG_SIZE),
-                  component(byte_offset(src2, r * REG_SIZE), s))
-            ->saturate = inst->saturate;
-      }
-
-      dest = byte_offset(dest, dest_stride);
-   }
-}
-
-static void
-int8_using_mul_add(const fs_builder &bld, fs_inst *inst)
-{
-   /* We only intend to support configurations where the destination and
-    * accumulator have the same type.
-    */
-   if (!inst->src[0].is_null())
-      assert(inst->dst.type == inst->src[0].type);
-
-   assert(inst->src[1].type == BRW_REGISTER_TYPE_B ||
-          inst->src[1].type == BRW_REGISTER_TYPE_UB);
-   assert(inst->src[2].type == BRW_REGISTER_TYPE_B ||
-          inst->src[2].type == BRW_REGISTER_TYPE_UB);
-
-   const brw_reg_type src0_type = inst->dst.type;
-
-   const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB
-      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
-   const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB
-      ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D;
-
-   fs_reg dest = inst->dst;
-   fs_reg src0 = inst->src[0];
-   const fs_reg src1 = retype(inst->src[1], src1_type);
-   const fs_reg src2 = retype(inst->src[2], src2_type);
-
-   const unsigned dest_stride = REG_SIZE;
-
-   for (unsigned r = 0; r < inst->rcount; r++) {
-      if (!src0.is_null()) {
-         bld.MOV(dest, src0);
-         src0 = byte_offset(src0, dest_stride);
-      } else {
-         bld.MOV(dest, retype(brw_imm_d(0), dest.type));
-      }
-
-      for (unsigned s = 0; s < inst->sdepth; s++) {
-         fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-         fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
-         const brw_reg_type temp_type =
-            (inst->src[1].type == BRW_REGISTER_TYPE_B ||
-             inst->src[2].type == BRW_REGISTER_TYPE_B)
-            ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW;
-
-         /* Expand 8 dwords of packed bytes into 16 dwords of packed
-          * words.
-          *
-          * FINISHME: Gfx9 should not need this work around. Gfx11
-          * may be able to use integer MAD. Both platforms may be
-          * able to use MAC.
-          */
-         bld.group(32, 0).MOV(retype(temp3, temp_type),
-                              retype(byte_offset(src2, r * REG_SIZE),
-                                     inst->src[2].type));
-
-         bld.MUL(subscript(temp1, temp_type, 0),
-                 subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                  BRW_REGISTER_TYPE_UD),
-                           inst->src[1].type, 0),
-                 subscript(component(retype(temp3,
-                                            BRW_REGISTER_TYPE_UD),
-                                     s * 2),
-                           temp_type, 0));
-
-         bld.MUL(subscript(temp1, temp_type, 1),
-                 subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                  BRW_REGISTER_TYPE_UD),
-                           inst->src[1].type, 1),
-                 subscript(component(retype(temp3,
-                                            BRW_REGISTER_TYPE_UD),
-                                     s * 2),
-                           temp_type, 1));
-
-         bld.MUL(subscript(temp2, temp_type, 0),
-                 subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                  BRW_REGISTER_TYPE_UD),
-                           inst->src[1].type, 2),
-                 subscript(component(retype(temp3,
-                                            BRW_REGISTER_TYPE_UD),
-                                     s * 2 + 1),
-                           temp_type, 0));
-
-         bld.MUL(subscript(temp2, temp_type, 1),
-                 subscript(retype(byte_offset(src1, s * REG_SIZE),
-                                  BRW_REGISTER_TYPE_UD),
-                           inst->src[1].type, 3),
-                 subscript(component(retype(temp3,
-                                            BRW_REGISTER_TYPE_UD),
-                                     s * 2 + 1),
-                           temp_type, 1));
-
-         bld.ADD(subscript(temp1, src0_type, 0),
-                 subscript(temp1, temp_type, 0),
-                 subscript(temp1, temp_type, 1));
-
-         bld.ADD(subscript(temp2, src0_type, 0),
-                 subscript(temp2, temp_type, 0),
-                 subscript(temp2, temp_type, 1));
-
-         bld.ADD(retype(temp1, src0_type),
-                 retype(temp1, src0_type),
-                 retype(temp2, src0_type));
-
-         bld.ADD(dest, dest, retype(temp1, src0_type))
-            ->saturate = inst->saturate;
-      }
-
-      dest = byte_offset(dest, dest_stride);
-   }
-}
-
-bool
-brw_lower_dpas(fs_visitor &v)
-{
-   bool progress = false;
-
-   foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) {
-      if (inst->opcode != BRW_OPCODE_DPAS)
-         continue;
-
-      const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all();
-
-      if (brw_reg_type_is_floating_point(inst->dst.type)) {
-         f16_using_mac(bld, inst);
-      } else {
-         if (v.devinfo->ver >= 12) {
-            int8_using_dp4a(bld, inst);
-         } else {
-            int8_using_mul_add(bld, inst);
-         }
-      }
-
-      inst->remove(block);
-      progress = true;
-   }
-
-   if (progress)
-      v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
-
-   return progress;
-}
diff --git a/src/intel/compiler/elk/meson.build b/src/intel/compiler/elk/meson.build
index e755ffbb285..d9ea33b080e 100644
--- a/src/intel/compiler/elk/meson.build
+++ b/src/intel/compiler/elk/meson.build
@@ -58,7 +58,6 @@ libintel_compiler_elk_files = files(
   'brw_fs.h',
   'brw_fs_live_variables.cpp',
   'brw_fs_live_variables.h',
-  'brw_fs_lower_dpas.cpp',
   'brw_fs_lower_pack.cpp',
   'brw_fs_lower_regioning.cpp',
   'brw_fs_nir.cpp',
author	Caio Oliveira <caio.oliveira@intel.com>	2024-01-19 14:59:28 -0800
committer	Marge Bot <emma+marge@anholt.net>	2024-02-24 00:24:31 +0000
commit	b02712efc2d7803be64be946b46e76ae243cd8cc (patch)
tree	a010d9c60834bd0413d8d494c411152e57634ae2 /src
parent	b743ab7acc86ee1346e13e6aecad03ca34a64b40 (diff)