From b02712efc2d7803be64be946b46e76ae243cd8cc Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Fri, 19 Jan 2024 14:59:28 -0800 Subject: intel/elk: Remove DPAS lowering This is meant for Gfx9+. Reviewed-by: Ian Romanick Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/elk/brw_fs.cpp | 3 - src/intel/compiler/elk/brw_fs_lower_dpas.cpp | 306 --------------------------- src/intel/compiler/elk/meson.build | 1 - 3 files changed, 310 deletions(-) delete mode 100644 src/intel/compiler/elk/brw_fs_lower_dpas.cpp (limited to 'src') diff --git a/src/intel/compiler/elk/brw_fs.cpp b/src/intel/compiler/elk/brw_fs.cpp index deddc83d456..77daed6b3a5 100644 --- a/src/intel/compiler/elk/brw_fs.cpp +++ b/src/intel/compiler/elk/brw_fs.cpp @@ -5990,9 +5990,6 @@ fs_visitor::optimize() validate(); - if (compiler->lower_dpas) - OPT(brw_lower_dpas, *this); - OPT(split_virtual_grfs); /* Before anything else, eliminate dead code. The results of some NIR diff --git a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp b/src/intel/compiler/elk/brw_fs_lower_dpas.cpp deleted file mode 100644 index 306731722af..00000000000 --- a/src/intel/compiler/elk/brw_fs_lower_dpas.cpp +++ /dev/null @@ -1,306 +0,0 @@ -/* - * Copyright 2023 Intel Corporation - * SPDX-License-Identifier: MIT - */ - -#include "brw_fs.h" -#include "brw_fs_builder.h" - -using namespace brw; - -static void -f16_using_mac(const fs_builder &bld, fs_inst *inst) -{ - /* We only intend to support configurations where the destination and - * accumulator have the same type. - */ - if (!inst->src[0].is_null()) - assert(inst->dst.type == inst->src[0].type); - - assert(inst->src[1].type == BRW_REGISTER_TYPE_HF); - assert(inst->src[2].type == BRW_REGISTER_TYPE_HF); - - const brw_reg_type src0_type = inst->dst.type; - const brw_reg_type src1_type = BRW_REGISTER_TYPE_HF; - const brw_reg_type src2_type = BRW_REGISTER_TYPE_HF; - - const fs_reg dest = inst->dst; - fs_reg src0 = inst->src[0]; - const fs_reg src1 = retype(inst->src[1], src1_type); - const fs_reg src2 = retype(inst->src[2], src2_type); - - const unsigned dest_stride = - dest.type == BRW_REGISTER_TYPE_HF ? REG_SIZE / 2 : REG_SIZE; - - for (unsigned r = 0; r < inst->rcount; r++) { - fs_reg temp = bld.vgrf(BRW_REGISTER_TYPE_HF, 1); - - for (unsigned subword = 0; subword < 2; subword++) { - for (unsigned s = 0; s < inst->sdepth; s++) { - /* The first multiply of the dot-product operation has to - * explicitly write the accumulator register. The successive MAC - * instructions will implicitly read *and* write the - * accumulator. Those MAC instructions can also optionally - * explicitly write some other register. - * - * FINISHME: The accumulator can actually hold 16 HF values. On - * Gfx12 there are two accumulators. It should be possible to do - * this in SIMD16 or even SIMD32. I was unable to get this to work - * properly. - */ - if (s == 0 && subword == 0) { - const unsigned acc_width = 8; - fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD), - inst->group % acc_width); - - if (bld.shader->devinfo->verx10 >= 125) { - acc = subscript(acc, BRW_REGISTER_TYPE_HF, subword); - } else { - acc = retype(acc, BRW_REGISTER_TYPE_HF); - } - - bld.MUL(acc, - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - BRW_REGISTER_TYPE_HF, subword), - component(retype(byte_offset(src2, r * REG_SIZE), - BRW_REGISTER_TYPE_HF), - s * 2 + subword)) - ->writes_accumulator = true; - - } else { - fs_reg result; - - /* As mentioned above, the MAC had an optional, explicit - * destination register. Various optimization passes are not - * clever enough to understand the intricacies of this - * instruction, so only write the result register on the final - * MAC in the sequence. - */ - if ((s + 1) == inst->sdepth && subword == 1) - result = temp; - else - result = retype(bld.null_reg_ud(), BRW_REGISTER_TYPE_HF); - - bld.MAC(result, - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - BRW_REGISTER_TYPE_HF, subword), - component(retype(byte_offset(src2, r * REG_SIZE), - BRW_REGISTER_TYPE_HF), - s * 2 + subword)) - ->writes_accumulator = true; - } - } - } - - if (!src0.is_null()) { - if (src0_type != BRW_REGISTER_TYPE_HF) { - fs_reg temp2 = bld.vgrf(src0_type, 1); - - bld.MOV(temp2, temp); - - bld.ADD(byte_offset(dest, r * dest_stride), - temp2, - byte_offset(src0, r * dest_stride)); - } else { - bld.ADD(byte_offset(dest, r * dest_stride), - temp, - byte_offset(src0, r * dest_stride)); - } - } else { - bld.MOV(byte_offset(dest, r * dest_stride), temp); - } - } -} - -static void -int8_using_dp4a(const fs_builder &bld, fs_inst *inst) -{ - /* We only intend to support configurations where the destination and - * accumulator have the same type. - */ - if (!inst->src[0].is_null()) - assert(inst->dst.type == inst->src[0].type); - - assert(inst->src[1].type == BRW_REGISTER_TYPE_B || - inst->src[1].type == BRW_REGISTER_TYPE_UB); - assert(inst->src[2].type == BRW_REGISTER_TYPE_B || - inst->src[2].type == BRW_REGISTER_TYPE_UB); - - const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB - ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; - - const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB - ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; - - fs_reg dest = inst->dst; - fs_reg src0 = inst->src[0]; - const fs_reg src1 = retype(inst->src[1], src1_type); - const fs_reg src2 = retype(inst->src[2], src2_type); - - const unsigned dest_stride = REG_SIZE; - - for (unsigned r = 0; r < inst->rcount; r++) { - if (!src0.is_null()) { - bld.MOV(dest, src0); - src0 = byte_offset(src0, dest_stride); - } else { - bld.MOV(dest, retype(brw_imm_d(0), dest.type)); - } - - for (unsigned s = 0; s < inst->sdepth; s++) { - bld.DP4A(dest, - dest, - byte_offset(src1, s * REG_SIZE), - component(byte_offset(src2, r * REG_SIZE), s)) - ->saturate = inst->saturate; - } - - dest = byte_offset(dest, dest_stride); - } -} - -static void -int8_using_mul_add(const fs_builder &bld, fs_inst *inst) -{ - /* We only intend to support configurations where the destination and - * accumulator have the same type. - */ - if (!inst->src[0].is_null()) - assert(inst->dst.type == inst->src[0].type); - - assert(inst->src[1].type == BRW_REGISTER_TYPE_B || - inst->src[1].type == BRW_REGISTER_TYPE_UB); - assert(inst->src[2].type == BRW_REGISTER_TYPE_B || - inst->src[2].type == BRW_REGISTER_TYPE_UB); - - const brw_reg_type src0_type = inst->dst.type; - - const brw_reg_type src1_type = inst->src[1].type == BRW_REGISTER_TYPE_UB - ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; - - const brw_reg_type src2_type = inst->src[2].type == BRW_REGISTER_TYPE_UB - ? BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_D; - - fs_reg dest = inst->dst; - fs_reg src0 = inst->src[0]; - const fs_reg src1 = retype(inst->src[1], src1_type); - const fs_reg src2 = retype(inst->src[2], src2_type); - - const unsigned dest_stride = REG_SIZE; - - for (unsigned r = 0; r < inst->rcount; r++) { - if (!src0.is_null()) { - bld.MOV(dest, src0); - src0 = byte_offset(src0, dest_stride); - } else { - bld.MOV(dest, retype(brw_imm_d(0), dest.type)); - } - - for (unsigned s = 0; s < inst->sdepth; s++) { - fs_reg temp1 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - fs_reg temp2 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1); - fs_reg temp3 = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - const brw_reg_type temp_type = - (inst->src[1].type == BRW_REGISTER_TYPE_B || - inst->src[2].type == BRW_REGISTER_TYPE_B) - ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW; - - /* Expand 8 dwords of packed bytes into 16 dwords of packed - * words. - * - * FINISHME: Gfx9 should not need this work around. Gfx11 - * may be able to use integer MAD. Both platforms may be - * able to use MAC. - */ - bld.group(32, 0).MOV(retype(temp3, temp_type), - retype(byte_offset(src2, r * REG_SIZE), - inst->src[2].type)); - - bld.MUL(subscript(temp1, temp_type, 0), - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - inst->src[1].type, 0), - subscript(component(retype(temp3, - BRW_REGISTER_TYPE_UD), - s * 2), - temp_type, 0)); - - bld.MUL(subscript(temp1, temp_type, 1), - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - inst->src[1].type, 1), - subscript(component(retype(temp3, - BRW_REGISTER_TYPE_UD), - s * 2), - temp_type, 1)); - - bld.MUL(subscript(temp2, temp_type, 0), - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - inst->src[1].type, 2), - subscript(component(retype(temp3, - BRW_REGISTER_TYPE_UD), - s * 2 + 1), - temp_type, 0)); - - bld.MUL(subscript(temp2, temp_type, 1), - subscript(retype(byte_offset(src1, s * REG_SIZE), - BRW_REGISTER_TYPE_UD), - inst->src[1].type, 3), - subscript(component(retype(temp3, - BRW_REGISTER_TYPE_UD), - s * 2 + 1), - temp_type, 1)); - - bld.ADD(subscript(temp1, src0_type, 0), - subscript(temp1, temp_type, 0), - subscript(temp1, temp_type, 1)); - - bld.ADD(subscript(temp2, src0_type, 0), - subscript(temp2, temp_type, 0), - subscript(temp2, temp_type, 1)); - - bld.ADD(retype(temp1, src0_type), - retype(temp1, src0_type), - retype(temp2, src0_type)); - - bld.ADD(dest, dest, retype(temp1, src0_type)) - ->saturate = inst->saturate; - } - - dest = byte_offset(dest, dest_stride); - } -} - -bool -brw_lower_dpas(fs_visitor &v) -{ - bool progress = false; - - foreach_block_and_inst_safe(block, fs_inst, inst, v.cfg) { - if (inst->opcode != BRW_OPCODE_DPAS) - continue; - - const fs_builder bld = fs_builder(&v, block, inst).group(8, 0).exec_all(); - - if (brw_reg_type_is_floating_point(inst->dst.type)) { - f16_using_mac(bld, inst); - } else { - if (v.devinfo->ver >= 12) { - int8_using_dp4a(bld, inst); - } else { - int8_using_mul_add(bld, inst); - } - } - - inst->remove(block); - progress = true; - } - - if (progress) - v.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} diff --git a/src/intel/compiler/elk/meson.build b/src/intel/compiler/elk/meson.build index e755ffbb285..d9ea33b080e 100644 --- a/src/intel/compiler/elk/meson.build +++ b/src/intel/compiler/elk/meson.build @@ -58,7 +58,6 @@ libintel_compiler_elk_files = files( 'brw_fs.h', 'brw_fs_live_variables.cpp', 'brw_fs_live_variables.h', - 'brw_fs_lower_dpas.cpp', 'brw_fs_lower_pack.cpp', 'brw_fs_lower_regioning.cpp', 'brw_fs_nir.cpp', -- cgit v1.2.3