diff options
author | Vincent Lejeune <vljn@ovi.com> | 2013-02-24 16:31:32 +0100 |
---|---|---|
committer | Vincent Lejeune <vljn@ovi.com> | 2013-04-10 16:14:44 +0200 |
commit | cfeb9a335b5423ab9a60467d8f57562c965c24a0 (patch) | |
tree | 1a8d103b38a5340b6b2474ec4aefaed0648d465d | |
parent | 49c8138cf0e8e635b9797bd1b56bf63697aabe02 (diff) |
R600: Relax some vector constraints on Dot4.
Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register
coalescer to remove some unneeded COPY.
This patch also defines some structures/functions that can be used to handle
every vector instructions (CUBE, Cayman special instructions...) in a similar
fashion.
-rw-r--r-- | lib/Target/R600/AMDGPUISelLowering.h | 1 | ||||
-rw-r--r-- | lib/Target/R600/R600Defines.h | 74 | ||||
-rw-r--r-- | lib/Target/R600/R600EmitClauseMarkers.cpp | 6 | ||||
-rw-r--r-- | lib/Target/R600/R600ExpandSpecialInstrs.cpp | 25 | ||||
-rw-r--r-- | lib/Target/R600/R600ISelLowering.cpp | 21 | ||||
-rw-r--r-- | lib/Target/R600/R600InstrInfo.cpp | 88 | ||||
-rw-r--r-- | lib/Target/R600/R600InstrInfo.h | 5 | ||||
-rw-r--r-- | lib/Target/R600/R600Instructions.td | 51 | ||||
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.cpp | 2 |
9 files changed, 268 insertions, 5 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index c2a79ea999..acb7e2526a 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -126,6 +126,7 @@ enum { SMIN, UMIN, URECIP, + DOT4, EXPORT, CONST_ADDRESS, REGISTER_LOAD, diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h index 16cfcf59eb..72d83b0bd5 100644 --- a/lib/Target/R600/R600Defines.h +++ b/lib/Target/R600/R600Defines.h @@ -92,6 +92,80 @@ namespace R600Operands { {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17} }; + enum VecOps { + UPDATE_EXEC_MASK_X, + UPDATE_PREDICATE_X, + WRITE_X, + OMOD_X, + DST_REL_X, + CLAMP_X, + SRC0_X, + SRC0_NEG_X, + SRC0_REL_X, + SRC0_ABS_X, + SRC0_SEL_X, + SRC1_X, + SRC1_NEG_X, + SRC1_REL_X, + SRC1_ABS_X, + SRC1_SEL_X, + PRED_SEL_X, + UPDATE_EXEC_MASK_Y, + UPDATE_PREDICATE_Y, + WRITE_Y, + OMOD_Y, + DST_REL_Y, + CLAMP_Y, + SRC0_Y, + SRC0_NEG_Y, + SRC0_REL_Y, + SRC0_ABS_Y, + SRC0_SEL_Y, + SRC1_Y, + SRC1_NEG_Y, + SRC1_REL_Y, + SRC1_ABS_Y, + SRC1_SEL_Y, + PRED_SEL_Y, + UPDATE_EXEC_MASK_Z, + UPDATE_PREDICATE_Z, + WRITE_Z, + OMOD_Z, + DST_REL_Z, + CLAMP_Z, + SRC0_Z, + SRC0_NEG_Z, + SRC0_REL_Z, + SRC0_ABS_Z, + SRC0_SEL_Z, + SRC1_Z, + SRC1_NEG_Z, + SRC1_REL_Z, + SRC1_ABS_Z, + SRC1_SEL_Z, + PRED_SEL_Z, + UPDATE_EXEC_MASK_W, + UPDATE_PREDICATE_W, + WRITE_W, + OMOD_W, + DST_REL_W, + CLAMP_W, + SRC0_W, + SRC0_NEG_W, + SRC0_REL_W, + SRC0_ABS_W, + SRC0_SEL_W, + SRC1_W, + SRC1_NEG_W, + SRC1_REL_W, + SRC1_ABS_W, + SRC1_SEL_W, + PRED_SEL_W, + IMM_0, + IMM_1, + VEC_COUNT + }; + } #endif // R600DEFINES_H_ diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 3fdc678b9e..6e9a15a7a2 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -36,8 +36,7 @@ private: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT4_eg_pseudo: - case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return 4; case AMDGPU::KILL: return 0; @@ -71,8 +70,7 @@ private: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: case AMDGPU::COPY: - case AMDGPU::DOT4_eg_pseudo: - case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return true; default: return false; diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index f8c900f727..993bdadcbc 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -182,6 +182,31 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); continue; } + case AMDGPU::DOT_4: { + + const R600RegisterInfo &TRI = TII->getRegisterInfo(); + + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + + for (unsigned Chan = 0; Chan < 4; ++Chan) { + bool Mask = (Chan != TRI.getHWRegChan(DstReg)); + unsigned SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + MachineInstr *BMI = + TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); + if (Chan > 0) { + BMI->bundleWithPred(); + } + if (Mask) { + TII->addFlag(BMI, 0, MO_FLAG_MASK); + } + if (Chan != 3) + TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); + } + MI.eraseFromParent(); + continue; + } } bool IsReduction = TII->isReductionOp(MI.getOpcode()); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 53e6e51dd2..ff6f0c8fb3 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -391,6 +391,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return SDValue(interp, slot % 2); } + case AMDGPUIntrinsic::AMDGPU_dp4: { + SDValue Args[8] = { + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), + DAG.getConstant(3, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), + DAG.getConstant(3, MVT::i32)) + }; + return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); + } case r600_read_ngroups_x: return LowerImplicitParameter(DAG, VT, DL, 0); diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index b232188a26..0a8a994b02 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -689,6 +689,94 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB return MIB; } +#define OPERAND_CASE(Label) \ + case Label: { \ + static const R600Operands::VecOps Ops[] = \ + { \ + Label##_X, \ + Label##_Y, \ + Label##_Z, \ + Label##_W \ + }; \ + return Ops[Slot]; \ + } + +static R600Operands::VecOps +getSlotedOps(R600Operands::Ops Op, unsigned Slot) { + switch (Op) { + OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK) + OPERAND_CASE(R600Operands::UPDATE_PREDICATE) + OPERAND_CASE(R600Operands::WRITE) + OPERAND_CASE(R600Operands::OMOD) + OPERAND_CASE(R600Operands::DST_REL) + OPERAND_CASE(R600Operands::CLAMP) + OPERAND_CASE(R600Operands::SRC0) + OPERAND_CASE(R600Operands::SRC0_NEG) + OPERAND_CASE(R600Operands::SRC0_REL) + OPERAND_CASE(R600Operands::SRC0_ABS) + OPERAND_CASE(R600Operands::SRC0_SEL) + OPERAND_CASE(R600Operands::SRC1) + OPERAND_CASE(R600Operands::SRC1_NEG) + OPERAND_CASE(R600Operands::SRC1_REL) + OPERAND_CASE(R600Operands::SRC1_ABS) + OPERAND_CASE(R600Operands::SRC1_SEL) + OPERAND_CASE(R600Operands::PRED_SEL) + default: + llvm_unreachable("Wrong Operand"); + } +} + +#undef OPERAND_CASE + +static int +getVecOperandIdx(R600Operands::VecOps Op) { + return 1 + Op; +} + + +MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( + MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + const { + assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); + unsigned Opcode; + const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX) + Opcode = AMDGPU::DOT4_r600_real; + else + Opcode = AMDGPU::DOT4_eg_real; + MachineBasicBlock::iterator I = MI; + MachineOperand &Src0 = MI->getOperand( + getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot))); + MachineOperand &Src1 = MI->getOperand( + getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot))); + MachineInstr *MIB = buildDefaultInstruction( + MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + static const R600Operands::Ops Operands[14] = { + R600Operands::UPDATE_EXEC_MASK, + R600Operands::UPDATE_PREDICATE, + R600Operands::WRITE, + R600Operands::OMOD, + R600Operands::DST_REL, + R600Operands::CLAMP, + R600Operands::SRC0_NEG, + R600Operands::SRC0_REL, + R600Operands::SRC0_ABS, + R600Operands::SRC0_SEL, + R600Operands::SRC1_NEG, + R600Operands::SRC1_REL, + R600Operands::SRC1_ABS, + R600Operands::SRC1_SEL, + }; + + for (unsigned i = 0; i < 14; i++) { + MachineOperand &MO = MI->getOperand( + getVecOperandIdx(getSlotedOps(Operands[i], Slot))); + assert (MO.isImm()); + setImmOperand(MIB, Operands[i], MO.getImm()); + } + return MIB; +} + MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned DstReg, diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index dbae90013d..b051698712 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -161,6 +161,11 @@ namespace llvm { unsigned Src0Reg, unsigned Src1Reg = 0) const; + MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned Slot, + unsigned DstReg) const; + MachineInstr *buildMovImm(MachineBasicBlock &BB, MachineBasicBlock::iterator I, unsigned DstReg, diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index b4c45e18fc..53e82a605c 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -610,6 +610,13 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", [SDNPVariadic] >; +def DOT4 : SDNode<"AMDGPUISD::DOT4", + SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, + SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, + SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, + [] +>; + //===----------------------------------------------------------------------===// // Interpolation Instructions //===----------------------------------------------------------------------===// @@ -1257,12 +1264,54 @@ class CNDGE_Common <bits<5> inst> : R600_3OP < COND_GE))] >; + +let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { +class R600_VEC2OP<list<dag> pattern> : InstR600 <0, (outs R600_Reg32:$dst), (ins +// Slot X + UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, + OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y, + OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z, + OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W, + OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU> {} +} + +def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 + R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, + R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, + R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, + R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; + + + + multiclass DOT4_Common <bits<11> inst> { def _pseudo : R600_REDUCTION <inst, (ins R600_Reg128:$src0, R600_Reg128:$src1), "DOT4 $dst $src0, $src1", - [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))] + [] >; def _real : R600_2OP <inst, "DOT4", []>; diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 8d4fa3502a..deb6d73868 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -407,6 +407,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: + case AMDGPU::DOT_4: return AluT_XYZW; case AMDGPU::COPY: if (MI->getOperand(1).isUndef()) { @@ -471,6 +472,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) { case AMDGPU::INTERP_VEC_LOAD: case AMDGPU::DOT4_eg_pseudo: case AMDGPU::DOT4_r600_pseudo: + case AMDGPU::DOT_4: return IDAlu; case AMDGPU::TEX_VTX_CONSTBUF: case AMDGPU::TEX_VTX_TEXBUF: |