diff options
author | Vincent Lejeune <vljn@ovi.com> | 2013-11-07 00:14:07 +0100 |
---|---|---|
committer | Vincent Lejeune <vljn@ovi.com> | 2013-12-11 19:28:36 +0100 |
commit | 2b4b5221be8c9f591fe1117dfe5f5e455dde6a64 (patch) | |
tree | 6ec9ba1295df1ce7e8cd5104715e307a9f021397 | |
parent | 5d7deeb54b1472c36e78b96744bba1dd481448d7 (diff) |
R600: Generalize dot4 optimizations to cube instructionsradeonsi
-rw-r--r-- | lib/Target/R600/AMDGPUISelLowering.h | 1 | ||||
-rw-r--r-- | lib/Target/R600/R600EmitClauseMarkers.cpp | 11 | ||||
-rw-r--r-- | lib/Target/R600/R600ExpandSpecialInstrs.cpp | 79 | ||||
-rw-r--r-- | lib/Target/R600/R600ISelLowering.cpp | 86 | ||||
-rw-r--r-- | lib/Target/R600/R600InstrInfo.cpp | 73 | ||||
-rw-r--r-- | lib/Target/R600/R600InstrInfo.h | 6 | ||||
-rw-r--r-- | lib/Target/R600/R600Instructions.td | 66 | ||||
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.cpp | 4 | ||||
-rw-r--r-- | test/CodeGen/R600/pv.ll | 2 |
9 files changed, 205 insertions, 123 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h index 2dfd3cf492..d3393d40cb 100644 --- a/lib/Target/R600/AMDGPUISelLowering.h +++ b/lib/Target/R600/AMDGPUISelLowering.h @@ -159,7 +159,6 @@ enum { SMIN, UMIN, URECIP, - DOT4, TEXTURE_FETCH, EXPORT, CONST_ADDRESS, diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp index 1bbfd2b68f..e3ffc0c14c 100644 --- a/lib/Target/R600/R600EmitClauseMarkers.cpp +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp @@ -39,7 +39,7 @@ private: case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case AMDGPU::DOT4: return 4; case AMDGPU::KILL: return 0; @@ -78,7 +78,7 @@ private: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: case AMDGPU::COPY: - case AMDGPU::DOT_4: + case AMDGPU::DOT4: return true; default: return false; @@ -115,13 +115,13 @@ private: bool UpdateInstr = true) const { std::vector<std::pair<unsigned, unsigned> > UsedKCache; - if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4) + if (!TII->isALUInstr(MI->getOpcode()) && !TII->uses4Slots(*MI)) return true; const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts = TII->getSrcs(MI); assert((TII->isALUInstr(MI->getOpcode()) || - MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const"); + TII->uses4Slots(*MI)) && "Can't assign Const"); for (unsigned i = 0, n = Consts.size(); i < n; ++i) { if (Consts[i].first->getReg() != AMDGPU::ALU_CONST) continue; @@ -265,6 +265,9 @@ private: if (!SubstituteKCacheBank(I, KCacheBanks)) break; + if (TII->uses4Slots(I->getOpcode()) && + !SubstituteKCacheBank(I, KCacheBanks)) + break; AluInstCount += OccupiedDwords(I); } unsigned Opcode = PushBeforeModifier ? diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp index 0be491c304..5b528b5b34 100644 --- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp +++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp @@ -194,46 +194,65 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { MI.eraseFromParent(); continue; } - case AMDGPU::DOT_4: { + } + if (TII->uses4Slots(MI)) { const R600RegisterInfo &TRI = TII->getRegisterInfo(); - - unsigned DstReg = MI.getOperand(0).getReg(); - unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; - + unsigned SlotOpcode; + switch (MI.getOpcode()) { + case AMDGPU::DOT4: + if (true)//ST.getGeneration() <= AMDGPUSubtarget::R700) + SlotOpcode = AMDGPU::DOT4_r600; + else + SlotOpcode = AMDGPU::DOT4_eg; + break; + case AMDGPU::CUBE: + if (true) + SlotOpcode = AMDGPU::CUBE_eg_real; + else + SlotOpcode = AMDGPU::CUBE_r600_real; + break; + default: + llvm_unreachable_internal("Unknow Vector Op"); + } + //const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); for (unsigned Chan = 0; Chan < 4; ++Chan) { - bool Mask = (Chan != TRI.getHWRegChan(DstReg)); - unsigned SubDstReg = - AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + unsigned SubDstReg; + unsigned DstReg = MI.getOperand(0).getReg(); + unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK; + bool Mask = (Chan != TRI.getHWRegChan(DstReg)) && MI.getOpcode() == AMDGPU::DOT4; + if (MI.getOpcode() == AMDGPU::DOT4) + SubDstReg = + AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan); + else + SubDstReg = MI.getOperand(Chan).getReg(); MachineInstr *BMI = - TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg); - if (Chan > 0) { + TII->buildSlotOfVectorInstruction(MBB, &MI, SlotOpcode, Chan, + SubDstReg); + if (Chan > 0) BMI->bundleWithPred(); - } - if (Mask) { + if (Mask) TII->addFlag(BMI, 0, MO_FLAG_MASK); - } if (Chan != 3) TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST); unsigned Opcode = BMI->getOpcode(); - // While not strictly necessary from hw point of view, we force - // all src operands of a dot4 inst to belong to the same slot. - unsigned Src0 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) - .getReg(); - unsigned Src1 = BMI->getOperand( - TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) - .getReg(); - (void) Src0; - (void) Src1; - if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && - (TRI.getEncodingValue(Src1) & 0xff) < 127) - assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1)); - } + // While not strictly necessary from hw point of view, we force + // all src operands of a dot4 inst to belong to the same slot. + /* unsigned Src0 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src0)) + .getReg(); + unsigned Src1 = BMI->getOperand( + TII->getOperandIdx(Opcode, AMDGPU::OpName::src1)) + .getReg(); + (void) Src0; + (void) Src1; + if ((TRI.getEncodingValue(Src0) & 0xff) < 127 && + (TRI.getEncodingValue(Src1) & 0xff) < 127) + assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));*/ + } MI.eraseFromParent(); continue; } - } bool IsReduction = TII->isReductionOp(MI.getOpcode()); bool IsVector = TII->isVector(MI); @@ -312,7 +331,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { // Add the new instruction unsigned Opcode = MI.getOpcode(); - switch (Opcode) { +/* switch (Opcode) { case AMDGPU::CUBE_r600_pseudo: Opcode = AMDGPU::CUBE_r600_real; break; @@ -321,7 +340,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) { break; default: break; - } + }*/ MachineInstr *NewMI = TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1); diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp index 21a2b0dd17..bb0f6aab14 100644 --- a/lib/Target/R600/R600ISelLowering.cpp +++ b/lib/Target/R600/R600ISelLowering.cpp @@ -505,6 +505,25 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter( // Custom DAG Lowering Operations //===----------------------------------------------------------------------===// +static void getVector2OpArgs(SDValue *Arg, SelectionDAG &DAG, + SDValue SlotX0, SDValue SlotY0, SDValue SlotZ0, SDValue SlotW0, + SDValue SlotX1, SDValue SlotY1, SDValue SlotZ1, SDValue SlotW1) { + for (unsigned i = 0; i < 70; i++) + Arg[i] = DAG.getTargetConstant(0, MVT::i32); + // WriteMask + Arg[2] = Arg[19] = Arg[36] = Arg[53] = DAG.getTargetConstant(1, MVT::i32); + Arg[16] = Arg[33] = Arg[50] = Arg[67] = + DAG.getRegister(AMDGPU::PRED_SEL_OFF, MVT::i32); + Arg[6] = SlotX0; + Arg[11] = SlotX1; + Arg[23] = SlotY0; + Arg[28] = SlotY1; + Arg[40] = SlotZ0; + Arg[45] = SlotZ1; + Arg[57] = SlotW0; + Arg[62] = SlotW1; +} + SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>(); @@ -692,25 +711,52 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19); } case AMDGPUIntrinsic::AMDGPU_dp4: { - SDValue Args[8] = { - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(0, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(0, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(1, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(1, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(2, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(2, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1), - DAG.getConstant(3, MVT::i32)), - DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2), - DAG.getConstant(3, MVT::i32)) - }; - return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8); + SDValue Op0 = Op.getOperand(1), Op1 = Op.getOperand(2); + SDValue Args[70]; + getVector2OpArgs(Args, DAG, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(3, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1, + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1, + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1, + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1, + DAG.getConstant(3, MVT::i32))); + MachineSDNode *N = DAG.getMachineNode(AMDGPU::DOT4, DL, MVT::f32, Args); + return SDValue(N, 0); + } + case AMDGPUIntrinsic::AMDGPU_cube: { + SDValue Op0 = Op.getOperand(1); + SDValue Args[70]; + getVector2OpArgs(Args, DAG, + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(1, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(0, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(2, MVT::i32)), + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0, + DAG.getConstant(2, MVT::i32))); + MachineSDNode *N = DAG.getMachineNode(AMDGPU::CUBE, DL, + MVT::f32, MVT::f32, MVT::f32, MVT::f32, Args); + return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, SDValue(N, 0), + SDValue(N, 1), SDValue(N, 2), SDValue(N, 3)); } case Intrinsic::r600_read_ngroups_x: @@ -1854,7 +1900,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node, I != E; ++I) Ops.push_back(*I); - if (Opcode == AMDGPU::DOT_4) { + if (TII->uses4Slots(Opcode)) { int OperandIdx[] = { TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X), TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y), diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp index 2eca6cf432..7607623392 100644 --- a/lib/Target/R600/R600InstrInfo.cpp +++ b/lib/Target/R600/R600InstrInfo.cpp @@ -42,6 +42,18 @@ bool R600InstrInfo::isTrig(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG; } +bool R600InstrInfo::uses4Slots(unsigned Opcode) const { + switch (Opcode) { + case AMDGPU::CUBE: + case AMDGPU::DOT4: return true; + default: return false; + } +} + +bool R600InstrInfo::uses4Slots(const MachineInstr &MI) const { + return uses4Slots(MI.getOpcode()); +} + bool R600InstrInfo::isVector(const MachineInstr &MI) const { return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR; } @@ -123,11 +135,6 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const { bool R600InstrInfo::isCubeOp(unsigned Opcode) const { switch(Opcode) { default: return false; - case AMDGPU::CUBE_r600_pseudo: - case AMDGPU::CUBE_r600_real: - case AMDGPU::CUBE_eg_pseudo: - case AMDGPU::CUBE_eg_real: - return true; } } @@ -172,7 +179,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const { case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: case AMDGPU::COPY: - case AMDGPU::DOT_4: + case AMDGPU::DOT4: return true; default: return false; @@ -294,7 +301,7 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3> R600InstrInfo::getSrcs(MachineInstr *MI) const { SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result; - if (MI->getOpcode() == AMDGPU::DOT_4) { + if (uses4Slots(*MI)) { static const unsigned OpTable[8][2] = { {AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X}, {AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y}, @@ -307,8 +314,10 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const { }; for (unsigned j = 0; j < 8; j++) { - MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), - OpTable[j][0])); + int OperandIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]); + if (OperandIdx < 0) + continue; + MachineOperand &MO = MI->getOperand(OperandIdx); unsigned Reg = MO.getReg(); if (Reg == AMDGPU::ALU_CONST) { unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(), @@ -927,8 +936,8 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const { if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0) return false; return true; - } else if (isVector(*MI)) { - return false; + } else if (uses4Slots(*MI)) { + return true; } else { return AMDGPUInstrInfo::isPredicable(MI); } @@ -1027,7 +1036,7 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI, return true; } - if (MI->getOpcode() == AMDGPU::DOT_4) { + if (uses4Slots(*MI)) { MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X)) .setReg(Pred[2].getReg()); MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y)) @@ -1216,22 +1225,26 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) { #undef OPERAND_CASE MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( - MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg) + MachineBasicBlock &MBB, + MachineInstr *MI, + unsigned SlotOpcode, + unsigned Slot, + unsigned DstReg) const { - assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented"); - unsigned Opcode; - const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>(); - if (ST.getGeneration() <= AMDGPUSubtarget::R700) - Opcode = AMDGPU::DOT4_r600; - else - Opcode = AMDGPU::DOT4_eg; + MachineInstr *MIB; MachineBasicBlock::iterator I = MI; MachineOperand &Src0 = MI->getOperand( getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot))); - MachineOperand &Src1 = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot))); - MachineInstr *MIB = buildDefaultInstruction( - MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg()); + int Src1Idx = getOperandIdx(MI->getOpcode(), + getSlotedOps(AMDGPU::OpName::src1, Slot)); + if (Src1Idx > -1) { + MachineOperand &Src1 = MI->getOperand(Src1Idx); + MIB = buildDefaultInstruction( MBB, I, SlotOpcode, DstReg, Src0.getReg(), + Src1.getReg()); + } else + MIB = buildDefaultInstruction( MBB, I, SlotOpcode, DstReg, Src0.getReg(), + 0); + static const unsigned Operands[14] = { AMDGPU::OpName::update_exec_mask, AMDGPU::OpName::update_pred, @@ -1251,16 +1264,18 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction( MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::pred_sel, Slot))); - MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel)) + MIB->getOperand(getOperandIdx(SlotOpcode, AMDGPU::OpName::pred_sel)) .setReg(MO.getReg()); for (unsigned i = 0; i < 14; i++) { - MachineOperand &MO = MI->getOperand( - getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot))); - assert (MO.isImm()); + int OperandIdx = getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], + Slot)); + if (OperandIdx < 0) + continue; + MachineOperand &MO = MI->getOperand(OperandIdx); setImmOperand(MIB, Operands[i], MO.getImm()); } - MIB->getOperand(20).setImm(0); +// MIB->getOperand(20).setImm(0); //literal return MIB; } diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h index 13d981094e..f0086b0684 100644 --- a/lib/Target/R600/R600InstrInfo.h +++ b/lib/Target/R600/R600InstrInfo.h @@ -138,7 +138,10 @@ namespace llvm { /// Same but using const index set instead of MI set. bool fitsConstReadLimitations(const std::vector<unsigned>&) const; - /// \breif Vector instructions are instructions that must fill all + /// If an instruction uses 4 independent slot like dot4 or cube, returns true. + bool uses4Slots(unsigned Opcode) const; + bool uses4Slots(const MachineInstr &MI) const; + /// \brief Vector instructions are instructions that must fill all /// instruction slots within an instruction group. bool isVector(const MachineInstr &MI) const; @@ -234,6 +237,7 @@ namespace llvm { MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB, MachineInstr *MI, + unsigned SlotOpcode, unsigned Slot, unsigned DstReg) const; diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td index 2249ceec3c..b3a002ade9 100644 --- a/lib/Target/R600/R600Instructions.td +++ b/lib/Target/R600/R600Instructions.td @@ -371,13 +371,6 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS", [SDNPVariadic] >; -def DOT4 : SDNode<"AMDGPUISD::DOT4", - SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>, - SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>, - SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>, - [] ->; - def COS_HW : SDNode<"AMDGPUISD::COS_HW", SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]> >; @@ -942,8 +935,31 @@ class CNDGE_Common <bits<5> inst> : R600_3OP < } -let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in { -class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins +let isCodeGenOnly = 1, isPseudo = 1, isVector = 1, Namespace = "AMDGPU", + UseNamedOperandTable = 1 in { +class R600_VEC1OP<dag outs, list<dag> pattern> : InstR600 <outs, (ins +// Slot X + WRITE:$write_X, OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, + R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X, + R600_Pred:$pred_sel_X, +// Slot Y + WRITE:$write_Y, OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y, + R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y, + R600_Pred:$pred_sel_Y, +// Slot Z + WRITE:$write_Z, OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z, + R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z, + R600_Pred:$pred_sel_Z, +// Slot W + WRITE:$write_W, OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W, + R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W, + R600_Pred:$pred_sel_W, + LITERAL:$literal0, LITERAL:$literal1), + "", + pattern, + AnyALU>; + +class R600_VEC2OP<dag outs, list<dag> pattern> : InstR600 <outs, (ins // Slot X UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X, OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X, @@ -971,37 +987,18 @@ class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins LITERAL:$literal0, LITERAL:$literal1), "", pattern, - AnyALU> { - - let UseNamedOperandTable = 1; - -} -} - -def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4 - R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X, - R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y, - R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z, - R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>; - + AnyALU>; +} // End isCodeGenOnly = 1, isPseudo = 1, isVector = 1, Namespace = "AMDGPU", +// UseNamedOperandTable = 1 +def DOT4 : R600_VEC2OP<(outs R600_Reg32:$dst), []>; +def CUBE : R600_VEC2OP<(outs R600_TReg32_X:$dst_X, R600_TReg32_Y:$dst_Y, + R600_TReg32_Z:$dst_Z, R600_TReg32_W:$dst_W), []>; class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>; let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in { multiclass CUBE_Common <bits<11> inst> { - - def _pseudo : InstR600 < - (outs R600_Reg128:$dst), - (ins R600_Reg128:$src0), - "CUBE $dst $src0", - [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))], - VecALU - > { - let isPseudo = 1; - let UseNamedOperandTable = 1; - } - def _real : R600_2OP <inst, "CUBE", []>; } } // End mayLoad = 0, mayStore = 0, hasSideEffects = 0 @@ -1835,7 +1832,6 @@ def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24", let isVector = 1 in { def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>; - def MULLO_INT_cm : MULLO_INT_Common<0x8F>; def MULHI_INT_cm : MULHI_INT_Common<0x90>; def MULLO_UINT_cm : MULLO_UINT_Common<0x91>; diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index da2a4d862e..e8b5f927ff 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -228,7 +228,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case AMDGPU::DOT4: return AluT_XYZW; case AMDGPU::COPY: if (MI->getOperand(1).isUndef()) { @@ -307,7 +307,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) { case AMDGPU::INTERP_PAIR_XY: case AMDGPU::INTERP_PAIR_ZW: case AMDGPU::INTERP_VEC_LOAD: - case AMDGPU::DOT_4: + case AMDGPU::DOT4: return IDAlu; default: return IDOther; diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll index 5a930b2926..9fe746d1f3 100644 --- a/test/CodeGen/R600/pv.ll +++ b/test/CodeGen/R600/pv.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -march=r600 | FileCheck %s ;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED) -;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X +;CHECK: MAX * T{{[0-9].[XYZW]}}, 0.0, PV.X define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 { main_body: |