summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Lejeune <vljn@ovi.com>2013-11-07 00:14:07 +0100
committerVincent Lejeune <vljn@ovi.com>2013-12-11 19:28:36 +0100
commit2b4b5221be8c9f591fe1117dfe5f5e455dde6a64 (patch)
tree6ec9ba1295df1ce7e8cd5104715e307a9f021397
parent5d7deeb54b1472c36e78b96744bba1dd481448d7 (diff)
R600: Generalize dot4 optimizations to cube instructionsradeonsi
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/R600/R600EmitClauseMarkers.cpp11
-rw-r--r--lib/Target/R600/R600ExpandSpecialInstrs.cpp79
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp86
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp73
-rw-r--r--lib/Target/R600/R600InstrInfo.h6
-rw-r--r--lib/Target/R600/R600Instructions.td66
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp4
-rw-r--r--test/CodeGen/R600/pv.ll2
9 files changed, 205 insertions, 123 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index 2dfd3cf492..d3393d40cb 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -159,7 +159,6 @@ enum {
SMIN,
UMIN,
URECIP,
- DOT4,
TEXTURE_FETCH,
EXPORT,
CONST_ADDRESS,
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 1bbfd2b68f..e3ffc0c14c 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -39,7 +39,7 @@ private:
case AMDGPU::INTERP_PAIR_XY:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case AMDGPU::DOT4:
return 4;
case AMDGPU::KILL:
return 0;
@@ -78,7 +78,7 @@ private:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case AMDGPU::DOT4:
return true;
default:
return false;
@@ -115,13 +115,13 @@ private:
bool UpdateInstr = true) const {
std::vector<std::pair<unsigned, unsigned> > UsedKCache;
- if (!TII->isALUInstr(MI->getOpcode()) && MI->getOpcode() != AMDGPU::DOT_4)
+ if (!TII->isALUInstr(MI->getOpcode()) && !TII->uses4Slots(*MI))
return true;
const SmallVectorImpl<std::pair<MachineOperand *, int64_t> > &Consts =
TII->getSrcs(MI);
assert((TII->isALUInstr(MI->getOpcode()) ||
- MI->getOpcode() == AMDGPU::DOT_4) && "Can't assign Const");
+ TII->uses4Slots(*MI)) && "Can't assign Const");
for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
if (Consts[i].first->getReg() != AMDGPU::ALU_CONST)
continue;
@@ -265,6 +265,9 @@ private:
if (!SubstituteKCacheBank(I, KCacheBanks))
break;
+ if (TII->uses4Slots(I->getOpcode()) &&
+ !SubstituteKCacheBank(I, KCacheBanks))
+ break;
AluInstCount += OccupiedDwords(I);
}
unsigned Opcode = PushBeforeModifier ?
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index 0be491c304..5b528b5b34 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -194,46 +194,65 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
continue;
}
- case AMDGPU::DOT_4: {
+ }
+ if (TII->uses4Slots(MI)) {
const R600RegisterInfo &TRI = TII->getRegisterInfo();
-
- unsigned DstReg = MI.getOperand(0).getReg();
- unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
-
+ unsigned SlotOpcode;
+ switch (MI.getOpcode()) {
+ case AMDGPU::DOT4:
+ if (true)//ST.getGeneration() <= AMDGPUSubtarget::R700)
+ SlotOpcode = AMDGPU::DOT4_r600;
+ else
+ SlotOpcode = AMDGPU::DOT4_eg;
+ break;
+ case AMDGPU::CUBE:
+ if (true)
+ SlotOpcode = AMDGPU::CUBE_eg_real;
+ else
+ SlotOpcode = AMDGPU::CUBE_r600_real;
+ break;
+ default:
+ llvm_unreachable_internal("Unknow Vector Op");
+ }
+ //const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
for (unsigned Chan = 0; Chan < 4; ++Chan) {
- bool Mask = (Chan != TRI.getHWRegChan(DstReg));
- unsigned SubDstReg =
- AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ unsigned SubDstReg;
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+ bool Mask = (Chan != TRI.getHWRegChan(DstReg)) && MI.getOpcode() == AMDGPU::DOT4;
+ if (MI.getOpcode() == AMDGPU::DOT4)
+ SubDstReg =
+ AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ else
+ SubDstReg = MI.getOperand(Chan).getReg();
MachineInstr *BMI =
- TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
- if (Chan > 0) {
+ TII->buildSlotOfVectorInstruction(MBB, &MI, SlotOpcode, Chan,
+ SubDstReg);
+ if (Chan > 0)
BMI->bundleWithPred();
- }
- if (Mask) {
+ if (Mask)
TII->addFlag(BMI, 0, MO_FLAG_MASK);
- }
if (Chan != 3)
TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
unsigned Opcode = BMI->getOpcode();
- // While not strictly necessary from hw point of view, we force
- // all src operands of a dot4 inst to belong to the same slot.
- unsigned Src0 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
- .getReg();
- unsigned Src1 = BMI->getOperand(
- TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
- .getReg();
- (void) Src0;
- (void) Src1;
- if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
- (TRI.getEncodingValue(Src1) & 0xff) < 127)
- assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));
- }
+ // While not strictly necessary from hw point of view, we force
+ // all src operands of a dot4 inst to belong to the same slot.
+ /* unsigned Src0 = BMI->getOperand(
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src0))
+ .getReg();
+ unsigned Src1 = BMI->getOperand(
+ TII->getOperandIdx(Opcode, AMDGPU::OpName::src1))
+ .getReg();
+ (void) Src0;
+ (void) Src1;
+ if ((TRI.getEncodingValue(Src0) & 0xff) < 127 &&
+ (TRI.getEncodingValue(Src1) & 0xff) < 127)
+ assert(TRI.getHWRegChan(Src0) == TRI.getHWRegChan(Src1));*/
+ }
MI.eraseFromParent();
continue;
}
- }
bool IsReduction = TII->isReductionOp(MI.getOpcode());
bool IsVector = TII->isVector(MI);
@@ -312,7 +331,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
// Add the new instruction
unsigned Opcode = MI.getOpcode();
- switch (Opcode) {
+/* switch (Opcode) {
case AMDGPU::CUBE_r600_pseudo:
Opcode = AMDGPU::CUBE_r600_real;
break;
@@ -321,7 +340,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
break;
default:
break;
- }
+ }*/
MachineInstr *NewMI =
TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index 21a2b0dd17..bb0f6aab14 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -505,6 +505,25 @@ MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
// Custom DAG Lowering Operations
//===----------------------------------------------------------------------===//
+static void getVector2OpArgs(SDValue *Arg, SelectionDAG &DAG,
+ SDValue SlotX0, SDValue SlotY0, SDValue SlotZ0, SDValue SlotW0,
+ SDValue SlotX1, SDValue SlotY1, SDValue SlotZ1, SDValue SlotW1) {
+ for (unsigned i = 0; i < 70; i++)
+ Arg[i] = DAG.getTargetConstant(0, MVT::i32);
+ // WriteMask
+ Arg[2] = Arg[19] = Arg[36] = Arg[53] = DAG.getTargetConstant(1, MVT::i32);
+ Arg[16] = Arg[33] = Arg[50] = Arg[67] =
+ DAG.getRegister(AMDGPU::PRED_SEL_OFF, MVT::i32);
+ Arg[6] = SlotX0;
+ Arg[11] = SlotX1;
+ Arg[23] = SlotY0;
+ Arg[28] = SlotY1;
+ Arg[40] = SlotZ0;
+ Arg[45] = SlotZ1;
+ Arg[57] = SlotW0;
+ Arg[62] = SlotW1;
+}
+
SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
MachineFunction &MF = DAG.getMachineFunction();
R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
@@ -692,25 +711,52 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return DAG.getNode(AMDGPUISD::TEXTURE_FETCH, DL, MVT::v4f32, TexArgs, 19);
}
case AMDGPUIntrinsic::AMDGPU_dp4: {
- SDValue Args[8] = {
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
- DAG.getConstant(0, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
- DAG.getConstant(0, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
- DAG.getConstant(1, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
- DAG.getConstant(1, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
- DAG.getConstant(2, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
- DAG.getConstant(2, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
- DAG.getConstant(3, MVT::i32)),
- DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
- DAG.getConstant(3, MVT::i32))
- };
- return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+ SDValue Op0 = Op.getOperand(1), Op1 = Op.getOperand(2);
+ SDValue Args[70];
+ getVector2OpArgs(Args, DAG,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(3, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1,
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1,
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1,
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op1), MVT::f32, Op1,
+ DAG.getConstant(3, MVT::i32)));
+ MachineSDNode *N = DAG.getMachineNode(AMDGPU::DOT4, DL, MVT::f32, Args);
+ return SDValue(N, 0);
+ }
+ case AMDGPUIntrinsic::AMDGPU_cube: {
+ SDValue Op0 = Op.getOperand(1);
+ SDValue Args[70];
+ getVector2OpArgs(Args, DAG,
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Op0), MVT::f32, Op0,
+ DAG.getConstant(2, MVT::i32)));
+ MachineSDNode *N = DAG.getMachineNode(AMDGPU::CUBE, DL,
+ MVT::f32, MVT::f32, MVT::f32, MVT::f32, Args);
+ return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, SDValue(N, 0),
+ SDValue(N, 1), SDValue(N, 2), SDValue(N, 3));
}
case Intrinsic::r600_read_ngroups_x:
@@ -1854,7 +1900,7 @@ SDNode *R600TargetLowering::PostISelFolding(MachineSDNode *Node,
I != E; ++I)
Ops.push_back(*I);
- if (Opcode == AMDGPU::DOT_4) {
+ if (TII->uses4Slots(Opcode)) {
int OperandIdx[] = {
TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_X),
TII->getOperandIdx(Opcode, AMDGPU::OpName::src0_Y),
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 2eca6cf432..7607623392 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -42,6 +42,18 @@ bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
}
+bool R600InstrInfo::uses4Slots(unsigned Opcode) const {
+ switch (Opcode) {
+ case AMDGPU::CUBE:
+ case AMDGPU::DOT4: return true;
+ default: return false;
+ }
+}
+
+bool R600InstrInfo::uses4Slots(const MachineInstr &MI) const {
+ return uses4Slots(MI.getOpcode());
+}
+
bool R600InstrInfo::isVector(const MachineInstr &MI) const {
return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
}
@@ -123,11 +135,6 @@ bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
switch(Opcode) {
default: return false;
- case AMDGPU::CUBE_r600_pseudo:
- case AMDGPU::CUBE_r600_real:
- case AMDGPU::CUBE_eg_pseudo:
- case AMDGPU::CUBE_eg_real:
- return true;
}
}
@@ -172,7 +179,7 @@ bool R600InstrInfo::canBeConsideredALU(const MachineInstr *MI) const {
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
case AMDGPU::COPY:
- case AMDGPU::DOT_4:
+ case AMDGPU::DOT4:
return true;
default:
return false;
@@ -294,7 +301,7 @@ SmallVector<std::pair<MachineOperand *, int64_t>, 3>
R600InstrInfo::getSrcs(MachineInstr *MI) const {
SmallVector<std::pair<MachineOperand *, int64_t>, 3> Result;
- if (MI->getOpcode() == AMDGPU::DOT_4) {
+ if (uses4Slots(*MI)) {
static const unsigned OpTable[8][2] = {
{AMDGPU::OpName::src0_X, AMDGPU::OpName::src0_sel_X},
{AMDGPU::OpName::src0_Y, AMDGPU::OpName::src0_sel_Y},
@@ -307,8 +314,10 @@ R600InstrInfo::getSrcs(MachineInstr *MI) const {
};
for (unsigned j = 0; j < 8; j++) {
- MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
- OpTable[j][0]));
+ int OperandIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
+ if (OperandIdx < 0)
+ continue;
+ MachineOperand &MO = MI->getOperand(OperandIdx);
unsigned Reg = MO.getReg();
if (Reg == AMDGPU::ALU_CONST) {
unsigned Sel = MI->getOperand(getOperandIdx(MI->getOpcode(),
@@ -927,8 +936,8 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
return false;
return true;
- } else if (isVector(*MI)) {
- return false;
+ } else if (uses4Slots(*MI)) {
+ return true;
} else {
return AMDGPUInstrInfo::isPredicable(MI);
}
@@ -1027,7 +1036,7 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
return true;
}
- if (MI->getOpcode() == AMDGPU::DOT_4) {
+ if (uses4Slots(*MI)) {
MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_X))
.setReg(Pred[2].getReg());
MI->getOperand(getOperandIdx(*MI, AMDGPU::OpName::pred_sel_Y))
@@ -1216,22 +1225,26 @@ static unsigned getSlotedOps(unsigned Op, unsigned Slot) {
#undef OPERAND_CASE
MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
- MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+ MachineBasicBlock &MBB,
+ MachineInstr *MI,
+ unsigned SlotOpcode,
+ unsigned Slot,
+ unsigned DstReg)
const {
- assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
- unsigned Opcode;
- const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
- if (ST.getGeneration() <= AMDGPUSubtarget::R700)
- Opcode = AMDGPU::DOT4_r600;
- else
- Opcode = AMDGPU::DOT4_eg;
+ MachineInstr *MIB;
MachineBasicBlock::iterator I = MI;
MachineOperand &Src0 = MI->getOperand(
getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src0, Slot)));
- MachineOperand &Src1 = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(AMDGPU::OpName::src1, Slot)));
- MachineInstr *MIB = buildDefaultInstruction(
- MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+ int Src1Idx = getOperandIdx(MI->getOpcode(),
+ getSlotedOps(AMDGPU::OpName::src1, Slot));
+ if (Src1Idx > -1) {
+ MachineOperand &Src1 = MI->getOperand(Src1Idx);
+ MIB = buildDefaultInstruction( MBB, I, SlotOpcode, DstReg, Src0.getReg(),
+ Src1.getReg());
+ } else
+ MIB = buildDefaultInstruction( MBB, I, SlotOpcode, DstReg, Src0.getReg(),
+ 0);
+
static const unsigned Operands[14] = {
AMDGPU::OpName::update_exec_mask,
AMDGPU::OpName::update_pred,
@@ -1251,16 +1264,18 @@ MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
MachineOperand &MO = MI->getOperand(getOperandIdx(MI->getOpcode(),
getSlotedOps(AMDGPU::OpName::pred_sel, Slot)));
- MIB->getOperand(getOperandIdx(Opcode, AMDGPU::OpName::pred_sel))
+ MIB->getOperand(getOperandIdx(SlotOpcode, AMDGPU::OpName::pred_sel))
.setReg(MO.getReg());
for (unsigned i = 0; i < 14; i++) {
- MachineOperand &MO = MI->getOperand(
- getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i], Slot)));
- assert (MO.isImm());
+ int OperandIdx = getOperandIdx(MI->getOpcode(), getSlotedOps(Operands[i],
+ Slot));
+ if (OperandIdx < 0)
+ continue;
+ MachineOperand &MO = MI->getOperand(OperandIdx);
setImmOperand(MIB, Operands[i], MO.getImm());
}
- MIB->getOperand(20).setImm(0);
+// MIB->getOperand(20).setImm(0); //literal
return MIB;
}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index 13d981094e..f0086b0684 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -138,7 +138,10 @@ namespace llvm {
/// Same but using const index set instead of MI set.
bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
- /// \breif Vector instructions are instructions that must fill all
+ /// If an instruction uses 4 independent slot like dot4 or cube, returns true.
+ bool uses4Slots(unsigned Opcode) const;
+ bool uses4Slots(const MachineInstr &MI) const;
+ /// \brief Vector instructions are instructions that must fill all
/// instruction slots within an instruction group.
bool isVector(const MachineInstr &MI) const;
@@ -234,6 +237,7 @@ namespace llvm {
MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
MachineInstr *MI,
+ unsigned SlotOpcode,
unsigned Slot,
unsigned DstReg) const;
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 2249ceec3c..b3a002ade9 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -371,13 +371,6 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
[SDNPVariadic]
>;
-def DOT4 : SDNode<"AMDGPUISD::DOT4",
- SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
- SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
- SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
- []
->;
-
def COS_HW : SDNode<"AMDGPUISD::COS_HW",
SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>
>;
@@ -942,8 +935,31 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
}
-let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
-class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+let isCodeGenOnly = 1, isPseudo = 1, isVector = 1, Namespace = "AMDGPU",
+ UseNamedOperandTable = 1 in {
+class R600_VEC1OP<dag outs, list<dag> pattern> : InstR600 <outs, (ins
+// Slot X
+ WRITE:$write_X, OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+ R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+ R600_Pred:$pred_sel_X,
+// Slot Y
+ WRITE:$write_Y, OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+ R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+ R600_Pred:$pred_sel_Y,
+// Slot Z
+ WRITE:$write_Z, OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+ R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+ R600_Pred:$pred_sel_Z,
+// Slot W
+ WRITE:$write_W, OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+ R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+ R600_Pred:$pred_sel_W,
+ LITERAL:$literal0, LITERAL:$literal1),
+ "",
+ pattern,
+ AnyALU>;
+
+class R600_VEC2OP<dag outs, list<dag> pattern> : InstR600 <outs, (ins
// Slot X
UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
@@ -971,37 +987,18 @@ class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
LITERAL:$literal0, LITERAL:$literal1),
"",
pattern,
- AnyALU> {
-
- let UseNamedOperandTable = 1;
-
-}
-}
-
-def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
- R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
- R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
- R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
- R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
-
+ AnyALU>;
+} // End isCodeGenOnly = 1, isPseudo = 1, isVector = 1, Namespace = "AMDGPU",
+// UseNamedOperandTable = 1
+def DOT4 : R600_VEC2OP<(outs R600_Reg32:$dst), []>;
+def CUBE : R600_VEC2OP<(outs R600_TReg32_X:$dst_X, R600_TReg32_Y:$dst_Y,
+ R600_TReg32_Z:$dst_Z, R600_TReg32_W:$dst_W), []>;
class DOT4_Common <bits<11> inst> : R600_2OP <inst, "DOT4", []>;
let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
multiclass CUBE_Common <bits<11> inst> {
-
- def _pseudo : InstR600 <
- (outs R600_Reg128:$dst),
- (ins R600_Reg128:$src0),
- "CUBE $dst $src0",
- [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
- VecALU
- > {
- let isPseudo = 1;
- let UseNamedOperandTable = 1;
- }
-
def _real : R600_2OP <inst, "CUBE", []>;
}
} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
@@ -1835,7 +1832,6 @@ def MUL_INT24_cm : R600_2OP <0x5B, "MUL_INT24",
let isVector = 1 in {
def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
-
def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
def MULHI_INT_cm : MULHI_INT_Common<0x90>;
def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index da2a4d862e..e8b5f927ff 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -228,7 +228,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
case AMDGPU::INTERP_PAIR_XY:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case AMDGPU::DOT4:
return AluT_XYZW;
case AMDGPU::COPY:
if (MI->getOperand(1).isUndef()) {
@@ -307,7 +307,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
case AMDGPU::INTERP_PAIR_XY:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT_4:
+ case AMDGPU::DOT4:
return IDAlu;
default:
return IDOther;
diff --git a/test/CodeGen/R600/pv.ll b/test/CodeGen/R600/pv.ll
index 5a930b2926..9fe746d1f3 100644
--- a/test/CodeGen/R600/pv.ll
+++ b/test/CodeGen/R600/pv.ll
@@ -1,7 +1,7 @@
; RUN: llc < %s -march=r600 | FileCheck %s
;CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
-;CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
+;CHECK: MAX * T{{[0-9].[XYZW]}}, 0.0, PV.X
define void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) #0 {
main_body: