summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorVincent Lejeune <vljn@ovi.com>2013-02-24 16:31:32 +0100
committerVincent Lejeune <vljn@ovi.com>2013-05-04 20:19:54 +0200
commitb46e7f089e8adee4995b0bc05accc87a2977cc04 (patch)
treecbea05558bd94ca456e5f87ebe68296cefe86a83
parent06cfc8ddbd7b07d17ae674265cbaafc963d7376b (diff)
R600: Relax some vector constraints on Dot4.codesize6
Dot4 now uses 8 scalar operands instead of 2 vectors one which allows register coalescer to remove some unneeded COPY. This patch also defines some structures/functions that can be used to handle every vector instructions (CUBE, Cayman special instructions...) in a similar fashion.
-rw-r--r--lib/Target/R600/AMDGPUISelLowering.h1
-rw-r--r--lib/Target/R600/R600Defines.h74
-rw-r--r--lib/Target/R600/R600EmitClauseMarkers.cpp6
-rw-r--r--lib/Target/R600/R600ExpandSpecialInstrs.cpp25
-rw-r--r--lib/Target/R600/R600ISelLowering.cpp21
-rw-r--r--lib/Target/R600/R600InstrInfo.cpp89
-rw-r--r--lib/Target/R600/R600InstrInfo.h5
-rw-r--r--lib/Target/R600/R600Instructions.td51
-rw-r--r--lib/Target/R600/R600MachineScheduler.cpp2
9 files changed, 269 insertions, 5 deletions
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
index c2a79ea999..acb7e2526a 100644
--- a/lib/Target/R600/AMDGPUISelLowering.h
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -126,6 +126,7 @@ enum {
SMIN,
UMIN,
URECIP,
+ DOT4,
EXPORT,
CONST_ADDRESS,
REGISTER_LOAD,
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
index 303ca739ec..ddb6167eb3 100644
--- a/lib/Target/R600/R600Defines.h
+++ b/lib/Target/R600/R600Defines.h
@@ -95,6 +95,80 @@ namespace R600Operands {
{0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8, 9,-1,10,11,12,13,14,15,16,17,18}
};
+ enum VecOps {
+ UPDATE_EXEC_MASK_X,
+ UPDATE_PREDICATE_X,
+ WRITE_X,
+ OMOD_X,
+ DST_REL_X,
+ CLAMP_X,
+ SRC0_X,
+ SRC0_NEG_X,
+ SRC0_REL_X,
+ SRC0_ABS_X,
+ SRC0_SEL_X,
+ SRC1_X,
+ SRC1_NEG_X,
+ SRC1_REL_X,
+ SRC1_ABS_X,
+ SRC1_SEL_X,
+ PRED_SEL_X,
+ UPDATE_EXEC_MASK_Y,
+ UPDATE_PREDICATE_Y,
+ WRITE_Y,
+ OMOD_Y,
+ DST_REL_Y,
+ CLAMP_Y,
+ SRC0_Y,
+ SRC0_NEG_Y,
+ SRC0_REL_Y,
+ SRC0_ABS_Y,
+ SRC0_SEL_Y,
+ SRC1_Y,
+ SRC1_NEG_Y,
+ SRC1_REL_Y,
+ SRC1_ABS_Y,
+ SRC1_SEL_Y,
+ PRED_SEL_Y,
+ UPDATE_EXEC_MASK_Z,
+ UPDATE_PREDICATE_Z,
+ WRITE_Z,
+ OMOD_Z,
+ DST_REL_Z,
+ CLAMP_Z,
+ SRC0_Z,
+ SRC0_NEG_Z,
+ SRC0_REL_Z,
+ SRC0_ABS_Z,
+ SRC0_SEL_Z,
+ SRC1_Z,
+ SRC1_NEG_Z,
+ SRC1_REL_Z,
+ SRC1_ABS_Z,
+ SRC1_SEL_Z,
+ PRED_SEL_Z,
+ UPDATE_EXEC_MASK_W,
+ UPDATE_PREDICATE_W,
+ WRITE_W,
+ OMOD_W,
+ DST_REL_W,
+ CLAMP_W,
+ SRC0_W,
+ SRC0_NEG_W,
+ SRC0_REL_W,
+ SRC0_ABS_W,
+ SRC0_SEL_W,
+ SRC1_W,
+ SRC1_NEG_W,
+ SRC1_REL_W,
+ SRC1_ABS_W,
+ SRC1_SEL_W,
+ PRED_SEL_W,
+ IMM_0,
+ IMM_1,
+ VEC_COUNT
+ };
+
}
#endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
index 3fdc678b9e..6e9a15a7a2 100644
--- a/lib/Target/R600/R600EmitClauseMarkers.cpp
+++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
@@ -36,8 +36,7 @@ private:
case AMDGPU::INTERP_PAIR_XY:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
- case AMDGPU::DOT4_eg_pseudo:
- case AMDGPU::DOT4_r600_pseudo:
+ case AMDGPU::DOT_4:
return 4;
case AMDGPU::KILL:
return 0;
@@ -71,8 +70,7 @@ private:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
case AMDGPU::COPY:
- case AMDGPU::DOT4_eg_pseudo:
- case AMDGPU::DOT4_r600_pseudo:
+ case AMDGPU::DOT_4:
return true;
default:
return false;
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
index f8c900f727..993bdadcbc 100644
--- a/lib/Target/R600/R600ExpandSpecialInstrs.cpp
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -182,6 +182,31 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
MI.eraseFromParent();
continue;
}
+ case AMDGPU::DOT_4: {
+
+ const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+ unsigned DstReg = MI.getOperand(0).getReg();
+ unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+
+ for (unsigned Chan = 0; Chan < 4; ++Chan) {
+ bool Mask = (Chan != TRI.getHWRegChan(DstReg));
+ unsigned SubDstReg =
+ AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+ MachineInstr *BMI =
+ TII->buildSlotOfVectorInstruction(MBB, &MI, Chan, SubDstReg);
+ if (Chan > 0) {
+ BMI->bundleWithPred();
+ }
+ if (Mask) {
+ TII->addFlag(BMI, 0, MO_FLAG_MASK);
+ }
+ if (Chan != 3)
+ TII->addFlag(BMI, 0, MO_FLAG_NOT_LAST);
+ }
+ MI.eraseFromParent();
+ continue;
+ }
}
bool IsReduction = TII->isReductionOp(MI.getOpcode());
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
index a66bacaa5f..0fc5be4e94 100644
--- a/lib/Target/R600/R600ISelLowering.cpp
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -392,6 +392,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
return SDValue(interp, slot % 2);
}
+ case AMDGPUIntrinsic::AMDGPU_dp4: {
+ SDValue Args[8] = {
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(0, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(1, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(2, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(1),
+ DAG.getConstant(3, MVT::i32)),
+ DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, Op.getOperand(2),
+ DAG.getConstant(3, MVT::i32))
+ };
+ return DAG.getNode(AMDGPUISD::DOT4, DL, MVT::f32, Args, 8);
+ }
case r600_read_ngroups_x:
return LowerImplicitParameter(DAG, VT, DL, 0);
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
index 6470beda46..39cef73737 100644
--- a/lib/Target/R600/R600InstrInfo.cpp
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -728,6 +728,95 @@ MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MB
return MIB;
}
+#define OPERAND_CASE(Label) \
+ case Label: { \
+ static const R600Operands::VecOps Ops[] = \
+ { \
+ Label##_X, \
+ Label##_Y, \
+ Label##_Z, \
+ Label##_W \
+ }; \
+ return Ops[Slot]; \
+ }
+
+static R600Operands::VecOps
+getSlotedOps(R600Operands::Ops Op, unsigned Slot) {
+ switch (Op) {
+ OPERAND_CASE(R600Operands::UPDATE_EXEC_MASK)
+ OPERAND_CASE(R600Operands::UPDATE_PREDICATE)
+ OPERAND_CASE(R600Operands::WRITE)
+ OPERAND_CASE(R600Operands::OMOD)
+ OPERAND_CASE(R600Operands::DST_REL)
+ OPERAND_CASE(R600Operands::CLAMP)
+ OPERAND_CASE(R600Operands::SRC0)
+ OPERAND_CASE(R600Operands::SRC0_NEG)
+ OPERAND_CASE(R600Operands::SRC0_REL)
+ OPERAND_CASE(R600Operands::SRC0_ABS)
+ OPERAND_CASE(R600Operands::SRC0_SEL)
+ OPERAND_CASE(R600Operands::SRC1)
+ OPERAND_CASE(R600Operands::SRC1_NEG)
+ OPERAND_CASE(R600Operands::SRC1_REL)
+ OPERAND_CASE(R600Operands::SRC1_ABS)
+ OPERAND_CASE(R600Operands::SRC1_SEL)
+ OPERAND_CASE(R600Operands::PRED_SEL)
+ default:
+ llvm_unreachable("Wrong Operand");
+ }
+}
+
+#undef OPERAND_CASE
+
+static int
+getVecOperandIdx(R600Operands::VecOps Op) {
+ return 1 + Op;
+}
+
+
+MachineInstr *R600InstrInfo::buildSlotOfVectorInstruction(
+ MachineBasicBlock &MBB, MachineInstr *MI, unsigned Slot, unsigned DstReg)
+ const {
+ assert (MI->getOpcode() == AMDGPU::DOT_4 && "Not Implemented");
+ unsigned Opcode;
+ const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+ if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD4XXX)
+ Opcode = AMDGPU::DOT4_r600_real;
+ else
+ Opcode = AMDGPU::DOT4_eg_real;
+ MachineBasicBlock::iterator I = MI;
+ MachineOperand &Src0 = MI->getOperand(
+ getVecOperandIdx(getSlotedOps(R600Operands::SRC0, Slot)));
+ MachineOperand &Src1 = MI->getOperand(
+ getVecOperandIdx(getSlotedOps(R600Operands::SRC1, Slot)));
+ MachineInstr *MIB = buildDefaultInstruction(
+ MBB, I, Opcode, DstReg, Src0.getReg(), Src1.getReg());
+ static const R600Operands::Ops Operands[14] = {
+ R600Operands::UPDATE_EXEC_MASK,
+ R600Operands::UPDATE_PREDICATE,
+ R600Operands::WRITE,
+ R600Operands::OMOD,
+ R600Operands::DST_REL,
+ R600Operands::CLAMP,
+ R600Operands::SRC0_NEG,
+ R600Operands::SRC0_REL,
+ R600Operands::SRC0_ABS,
+ R600Operands::SRC0_SEL,
+ R600Operands::SRC1_NEG,
+ R600Operands::SRC1_REL,
+ R600Operands::SRC1_ABS,
+ R600Operands::SRC1_SEL,
+ };
+
+ for (unsigned i = 0; i < 14; i++) {
+ MachineOperand &MO = MI->getOperand(
+ getVecOperandIdx(getSlotedOps(Operands[i], Slot)));
+ assert (MO.isImm());
+ setImmOperand(MIB, Operands[i], MO.getImm());
+ }
+ MIB->getOperand(20).setImm(0);
+ return MIB;
+}
+
MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
MachineBasicBlock::iterator I,
unsigned DstReg,
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
index babe4b8fe5..0c2518a5e5 100644
--- a/lib/Target/R600/R600InstrInfo.h
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -170,6 +170,11 @@ namespace llvm {
unsigned Src0Reg,
unsigned Src1Reg = 0) const;
+ MachineInstr *buildSlotOfVectorInstruction(MachineBasicBlock &MBB,
+ MachineInstr *MI,
+ unsigned Slot,
+ unsigned DstReg) const;
+
MachineInstr *buildMovImm(MachineBasicBlock &BB,
MachineBasicBlock::iterator I,
unsigned DstReg,
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index cbe2a87bf6..de7d1ae2e1 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -618,6 +618,13 @@ def CONST_ADDRESS: SDNode<"AMDGPUISD::CONST_ADDRESS",
[SDNPVariadic]
>;
+def DOT4 : SDNode<"AMDGPUISD::DOT4",
+ SDTypeProfile<1, 8, [SDTCisFP<0>, SDTCisVT<1, f32>, SDTCisVT<2, f32>,
+ SDTCisVT<3, f32>, SDTCisVT<4, f32>, SDTCisVT<5, f32>,
+ SDTCisVT<6, f32>, SDTCisVT<7, f32>, SDTCisVT<8, f32>]>,
+ []
+>;
+
//===----------------------------------------------------------------------===//
// Interpolation Instructions
//===----------------------------------------------------------------------===//
@@ -1254,12 +1261,54 @@ class CNDGE_Common <bits<5> inst> : R600_3OP <
[(set f32:$dst, (selectcc f32:$src0, FP_ZERO, f32:$src1, f32:$src2, COND_GE))]
>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Namespace = "AMDGPU" in {
+class R600_VEC2OP<list<dag> pattern> : InstR600 <(outs R600_Reg32:$dst), (ins
+// Slot X
+ UEM:$update_exec_mask_X, UP:$update_pred_X, WRITE:$write_X,
+ OMOD:$omod_X, REL:$dst_rel_X, CLAMP:$clamp_X,
+ R600_TReg32_X:$src0_X, NEG:$src0_neg_X, REL:$src0_rel_X, ABS:$src0_abs_X, SEL:$src0_sel_X,
+ R600_TReg32_X:$src1_X, NEG:$src1_neg_X, REL:$src1_rel_X, ABS:$src1_abs_X, SEL:$src1_sel_X,
+ R600_Pred:$pred_sel_X,
+// Slot Y
+ UEM:$update_exec_mask_Y, UP:$update_pred_Y, WRITE:$write_Y,
+ OMOD:$omod_Y, REL:$dst_rel_Y, CLAMP:$clamp_Y,
+ R600_TReg32_Y:$src0_Y, NEG:$src0_neg_Y, REL:$src0_rel_Y, ABS:$src0_abs_Y, SEL:$src0_sel_Y,
+ R600_TReg32_Y:$src1_Y, NEG:$src1_neg_Y, REL:$src1_rel_Y, ABS:$src1_abs_Y, SEL:$src1_sel_Y,
+ R600_Pred:$pred_sel_Y,
+// Slot Z
+ UEM:$update_exec_mask_Z, UP:$update_pred_Z, WRITE:$write_Z,
+ OMOD:$omod_Z, REL:$dst_rel_Z, CLAMP:$clamp_Z,
+ R600_TReg32_Z:$src0_Z, NEG:$src0_neg_Z, REL:$src0_rel_Z, ABS:$src0_abs_Z, SEL:$src0_sel_Z,
+ R600_TReg32_Z:$src1_Z, NEG:$src1_neg_Z, REL:$src1_rel_Z, ABS:$src1_abs_Z, SEL:$src1_sel_Z,
+ R600_Pred:$pred_sel_Z,
+// Slot W
+ UEM:$update_exec_mask_W, UP:$update_pred_W, WRITE:$write_W,
+ OMOD:$omod_W, REL:$dst_rel_W, CLAMP:$clamp_W,
+ R600_TReg32_W:$src0_W, NEG:$src0_neg_W, REL:$src0_rel_W, ABS:$src0_abs_W, SEL:$src0_sel_W,
+ R600_TReg32_W:$src1_W, NEG:$src1_neg_W, REL:$src1_rel_W, ABS:$src1_abs_W, SEL:$src1_sel_W,
+ R600_Pred:$pred_sel_W,
+ LITERAL:$literal0, LITERAL:$literal1),
+ "",
+ pattern,
+ AnyALU> {}
+}
+
+def DOT_4 : R600_VEC2OP<[(set R600_Reg32:$dst, (DOT4
+ R600_TReg32_X:$src0_X, R600_TReg32_X:$src1_X,
+ R600_TReg32_Y:$src0_Y, R600_TReg32_Y:$src1_Y,
+ R600_TReg32_Z:$src0_Z, R600_TReg32_Z:$src1_Z,
+ R600_TReg32_W:$src0_W, R600_TReg32_W:$src1_W))]>;
+
+
+
+
multiclass DOT4_Common <bits<11> inst> {
def _pseudo : R600_REDUCTION <inst,
(ins R600_Reg128:$src0, R600_Reg128:$src1),
"DOT4 $dst $src0, $src1",
- [(set f32:$dst, (int_AMDGPU_dp4 v4f32:$src0, v4f32:$src1))]
+ []
>;
def _real : R600_2OP <inst, "DOT4", []>;
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 5d4c4d7e73..343bb72a93 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -418,6 +418,7 @@ R600SchedStrategy::AluKind R600SchedStrategy::getAluKind(SUnit *SU) const {
case AMDGPU::PRED_X:
case AMDGPU::INTERP_PAIR_ZW:
case AMDGPU::INTERP_VEC_LOAD:
+ case AMDGPU::DOT_4:
return AluT_XYZW;
case AMDGPU::COPY:
if (MI->getOperand(1).isUndef()) {
@@ -485,6 +486,7 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
case AMDGPU::INTERP_VEC_LOAD:
case AMDGPU::DOT4_eg_pseudo:
case AMDGPU::DOT4_r600_pseudo:
+ case AMDGPU::DOT_4:
return IDAlu;
default:
return IDOther;