diff options
author | Vincent Lejeune <vljn@ovi.com> | 2013-02-17 15:42:51 +0100 |
---|---|---|
committer | Vincent Lejeune <vljn@ovi.com> | 2013-02-18 13:44:58 +0100 |
commit | a9d587d5d0a1441cce20c78ead7404845ff5c2b0 (patch) | |
tree | b70d608a4d0cb41b65fbda5193b8ce5749e0ff5b | |
parent | ed7cd57044d4310e8a431f060bc2bcbf3a1c5344 (diff) |
R600: Recompute schedule graph non order dependencies.scheduling
-rw-r--r-- | lib/Target/R600/R600MachineScheduler.cpp | 120 | ||||
-rw-r--r-- | test/CodeGen/R600/schedule1.ll | 116 | ||||
-rw-r--r-- | test/CodeGen/R600/schedule2.ll | 102 |
3 files changed, 337 insertions, 1 deletions
diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp index 0aa9997151..2fc8b34e5c 100644 --- a/lib/Target/R600/R600MachineScheduler.cpp +++ b/lib/Target/R600/R600MachineScheduler.cpp @@ -24,7 +24,123 @@ #include <iostream> using namespace llvm; +/// \brief Recompute Output and Anti dependencies of incoming dag +/// ScheduleDAGInstrs has a conservative policy about subregisters dependencies. +/// All subreg write of a same superreg will be chained by Output/Anti deps. +/// These artificial deps delay releases of MI and thus reduce parallelism +/// oportunities. This function recompute the ScheduleDag to produce proper +/// subreg aware dependencies. +static +void RecomputeScheduleDAGMI(ScheduleDAGMI *dag) { + + // Remove all Output/Anti deps + for (unsigned i = 0; i < dag->SUnits.size(); ++i) { + SUnit &SU = dag->SUnits[i]; + for (SUnit::pred_iterator SUIt = SU.Preds.begin(), SUE = SU.Preds.end(); + SUIt != SUE; ++SUIt) { + SDep &SD = *SUIt; + if (SD.getKind() == SDep::Output || SD.getKind() == SDep::Data) { + SU.removePred(SD); + } + } + } + + // Now recompute output/anti dependencies + for (unsigned i = 0; i < dag->SUnits.size(); ++i) { + SUnit &SU = dag->SUnits[i]; + MachineOperand &DestMO = SU.getInstr()->getOperand(0); + unsigned DestReg = SU.getInstr()->getOperand(0).getReg(); + // Using LiveInterval should make things a lot more efficient, but we + // can't access them inside a MachineSchedStrategy. + // Scheduling occurs on a per MBB basis, so it is sufficient to get deps + // inside a MBB. + MachineBasicBlock *MBB = SU.getInstr()->getParent(); + MachineBasicBlock::iterator SUPos = SU.getInstr(); + // We parse MI from MBB's start to SU instruction ; following deps will + // be caught when parsing later SU, and add preds takes care of both ends of + // a SDep. + for (MachineBasicBlock::iterator It = MBB->begin(), E = SUPos; It != E; + ++It) { + MachineInstr &MI = *It; + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || MO.getReg() != DestReg) + continue; + if (MO.isUse() && + (DestMO.getSubReg() == AMDGPU::NoSubRegister || + MO.getSubReg() == DestMO.getSubReg()) + ) { + SUnit *Predecessor = dag->getSUnit(&MI); + SU.addPred(SDep(Predecessor, SDep::Anti, DestReg)); + } + if (MO.isDef() && + (MO.getSubReg() == AMDGPU::NoSubRegister || + MO.getSubReg() == DestMO.getSubReg()) + ) { + SUnit *Predecessor = dag->getSUnit(&MI); + SU.addPred(SDep(Predecessor, SDep::Output, DestReg)); + } + } + } + // Compute data dependencies + std::set<unsigned> DefRegs; + for (MachineInstr::mop_iterator MOI = SU.getInstr()->operands_begin(), MOE = SU.getInstr()->operands_end(); + MOI != MOE; ++MOI) { + MachineOperand &MO = *MOI; + if (MO.isReg() && MO.isDef()) + DefRegs.insert(MO.getReg()); + } + + for (MachineBasicBlock::iterator It = llvm::next(SUPos), E = MBB->end(); It != E; + ++It) { + MachineInstr &MI = *It; + for (unsigned i = 0; i < MI.getNumOperands(); ++i) { + MachineOperand &MO = MI.getOperand(i); + if (!MO.isReg() || !MO.isUse()) + continue; + std::set<unsigned>::iterator OtherDef = std::find(DefRegs.begin(), DefRegs.end(), MO.getReg()); + if (OtherDef != DefRegs.end()) { + SUnit *Successor = dag->getSUnit(&MI); + if (Successor) + Successor->addPred(SDep(&SU, SDep::Data, DestReg)); + } + if (MO.getReg() != DestMO.getReg()) + continue; + if (MO.getSubReg() == AMDGPU::NoSubRegister || + MO.getSubReg() == DestMO.getSubReg()) { + SUnit *Successor = dag->getSUnit(&MI); + if (Successor) + Successor->addPred(SDep(&SU, SDep::Data, DestReg)); + } + } + } + } + + DEBUG( + dbgs() << "\n\n Recomputed DAG is :"; + for (unsigned i = 0; i < dag->SUnits.size(); ++i) { + SUnit &SU = dag->SUnits[i]; + dbgs() << "\n\n"; + dag->SUnits[i].dump(dag); + dbgs() << "\nSuccs (" << SU.NumSuccsLeft << "):\n"; + for (unsigned j = 0; j < SU.Succs.size(); j++) { + dbgs() << "- (" << SU.Succs[j].getKind() << ") "; + SU.Succs[j].getSUnit()->dump(dag); + dbgs() << "\n"; + } + dbgs() << " and Preds (" << SU.NumPredsLeft << ") :\n"; + for (unsigned j = 0; j < SU.Preds.size(); j++) { + dbgs() << "- (" << SU.Preds[j].getKind() << ") "; + SU.Preds[j].getSUnit()->dump(dag); + dbgs() << "\n"; + } + } + ); + +} + void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { + RecomputeScheduleDAGMI(dag); DAG = dag; TII = static_cast<const R600InstrInfo*>(DAG->TII); @@ -75,10 +191,12 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) { (!AllowSwitchFromAlu && CurInstKind == IDAlu)) { // try to pick ALU SU = pickAlu(); - if (SU) + if (SU) { + SU->getInstr()->getOperand(0).setIsUndef(false); if (CurEmitted > InstKindLimit[IDAlu]) CurEmitted = 0; NextInstKind = IDAlu; + } } if (!SU) { diff --git a/test/CodeGen/R600/schedule1.ll b/test/CodeGen/R600/schedule1.ll new file mode 100644 index 0000000000..14fad97a15 --- /dev/null +++ b/test/CodeGen/R600/schedule1.ll @@ -0,0 +1,116 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + + +;CHECK: MOV_SAT T{{[0-9]+\.[X]}} +;CHECK: MOV_SAT T{{[0-9]+\.[Y]}} +;CHECK: MOV_SAT T{{[0-9]+\.[Z]}} +;CHECK: MOV_SAT T{{[0-9]+\.[W]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[X]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[Y]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[Z]}} +;CHECK: MUL_IEEE T{{[0-9]+\.[W]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[X]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Y]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Z]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[W]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[X]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Y]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Z]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[W]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[X]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Y]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[Z]}} +;CHECK: MULADD_IEEE T{{[0-9]+\.[W]}} + +define void @main() { +main_body: + %0 = call float @llvm.R600.load.input(i32 4) + %1 = call float @llvm.R600.load.input(i32 5) + %2 = call float @llvm.R600.load.input(i32 6) + %3 = call float @llvm.R600.load.input(i32 7) + %4 = call float @llvm.R600.load.input(i32 8) + %5 = call float @llvm.R600.load.input(i32 9) + %6 = call float @llvm.R600.load.input(i32 10) + %7 = call float @llvm.R600.load.input(i32 11) + %8 = load <4 x float> addrspace(9)* null + %9 = extractelement <4 x float> %8, i32 0 + %10 = fmul float %0, %9 + %11 = load <4 x float> addrspace(9)* null + %12 = extractelement <4 x float> %11, i32 1 + %13 = fmul float %0, %12 + %14 = load <4 x float> addrspace(9)* null + %15 = extractelement <4 x float> %14, i32 2 + %16 = fmul float %0, %15 + %17 = load <4 x float> addrspace(9)* null + %18 = extractelement <4 x float> %17, i32 3 + %19 = fmul float %0, %18 + %20 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %21 = extractelement <4 x float> %20, i32 0 + %22 = fmul float %1, %21 + %23 = fadd float %22, %10 + %24 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %25 = extractelement <4 x float> %24, i32 1 + %26 = fmul float %1, %25 + %27 = fadd float %26, %13 + %28 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %29 = extractelement <4 x float> %28, i32 2 + %30 = fmul float %1, %29 + %31 = fadd float %30, %16 + %32 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 1) + %33 = extractelement <4 x float> %32, i32 3 + %34 = fmul float %1, %33 + %35 = fadd float %34, %19 + %36 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %37 = extractelement <4 x float> %36, i32 0 + %38 = fmul float %2, %37 + %39 = fadd float %38, %23 + %40 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %41 = extractelement <4 x float> %40, i32 1 + %42 = fmul float %2, %41 + %43 = fadd float %42, %27 + %44 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %45 = extractelement <4 x float> %44, i32 2 + %46 = fmul float %2, %45 + %47 = fadd float %46, %31 + %48 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 2) + %49 = extractelement <4 x float> %48, i32 3 + %50 = fmul float %2, %49 + %51 = fadd float %50, %35 + %52 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %53 = extractelement <4 x float> %52, i32 0 + %54 = fmul float %3, %53 + %55 = fadd float %54, %39 + %56 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %57 = extractelement <4 x float> %56, i32 1 + %58 = fmul float %3, %57 + %59 = fadd float %58, %43 + %60 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %61 = extractelement <4 x float> %60, i32 2 + %62 = fmul float %3, %61 + %63 = fadd float %62, %47 + %64 = load <4 x float> addrspace(9)* getelementptr ([1024 x <4 x float>] addrspace(9)* null, i64 0, i32 3) + %65 = extractelement <4 x float> %64, i32 3 + %66 = fmul float %3, %65 + %67 = fadd float %66, %51 + %68 = call float @llvm.AMDIL.clamp.(float %4, float 0.000000e+00, float 1.000000e+00) + %69 = call float @llvm.AMDIL.clamp.(float %5, float 0.000000e+00, float 1.000000e+00) + %70 = call float @llvm.AMDIL.clamp.(float %6, float 0.000000e+00, float 1.000000e+00) + %71 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00) + %72 = insertelement <4 x float> undef, float %55, i32 0 + %73 = insertelement <4 x float> %72, float %59, i32 1 + %74 = insertelement <4 x float> %73, float %63, i32 2 + %75 = insertelement <4 x float> %74, float %67, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %75, i32 60, i32 1) + %76 = insertelement <4 x float> undef, float %68, i32 0 + %77 = insertelement <4 x float> %76, float %69, i32 1 + %78 = insertelement <4 x float> %77, float %70, i32 2 + %79 = insertelement <4 x float> %78, float %71, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %79, i32 0, i32 2) + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) diff --git a/test/CodeGen/R600/schedule2.ll b/test/CodeGen/R600/schedule2.ll new file mode 100644 index 0000000000..9cd46fcf70 --- /dev/null +++ b/test/CodeGen/R600/schedule2.ll @@ -0,0 +1,102 @@ +;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s + +;CHECK: MOV_SAT T{{[0-9]+\.[X]}} +;CHECK-NEXT: MOV_SAT T{{[0-9]+\.[Y]}} +;CHECK-NEXT: MOV_SAT T{{[0-9]+\.[Z]}} +;CHECK-NEXT: MOV_SAT T{{[0-9]+\.[W]}} + +define void @main() { +main_body: + %0 = call float @llvm.R600.load.input(i32 4) + %1 = call float @llvm.R600.load.input(i32 5) + %2 = call float @llvm.R600.load.input(i32 6) + %3 = call float @llvm.R600.load.input(i32 7) + %4 = call float @llvm.R600.load.input(i32 8) + %5 = call float @llvm.R600.load.input(i32 9) + %6 = call float @llvm.R600.load.input(i32 10) + %7 = call float @llvm.R600.load.input(i32 11) + %8 = call float @llvm.R600.load.input(i32 12) + %9 = call float @llvm.R600.load.input(i32 13) + %10 = call float @llvm.R600.load.input(i32 14) + %11 = call float @llvm.R600.load.input(i32 15) + %12 = call float @llvm.R600.load.input(i32 16) + %13 = call float @llvm.R600.load.input(i32 17) + %14 = call float @llvm.R600.load.input(i32 18) + %15 = call float @llvm.R600.load.input(i32 19) + %16 = call float @llvm.R600.load.input(i32 20) + %17 = call float @llvm.R600.load.input(i32 21) + %18 = call float @llvm.R600.load.input(i32 22) + %19 = call float @llvm.R600.load.input(i32 23) + %20 = call float @llvm.R600.load.input(i32 24) + %21 = call float @llvm.R600.load.input(i32 25) + %22 = call float @llvm.R600.load.input(i32 26) + %23 = call float @llvm.R600.load.input(i32 27) + %24 = call float @llvm.R600.load.input(i32 28) + %25 = call float @llvm.R600.load.input(i32 29) + %26 = call float @llvm.R600.load.input(i32 30) + %27 = call float @llvm.R600.load.input(i32 31) + %28 = insertelement <4 x float> undef, float %12, i32 0 + %29 = insertelement <4 x float> %28, float %13, i32 1 + %30 = insertelement <4 x float> %29, float %14, i32 2 + %31 = insertelement <4 x float> %30, float %15, i32 3 + %32 = insertelement <4 x float> undef, float %0, i32 0 + %33 = insertelement <4 x float> %32, float %1, i32 1 + %34 = insertelement <4 x float> %33, float %2, i32 2 + %35 = insertelement <4 x float> %34, float %3, i32 3 + %36 = call float @llvm.AMDGPU.dp4(<4 x float> %31, <4 x float> %35) + %37 = insertelement <4 x float> undef, float %16, i32 0 + %38 = insertelement <4 x float> %37, float %17, i32 1 + %39 = insertelement <4 x float> %38, float %18, i32 2 + %40 = insertelement <4 x float> %39, float %19, i32 3 + %41 = insertelement <4 x float> undef, float %0, i32 0 + %42 = insertelement <4 x float> %41, float %1, i32 1 + %43 = insertelement <4 x float> %42, float %2, i32 2 + %44 = insertelement <4 x float> %43, float %3, i32 3 + %45 = call float @llvm.AMDGPU.dp4(<4 x float> %40, <4 x float> %44) + %46 = insertelement <4 x float> undef, float %20, i32 0 + %47 = insertelement <4 x float> %46, float %21, i32 1 + %48 = insertelement <4 x float> %47, float %22, i32 2 + %49 = insertelement <4 x float> %48, float %23, i32 3 + %50 = insertelement <4 x float> undef, float %0, i32 0 + %51 = insertelement <4 x float> %50, float %1, i32 1 + %52 = insertelement <4 x float> %51, float %2, i32 2 + %53 = insertelement <4 x float> %52, float %3, i32 3 + %54 = call float @llvm.AMDGPU.dp4(<4 x float> %49, <4 x float> %53) + %55 = insertelement <4 x float> undef, float %24, i32 0 + %56 = insertelement <4 x float> %55, float %25, i32 1 + %57 = insertelement <4 x float> %56, float %26, i32 2 + %58 = insertelement <4 x float> %57, float %27, i32 3 + %59 = insertelement <4 x float> undef, float %0, i32 0 + %60 = insertelement <4 x float> %59, float %1, i32 1 + %61 = insertelement <4 x float> %60, float %2, i32 2 + %62 = insertelement <4 x float> %61, float %3, i32 3 + %63 = call float @llvm.AMDGPU.dp4(<4 x float> %58, <4 x float> %62) + %64 = call float @llvm.AMDIL.clamp.(float %8, float 0.000000e+00, float 1.000000e+00) + %65 = call float @llvm.AMDIL.clamp.(float %9, float 0.000000e+00, float 1.000000e+00) + %66 = call float @llvm.AMDIL.clamp.(float %10, float 0.000000e+00, float 1.000000e+00) + %67 = call float @llvm.AMDIL.clamp.(float %11, float 0.000000e+00, float 1.000000e+00) + %68 = insertelement <4 x float> undef, float %36, i32 0 + %69 = insertelement <4 x float> %68, float %45, i32 1 + %70 = insertelement <4 x float> %69, float %54, i32 2 + %71 = insertelement <4 x float> %70, float %63, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %71, i32 60, i32 1) + %72 = insertelement <4 x float> undef, float %64, i32 0 + %73 = insertelement <4 x float> %72, float %65, i32 1 + %74 = insertelement <4 x float> %73, float %66, i32 2 + %75 = insertelement <4 x float> %74, float %67, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %75, i32 0, i32 2) + %76 = insertelement <4 x float> undef, float %4, i32 0 + %77 = insertelement <4 x float> %76, float %5, i32 1 + %78 = insertelement <4 x float> %77, float %6, i32 2 + %79 = insertelement <4 x float> %78, float %7, i32 3 + call void @llvm.R600.store.swizzle(<4 x float> %79, i32 1, i32 2) + ret void +} + +declare float @llvm.R600.load.input(i32) readnone + +declare float @llvm.AMDGPU.dp4(<4 x float>, <4 x float>) readnone + +declare float @llvm.AMDIL.clamp.(float, float, float) readnone + +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32) |