diff options
author | Christian König <deathsimple@vodafone.de> | 2012-12-11 18:43:06 +0100 |
---|---|---|
committer | Tom Stellard <thomas.stellard@amd.com> | 2012-12-14 14:25:59 +0000 |
commit | f705444c37cad234394ba64bdf0008acb6664b1e (patch) | |
tree | 6eb26a0ede0f464357b8e4cffa97684089eea6a4 | |
parent | c527b5b6c03ea0d47a66b7523b716ea81c47cb2c (diff) |
R600: New control flow for SI v2
This patch replaces the control flow handling with a new
pass which structurize the graph before transforming it to
machine instruction. This has a couple of different advantages
and currently fixes 20 piglit tests without a single regression.
It is now a general purpose transformation that could be not
only be used for SI/R6xx, but also for other hardware
implementations that use a form of structurized control flow.
v2: further cleanup, fixes and documentation
Signed-off-by: Christian König <deathsimple@vodafone.de>
Reviewed-by: Tom Stellard <thomas.stellard@amd.com>
Tested-by: Michel Dänzer <michel.daenzer@amd.com>
-rw-r--r-- | lib/Target/AMDGPU/AMDGPU.h | 3 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp | 732 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 13 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDILCFGStructurizer.cpp | 4 | ||||
-rw-r--r-- | lib/Target/AMDGPU/AMDILInstrInfo.td | 65 | ||||
-rw-r--r-- | lib/Target/AMDGPU/R600Instructions.td | 65 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 337 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIFixSGPRLiveness.cpp | 179 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.cpp | 118 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIISelLowering.h | 2 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIInstructions.td | 75 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SIIntrinsics.td | 10 | ||||
-rw-r--r-- | lib/Target/AMDGPU/SILowerControlFlow.cpp | 279 |
13 files changed, 1498 insertions, 384 deletions
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 40864b09dd..0f5125d39b 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -25,13 +25,14 @@ FunctionPass* createR600KernelParametersPass(const DataLayout *TD); FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm); // SI Passes +FunctionPass *createSIAnnotateControlFlowPass(); FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm); FunctionPass *createSILowerControlFlowPass(TargetMachine &tm); FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS); FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm); -FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm); // Passes common to R600 and SI +Pass *createAMDGPUStructurizeCFGPass(); FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); } // End namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp b/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp new file mode 100644 index 0000000000..686c3dccd3 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUStructurizeCFG.cpp @@ -0,0 +1,732 @@ +//===-- AMDGPUStructurizeCFG.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// The pass implemented in this file transforms the programs control flow +/// graph into a form that's suitable for code generation on hardware that +/// implements control flow by execution masking. This currently includes all +/// AMD GPUs but may as well be useful for other types of hardware. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "llvm/Module.h" +#include "llvm/ADT/SCCIterator.h" +#include "llvm/Analysis/RegionIterator.h" +#include "llvm/Analysis/RegionInfo.h" +#include "llvm/Analysis/RegionPass.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +namespace { + +// Definition of the complex types used in this pass. + +typedef std::pair<BasicBlock *, Value *> BBValuePair; +typedef ArrayRef<BasicBlock*> BBVecRef; + +typedef SmallVector<RegionNode*, 8> RNVector; +typedef SmallVector<BasicBlock*, 8> BBVector; +typedef SmallVector<BBValuePair, 2> BBValueVector; + +typedef DenseMap<PHINode *, BBValueVector> PhiMap; +typedef DenseMap<BasicBlock *, PhiMap> BBPhiMap; +typedef DenseMap<BasicBlock *, Value *> BBPredicates; +typedef DenseMap<BasicBlock *, BBPredicates> PredMap; +typedef DenseMap<BasicBlock *, unsigned> VisitedMap; + +// The name for newly created blocks. + +static const char *FlowBlockName = "Flow"; + +/// @brief Transforms the control flow graph on one single entry/exit region +/// at a time. +/// +/// After the transform all "If"/"Then"/"Else" style control flow looks like +/// this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 | +/// | / +/// |/ +/// 3 +/// || Where: +/// | | 1 = "If" block, calculates the condition +/// 4 | 2 = "Then" subregion, runs if the condition is true +/// | / 3 = "Flow" blocks, newly inserted flow blocks, rejoins the flow +/// |/ 4 = "Else" optional subregion, runs if the condition is false +/// 5 5 = "End" block, also rejoins the control flow +/// \endverbatim +/// +/// Control flow is expressed as a branch where the true exit goes into the +/// "Then"/"Else" region, while the false exit skips the region +/// The condition for the optional "Else" region is expressed as a PHI node. +/// The incomming values of the PHI node are true for the "If" edge and false +/// for the "Then" edge. +/// +/// Additionally to that even complicated loops look like this: +/// +/// \verbatim +/// 1 +/// || +/// | | +/// 2 ^ Where: +/// | / 1 = "Entry" block +/// |/ 2 = "Loop" optional subregion, with all exits at "Flow" block +/// 3 3 = "Flow" block, with back edge to entry block +/// | +/// \endverbatim +/// +/// The back edge of the "Flow" block is always on the false side of the branch +/// while the true side continues the general flow. So the loop condition +/// consist of a network of PHI nodes where the true incoming values expresses +/// breaks and the false values expresses continue states. +class AMDGPUStructurizeCFG : public RegionPass { + + static char ID; + + Type *Boolean; + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + + Function *Func; + Region *ParentRegion; + + DominatorTree *DT; + + RNVector Order; + VisitedMap Visited; + PredMap Predicates; + BBPhiMap DeletedPhis; + BBVector FlowsInserted; + + BasicBlock *LoopStart; + BasicBlock *LoopEnd; + BBPredicates LoopPred; + + void orderNodes(); + + void buildPredicate(BranchInst *Term, unsigned Idx, + BBPredicates &Pred, bool Invert); + + void analyzeBlock(BasicBlock *BB); + + void analyzeLoop(BasicBlock *BB, unsigned &LoopIdx); + + void collectInfos(); + + bool dominatesPredicates(BasicBlock *A, BasicBlock *B); + + void killTerminator(BasicBlock *BB); + + RegionNode *skipChained(RegionNode *Node); + + void delPhiValues(BasicBlock *From, BasicBlock *To); + + void addPhiValues(BasicBlock *From, BasicBlock *To); + + BasicBlock *getNextFlow(BasicBlock *Prev); + + bool isPredictableTrue(BasicBlock *Prev, BasicBlock *Node); + + BasicBlock *wireFlowBlock(BasicBlock *Prev, RegionNode *Node); + + void createFlow(); + + void insertConditions(); + + void rebuildSSA(); + +public: + AMDGPUStructurizeCFG(): + RegionPass(ID) { + + initializeRegionInfoPass(*PassRegistry::getPassRegistry()); + } + + virtual bool doInitialization(Region *R, RGPassManager &RGM); + + virtual bool runOnRegion(Region *R, RGPassManager &RGM); + + virtual const char *getPassName() const { + return "AMDGPU simplify control flow"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const { + + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + RegionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char AMDGPUStructurizeCFG::ID = 0; + +/// \brief Initialize the types and constants used in the pass +bool AMDGPUStructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) { + + LLVMContext &Context = R->getEntry()->getContext(); + + Boolean = Type::getInt1Ty(Context); + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + + return false; +} + +/// \brief Build up the general order of nodes +void AMDGPUStructurizeCFG::orderNodes() { + + scc_iterator<Region *> I = scc_begin(ParentRegion), + E = scc_end(ParentRegion); + for (Order.clear(); I != E; ++I) { + std::vector<RegionNode *> &Nodes = *I; + Order.append(Nodes.begin(), Nodes.end()); + } +} + +/// \brief Build blocks and loop predicates +void AMDGPUStructurizeCFG::buildPredicate(BranchInst *Term, unsigned Idx, + BBPredicates &Pred, bool Invert) { + + Value *True = Invert ? BoolFalse : BoolTrue; + Value *False = Invert ? BoolTrue : BoolFalse; + + RegionInfo *RI = ParentRegion->getRegionInfo(); + BasicBlock *BB = Term->getParent(); + + // Handle the case where multiple regions start at the same block + Region *R = BB != ParentRegion->getEntry() ? + RI->getRegionFor(BB) : ParentRegion; + + if (R == ParentRegion) { + // It's a top level block in our region + Value *Cond = True; + if (Term->isConditional()) { + BasicBlock *Other = Term->getSuccessor(!Idx); + + if (Visited.count(Other)) { + if (!Pred.count(Other)) + Pred[Other] = False; + + if (!Pred.count(BB)) + Pred[BB] = True; + return; + } + Cond = Term->getCondition(); + + if (Idx != Invert) + Cond = BinaryOperator::CreateNot(Cond, "", Term); + } + + Pred[BB] = Cond; + + } else if (ParentRegion->contains(R)) { + // It's a block in a sub region + while(R->getParent() != ParentRegion) + R = R->getParent(); + + Pred[R->getEntry()] = True; + + } else { + // It's a branch from outside into our parent region + Pred[BB] = True; + } +} + +/// \brief Analyze the successors of each block and build up predicates +void AMDGPUStructurizeCFG::analyzeBlock(BasicBlock *BB) { + + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + BBPredicates &Pred = Predicates[BB]; + + for (; PI != PE; ++PI) { + BranchInst *Term = cast<BranchInst>((*PI)->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + if (Succ != BB) + continue; + buildPredicate(Term, i, Pred, false); + } + } +} + +/// \brief Analyze the conditions leading to loop to a previous block +void AMDGPUStructurizeCFG::analyzeLoop(BasicBlock *BB, unsigned &LoopIdx) { + + BranchInst *Term = cast<BranchInst>(BB->getTerminator()); + + for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) { + BasicBlock *Succ = Term->getSuccessor(i); + + // Ignore it if it's not a back edge + if (!Visited.count(Succ)) + continue; + + buildPredicate(Term, i, LoopPred, true); + + LoopEnd = BB; + if (Visited[Succ] < LoopIdx) { + LoopIdx = Visited[Succ]; + LoopStart = Succ; + } + } +} + +/// \brief Collect various loop and predicate infos +void AMDGPUStructurizeCFG::collectInfos() { + + unsigned Number = 0, LoopIdx = ~0; + + // Reset predicate + Predicates.clear(); + + // and loop infos + LoopStart = LoopEnd = 0; + LoopPred.clear(); + + RNVector::reverse_iterator OI = Order.rbegin(), OE = Order.rend(); + for (Visited.clear(); OI != OE; Visited[(*OI++)->getEntry()] = ++Number) { + + // Analyze all the conditions leading to a node + analyzeBlock((*OI)->getEntry()); + + if ((*OI)->isSubRegion()) + continue; + + // Find the first/last loop nodes and loop predicates + analyzeLoop((*OI)->getNodeAs<BasicBlock>(), LoopIdx); + } +} + +/// \brief Does A dominate all the predicates of B ? +bool AMDGPUStructurizeCFG::dominatesPredicates(BasicBlock *A, BasicBlock *B) { + + BBPredicates &Preds = Predicates[B]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + if (!DT->dominates(A, PI->first)) + return false; + } + return true; +} + +/// \brief Remove phi values from all successors and the remove the terminator. +void AMDGPUStructurizeCFG::killTerminator(BasicBlock *BB) { + + TerminatorInst *Term = BB->getTerminator(); + if (!Term) + return; + + for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); + SI != SE; ++SI) { + + delPhiValues(BB, *SI); + } + + Term->eraseFromParent(); +} + +/// First: Skip forward to the first region node that either isn't a subregion or not +/// dominating it's exit, remove all the skipped nodes from the node order. +/// +/// Second: Handle the first successor directly if the resulting nodes successor +/// predicates are still dominated by the original entry +RegionNode *AMDGPUStructurizeCFG::skipChained(RegionNode *Node) { + + BasicBlock *Entry = Node->getEntry(); + + // Skip forward as long as it is just a linear flow + while (true) { + BasicBlock *Entry = Node->getEntry(); + BasicBlock *Exit; + + if (Node->isSubRegion()) { + Exit = Node->getNodeAs<Region>()->getExit(); + } else { + TerminatorInst *Term = Entry->getTerminator(); + if (Term->getNumSuccessors() != 1) + break; + Exit = Term->getSuccessor(0); + } + + // It's a back edge, break here so we can insert a loop node + if (!Visited.count(Exit)) + return Node; + + // More than node edges are pointing to exit + if (!DT->dominates(Entry, Exit)) + return Node; + + RegionNode *Next = ParentRegion->getNode(Exit); + RNVector::iterator I = std::find(Order.begin(), Order.end(), Next); + assert(I != Order.end()); + + Visited.erase(Next->getEntry()); + Order.erase(I); + Node = Next; + } + + BasicBlock *BB = Node->getEntry(); + TerminatorInst *Term = BB->getTerminator(); + if (Term->getNumSuccessors() != 2) + return Node; + + // Our node has exactly two succesors, check if we can handle + // any of them directly + BasicBlock *Succ = Term->getSuccessor(0); + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) { + Succ = Term->getSuccessor(1); + if (!Visited.count(Succ) || !dominatesPredicates(Entry, Succ)) + return Node; + } else { + BasicBlock *Succ2 = Term->getSuccessor(1); + if (Visited.count(Succ2) && Visited[Succ] > Visited[Succ2] && + dominatesPredicates(Entry, Succ2)) + Succ = Succ2; + } + + RegionNode *Next = ParentRegion->getNode(Succ); + RNVector::iterator E = Order.end(); + RNVector::iterator I = std::find(Order.begin(), E, Next); + assert(I != E); + + killTerminator(BB); + FlowsInserted.push_back(BB); + Visited.erase(Succ); + Order.erase(I); + return ParentRegion->getNode(wireFlowBlock(BB, Next)); +} + +/// \brief Remove all PHI values coming from "From" into "To" and remember +/// them in DeletedPhis +void AMDGPUStructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) { + + PhiMap &Map = DeletedPhis[To]; + for (BasicBlock::iterator I = To->begin(), E = To->end(); + I != E && isa<PHINode>(*I);) { + + PHINode &Phi = cast<PHINode>(*I++); + while (Phi.getBasicBlockIndex(From) != -1) { + Value *Deleted = Phi.removeIncomingValue(From, false); + Map[&Phi].push_back(std::make_pair(From, Deleted)); + } + } +} + +/// \brief Add the PHI values back once we knew the new predecessor +void AMDGPUStructurizeCFG::addPhiValues(BasicBlock *From, BasicBlock *To) { + + if (!DeletedPhis.count(To)) + return; + + PhiMap &Map = DeletedPhis[To]; + SSAUpdater Updater; + + for (PhiMap::iterator I = Map.begin(), E = Map.end(); I != E; ++I) { + + PHINode *Phi = I->first; + Updater.Initialize(Phi->getType(), ""); + BasicBlock *Fallback = To; + bool HaveFallback = false; + + for (BBValueVector::iterator VI = I->second.begin(), VE = I->second.end(); + VI != VE; ++VI) { + + Updater.AddAvailableValue(VI->first, VI->second); + BasicBlock *Dom = DT->findNearestCommonDominator(Fallback, VI->first); + if (Dom == VI->first) + HaveFallback = true; + else if (Dom != Fallback) + HaveFallback = false; + Fallback = Dom; + } + if (!HaveFallback) { + Value *Undef = UndefValue::get(Phi->getType()); + Updater.AddAvailableValue(Fallback, Undef); + } + + Phi->addIncoming(Updater.GetValueAtEndOfBlock(From), From); + } + DeletedPhis.erase(To); +} + +/// \brief Create a new flow node and update dominator tree and region info +BasicBlock *AMDGPUStructurizeCFG::getNextFlow(BasicBlock *Prev) { + + LLVMContext &Context = Func->getContext(); + BasicBlock *Insert = Order.empty() ? ParentRegion->getExit() : + Order.back()->getEntry(); + BasicBlock *Flow = BasicBlock::Create(Context, FlowBlockName, + Func, Insert); + DT->addNewBlock(Flow, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Flow, ParentRegion); + FlowsInserted.push_back(Flow); + return Flow; +} + +/// \brief Can we predict that this node will always be called? +bool AMDGPUStructurizeCFG::isPredictableTrue(BasicBlock *Prev, + BasicBlock *Node) { + + BBPredicates &Preds = Predicates[Node]; + bool Dominated = false; + + for (BBPredicates::iterator I = Preds.begin(), E = Preds.end(); + I != E; ++I) { + + if (I->second != BoolTrue) + return false; + + if (!Dominated && DT->dominates(I->first, Prev)) + Dominated = true; + } + return Dominated; +} + +/// \brief Wire up the new control flow by inserting or updating the branch +/// instructions at node exits +BasicBlock *AMDGPUStructurizeCFG::wireFlowBlock(BasicBlock *Prev, + RegionNode *Node) { + + BasicBlock *Entry = Node->getEntry(); + + if (LoopStart == Entry) { + LoopStart = Prev; + LoopPred[Prev] = BoolTrue; + } + + // Wire it up temporary, skipChained may recurse into us + BranchInst::Create(Entry, Prev); + DT->changeImmediateDominator(Entry, Prev); + addPhiValues(Prev, Entry); + + Node = skipChained(Node); + + BasicBlock *Next = getNextFlow(Prev); + if (!isPredictableTrue(Prev, Entry)) { + // Let Prev point to entry and next block + Prev->getTerminator()->eraseFromParent(); + BranchInst::Create(Entry, Next, BoolUndef, Prev); + } else { + DT->changeImmediateDominator(Next, Entry); + } + + // Let node exit(s) point to next block + if (Node->isSubRegion()) { + Region *SubRegion = Node->getNodeAs<Region>(); + BasicBlock *Exit = SubRegion->getExit(); + + // Find all the edges from the sub region to the exit + BBVector ToDo; + for (pred_iterator I = pred_begin(Exit), E = pred_end(Exit); I != E; ++I) { + if (SubRegion->contains(*I)) + ToDo.push_back(*I); + } + + // Modify the edges to point to the new flow block + for (BBVector::iterator I = ToDo.begin(), E = ToDo.end(); I != E; ++I) { + delPhiValues(*I, Exit); + TerminatorInst *Term = (*I)->getTerminator(); + Term->replaceUsesOfWith(Exit, Next); + } + + // Update the region info + SubRegion->replaceExit(Next); + + } else { + BasicBlock *BB = Node->getNodeAs<BasicBlock>(); + killTerminator(BB); + BranchInst::Create(Next, BB); + + if (BB == LoopEnd) + LoopEnd = 0; + } + + return Next; +} + +/// Destroy node order and visited map, build up flow order instead. +/// After this function control flow looks like it should be, but +/// branches only have undefined conditions. +void AMDGPUStructurizeCFG::createFlow() { + + DeletedPhis.clear(); + + BasicBlock *Prev = Order.pop_back_val()->getEntry(); + assert(Prev == ParentRegion->getEntry() && "Incorrect node order!"); + Visited.erase(Prev); + + if (LoopStart == Prev) { + // Loop starts at entry, split entry so that we can predicate it + BasicBlock::iterator Insert = Prev->getFirstInsertionPt(); + BasicBlock *Split = Prev->splitBasicBlock(Insert, FlowBlockName); + DT->addNewBlock(Split, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); + Predicates[Split] = Predicates[Prev]; + Order.push_back(ParentRegion->getBBNode(Split)); + LoopPred[Prev] = BoolTrue; + + } else if (LoopStart == Order.back()->getEntry()) { + // Loop starts behind entry, split entry so that we can jump to it + Instruction *Term = Prev->getTerminator(); + BasicBlock *Split = Prev->splitBasicBlock(Term, FlowBlockName); + DT->addNewBlock(Split, Prev); + ParentRegion->getRegionInfo()->setRegionFor(Split, ParentRegion); + Prev = Split; + } + + killTerminator(Prev); + FlowsInserted.clear(); + FlowsInserted.push_back(Prev); + + while (!Order.empty()) { + RegionNode *Node = Order.pop_back_val(); + Visited.erase(Node->getEntry()); + Prev = wireFlowBlock(Prev, Node); + if (LoopStart && !LoopEnd) { + // Create an extra loop end node + LoopEnd = Prev; + Prev = getNextFlow(LoopEnd); + BranchInst::Create(Prev, LoopStart, BoolUndef, LoopEnd); + addPhiValues(LoopEnd, LoopStart); + } + } + + BasicBlock *Exit = ParentRegion->getExit(); + BranchInst::Create(Exit, Prev); + addPhiValues(Prev, Exit); + if (DT->dominates(ParentRegion->getEntry(), Exit)) + DT->changeImmediateDominator(Exit, Prev); + + if (LoopStart && LoopEnd) { + BBVector::iterator FI = std::find(FlowsInserted.begin(), + FlowsInserted.end(), + LoopStart); + for (; *FI != LoopEnd; ++FI) { + addPhiValues(*FI, (*FI)->getTerminator()->getSuccessor(0)); + } + } + + assert(Order.empty()); + assert(Visited.empty()); + assert(DeletedPhis.empty()); +} + +/// \brief Insert the missing branch conditions +void AMDGPUStructurizeCFG::insertConditions() { + + SSAUpdater PhiInserter; + + for (BBVector::iterator FI = FlowsInserted.begin(), FE = FlowsInserted.end(); + FI != FE; ++FI) { + + BranchInst *Term = cast<BranchInst>((*FI)->getTerminator()); + if (Term->isUnconditional()) + continue; + + PhiInserter.Initialize(Boolean, ""); + PhiInserter.AddAvailableValue(&Func->getEntryBlock(), BoolFalse); + + BasicBlock *Succ = Term->getSuccessor(0); + BBPredicates &Preds = (*FI == LoopEnd) ? LoopPred : Predicates[Succ]; + for (BBPredicates::iterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + + PhiInserter.AddAvailableValue(PI->first, PI->second); + } + + Term->setCondition(PhiInserter.GetValueAtEndOfBlock(*FI)); + } +} + +/// Handle a rare case where the disintegrated nodes instructions +/// no longer dominate all their uses. Not sure if this is really nessasary +void AMDGPUStructurizeCFG::rebuildSSA() { + + SSAUpdater Updater; + for (Region::block_iterator I = ParentRegion->block_begin(), + E = ParentRegion->block_end(); + I != E; ++I) { + + BasicBlock *BB = *I; + for (BasicBlock::iterator II = BB->begin(), IE = BB->end(); + II != IE; ++II) { + + bool Initialized = false; + for (Use *I = &II->use_begin().getUse(), *Next; I; I = Next) { + + Next = I->getNext(); + + Instruction *User = cast<Instruction>(I->getUser()); + if (User->getParent() == BB) { + continue; + + } else if (PHINode *UserPN = dyn_cast<PHINode>(User)) { + if (UserPN->getIncomingBlock(*I) == BB) + continue; + } + + if (DT->dominates(II, User)) + continue; + + if (!Initialized) { + Value *Undef = UndefValue::get(II->getType()); + Updater.Initialize(II->getType(), ""); + Updater.AddAvailableValue(&Func->getEntryBlock(), Undef); + Updater.AddAvailableValue(BB, II); + Initialized = true; + } + Updater.RewriteUseAfterInsertions(*I); + } + } + } +} + +/// \brief Run the transformation for each region found +bool AMDGPUStructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) { + + if (R->isTopLevelRegion()) + return false; + + Func = R->getEntry()->getParent(); + ParentRegion = R; + + DT = &getAnalysis<DominatorTree>(); + + orderNodes(); + collectInfos(); + createFlow(); + insertConditions(); + rebuildSSA(); + + Order.clear(); + Visited.clear(); + Predicates.clear(); + DeletedPhis.clear(); + FlowsInserted.clear(); + + return true; +} + +/// \brief Create the pass +Pass *llvm::createAMDGPUStructurizeCFGPass() { + return new AMDGPUStructurizeCFG(); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index e42fa8abb2..098d42e79a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -91,6 +91,11 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { bool AMDGPUPassConfig::addPreISel() { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); + if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUStructurizeCFGPass()); + addPass(createSIAnnotateControlFlowPass()); + } return false; } @@ -107,9 +112,6 @@ bool AMDGPUPassConfig::addPreRegAlloc() { addPass(createSIAssignInterpRegsPass(*TM)); } addPass(createAMDGPUConvertToISAPass(*TM)); - if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) { - addPass(createSIFixSGPRLivenessPass(*TM)); - } return false; } @@ -124,11 +126,10 @@ bool AMDGPUPassConfig::addPreSched2() { } bool AMDGPUPassConfig::addPreEmitPass() { - addPass(createAMDGPUCFGPreparationPass(*TM)); - addPass(createAMDGPUCFGStructurizerPass(*TM)); - const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(); if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) { + addPass(createAMDGPUCFGPreparationPass(*TM)); + addPass(createAMDGPUCFGStructurizerPass(*TM)); addPass(createR600ExpandSpecialInstrsPass(*TM)); addPass(&FinalizeMachineBundlesID); } else { diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp index 1f276dc570..568d281e63 100644 --- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp +++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp @@ -2596,7 +2596,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32; - case AMDGPU::SI_IF_NZ: return AMDGPU::SI_IF_NZ; default: assert(0 && "internal error"); } @@ -2608,7 +2607,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32; - case AMDGPU::SI_IF_Z: return AMDGPU::SI_IF_Z; default: assert(0 && "internal error"); } @@ -2658,8 +2656,6 @@ struct CFGStructTraits<AMDGPUCFGStructurizer> { return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0; case AMDGPU::BRANCH_COND_i32: case AMDGPU::BRANCH_COND_f32: - case AMDGPU::SI_IF_NZ: - case AMDGPU::SI_IF_Z: break; default: return false; diff --git a/lib/Target/AMDGPU/AMDILInstrInfo.td b/lib/Target/AMDGPU/AMDILInstrInfo.td index ac6745148e..e969bbf8ca 100644 --- a/lib/Target/AMDGPU/AMDILInstrInfo.td +++ b/lib/Target/AMDGPU/AMDILInstrInfo.td @@ -206,68 +206,3 @@ multiclass BranchInstr2<string name> { // Intrinsics support //===--------------------------------------------------------------------===// include "AMDILIntrinsics.td" - -//===--------------------------------------------------------------------===// -// Instructions support -//===--------------------------------------------------------------------===// -//===---------------------------------------------------------------------===// -// Custom Inserter for Branches and returns, this eventually will be a -// seperate pass -//===---------------------------------------------------------------------===// -let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { - def BRANCH : ILFormat<(outs), (ins brtarget:$target), - "; Pseudo unconditional branch instruction", - [(br bb:$target)]>; - defm BRANCH_COND : BranchConditional<IL_brcond>; -} - -//===---------------------------------------------------------------------===// -// Flow and Program control Instructions -//===---------------------------------------------------------------------===// -let isTerminator=1 in { - def SWITCH : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("SWITCH", " $src"), []>; - def CASE : ILFormat< (outs), (ins GPRI32:$src), - !strconcat("CASE", " $src"), []>; - def BREAK : ILFormat< (outs), (ins), - "BREAK", []>; - def CONTINUE : ILFormat< (outs), (ins), - "CONTINUE", []>; - def DEFAULT : ILFormat< (outs), (ins), - "DEFAULT", []>; - def ELSE : ILFormat< (outs), (ins), - "ELSE", []>; - def ENDSWITCH : ILFormat< (outs), (ins), - "ENDSWITCH", []>; - def ENDMAIN : ILFormat< (outs), (ins), - "ENDMAIN", []>; - def END : ILFormat< (outs), (ins), - "END", []>; - def ENDFUNC : ILFormat< (outs), (ins), - "ENDFUNC", []>; - def ENDIF : ILFormat< (outs), (ins), - "ENDIF", []>; - def WHILELOOP : ILFormat< (outs), (ins), - "WHILE", []>; - def ENDLOOP : ILFormat< (outs), (ins), - "ENDLOOP", []>; - def FUNC : ILFormat< (outs), (ins), - "FUNC", []>; - def RETDYN : ILFormat< (outs), (ins), - "RET_DYN", []>; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; - // This opcode has custom swizzle pattern encoded in Swizzle Encoder - defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; - defm IFC : BranchInstr2<"IFC">; - defm BREAKC : BranchInstr2<"BREAKC">; - defm CONTINUEC : BranchInstr2<"CONTINUEC">; -} diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td index 105822066c..5900794421 100644 --- a/lib/Target/AMDGPU/R600Instructions.td +++ b/lib/Target/AMDGPU/R600Instructions.td @@ -1545,6 +1545,71 @@ let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in { "RETURN", [(IL_retflag)]>; } +//===--------------------------------------------------------------------===// +// Instructions support +//===--------------------------------------------------------------------===// +//===---------------------------------------------------------------------===// +// Custom Inserter for Branches and returns, this eventually will be a +// seperate pass +//===---------------------------------------------------------------------===// +let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in { + def BRANCH : ILFormat<(outs), (ins brtarget:$target), + "; Pseudo unconditional branch instruction", + [(br bb:$target)]>; + defm BRANCH_COND : BranchConditional<IL_brcond>; +} + +//===---------------------------------------------------------------------===// +// Flow and Program control Instructions +//===---------------------------------------------------------------------===// +let isTerminator=1 in { + def SWITCH : ILFormat< (outs), (ins GPRI32:$src), + !strconcat("SWITCH", " $src"), []>; + def CASE : ILFormat< (outs), (ins GPRI32:$src), + !strconcat("CASE", " $src"), []>; + def BREAK : ILFormat< (outs), (ins), + "BREAK", []>; + def CONTINUE : ILFormat< (outs), (ins), + "CONTINUE", []>; + def DEFAULT : ILFormat< (outs), (ins), + "DEFAULT", []>; + def ELSE : ILFormat< (outs), (ins), + "ELSE", []>; + def ENDSWITCH : ILFormat< (outs), (ins), + "ENDSWITCH", []>; + def ENDMAIN : ILFormat< (outs), (ins), + "ENDMAIN", []>; + def END : ILFormat< (outs), (ins), + "END", []>; + def ENDFUNC : ILFormat< (outs), (ins), + "ENDFUNC", []>; + def ENDIF : ILFormat< (outs), (ins), + "ENDIF", []>; + def WHILELOOP : ILFormat< (outs), (ins), + "WHILE", []>; + def ENDLOOP : ILFormat< (outs), (ins), + "ENDLOOP", []>; + def FUNC : ILFormat< (outs), (ins), + "FUNC", []>; + def RETDYN : ILFormat< (outs), (ins), + "RET_DYN", []>; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALNZ : BranchInstr<"IF_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm IF_LOGICALZ : BranchInstr<"IF_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">; + // This opcode has custom swizzle pattern encoded in Swizzle Encoder + defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">; + defm IFC : BranchInstr2<"IFC">; + defm BREAKC : BranchInstr2<"BREAKC">; + defm CONTINUEC : BranchInstr2<"CONTINUEC">; +} + //===----------------------------------------------------------------------===// // ISel Patterns //===----------------------------------------------------------------------===// diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp new file mode 100644 index 0000000000..d13183557d --- /dev/null +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -0,0 +1,337 @@ +//===-- SIAnnotateControlFlow.cpp - ------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Annotates the control flow with hardware specific intrinsics. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" + +#include "llvm/Pass.h" +#include "llvm/Module.h" +#include "llvm/Analysis/Dominators.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/Transforms/Utils/SSAUpdater.h" + +using namespace llvm; + +namespace { + +// Complex types used in this pass +typedef std::pair<BasicBlock *, Value *> StackEntry; +typedef SmallVector<StackEntry, 16> StackVector; + +// Intrinsic names the control flow is annotated with +static const char *IfIntrinsic = "llvm.SI.if"; +static const char *ElseIntrinsic = "llvm.SI.else"; +static const char *BreakIntrinsic = "llvm.SI.break"; +static const char *IfBreakIntrinsic = "llvm.SI.if.break"; +static const char *ElseBreakIntrinsic = "llvm.SI.else.break"; +static const char *LoopIntrinsic = "llvm.SI.loop"; +static const char *EndCfIntrinsic = "llvm.SI.end.cf"; + +class SIAnnotateControlFlow : public FunctionPass { + + static char ID; + + Type *Boolean; + Type *Void; + Type *Int64; + Type *ReturnStruct; + + ConstantInt *BoolTrue; + ConstantInt *BoolFalse; + UndefValue *BoolUndef; + Constant *Int64Zero; + + Constant *If; + Constant *Else; + Constant *Break; + Constant *IfBreak; + Constant *ElseBreak; + Constant *Loop; + Constant *EndCf; + + DominatorTree *DT; + StackVector Stack; + SSAUpdater PhiInserter; + + bool isTopOfStack(BasicBlock *BB); + + Value *popSaved(); + + void push(BasicBlock *BB, Value *Saved); + + bool isElse(PHINode *Phi); + + void eraseIfUnused(PHINode *Phi); + + void openIf(BranchInst *Term); + + void insertElse(BranchInst *Term); + + void handleLoopCondition(Value *Cond); + + void handleLoop(BranchInst *Term); + + void closeControlFlow(BasicBlock *BB); + +public: + SIAnnotateControlFlow(): + FunctionPass(ID) { } + + virtual bool doInitialization(Module &M); + + virtual bool runOnFunction(Function &F); + + virtual const char *getPassName() const { + return "SI annotate control flow"; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + + AU.addRequired<DominatorTree>(); + AU.addPreserved<DominatorTree>(); + FunctionPass::getAnalysisUsage(AU); + } + +}; + +} // end anonymous namespace + +char SIAnnotateControlFlow::ID = 0; + +/// \brief Initialize all the types and constants used in the pass +bool SIAnnotateControlFlow::doInitialization(Module &M) { + + LLVMContext &Context = M.getContext(); + + Void = Type::getVoidTy(Context); + Boolean = Type::getInt1Ty(Context); + Int64 = Type::getInt64Ty(Context); + ReturnStruct = StructType::get(Boolean, Int64, (Type *)0); + + BoolTrue = ConstantInt::getTrue(Context); + BoolFalse = ConstantInt::getFalse(Context); + BoolUndef = UndefValue::get(Boolean); + Int64Zero = ConstantInt::get(Int64, 0); + + If = M.getOrInsertFunction( + IfIntrinsic, ReturnStruct, Boolean, (Type *)0); + + Else = M.getOrInsertFunction( + ElseIntrinsic, ReturnStruct, Int64, (Type *)0); + + Break = M.getOrInsertFunction( + BreakIntrinsic, Int64, Int64, (Type *)0); + + IfBreak = M.getOrInsertFunction( + IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)0); + + ElseBreak = M.getOrInsertFunction( + ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)0); + + Loop = M.getOrInsertFunction( + LoopIntrinsic, Boolean, Int64, (Type *)0); + + EndCf = M.getOrInsertFunction( + EndCfIntrinsic, Void, Int64, (Type *)0); + + return false; +} + +/// \brief Is BB the last block saved on the stack ? +bool SIAnnotateControlFlow::isTopOfStack(BasicBlock *BB) { + return Stack.back().first == BB; +} + +/// \brief Pop the last saved value from the control flow stack +Value *SIAnnotateControlFlow::popSaved() { + return Stack.pop_back_val().second; +} + +/// \brief Push a BB and saved value to the control flow stack +void SIAnnotateControlFlow::push(BasicBlock *BB, Value *Saved) { + Stack.push_back(std::make_pair(BB, Saved)); +} + +/// \brief Can the condition represented by this PHI node treated like +/// an "Else" block? +bool SIAnnotateControlFlow::isElse(PHINode *Phi) { + + BasicBlock *IDom = DT->getNode(Phi->getParent())->getIDom()->getBlock(); + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + if (Phi->getIncomingBlock(i) == IDom) { + + if (Phi->getIncomingValue(i) != BoolTrue) + return false; + + } else { + if (Phi->getIncomingValue(i) != BoolFalse) + return false; + + } + } + return true; +} + +// \brief Erase "Phi" if it is not used any more +void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) { + if (!Phi->hasNUsesOrMore(1)) + Phi->eraseFromParent(); +} + +/// \brief Open a new "If" block +void SIAnnotateControlFlow::openIf(BranchInst *Term) { + Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Close the last "If" block and open a new "Else" block +void SIAnnotateControlFlow::insertElse(BranchInst *Term) { + Value *Ret = CallInst::Create(Else, popSaved(), "", Term); + Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term)); + push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term)); +} + +/// \brief Recursively handle the condition leading to a loop +void SIAnnotateControlFlow::handleLoopCondition(Value *Cond) { + + if (PHINode *Phi = dyn_cast<PHINode>(Cond)) { + + // Handle all non constant incoming values first + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + Value *Incoming = Phi->getIncomingValue(i); + if (isa<ConstantInt>(Incoming)) + continue; + + Phi->setIncomingValue(i, BoolFalse); + handleLoopCondition(Incoming); + } + + BasicBlock *Parent = Phi->getParent(); + BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock(); + + for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) { + + Value *Incoming = Phi->getIncomingValue(i); + if (Incoming != BoolTrue) + continue; + + BasicBlock *From = Phi->getIncomingBlock(i); + if (From == IDom) { + CallInst *OldEnd = dyn_cast<CallInst>(Parent->getFirstInsertionPt()); + if (OldEnd && OldEnd->getCalledFunction() == EndCf) { + Value *Args[] = { + OldEnd->getArgOperand(0), + PhiInserter.GetValueAtEndOfBlock(Parent) + }; + Value *Ret = CallInst::Create(ElseBreak, Args, "", OldEnd); + PhiInserter.AddAvailableValue(Parent, Ret); + continue; + } + } + + TerminatorInst *Insert = From->getTerminator(); + Value *Arg = PhiInserter.GetValueAtEndOfBlock(From); + Value *Ret = CallInst::Create(Break, Arg, "", Insert); + PhiInserter.AddAvailableValue(From, Ret); + } + eraseIfUnused(Phi); + + } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) { + BasicBlock *Parent = Inst->getParent(); + TerminatorInst *Insert = Parent->getTerminator(); + Value *Args[] = { Cond, PhiInserter.GetValueAtEndOfBlock(Parent) }; + Value *Ret = CallInst::Create(IfBreak, Args, "", Insert); + PhiInserter.AddAvailableValue(Parent, Ret); + + } else { + assert(0 && "Unhandled loop condition!"); + } +} + +/// \brief Handle a back edge (loop) +void SIAnnotateControlFlow::handleLoop(BranchInst *Term) { + + BasicBlock *Target = Term->getSuccessor(1); + PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front()); + + PhiInserter.Initialize(Int64, ""); + PhiInserter.AddAvailableValue(Target, Broken); + + Value *Cond = Term->getCondition(); + Term->setCondition(BoolTrue); + handleLoopCondition(Cond); + + BasicBlock *BB = Term->getParent(); + Value *Arg = PhiInserter.GetValueAtEndOfBlock(BB); + for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target); + PI != PE; ++PI) { + + Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI); + } + + Term->setCondition(CallInst::Create(Loop, Arg, "", Term)); + push(Term->getSuccessor(0), Arg); +} + +/// \brief Close the last opened control flow +void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) { + CallInst::Create(EndCf, popSaved(), "", BB->getFirstInsertionPt()); +} + +/// \brief Annotate the control flow with intrinsics so the backend can +/// recognize if/then/else and loops. +bool SIAnnotateControlFlow::runOnFunction(Function &F) { + + DT = &getAnalysis<DominatorTree>(); + + for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()), + E = df_end(&F.getEntryBlock()); I != E; ++I) { + + BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator()); + + if (!Term || Term->isUnconditional()) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + continue; + } + + if (I.nodeVisited(Term->getSuccessor(1))) { + if (isTopOfStack(*I)) + closeControlFlow(*I); + handleLoop(Term); + continue; + } + + if (isTopOfStack(*I)) { + PHINode *Phi = dyn_cast<PHINode>(Term->getCondition()); + if (Phi && Phi->getParent() == *I && isElse(Phi)) { + insertElse(Term); + eraseIfUnused(Phi); + continue; + } + closeControlFlow(*I); + } + openIf(Term); + } + + assert(Stack.empty()); + return true; +} + +/// \brief Create the annotation pass +FunctionPass *llvm::createSIAnnotateControlFlowPass() { + + return new SIAnnotateControlFlow(); +} diff --git a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp b/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp deleted file mode 100644 index 0fecd7a28e..0000000000 --- a/lib/Target/AMDGPU/SIFixSGPRLiveness.cpp +++ /dev/null @@ -1,179 +0,0 @@ -//===-- SIFixSGPRLiveness.cpp - SGPR liveness adjustment ------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -/// \file -/// -/// SGPRs are not affected by control flow. This pass adjusts SGPR liveness in -/// so that the register allocator can still correctly allocate them. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/MachineDominators.h" -#include "llvm/CodeGen/MachinePostDominators.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" - -using namespace llvm; - -namespace { - -class SIFixSGPRLiveness : public MachineFunctionPass { -private: - static char ID; - - const TargetInstrInfo *TII; - MachineRegisterInfo *MRI; - MachineDominatorTree *MD; - MachinePostDominatorTree *MPD; - - bool isSGPR(const TargetRegisterClass *RegClass) { - return RegClass == &AMDGPU::SReg_1RegClass || - RegClass == &AMDGPU::SReg_32RegClass || - RegClass == &AMDGPU::SReg_64RegClass || - RegClass == &AMDGPU::SReg_128RegClass || - RegClass == &AMDGPU::SReg_256RegClass; - } - - void addKill(MachineBasicBlock::iterator I, unsigned Reg); - MachineBasicBlock *handleUses(unsigned VirtReg, MachineBasicBlock *Begin); - void handlePreds(MachineBasicBlock *Begin, MachineBasicBlock *End, - unsigned VirtReg); - - bool handleVirtReg(unsigned VirtReg); - -public: - SIFixSGPRLiveness(TargetMachine &tm); - - virtual bool runOnMachineFunction(MachineFunction &MF); - - virtual const char *getPassName() const { - return "SI fix SGPR liveness pass"; - } - - virtual void getAnalysisUsage(AnalysisUsage &AU) const; -}; - -} // end anonymous namespace - -char SIFixSGPRLiveness::ID = 0; - -SIFixSGPRLiveness::SIFixSGPRLiveness(TargetMachine &tm): - MachineFunctionPass(ID), - TII(tm.getInstrInfo()) { - initializeLiveIntervalsPass(*PassRegistry::getPassRegistry()); -} - -void SIFixSGPRLiveness::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired<MachineDominatorTree>(); - AU.addRequired<MachinePostDominatorTree>(); - AU.setPreservesCFG(); - MachineFunctionPass::getAnalysisUsage(AU); -} - -void SIFixSGPRLiveness::addKill(MachineBasicBlock::iterator I, unsigned Reg) { - MachineBasicBlock *MBB = I->getParent(); - - BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)).addReg(Reg); -} - -// Find the common post dominator of all uses -MachineBasicBlock *SIFixSGPRLiveness::handleUses(unsigned VirtReg, - MachineBasicBlock *Begin) { - MachineBasicBlock *LastUse = Begin, *End = Begin; - bool EndUsesReg = true; - - MachineRegisterInfo::use_iterator i, e; - for (i = MRI->use_begin(VirtReg), e = MRI->use_end(); i != e; ++i) { - MachineBasicBlock *MBB = i->getParent(); - if (LastUse == MBB) - continue; - - LastUse = MBB; - MBB = MPD->findNearestCommonDominator(End, MBB); - - if (MBB == LastUse) - EndUsesReg = true; - else if (MBB != End) - EndUsesReg = false; - - End = MBB; - } - - return EndUsesReg ? Begin : End; -} - -// Handles predecessors separately, only add KILLs to dominated ones -void SIFixSGPRLiveness::handlePreds(MachineBasicBlock *Begin, - MachineBasicBlock *End, - unsigned VirtReg) { - MachineBasicBlock::pred_iterator i, e; - for (i = End->pred_begin(), e = End->pred_end(); i != e; ++i) { - - if (MD->dominates(End, *i)) - continue; // ignore loops - - if (MD->dominates(*i, Begin)) - continue; // too far up, abort search - - if (MD->dominates(Begin, *i)) { - // found end of livetime - addKill((*i)->getFirstTerminator(), VirtReg); - continue; - } - - handlePreds(Begin, *i, VirtReg); - } -} - -bool SIFixSGPRLiveness::handleVirtReg(unsigned VirtReg) { - - MachineInstr *Def = MRI->getVRegDef(VirtReg); - if (!Def || MRI->use_empty(VirtReg)) - return false; // No definition or not used - - MachineBasicBlock *Begin = Def->getParent(); - MachineBasicBlock *End = handleUses(VirtReg, Begin); - if (Begin == End) - return false; // Defined and only used in the same block - - if (MD->dominates(Begin, End)) { - // Lifetime dominate the end node, just kill it here - addKill(End->getFirstNonPHI(), VirtReg); - } else { - // only some predecessors are dominate, handle them separately - handlePreds(Begin, End, VirtReg); - } - return true; -} - -bool SIFixSGPRLiveness::runOnMachineFunction(MachineFunction &MF) { - bool Changes = false; - - MRI = &MF.getRegInfo(); - MD = &getAnalysis<MachineDominatorTree>(); - MPD = &getAnalysis<MachinePostDominatorTree>(); - - for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) { - unsigned VirtReg = TargetRegisterInfo::index2VirtReg(i); - - const TargetRegisterClass *RegClass = MRI->getRegClass(VirtReg); - if (!isSGPR(RegClass)) - continue; - - Changes |= handleVirtReg(VirtReg); - } - - return Changes; -} - -FunctionPass *llvm::createSIFixSGPRLivenessPass(TargetMachine &tm) { - return new SIFixSGPRLiveness(tm); -} diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 292ce850f7..cd6e0e9916 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -44,8 +44,6 @@ SITargetLowering::SITargetLowering(TargetMachine &TM) : setOperationAction(ISD::ADD, MVT::i64, Legal); setOperationAction(ISD::ADD, MVT::i32, Legal); - setOperationAction(ISD::BR_CC, MVT::i32, Custom); - setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); // We need to custom lower loads from the USER_SGPR address space, so we can @@ -254,7 +252,7 @@ EVT SITargetLowering::getSetCCResultType(EVT VT) const { SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { switch (Op.getOpcode()) { default: return AMDGPUTargetLowering::LowerOperation(Op, DAG); - case ISD::BR_CC: return LowerBR_CC(Op, DAG); + case ISD::BRCOND: return LowerBRCOND(Op, DAG); case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG); case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND); @@ -298,27 +296,99 @@ SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op, return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode); } -SDValue SITargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const { - SDValue Chain = Op.getOperand(0); - SDValue CC = Op.getOperand(1); - SDValue LHS = Op.getOperand(2); - SDValue RHS = Op.getOperand(3); - SDValue JumpT = Op.getOperand(4); - SDValue CmpValue; - SDValue Result; - CmpValue = DAG.getNode( - ISD::SETCC, - Op.getDebugLoc(), - MVT::i1, - LHS, RHS, - CC); - - Result = DAG.getNode( - AMDGPUISD::BRANCH_COND, - CmpValue.getDebugLoc(), - MVT::Other, Chain, - JumpT, CmpValue); - return Result; +/// \brief Helper function for LowerBRCOND +static SDNode *findUser(SDValue Value, unsigned Opcode) { + + SDNode *Parent = Value.getNode(); + for (SDNode::use_iterator I = Parent->use_begin(), E = Parent->use_end(); + I != E; ++I) { + + if (I.getUse().get() != Value) + continue; + + if (I->getOpcode() == Opcode) + return *I; + } + return 0; +} + +/// This transforms the control flow intrinsics to get the branch destination as +/// last parameter, also switches branch target with BR if the need arise +SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND, + SelectionDAG &DAG) const { + + DebugLoc DL = BRCOND.getDebugLoc(); + + SDNode *Intr = BRCOND.getOperand(1).getNode(); + SDValue Target = BRCOND.getOperand(2); + SDNode *BR = 0; + + if (Intr->getOpcode() == ISD::SETCC) { + // As long as we negate the condition everything is fine + SDNode *SetCC = Intr; + assert(SetCC->getConstantOperandVal(1) == 1); + + CondCodeSDNode *CC = cast<CondCodeSDNode>(SetCC->getOperand(2).getNode()); + assert(CC->get() == ISD::SETNE); + Intr = SetCC->getOperand(0).getNode(); + + } else { + // Get the target from BR if we don't negate the condition + BR = findUser(BRCOND, ISD::BR); + Target = BR->getOperand(1); + } + + assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN); + + // Build the result and + SmallVector<EVT, 4> Res; + for (unsigned i = 1, e = Intr->getNumValues(); i != e; ++i) + Res.push_back(Intr->getValueType(i)); + + // operands of the new intrinsic call + SmallVector<SDValue, 4> Ops; + Ops.push_back(BRCOND.getOperand(0)); + for (unsigned i = 1, e = Intr->getNumOperands(); i != e; ++i) + Ops.push_back(Intr->getOperand(i)); + Ops.push_back(Target); + + // build the new intrinsic call + SDNode *Result = DAG.getNode( + Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL, + DAG.getVTList(Res.data(), Res.size()), Ops.data(), Ops.size()).getNode(); + + if (BR) { + // Give the branch instruction our target + SDValue Ops[] = { + BR->getOperand(0), + BRCOND.getOperand(2) + }; + DAG.MorphNodeTo(BR, ISD::BR, BR->getVTList(), Ops, 2); + } + + SDValue Chain = SDValue(Result, Result->getNumValues() - 1); + + // Copy the intrinsic results to registers + for (unsigned i = 1, e = Intr->getNumValues() - 1; i != e; ++i) { + SDNode *CopyToReg = findUser(SDValue(Intr, i), ISD::CopyToReg); + if (!CopyToReg) + continue; + + Chain = DAG.getCopyToReg( + Chain, DL, + CopyToReg->getOperand(1), + SDValue(Result, i - 1), + SDValue()); + + DAG.ReplaceAllUsesWith(SDValue(CopyToReg, 0), CopyToReg->getOperand(0)); + } + + // Remove the old intrinsic from the chain + DAG.ReplaceAllUsesOfValueWith( + SDValue(Intr, Intr->getNumValues() - 1), + Intr->getOperand(0)); + + return Chain; } SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 27c2a1c39a..c088112652 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -43,9 +43,9 @@ class SITargetLowering : public AMDGPUTargetLowering { SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG, unsigned VCCNode) const; - SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const; public: SITargetLowering(TargetMachine &tm); diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 008652f55e..005be96645 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -696,8 +696,9 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM", let isBranch = 1 in { def S_BRANCH : SOPP < 0x00000002, (ins brtarget:$target), "S_BRANCH", - [] ->; + [(br bb:$target)]> { + let isBarrier = 1; +} let DisableEncoding = "$scc" in { def S_CBRANCH_SCC0 : SOPP < @@ -1095,26 +1096,70 @@ def SI_WQM : InstSI < } // end usesCustomInserter -// SI Psuedo branch instructions. These are used by the CFG structurizer pass +// SI Psuedo instructions. These are used by the CFG structurizer pass // and should be lowered to ISA instructions prior to codegen. -let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0, - hasSideEffects = 0 in { -def SI_IF_NZ : InstSI < +let mayLoad = 1, mayStore = 1, hasSideEffects = 1, + Uses = [EXEC], Defs = [EXEC] in { + +let isBranch = 1, isTerminator = 1 in { + +def SI_IF : InstSI < + (outs SReg_64:$dst), + (ins SReg_1:$vcc, brtarget:$target), + "SI_IF", + [(set SReg_64:$dst, (int_SI_if SReg_1:$vcc, bb:$target))] +>; + +def SI_ELSE : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src, brtarget:$target), + "SI_ELSE", + [(set SReg_64:$dst, (int_SI_else SReg_64:$src, bb:$target))]> { + + let Constraints = "$src = $dst"; +} + +def SI_LOOP : InstSI < (outs), - (ins brtarget:$target, SReg_1:$vcc), - "SI_BRANCH_NZ", - [(IL_brcond bb:$target, SReg_1:$vcc)] + (ins SReg_64:$saved, brtarget:$target), + "SI_LOOP", + [(int_SI_loop SReg_64:$saved, bb:$target)] >; -def SI_IF_Z : InstSI < +} // end isBranch = 1, isTerminator = 1 + +def SI_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src), + "SI_ELSE", + [(set SReg_64:$dst, (int_SI_break SReg_64:$src))] +>; + +def SI_IF_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_1:$vcc, SReg_64:$src), + "SI_IF_BREAK", + [(set SReg_64:$dst, (int_SI_if_break SReg_1:$vcc, SReg_64:$src))] +>; + +def SI_ELSE_BREAK : InstSI < + (outs SReg_64:$dst), + (ins SReg_64:$src0, SReg_64:$src1), + "SI_ELSE_BREAK", + [(set SReg_64:$dst, (int_SI_else_break SReg_64:$src0, SReg_64:$src1))] +>; + +def SI_END_CF : InstSI < (outs), - (ins brtarget:$target, SReg_1:$vcc), - "SI_BRANCH_Z", - [] + (ins SReg_64:$saved), + "SI_END_CF", + [(int_SI_end_cf SReg_64:$saved)] >; -} // end isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0, - // hasSideEffects = 0 + +} // end mayLoad = 1, mayStore = 1, hasSideEffects = 1 + // Uses = [EXEC], Defs = [EXEC] + } // end IsCodeGenOnly, isPseudo /* int_SI_vs_load_input */ diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td index 1008fc42cc..c322fef0fe 100644 --- a/lib/Target/AMDGPU/SIIntrinsics.td +++ b/lib/Target/AMDGPU/SIIntrinsics.td @@ -39,4 +39,14 @@ let TargetPrefix = "SI", isTarget = 1 in { def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>; def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>; + + /* Control flow Intrinsics */ + + def int_SI_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], []>; + def int_SI_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], []>; + def int_SI_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], []>; + def int_SI_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], []>; + def int_SI_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], []>; + def int_SI_end_cf : Intrinsic<[], [llvm_i64_ty], []>; } diff --git a/lib/Target/AMDGPU/SILowerControlFlow.cpp b/lib/Target/AMDGPU/SILowerControlFlow.cpp index 277b647f67..1abcb8805a 100644 --- a/lib/Target/AMDGPU/SILowerControlFlow.cpp +++ b/lib/Target/AMDGPU/SILowerControlFlow.cpp @@ -8,10 +8,10 @@ //===----------------------------------------------------------------------===// // /// \file -/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF) -/// to predicated instructions. +/// \brief This pass lowers the pseudo control flow instructions to real +/// machine instructions. /// -/// All control flow (except loops) is handled using predicated instructions and +/// All control flow is handled using predicated instructions and /// a predicate stack. Each Scalar ALU controls the operations of 64 Vector /// ALUs. The Scalar ALU can update the predicate for any of the Vector ALUs /// by writting to the 64-bit EXEC register (each bit corresponds to a @@ -22,17 +22,17 @@ /// /// For example: /// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2 -/// SI_IF_NZ %VCC +/// %SGPR0 = SI_IF %VCC /// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 -/// ELSE +/// %SGPR0 = SI_ELSE %SGPR0 /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0 -/// ENDIF +/// SI_END_CF %SGPR0 /// /// becomes: /// /// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC // Save and update the exec mask /// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC // Clear live bits from saved exec mask -/// S_CBRANCH_EXECZ label0 // This instruction is an +/// S_CBRANCH_EXECZ label0 // This instruction is an optional /// // optimization which allows us to /// // branch if all the bits of /// // EXEC are zero. @@ -45,7 +45,7 @@ /// // instruction again. /// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR // Do the THEN block /// label1: -/// %EXEC = S_OR_B64 %EXEC, %SGPR2 // Re-enable saved exec mask bits +/// %EXEC = S_OR_B64 %EXEC, %SGPR0 // Re-enable saved exec mask bits //===----------------------------------------------------------------------===// #include "AMDGPU.h" @@ -65,11 +65,14 @@ class SILowerControlFlowPass : public MachineFunctionPass { private: static char ID; const TargetInstrInfo *TII; - std::vector<unsigned> PredicateStack; - std::vector<unsigned> UnusedRegisters; - unsigned allocReg(); - void freeReg(unsigned Reg); + void If(MachineInstr &MI); + void Else(MachineInstr &MI); + void Break(MachineInstr &MI); + void IfBreak(MachineInstr &MI); + void ElseBreak(MachineInstr &MI); + void Loop(MachineInstr &MI); + void EndCf(MachineInstr &MI); public: SILowerControlFlowPass(TargetMachine &tm) : @@ -91,101 +94,199 @@ FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) { return new SILowerControlFlowPass(tm); } +void SILowerControlFlowPass::If(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_SAVEEXEC_B64), Reg) + .addReg(Vcc); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), Reg) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Else(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_SAVEEXEC_B64), Dst) + .addReg(Src); // Saved EXEC + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_XOR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Dst); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Break(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Src = MI.getOperand(1).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::IfBreak(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Vcc = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Vcc) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::ElseBreak(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + + unsigned Dst = MI.getOperand(0).getReg(); + unsigned Saved = MI.getOperand(1).getReg(); + unsigned Src = MI.getOperand(2).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_OR_B64), Dst) + .addReg(Saved) + .addReg(Src); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::Loop(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Src = MI.getOperand(0).getReg(); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ANDN2_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Src); + + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addOperand(MI.getOperand(1)) + .addReg(AMDGPU::EXEC); + + MI.eraseFromParent(); +} + +void SILowerControlFlowPass::EndCf(MachineInstr &MI) { + + MachineBasicBlock &MBB = *MI.getParent(); + DebugLoc DL = MI.getDebugLoc(); + unsigned Reg = MI.getOperand(0).getReg(); + + BuildMI(MBB, MBB.getFirstNonPHI(), DL, + TII->get(AMDGPU::S_OR_B64), AMDGPU::EXEC) + .addReg(AMDGPU::EXEC) + .addReg(Reg); + + MI.eraseFromParent(); +} + bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) { - // Find all the unused registers that can be used for the predicate stack. - for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(), - S = AMDGPU::SReg_64RegClass.end(); - I != S; ++I) { - unsigned Reg = *I; - if (!MF.getRegInfo().isPhysRegUsed(Reg)) { - UnusedRegisters.insert(UnusedRegisters.begin(), Reg); - } - } + bool HaveCf = false; - for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end(); - BB != BB_E; ++BB) { - MachineBasicBlock &MBB = *BB; + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { + + MachineBasicBlock &MBB = *BI; for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I); - I != MBB.end(); I = Next) { + I != MBB.end(); I = Next) { + Next = llvm::next(I); MachineInstr &MI = *I; - unsigned Reg; switch (MI.getOpcode()) { default: break; - case AMDGPU::SI_IF_NZ: - Reg = allocReg(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64), - Reg) - .addOperand(MI.getOperand(0)); // VCC - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), - Reg) - .addReg(Reg) - .addReg(AMDGPU::EXEC); - MI.eraseFromParent(); - PredicateStack.push_back(Reg); + case AMDGPU::SI_IF: + If(MI); + break; + + case AMDGPU::SI_ELSE: + Else(MI); + break; + + case AMDGPU::SI_BREAK: + Break(MI); + break; + + case AMDGPU::SI_IF_BREAK: + IfBreak(MI); + break; + + case AMDGPU::SI_ELSE_BREAK: + ElseBreak(MI); break; - case AMDGPU::ELSE: - Reg = PredicateStack.back(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64), - Reg) - .addReg(Reg); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64), - AMDGPU::EXEC) - .addReg(Reg) - .addReg(AMDGPU::EXEC); - MI.eraseFromParent(); + case AMDGPU::SI_LOOP: + Loop(MI); break; - case AMDGPU::ENDIF: - Reg = PredicateStack.back(); - PredicateStack.pop_back(); - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64), - AMDGPU::EXEC) - .addReg(AMDGPU::EXEC) - .addReg(Reg); - freeReg(Reg); - - if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL && - PredicateStack.empty()) { - // If the exec mask is non-zero, skip the next two instructions - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ)) - .addImm(3) - .addReg(AMDGPU::EXEC); - - // Exec mask is zero: Export to NULL target... - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP)) - .addImm(0) - .addImm(0x09) // V_008DFC_SQ_EXP_NULL - .addImm(0) - .addImm(1) - .addImm(1) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0) - .addReg(AMDGPU::SREG_LIT_0); - - // ... and terminate wavefront - BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM)); - } - MI.eraseFromParent(); + case AMDGPU::SI_END_CF: + HaveCf = true; + EndCf(MI); break; } } } - return true; -} -unsigned SILowerControlFlowPass::allocReg() { + // TODO: What is this good for? + unsigned ShaderType = MF.getInfo<SIMachineFunctionInfo>()->ShaderType; + if (HaveCf && ShaderType == ShaderType::PIXEL) { + for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); + BI != BE; ++BI) { - assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack"); - unsigned Reg = UnusedRegisters.back(); - UnusedRegisters.pop_back(); - return Reg; -} + MachineBasicBlock &MBB = *BI; + if (MBB.succ_empty()) { -void SILowerControlFlowPass::freeReg(unsigned Reg) { + MachineInstr &MI = *MBB.getFirstNonPHI(); + DebugLoc DL = MI.getDebugLoc(); - UnusedRegisters.push_back(Reg); + // If the exec mask is non-zero, skip the next two instructions + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ)) + .addImm(3) + .addReg(AMDGPU::EXEC); + + // Exec mask is zero: Export to NULL target... + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::EXP)) + .addImm(0) + .addImm(0x09) // V_008DFC_SQ_EXP_NULL + .addImm(0) + .addImm(1) + .addImm(1) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0) + .addReg(AMDGPU::SREG_LIT_0); + + // ... and terminate wavefront + BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_ENDPGM)); + } + } + } + + return true; } |