1 files changed, 477 insertions, 0 deletions
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
new file mode 100644
index 00000000000..8cf515ae1aa
--- /dev/null
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -0,0 +1,477 @@
+//===-- SIMachineScheduler.h - SI Scheduler Interface -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Machine Scheduler interface
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_SIMACHINESCHEDULER_H
+
+#include "SIInstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+enum SIScheduleCandReason {
+  NoCand,
+  RegUsage,
+  Latency,
+  Successor,
+  Depth,
+  NodeOrder
+};
+
+struct SISchedulerCandidate {
+  // The reason for this candidate.
+  SIScheduleCandReason Reason;
+
+  // Set of reasons that apply to multiple candidates.
+  uint32_t RepeatReasonSet;
+
+  SISchedulerCandidate()
+    :  Reason(NoCand), RepeatReasonSet(0) {}
+
+  bool isRepeat(SIScheduleCandReason R) { return RepeatReasonSet & (1 << R); }
+  void setRepeat(SIScheduleCandReason R) { RepeatReasonSet |= (1 << R); }
+};
+
+class SIScheduleDAGMI;
+class SIScheduleBlockCreator;
+
+class SIScheduleBlock {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator *BC;
+
+  std::vector<SUnit*> SUnits;
+  std::map<unsigned, unsigned> NodeNum2Index;
+  std::vector<SUnit*> TopReadySUs;
+  std::vector<SUnit*> ScheduledSUnits;
+
+  /// The top of the unscheduled zone.
+  IntervalPressure TopPressure;
+  RegPressureTracker TopRPTracker;
+
+  // Pressure: number of said class of registers needed to
+  // store the live virtual and real registers.
+  // We do care only of SGPR32 and VGPR32 and do track only virtual registers.
+  // Pressure of additional registers required inside the block.
+  std::vector<unsigned> InternalAdditionnalPressure;
+  // Pressure of input and output registers
+  std::vector<unsigned> LiveInPressure;
+  std::vector<unsigned> LiveOutPressure;
+  // Registers required by the block, and outputs.
+  // We do track only virtual registers.
+  // Note that some registers are not 32 bits,
+  // and thus the pressure is not equal
+  // to the number of live registers.
+  std::set<unsigned> LiveInRegs;
+  std::set<unsigned> LiveOutRegs;
+
+  bool Scheduled;
+  bool HighLatencyBlock;
+
+  std::vector<unsigned> HasLowLatencyNonWaitedParent;
+
+public:
+  SIScheduleBlock(SIScheduleDAGMI *DAG, SIScheduleBlockCreator *BC,
+                  unsigned ID):
+    DAG(DAG), BC(BC), SUnits(), TopReadySUs(), ScheduledSUnits(),
+    TopRPTracker(TopPressure), Scheduled(false),
+    HighLatencyBlock(false), ID(ID),
+    Preds(), Succs() {};
+
+  ~SIScheduleBlock() {};
+
+  // Unique ID, the index of the Block in the SIScheduleDAGMI Blocks table.
+  unsigned ID;
+
+  /// Functions for Block construction.
+  void addUnit(SUnit *SU);
+
+  // When all SUs have been added.
+  void finalizeUnits();
+
+  // Add block pred, which has instruction predecessor of SU.
+  void addPred(SIScheduleBlock *Pred);
+  void addSucc(SIScheduleBlock *Succ);
+
+  std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
+  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+
+  unsigned Height;  // Maximum topdown path length to block without outputs
+  unsigned Depth;   // Maximum bottomup path length to block without inputs
+
+  unsigned NumHighLatencySuccessors = 0;
+
+  bool isHighLatencyBlock() { return HighLatencyBlock; }
+
+  // This is approximative.
+  // Ideally should take into accounts some instructions (rcp, etc)
+  // are 4 times slower.
+  int getCost() { return SUnits.size(); }
+
+  // The block Predecessors and Successors must be all registered
+  // before fastSchedule().
+  // Fast schedule with no particular requirement.
+  void fastSchedule();
+
+  std::vector<SUnit*> getScheduledUnits() { return ScheduledSUnits; }
+
+  // Complete schedule that will try to minimize reg pressure and
+  // low latencies, and will fill liveins and liveouts.
+  // Needs all MIs to be groupped between BeginBlock and EndBlock.
+  // The MIs can be moved after the scheduling,
+  // it is just used to allow correct track of live registers.
+  void schedule(MachineBasicBlock::iterator BeginBlock,
+                MachineBasicBlock::iterator EndBlock);
+
+  bool isScheduled() { return Scheduled; }
+
+
+  // Needs the block to be scheduled inside
+  // TODO: find a way to compute it.
+  std::vector<unsigned> &getInternalAdditionnalRegUsage() {
+    return InternalAdditionnalPressure;
+  }
+
+  std::set<unsigned> &getInRegs() { return LiveInRegs; }
+  std::set<unsigned> &getOutRegs() { return LiveOutRegs; }
+
+  void printDebug(bool Full);
+
+private:
+  struct SISchedCandidate : SISchedulerCandidate {
+    // The best SUnit candidate.
+    SUnit *SU;
+
+    unsigned SGPRUsage;
+    unsigned VGPRUsage;
+    bool IsLowLatency;
+    unsigned LowLatencyOffset;
+    bool HasLowLatencyNonWaitedParent;
+
+    SISchedCandidate()
+      : SU(nullptr) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SISchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      SGPRUsage = Best.SGPRUsage;
+      VGPRUsage = Best.VGPRUsage;
+      IsLowLatency = Best.IsLowLatency;
+      LowLatencyOffset = Best.LowLatencyOffset;
+      HasLowLatencyNonWaitedParent = Best.HasLowLatencyNonWaitedParent;
+    }
+  };
+
+  void undoSchedule();
+
+  // InOrOutBlock: restrict to links pointing inside the block (true),
+  // or restrict to links pointing outside the block (false).
+  void undoReleaseSucc(SUnit *SU, SDep *SuccEdge, bool InOrOutBlock);
+  void releaseSucc(SUnit *SU, SDep *SuccEdge, bool InOrOutBlock);
+  void releaseSuccessors(SUnit *SU, bool InOrOutBlock);
+
+  void NodeScheduled(SUnit *SU);
+  void tryCandidateTopDown(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  void tryCandidateBottomUp(SISchedCandidate &Cand, SISchedCandidate &TryCand);
+  SUnit* pickNode();
+  void traceCandidate(const SISchedCandidate &Cand);
+  void initRegPressure(MachineBasicBlock::iterator BeginBlock,
+                       MachineBasicBlock::iterator EndBlock);
+};
+
+struct SIScheduleBlocks {
+  std::vector<SIScheduleBlock*> Blocks;
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+};
+
+enum SISchedulerBlockCreatorVariant {
+    LatenciesAlone,
+    LatenciesGroupped,
+    LatenciesAlonePlusConsecutive
+};
+
+class SIScheduleBlockCreator {
+  SIScheduleDAGMI *DAG;
+  // unique_ptr handles freeing memory for us.
+  std::vector<std::unique_ptr<SIScheduleBlock>> BlockPtrs;
+  std::map<SISchedulerBlockCreatorVariant,
+           SIScheduleBlocks> Blocks;
+  std::vector<SIScheduleBlock*> CurrentBlocks;
+  std::vector<int> Node2CurrentBlock;
+
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2Block;
+  std::vector<int> TopDownBlock2Index;
+  std::vector<int> BottomUpIndex2Block;
+
+  // 0 -> Color not given.
+  // 1 to SUnits.size() -> Reserved group (you should only add elements to them).
+  // Above -> Other groups.
+  int NextReservedID;
+  int NextNonReservedID;
+  std::vector<int> CurrentColoring;
+  std::vector<int> CurrentTopDownReservedDependencyColoring;
+  std::vector<int> CurrentBottomUpReservedDependencyColoring;
+
+public:
+  SIScheduleBlockCreator(SIScheduleDAGMI *DAG);
+  ~SIScheduleBlockCreator();
+
+  SIScheduleBlocks
+  getBlocks(SISchedulerBlockCreatorVariant BlockVariant);
+
+  bool isSUInBlock(SUnit *SU, unsigned ID);
+
+private:
+  // Give a Reserved color to every high latency.
+  void colorHighLatenciesAlone();
+
+  // Create groups of high latencies with a Reserved color.
+  void colorHighLatenciesGroups();
+
+  // Compute coloring for topdown and bottom traversals with
+  // different colors depending on dependencies on Reserved colors.
+  void colorComputeReservedDependencies();
+
+  // Give color to all non-colored SUs according to Reserved groups dependencies.
+  void colorAccordingToReservedDependencies();
+
+  // Divides Blocks having no bottom up or top down dependencies on Reserved groups.
+  // The new colors are computed according to the dependencies on the other blocks
+  // formed with colorAccordingToReservedDependencies.
+  void colorEndsAccordingToDependencies();
+
+  // Cut groups into groups with SUs in consecutive order (except for Reserved groups).
+  void colorForceConsecutiveOrderInGroup();
+
+  // Merge Constant loads that have all their users into another group to the group.
+  // (TODO: else if all their users depend on the same group, put them there)
+  void colorMergeConstantLoadsNextGroup();
+
+  // Merge SUs that have all their users into another group to the group
+  void colorMergeIfPossibleNextGroup();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only for Reserved groups.
+  void colorMergeIfPossibleNextGroupOnlyForReserved();
+
+  // Merge SUs that have all their users into another group to the group,
+  // but only if the group is no more than a few SUs.
+  void colorMergeIfPossibleSmallGroupsToNextGroup();
+
+  // Divides Blocks with important size.
+  // Idea of implementation: attribute new colors depending on topdown and
+  // bottom up links to other blocks.
+  void cutHugeBlocks();
+
+  // Put in one group all instructions with no users in this scheduling region
+  // (we'd want these groups be at the end).
+  void regroupNoUserInstructions();
+
+  void createBlocksForVariant(SISchedulerBlockCreatorVariant BlockVariant);
+
+  void topologicalSort();
+
+  void scheduleInsideBlocks();
+
+  void fillStats();
+};
+
+enum SISchedulerBlockSchedulerVariant {
+  BlockLatencyRegUsage,
+  BlockRegUsageLatency,
+  BlockRegUsage
+};
+
+class SIScheduleBlockScheduler {
+  SIScheduleDAGMI *DAG;
+  SISchedulerBlockSchedulerVariant Variant;
+  std::vector<SIScheduleBlock*> Blocks;
+
+  std::vector<std::map<unsigned, unsigned>> LiveOutRegsNumUsages;
+  std::set<unsigned> LiveRegs;
+  // Num of schedulable unscheduled blocks reading the register.
+  std::map<unsigned, unsigned> LiveRegsConsumers;
+
+  std::vector<unsigned> LastPosHighLatencyParentScheduled;
+  int LastPosWaitedHighLatency;
+
+  std::vector<SIScheduleBlock*> BlocksScheduled;
+  unsigned NumBlockScheduled;
+  std::vector<SIScheduleBlock*> ReadyBlocks;
+
+  unsigned VregCurrentUsage;
+  unsigned SregCurrentUsage;
+
+  // Currently is only approximation.
+  unsigned maxVregUsage;
+  unsigned maxSregUsage;
+
+  std::vector<unsigned> BlockNumPredsLeft;
+  std::vector<unsigned> BlockNumSuccsLeft;
+
+public:
+  SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
+                           SISchedulerBlockSchedulerVariant Variant,
+                           SIScheduleBlocks BlocksStruct);
+  ~SIScheduleBlockScheduler() {};
+
+  std::vector<SIScheduleBlock*> getBlocks() { return BlocksScheduled; };
+
+  unsigned getVGPRUsage() { return maxVregUsage; };
+  unsigned getSGPRUsage() { return maxSregUsage; };
+
+private:
+  struct SIBlockSchedCandidate : SISchedulerCandidate {
+    // The best Block candidate.
+    SIScheduleBlock *Block;
+
+    bool IsHighLatency;
+    int VGPRUsageDiff;
+    unsigned NumSuccessors;
+    unsigned NumHighLatencySuccessors;
+    unsigned LastPosHighLatParentScheduled;
+    unsigned Height;
+
+    SIBlockSchedCandidate()
+      : Block(nullptr) {}
+
+    bool isValid() const { return Block; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SIBlockSchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      Block = Best.Block;
+      Reason = Best.Reason;
+      IsHighLatency = Best.IsHighLatency;
+      VGPRUsageDiff = Best.VGPRUsageDiff;
+      NumSuccessors = Best.NumSuccessors;
+      NumHighLatencySuccessors = Best.NumHighLatencySuccessors;
+      LastPosHighLatParentScheduled = Best.LastPosHighLatParentScheduled;
+      Height = Best.Height;
+    }
+  };
+
+  bool tryCandidateLatency(SIBlockSchedCandidate &Cand,
+                           SIBlockSchedCandidate &TryCand);
+  bool tryCandidateRegUsage(SIBlockSchedCandidate &Cand,
+                            SIBlockSchedCandidate &TryCand);
+  SIScheduleBlock *pickBlock();
+
+  void addLiveRegs(std::set<unsigned> &Regs);
+  void decreaseLiveRegs(SIScheduleBlock *Block, std::set<unsigned> &Regs);
+  void releaseBlockSuccs(SIScheduleBlock *Parent);
+  void blockScheduled(SIScheduleBlock *Block);
+
+  // Check register pressure change
+  // by scheduling a block with these LiveIn and LiveOut.
+  std::vector<int> checkRegUsageImpact(std::set<unsigned> &InRegs,
+                                       std::set<unsigned> &OutRegs);
+
+  void schedule();
+};
+
+struct SIScheduleBlockResult {
+  std::vector<unsigned> SUs;
+  unsigned MaxSGPRUsage;
+  unsigned MaxVGPRUsage;
+};
+
+class SIScheduler {
+  SIScheduleDAGMI *DAG;
+  SIScheduleBlockCreator BlockCreator;
+
+public:
+  SIScheduler(SIScheduleDAGMI *DAG) : DAG(DAG), BlockCreator(DAG) {};
+
+  ~SIScheduler() {};
+
+  struct SIScheduleBlockResult
+  scheduleVariant(SISchedulerBlockCreatorVariant BlockVariant,
+                  SISchedulerBlockSchedulerVariant ScheduleVariant);
+};
+
+class SIScheduleDAGMI : public ScheduleDAGMILive {
+  const SIInstrInfo *SITII;
+  const SIRegisterInfo *SITRI;
+
+  std::vector<SUnit> SUnitsLinksBackup;
+
+  // For moveLowLatencies. After all Scheduling variants are tested.
+  std::vector<unsigned> ScheduledSUnits;
+  std::vector<unsigned> ScheduledSUnitsInv;
+
+public:
+  SIScheduleDAGMI(MachineSchedContext *C);
+
+  ~SIScheduleDAGMI() override;
+
+  // Entry point for the schedule.
+  void schedule() override;
+
+  // To init Block's RPTracker.
+  void initRPTracker(RegPressureTracker &RPTracker) {
+    RPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin);
+  }
+
+  MachineBasicBlock *getBB() { return BB; }
+  MachineBasicBlock::iterator getCurrentTop() { return CurrentTop; };
+  MachineBasicBlock::iterator getCurrentBottom() { return CurrentBottom; };
+  LiveIntervals *getLIS() { return LIS; }
+  MachineRegisterInfo *getMRI() { return &MRI; }
+  const TargetRegisterInfo *getTRI() { return TRI; }
+  SUnit& getEntrySU() { return EntrySU; };
+  SUnit& getExitSU() { return ExitSU; };
+
+  void restoreSULinksLeft();
+
+  template<typename _Iterator> void fillVgprSgprCost(_Iterator First,
+                                                     _Iterator End,
+                                                     unsigned &VgprUsage,
+                                                     unsigned &SgprUsage);
+  std::set<unsigned> getInRegs() {
+    std::set<unsigned> InRegs (RPTracker.getPressure().LiveInRegs.begin(),
+                               RPTracker.getPressure().LiveInRegs.end());
+    return InRegs;
+  };
+
+private:
+  void topologicalSort();
+  // After scheduling is done, improve low latency placements.
+  void moveLowLatencies();
+
+public:
+  unsigned VGPRSetID;
+  unsigned SGPRSetID;
+  // Some stats for scheduling inside blocks.
+  std::vector<unsigned> IsLowLatencySU;
+  std::vector<unsigned> LowLatencyOffset;
+  std::vector<unsigned> IsHighLatencySU;
+  // Topological sort
+  // Maps topological index to the node number.
+  std::vector<int> TopDownIndex2SU;
+  std::vector<int> BottomUpIndex2SU;
+};
+
+} // namespace llvm
+
+#endif /* SIMACHINESCHEDULER_H_ */