XXX: Struct fixesstruct-divergence-v1

author: Tom Stellard <thomas.stellard@amd.com> 2015-11-03 20:14:47 +0000
committer: Tom Stellard <thomas.stellard@amd.com> 2015-11-03 20:14:47 +0000
commit: b70e162b70e3bb112bc3fb7c4dec24c032aa6617 (patch)
tree: fc8770616062a699f2e812aa7990dff9ad3edc30
parent: 00c306b4f2ae9e970fa6f867d3cd193372c968ac (diff)
8 files changed, 111 insertions, 25 deletions
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 73b87b082e2..65f25f368e0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -149,6 +149,7 @@ private:
                    uint32_t Offset, uint32_t Width);
   SDNode *SelectS_BFEFromShifts(SDNode *N);
   SDNode *SelectS_BFE(SDNode *N);
+  SDNode *SelectBRCOND(SDNode *N);
 
   // Include the pieces autogenerated from the target description.
 #include "AMDGPUGenDAGISel.inc"
@@ -576,6 +577,8 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
       break;
 
     return SelectS_BFE(N);
+  case ISD::BRCOND:
+    return SelectBRCOND(N);
   }
 
   return SelectCode(N);
@@ -1449,6 +1452,44 @@ SDNode *AMDGPUDAGToDAGISel::SelectS_BFE(SDNode *N) {
   return SelectCode(N);
 }
 
+SDNode *AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
+  SDValue Cond = N->getOperand(1);
+  if (Cond.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+    return SelectCode(N);
+
+  unsigned IntrinsicID =
+        cast<ConstantSDNode>(Cond.getOperand(0))->getZExtValue();
+  if (IntrinsicID != AMDGPUIntrinsic::AMDGPU_uniform_cond)
+    return SelectCode(N);
+
+  SDValue RealCond = Cond.getOperand(1);
+  if (RealCond.hasOneUse() && isSALUCmp(RealCond)) {
+    // This brcond will use S_CBRANCH_SCC*, so let tablegen handle it.
+    return SelectCode(N);
+  }
+
+  // The result of VOPC instructions is or'd against ~EXEC before it is
+  // written to vcc or another SGPR.  This means that the value '1' is always
+  // written to the corresponding bit for results that are masked.  In order
+  // to correctly check against vccz, we need to and VCC with the EXEC
+  // register in order to clear the value from the masked bits.
+
+  SDLoc SL(N);
+
+  SDNode *MaskedCond =
+        CurDAG->getMachineNode(AMDGPU::S_AND_B64, SL, MVT::i1,
+                               CurDAG->getRegister(AMDGPU::EXEC, MVT::i1),
+                               RealCond);
+  SDValue VCC = CurDAG->getCopyToReg(N->getOperand(0), SL, AMDGPU::VCC,
+                                     SDValue(MaskedCond, 0),
+                                     SDValue()); // Passing SDValue() adds a
+                                                 // glue output.
+  return CurDAG->SelectNodeTo(N, AMDGPU::S_CBRANCH_VCCNZ, MVT::Other,
+                              N->getOperand(2), // Basic Block
+                              VCC.getValue(0),  // Chain
+                              VCC.getValue(1)); // Glue
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
                                         SDValue &SrcMods) const {
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 6fd38ca9022..3d0456052bd 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -877,6 +877,33 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   SDNode *Intr = BRCOND.getOperand(1).getNode();
   SDValue Target = BRCOND.getOperand(2);
   SDNode *BR = nullptr;
+  bool InvertBranch = false;
+
+  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IntrinsicID =
+        cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue();
+    if (IntrinsicID == AMDGPUIntrinsic::AMDGPU_uniform_cond)
+      return BRCOND;
+  }
+
+  if (Intr->getOperand(0).getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
+    unsigned IntrinsicID =
+        cast<ConstantSDNode>(Intr->getOperand(0)->getOperand(0))->getZExtValue();
+    if (IntrinsicID == AMDGPUIntrinsic::AMDGPU_uniform_cond) {
+      SDValue UniformCond = Intr->getOperand(0);
+      SDValue RealCond = UniformCond.getOperand(1);
+      SDValue Cond = BRCOND.getOperand(1);
+
+      //brcond <- cond <- uniformcond <- realcond
+      DAG.ReplaceAllUsesWith(UniformCond, RealCond);
+      //brcond <- cond <- realcond
+      SDValue NewUniformCond = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i1,
+                                           DAG.getTargetConstant(AMDGPUIntrinsic::AMDGPU_uniform_cond, DL, MVT::i32),
+                                           Cond);
+      return DAG.getNode(ISD::BRCOND, DL, MVT::Other,
+                         BRCOND.getOperand(0), NewUniformCond, Target);
+    }
+  }
 
   if (Intr->getOpcode() == ISD::SETCC) {
     // As long as we negate the condition everything is fine
@@ -885,6 +912,7 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     assert(cast<CondCodeSDNode>(SetCC->getOperand(2).getNode())->get() ==
            ISD::SETNE);
     Intr = SetCC->getOperand(0).getNode();
+    InvertBranch = true;
 
   } else {
     // Get the target from BR if we don't negate the condition
@@ -892,20 +920,9 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     Target = BR->getOperand(1);
   }
 
-  #if 1
-  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    unsigned IntrinsicID = cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue();
-
-    if (IntrinsicID == AMDGPUIntrinsic::AMDGPU_uniform_cond) {
-      SDValue Cond = Intr->getOperand(1);
-      return DAG.getNode(AMDGPUISD::BRCOND_UNIFORM, DL, MVT::Other,
-                         BRCOND.getOperand(0), Cond, BRCOND.getOperand(2));
-    }
-  }
-
-  #endif
-
-  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
+//  DAG.dump();
+//  Intr->dump();
+//  assert(Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN);
 
   // Build the result and
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index 52a23c8180a..0aa32c0be31 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -2298,6 +2298,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
       }
       break;
 
+    case AMDGPU::S_CBRANCH_SCC0:
+    case AMDGPU::S_CBRANCH_SCC1:
+      // Clear unused bits of vcc
+      BuildMI(*MBB, Inst, Inst->getDebugLoc(), get(AMDGPU::S_AND_B64), AMDGPU::VCC)
+              .addReg(AMDGPU::EXEC)
+              .addReg(AMDGPU::VCC);
+      break;
+
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 7158dd5686a..257585fa32b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -220,7 +220,6 @@ def si_setcc_uniform : PatFrag <
   (ops node:$lhs, node:$rhs, node:$cond),
   (setcc  node:$lhs, node:$rhs, node:$cond), [{
   for (SDNode *Use : N->uses()) {
-    Use->dump();
     if (Use->isMachineOpcode() || Use->getOpcode() != ISD::CopyToReg)
       return false;
 
@@ -231,14 +230,26 @@ def si_setcc_uniform : PatFrag <
   return true;
 }]>;
 
+def si_br_uniform : PatFrag <
+  (ops node:$cond, node:$bb),
+  (brcond (int_AMDGPU_uniform_cond node:$cond), node:$bb)
+>;
+
+/*
+def si_br_uniform_inverse : PatFrag <
+  (ops node:$cond, node:$bb),
+  (brcond (setne (int_AMDGPU_uniform_cond node:$cond), -1), node:$bb)
+>;
+*/
+
 def si_br_uniform_scc : PatFrag <
-  (ops node:$cond, node:$bb), (SIbr_uniform node:$cond, node:$bb), [{
-  SDValue SetCC = N->getOperand(1);
+  (ops node:$cond, node:$bb), (si_br_uniform node:$cond, node:$bb), [{
+  SDValue SetCC = N->getOperand(1)->getOperand(1);
   return SetCC.hasOneUse() && isSALUCmp(SetCC);
 }]>;
 
 def si_br_uniform_vcc : PatFrag <
-  (ops node:$cond, node:$bb), (SIbr_uniform node:$cond, node:$bb), [{
+  (ops node:$cond, node:$bb), (si_br_uniform node:$cond, node:$bb), [{
   SDValue SetCC = N->getOperand(1);
   return !SetCC.hasOneUse() || !isSALUCmp(SetCC);
 }]>;
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 145b7f3bff3..77d689e9f37 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -466,7 +466,7 @@ def S_CBRANCH_VCCZ : SOPP <
 def S_CBRANCH_VCCNZ : SOPP <
   0x00000007, (ins sopp_brtarget:$simm16),
   "s_cbranch_vccnz $simm16",
-  [(si_br_uniform_vcc (i1 VCC), bb:$simm16)]
+  []
 >;
 } // End Uses = [VCC]
 
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index 267a323c506..c67095438ee 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,8 +12,9 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define void @test(i32 addrspace(1)* %out) {
 entry:
+  %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %loop
 
@@ -32,3 +33,7 @@ done:
   store i32 %inc, i32 addrspace(1)* %tmp3
   ret void
 }
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index b11a2113764..38cf93b85b2 100644
--- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -4,9 +4,7 @@
 ; SILowerI1Copies was not handling IMPLICIT_DEF
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
-; SI-NEXT: s_and_saveexec_b64
-; SI-NEXT: s_xor_b64
-; SI-NEXT: BB#1:
+; SI-NEXT: s_cbranch_vccnz
 define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index 105cd06b330..e6129e62e34 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,9 +10,11 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg, i1 %arg1) #0 {
+define void @br_i1_phi(i32 %arg) {
 bb:
-  br i1 %arg1, label %bb2, label %bb3
+  %tidig = call i32 @llvm.r600.read.tidig.x() #0
+  %cmp = trunc i32 %tidig to i1
+  br i1 %cmp, label %bb2, label %bb3
 
 bb2:                                              ; preds = %bb
   br label %bb3
@@ -28,3 +30,7 @@ bb4:                                              ; preds = %bb3
 bb6:                                              ; preds = %bb4, %bb3
   ret void
 }
+
+declare i32 @llvm.r600.read.tidig.x() #0
+
+attributes #0 = { readnone }
author	Tom Stellard <thomas.stellard@amd.com>	2015-11-03 20:14:47 +0000
committer	Tom Stellard <thomas.stellard@amd.com>	2015-11-03 20:14:47 +0000
commit	b70e162b70e3bb112bc3fb7c4dec24c032aa6617 (patch)
tree	fc8770616062a699f2e812aa7990dff9ad3edc30
parent	00c306b4f2ae9e970fa6f867d3cd193372c968ac (diff)