summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJun Bum Lim <junbuml@codeaurora.org>2015-12-18 18:08:30 +0000
committerJun Bum Lim <junbuml@codeaurora.org>2015-12-18 18:08:30 +0000
commitcb2bad780bcdbe5a373237b71b31c92aa2736cf3 (patch)
tree7761eb8225b83f9600b322d6f7bbf97386d265ca
parente8df234a6ea0dbc20e4da677f2f9f8c426f44246 (diff)
[AArch64] Promote loads from stores
This change promotes load instructions which directly read from stores by replacing them with mov instructions. If the store is wider than the load, the load will be replaced with a bitfield extract. For example : STRWui %W1, %X0, 1 %W0 = LDRHHui %X0, 3 becomes STRWui %W1, %X0, 1 %W0 = UBFMWri %W1, 16, 31 git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@256004 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp283
-rw-r--r--test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll6
-rw-r--r--test/CodeGen/AArch64/arm64-ld-from-st.ll666
-rw-r--r--test/CodeGen/AArch64/regress-tblgen-chains.ll4
4 files changed, 951 insertions, 8 deletions
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 27d569d7043..05dce507e58 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -43,6 +43,7 @@ STATISTIC(NumUnscaledPairCreated,
"Number of load/store from unscaled generated");
STATISTIC(NumNarrowLoadsPromoted, "Number of narrow loads promoted");
STATISTIC(NumZeroStoresPromoted, "Number of narrow zero stores promoted");
+STATISTIC(NumLoadsFromStoresPromoted, "Number of loads from stores promoted");
static cl::opt<unsigned> ScanLimit("aarch64-load-store-scan-limit",
cl::init(20), cl::Hidden);
@@ -93,6 +94,12 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
MachineBasicBlock::iterator findMatchingInsn(MachineBasicBlock::iterator I,
LdStPairFlags &Flags,
unsigned Limit);
+
+ // Scan the instructions looking for a store that writes to the address from
+ // which the current load instruction reads. Return true if one is found.
+ bool findMatchingStore(MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI);
+
// Merge the two instructions indicated into a single pair-wise instruction.
// If MergeForward is true, erase the first instruction and fold its
// operation into the second. If false, the reverse. Return the instruction
@@ -102,6 +109,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
MachineBasicBlock::iterator Paired,
const LdStPairFlags &Flags);
+ // Promote the load that reads directly from the address stored to.
+ MachineBasicBlock::iterator
+ promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI);
+
// Scan the instruction list to find a base register update that can
// be combined with the current instruction (a load or store) using
// pre or post indexed addressing with writeback. Scan forwards.
@@ -128,6 +140,9 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
// Find and merge foldable ldr/str instructions.
bool tryToMergeLdStInst(MachineBasicBlock::iterator &MBBI);
+ // Find and promote load instructions which read directly from store.
+ bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI);
+
// Check if converting two narrow loads into a single wider load with
// bitfield extracts could be enabled.
bool enableNarrowLdMerge(MachineFunction &Fn);
@@ -399,6 +414,36 @@ static unsigned getMatchingPairOpcode(unsigned Opc) {
}
}
+static unsigned isMatchingStore(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ unsigned LdOpc = LoadInst->getOpcode();
+ unsigned StOpc = StoreInst->getOpcode();
+ switch (LdOpc) {
+ default:
+ llvm_unreachable("Unsupported load instruction!");
+ case AArch64::LDRBBui:
+ return StOpc == AArch64::STRBBui || StOpc == AArch64::STRHHui ||
+ StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURBBi:
+ return StOpc == AArch64::STURBBi || StOpc == AArch64::STURHHi ||
+ StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRHHui:
+ return StOpc == AArch64::STRHHui || StOpc == AArch64::STRWui ||
+ StOpc == AArch64::STRXui;
+ case AArch64::LDURHHi:
+ return StOpc == AArch64::STURHHi || StOpc == AArch64::STURWi ||
+ StOpc == AArch64::STURXi;
+ case AArch64::LDRWui:
+ return StOpc == AArch64::STRWui || StOpc == AArch64::STRXui;
+ case AArch64::LDURWi:
+ return StOpc == AArch64::STURWi || StOpc == AArch64::STURXi;
+ case AArch64::LDRXui:
+ return StOpc == AArch64::STRXui;
+ case AArch64::LDURXi:
+ return StOpc == AArch64::STURXi;
+ }
+}
+
static unsigned getPreIndexedOpcode(unsigned Opc) {
switch (Opc) {
default:
@@ -553,6 +598,21 @@ static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) {
return MI->getOperand(Idx);
}
+static bool isLdOffsetInRangeOfSt(MachineInstr *LoadInst,
+ MachineInstr *StoreInst) {
+ assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st.");
+ int LoadSize = getMemScale(LoadInst);
+ int StoreSize = getMemScale(StoreInst);
+ int UnscaledStOffset = isUnscaledLdSt(StoreInst)
+ ? getLdStOffsetOp(StoreInst).getImm()
+ : getLdStOffsetOp(StoreInst).getImm() * StoreSize;
+ int UnscaledLdOffset = isUnscaledLdSt(LoadInst)
+ ? getLdStOffsetOp(LoadInst).getImm()
+ : getLdStOffsetOp(LoadInst).getImm() * LoadSize;
+ return (UnscaledStOffset <= UnscaledLdOffset) &&
+ (UnscaledLdOffset + LoadSize <= (UnscaledStOffset + StoreSize));
+}
+
// Copy MachineMemOperands from Op0 and Op1 to a new array assigned to MI.
static void concatenateMemOperands(MachineInstr *MI, MachineInstr *Op0,
MachineInstr *Op1) {
@@ -800,6 +860,106 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
return NextI;
}
+MachineBasicBlock::iterator
+AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
+ MachineBasicBlock::iterator StoreI) {
+ MachineBasicBlock::iterator NextI = LoadI;
+ ++NextI;
+
+ int LoadSize = getMemScale(LoadI);
+ int StoreSize = getMemScale(StoreI);
+ unsigned LdRt = getLdStRegOp(LoadI).getReg();
+ unsigned StRt = getLdStRegOp(StoreI).getReg();
+ bool IsStoreXReg = TRI->getRegClass(AArch64::GPR64RegClassID)->contains(StRt);
+
+ assert((IsStoreXReg ||
+ TRI->getRegClass(AArch64::GPR32RegClassID)->contains(StRt)) &&
+ "Unexpected RegClass");
+
+ MachineInstr *BitExtMI;
+ if (LoadSize == StoreSize) {
+ // Remove the load, if the destination register of the loads is the same
+ // register for stored value.
+ if (StRt == LdRt) {
+ DEBUG(dbgs() << "Remove load instruction:\n ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+ LoadI->eraseFromParent();
+ return NextI;
+ }
+ // Replace the load with a mov if the load and store are in the same size.
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ORRXrs : AArch64::ORRWrs), LdRt)
+ .addReg(IsStoreXReg ? AArch64::XZR : AArch64::WZR)
+ .addReg(StRt)
+ .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+ } else {
+ // FIXME: Currently we disable this transformation in big-endian targets as
+ // performance and correctness are verified only in little-endian.
+ if (!Subtarget->isLittleEndian())
+ return NextI;
+ bool IsUnscaled = isUnscaledLdSt(LoadI);
+ assert(IsUnscaled == isUnscaledLdSt(StoreI) && "Unsupported ld/st match");
+ assert(LoadSize < StoreSize && "Invalid load size");
+ int UnscaledLdOffset = IsUnscaled
+ ? getLdStOffsetOp(LoadI).getImm()
+ : getLdStOffsetOp(LoadI).getImm() * LoadSize;
+ int UnscaledStOffset = IsUnscaled
+ ? getLdStOffsetOp(StoreI).getImm()
+ : getLdStOffsetOp(StoreI).getImm() * StoreSize;
+ int Width = LoadSize * 8;
+ int Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ int Imms = Immr + Width - 1;
+ unsigned DestReg = IsStoreXReg
+ ? TRI->getMatchingSuperReg(LdRt, AArch64::sub_32,
+ &AArch64::GPR64RegClass)
+ : LdRt;
+
+ assert(((UnscaledLdOffset) >= UnscaledStOffset &&
+ (UnscaledLdOffset + LoadSize) <= UnscaledStOffset + StoreSize) &&
+ "Invalid offset");
+
+ Immr = 8 * (UnscaledLdOffset - UnscaledStOffset);
+ Imms = Immr + Width - 1;
+ if (UnscaledLdOffset == UnscaledStOffset) {
+ uint32_t AndMaskEncoded = ((IsStoreXReg ? 1 : 0) << 12) // N
+ | ((Immr) << 6) // immr
+ | ((Imms) << 0) // imms
+ ;
+
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::ANDXri : AArch64::ANDWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(AndMaskEncoded);
+ } else {
+ BitExtMI =
+ BuildMI(*LoadI->getParent(), LoadI, LoadI->getDebugLoc(),
+ TII->get(IsStoreXReg ? AArch64::UBFMXri : AArch64::UBFMWri),
+ DestReg)
+ .addReg(StRt)
+ .addImm(Immr)
+ .addImm(Imms);
+ }
+ }
+
+ DEBUG(dbgs() << "Promoting load by replacing :\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG(LoadI->print(dbgs()));
+ DEBUG(dbgs() << " with instructions:\n ");
+ DEBUG(StoreI->print(dbgs()));
+ DEBUG(dbgs() << " ");
+ DEBUG((BitExtMI)->print(dbgs()));
+ DEBUG(dbgs() << "\n");
+
+ // Erase the old instructions.
+ LoadI->eraseFromParent();
+ return NextI;
+}
+
/// trackRegDefsUses - Remember what registers the specified instruction uses
/// and modifies.
static void trackRegDefsUses(const MachineInstr *MI, BitVector &ModifiedRegs,
@@ -863,6 +1023,60 @@ static bool mayAlias(MachineInstr *MIa,
return false;
}
+bool AArch64LoadStoreOpt::findMatchingStore(
+ MachineBasicBlock::iterator I, unsigned Limit,
+ MachineBasicBlock::iterator &StoreI) {
+ MachineBasicBlock::iterator E = I->getParent()->begin();
+ MachineBasicBlock::iterator MBBI = I;
+ MachineInstr *FirstMI = I;
+ unsigned BaseReg = getLdStBaseOp(FirstMI).getReg();
+
+ // Track which registers have been modified and used between the first insn
+ // and the second insn.
+ BitVector ModifiedRegs, UsedRegs;
+ ModifiedRegs.resize(TRI->getNumRegs());
+ UsedRegs.resize(TRI->getNumRegs());
+
+ for (unsigned Count = 0; MBBI != E && Count < Limit;) {
+ --MBBI;
+ MachineInstr *MI = MBBI;
+ // Skip DBG_VALUE instructions. Otherwise debug info can affect the
+ // optimization by changing how far we scan.
+ if (MI->isDebugValue())
+ continue;
+ // Now that we know this is a real instruction, count it.
+ ++Count;
+
+ // If the load instruction reads directly from the address to which the
+ // store instruction writes and the stored value is not modified, we can
+ // promote the load. Since we do not handle stores with pre-/post-index,
+ // it's unnecessary to check if BaseReg is modified by the store itself.
+ if (MI->mayStore() && isMatchingStore(FirstMI, MI) &&
+ BaseReg == getLdStBaseOp(MI).getReg() &&
+ isLdOffsetInRangeOfSt(FirstMI, MI) &&
+ !ModifiedRegs[getLdStRegOp(MI).getReg()]) {
+ StoreI = MBBI;
+ return true;
+ }
+
+ if (MI->isCall())
+ return false;
+
+ // Update modified / uses register lists.
+ trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI);
+
+ // Otherwise, if the base register is modified, we have no match, so
+ // return early.
+ if (ModifiedRegs[BaseReg])
+ return false;
+
+ // If we encounter a store aliased with the load, return early.
+ if (MI->mayStore() && mayAlias(FirstMI, MI, TII))
+ return false;
+ }
+ return false;
+}
+
/// findMatchingInsn - Scan the instructions looking for a load/store that can
/// be combined with the current instruction into a load/store pair.
MachineBasicBlock::iterator
@@ -1263,6 +1477,31 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
return E;
}
+bool AArch64LoadStoreOpt::tryToPromoteLoadFromStore(
+ MachineBasicBlock::iterator &MBBI) {
+ MachineInstr *MI = MBBI;
+ // If this is a volatile load, don't mess with it.
+ if (MI->hasOrderedMemoryRef())
+ return false;
+
+ // Make sure this is a reg+imm.
+ // FIXME: It is possible to extend it to handle reg+reg cases.
+ if (!getLdStOffsetOp(MI).isImm())
+ return false;
+
+ // Look backward up to ScanLimit instructions.
+ MachineBasicBlock::iterator StoreI;
+ if (findMatchingStore(MBBI, ScanLimit, StoreI)) {
+ ++NumLoadsFromStoresPromoted;
+ // Promote the load. Keeping the iterator straight is a
+ // pain, so we let the merge routine tell us what the next instruction
+ // is after it's done mucking about.
+ MBBI = promoteLoadFromStore(MBBI, StoreI);
+ return true;
+ }
+ return false;
+}
+
bool AArch64LoadStoreOpt::tryToMergeLdStInst(
MachineBasicBlock::iterator &MBBI) {
MachineInstr *MI = MBBI;
@@ -1307,7 +1546,16 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
bool enableNarrowLdOpt) {
bool Modified = false;
// Three tranformations to do here:
- // 1) Find narrow loads that can be converted into a single wider load
+ // 1) Find loads that directly read from stores and promote them by
+ // replacing with mov instructions. If the store is wider than the load,
+ // the load will be replaced with a bitfield extract.
+ // e.g.,
+ // str w1, [x0, #4]
+ // ldrh w2, [x0, #6]
+ // ; becomes
+ // str w1, [x0, #4]
+ // lsr w2, w1, #16
+ // 2) Find narrow loads that can be converted into a single wider load
// with bitfield extract instructions.
// e.g.,
// ldrh w0, [x2]
@@ -1316,14 +1564,14 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// ldr w0, [x2]
// ubfx w1, w0, #16, #16
// and w0, w0, #ffff
- // 2) Find loads and stores that can be merged into a single load or store
+ // 3) Find loads and stores that can be merged into a single load or store
// pair instruction.
// e.g.,
// ldr x0, [x2]
// ldr x1, [x2, #8]
// ; becomes
// ldp x0, x1, [x2]
- // 3) Find base register updates that can be merged into the load or store
+ // 4) Find base register updates that can be merged into the load or store
// as a base-reg writeback.
// e.g.,
// ldr x0, [x2]
@@ -1332,6 +1580,35 @@ bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB,
// ldr x0, [x2], #4
for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+ MBBI != E;) {
+ MachineInstr *MI = MBBI;
+ switch (MI->getOpcode()) {
+ default:
+ // Just move on to the next instruction.
+ ++MBBI;
+ break;
+ // Scaled instructions.
+ case AArch64::LDRBBui:
+ case AArch64::LDRHHui:
+ case AArch64::LDRWui:
+ case AArch64::LDRXui:
+ // Unscaled instructions.
+ case AArch64::LDURBBi:
+ case AArch64::LDURHHi:
+ case AArch64::LDURWi:
+ case AArch64::LDURXi: {
+ if (tryToPromoteLoadFromStore(MBBI)) {
+ Modified = true;
+ break;
+ }
+ ++MBBI;
+ break;
+ }
+ // FIXME: Do the other instructions.
+ }
+ }
+
+ for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
enableNarrowLdOpt && MBBI != E;) {
MachineInstr *MI = MBBI;
switch (MI->getOpcode()) {
diff --git a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
index eb0cd3547bd..36424506bee 100644
--- a/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
+++ b/test/CodeGen/AArch64/arm64-alloca-frame-pointer-offset.ll
@@ -1,9 +1,9 @@
; RUN: llc -march=arm64 -mcpu=cyclone < %s | FileCheck %s
; CHECK: foo
-; CHECK: ldr w[[REG:[0-9]+]], [x19, #264]
-; CHECK: str w[[REG]], [x19, #132]
-; CHECK: ldr w{{[0-9]+}}, [x19, #264]
+; CHECK: str w[[REG0:[0-9]+]], [x19, #264]
+; CHECK: mov w[[REG1:[0-9]+]], w[[REG0]]
+; CHECK: str w[[REG1]], [x19, #132]
define i32 @foo(i32 %a) nounwind {
%retval = alloca i32, align 4
diff --git a/test/CodeGen/AArch64/arm64-ld-from-st.ll b/test/CodeGen/AArch64/arm64-ld-from-st.ll
new file mode 100644
index 00000000000..5013ce6c1d4
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-ld-from-st.ll
@@ -0,0 +1,666 @@
+; RUN: llc < %s -mtriple aarch64--none-eabi -verify-machineinstrs | FileCheck %s
+
+; CHECK-LABEL: Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i64*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 1
+ %1 = load i64, i64* %arrayidx1
+ ret i64 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 2
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 3
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 4
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 5
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 6
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 7
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 8
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 9
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 10
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 11
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 12
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 13
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 14
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 15
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i32*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr16_1
+; CHECK: lsr w0, w1, #16
+define i16 @Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 5
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 6
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 7
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr16
+; CHECK: mov w0, w1
+define i16 @Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i16*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+
+; CHECK-LABEL: Unscaled_Str64Ldr64
+; CHECK: mov x0, x1
+define i64 @Unscaled_Str64Ldr64(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i64*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i64, i64* %0, i64 -1
+ %1 = load i64, i64* %arrayidx1
+ ret i64 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_0
+; CHECK: and x0, x1, #0xffffffff
+define i32 @Unscaled_Str64Ldr32_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -2
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr32_1
+; CHECK: lsr x0, x1, #32
+define i32 @Unscaled_Str64Ldr32_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i32*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_0
+; CHECK: and x0, x1, #0xffff
+define i16 @Unscaled_Str64Ldr16_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -4
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_1
+; CHECK: ubfx x0, x1, #16, #16
+define i16 @Unscaled_Str64Ldr16_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_2
+; CHECK: ubfx x0, x1, #32, #16
+define i16 @Unscaled_Str64Ldr16_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr16_3
+; CHECK: lsr x0, x1, #48
+define i16 @Unscaled_Str64Ldr16_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i16*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_0
+; CHECK: and x0, x1, #0xff
+define i8 @Unscaled_Str64Ldr8_0(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -8
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_1
+; CHECK: ubfx x0, x1, #8, #8
+define i8 @Unscaled_Str64Ldr8_1(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -7
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_2
+; CHECK: ubfx x0, x1, #16, #8
+define i8 @Unscaled_Str64Ldr8_2(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -6
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_3
+; CHECK: ubfx x0, x1, #24, #8
+define i8 @Unscaled_Str64Ldr8_3(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -5
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_4
+; CHECK: ubfx x0, x1, #32, #8
+define i8 @Unscaled_Str64Ldr8_4(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_5
+; CHECK: ubfx x0, x1, #40, #8
+define i8 @Unscaled_Str64Ldr8_5(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_6
+; CHECK: ubfx x0, x1, #48, #8
+define i8 @Unscaled_Str64Ldr8_6(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str64Ldr8_7
+; CHECK: lsr x0, x1, #56
+define i8 @Unscaled_Str64Ldr8_7(i64* nocapture %P, i64 %v, i64 %n) {
+entry:
+ %0 = bitcast i64* %P to i8*
+ %arrayidx0 = getelementptr inbounds i64, i64* %P, i64 -1
+ store i64 %v, i64* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr32
+; CHECK: mov w0, w1
+define i32 @Unscaled_Str32Ldr32(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i32*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i32, i32* %0, i64 -1
+ %1 = load i32, i32* %arrayidx1
+ ret i32 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_0
+; CHECK: and w0, w1, #0xffff
+define i16 @Unscaled_Str32Ldr16_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr16_1
+; CHECK: lsr w0, w1, #16
+define i16 @Unscaled_Str32Ldr16_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str32Ldr8_0(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -4
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str32Ldr8_1(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -3
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_2
+; CHECK: ubfx w0, w1, #16, #8
+define i8 @Unscaled_Str32Ldr8_2(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str32Ldr8_3
+; CHECK: lsr w0, w1, #24
+define i8 @Unscaled_Str32Ldr8_3(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i8*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr16
+; CHECK: mov w0, w1
+define i16 @Unscaled_Str16Ldr16(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i16*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_0
+; CHECK: and w0, w1, #0xff
+define i8 @Unscaled_Str16Ldr8_0(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -2
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: Unscaled_Str16Ldr8_1
+; CHECK: ubfx w0, w1, #8, #8
+define i8 @Unscaled_Str16Ldr8_1(i16* nocapture %P, i16 %v, i64 %n) {
+entry:
+ %0 = bitcast i16* %P to i8*
+ %arrayidx0 = getelementptr inbounds i16, i16* %P, i64 -1
+ store i16 %v, i16* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i8, i8* %0, i64 -1
+ %1 = load i8, i8* %arrayidx1
+ ret i8 %1
+}
+
+; CHECK-LABEL: StrVolatileLdr
+; CHECK: ldrh
+define i16 @StrVolatileLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load volatile i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: StrNotInRangeLdr
+; CHECK: ldrh
+define i16 @StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: Unscaled_StrNotInRangeLdr
+; CHECK: ldurh
+define i16 @Unscaled_StrNotInRangeLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 -1
+ store i32 %v, i32* %arrayidx0
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 -3
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+; CHECK-LABEL: StrCallLdr
+; CHECK: ldrh
+define i16 @StrCallLdr(i32* nocapture %P, i32 %v, i64 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ %c = call i1 @test_dummy()
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 1
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
+
+declare i1 @test_dummy()
+
+; CHECK-LABEL: StrStrLdr
+; CHECK: ldrh
+define i16 @StrStrLdr(i32 %v, i32* %P, i32* %P2, i32 %n) {
+entry:
+ %0 = bitcast i32* %P to i16*
+ %arrayidx0 = getelementptr inbounds i32, i32* %P, i64 1
+ store i32 %v, i32* %arrayidx0
+ store i32 %n, i32* %P2
+ %arrayidx1 = getelementptr inbounds i16, i16* %0, i64 2
+ %1 = load i16, i16* %arrayidx1
+ ret i16 %1
+}
diff --git a/test/CodeGen/AArch64/regress-tblgen-chains.ll b/test/CodeGen/AArch64/regress-tblgen-chains.ll
index 0d301bbd502..2062f8b8b38 100644
--- a/test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ b/test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -27,8 +27,8 @@ define i64 @test_chains() {
; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR:#-?[0-9]+]]]
; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
-; CHECK: sturb {{w[0-9]+}}, [x29, [[LOCADDR]]]
-; CHECK: ldurb {{w[0-9]+}}, [x29, [[LOCADDR]]]
+; CHECK: sturb w[[STRVAL:[0-9]+]], [x29, [[LOCADDR]]]
+; CHECK: mov {{w[0-9]+}}, w[[STRVAL]]
%ret.1 = load i8, i8* %locvar
%ret.2 = zext i8 %ret.1 to i64