diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index fee6937e7d502..966ffb7a1fbd2 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -278,6 +278,13 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
   /// Ordered list of DAG postprocessing steps.
   std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
 
+  /// Used to compute single-iteration dependencies (i.e., buildSchedGraph).
+  AliasAnalysis *AA;
+
+  /// Used to compute loop-carried dependencies (i.e.,
+  /// addLoopCarriedDependences).
+  BatchAAResults BAA;
+
   /// Helper class to implement Johnson's circuit finding algorithm.
   class Circuits {
     std::vector<SUnit> &SUnits;
@@ -323,13 +330,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 public:
   SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
                     const RegisterClassInfo &rci, unsigned II,
-                    TargetInstrInfo::PipelinerLoopInfo *PLI)
+                    TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA)
       : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
         RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
-        Topo(SUnits, &ExitSU) {
+        Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) {
     P.MF->getSubtarget().getSMSMutations(Mutations);
     if (SwpEnableCopyToPhi)
       Mutations.push_back(std::make_unique<CopyToPhiMutation>());
+    BAA.enableCrossIterationMode();
   }
 
   void schedule() override;
@@ -394,7 +402,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
                              const MachineInstr *OtherMI) const;
 
 private:
-  void addLoopCarriedDependences(AAResults *AA);
+  void addLoopCarriedDependences();
   void updatePhiDependences();
   void changeDependences();
   unsigned calculateResMII();
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 6cb0299a30d7a..07bffc6c3de90 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -237,6 +237,37 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
 INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
                     "Modulo Software Pipelining", false, false)
 
+namespace {
+
+/// This class holds an SUnit corresponding to a memory operation and other
+/// information related to the instruction.
+struct SUnitWithMemInfo {
+  SUnit *SU;
+  SmallVector<const Value *, 2> UnderlyingObjs;
+
+  /// The value of a memory operand.
+  const Value *MemOpValue = nullptr;
+
+  /// The offset of a memory operand.
+  int64_t MemOpOffset = 0;
+
+  AAMDNodes AATags;
+
+  /// True if all the underlying objects are identified.
+  bool IsAllIdentified = false;
+
+  SUnitWithMemInfo(SUnit *SU);
+
+  bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
+
+  bool isUnknown() const { return MemOpValue == nullptr; }
+
+private:
+  bool getUnderlyingObjects();
+};
+
+} // end anonymous namespace
+
 /// The "main" function for implementing Swing Modulo Scheduling.
 bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   if (skipFunction(mf.getFunction()))
@@ -470,9 +501,10 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
 bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
   assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
 
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   SwingSchedulerDAG SMS(
       *this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
-      II_setByPragma, LI.LoopPipelinerInfo.get());
+      II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
 
   MachineBasicBlock *MBB = L.getHeader();
   // The kernel should not include any terminator instructions.  These
@@ -560,9 +592,8 @@ void SwingSchedulerDAG::setMAX_II() {
 /// We override the schedule function in ScheduleDAGInstrs to implement the
 /// scheduling part of the Swing Modulo Scheduling algorithm.
 void SwingSchedulerDAG::schedule() {
-  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
   buildSchedGraph(AA);
-  addLoopCarriedDependences(AA);
+  addLoopCarriedDependences();
   updatePhiDependences();
   Topo.InitDAGTopologicalSorting();
   changeDependences();
@@ -810,113 +841,131 @@ static bool isDependenceBarrier(MachineInstr &MI) {
           (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
 }
 
-/// Return the underlying objects for the memory references of an instruction.
+SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
+  if (!getUnderlyingObjects())
+    return;
+  for (const Value *Obj : UnderlyingObjs)
+    if (!isIdentifiedObject(Obj)) {
+      IsAllIdentified = false;
+      break;
+    }
+}
+
+bool SUnitWithMemInfo::isTriviallyDisjoint(
+    const SUnitWithMemInfo &Other) const {
+  // If all underlying objects are identified objects and there is no overlap
+  // between them, then these two instructions are disjoint.
+  if (!IsAllIdentified || !Other.IsAllIdentified)
+    return false;
+  for (const Value *Obj : UnderlyingObjs)
+    if (llvm::is_contained(Other.UnderlyingObjs, Obj))
+      return false;
+  return true;
+}
+
+/// Collect the underlying objects for the memory references of an instruction.
 /// This function calls the code in ValueTracking, but first checks that the
 /// instruction has a memory operand.
-static void getUnderlyingObjects(const MachineInstr *MI,
-                                 SmallVectorImpl<const Value *> &Objs) {
+/// Returns false if we cannot find the underlying objects.
+bool SUnitWithMemInfo::getUnderlyingObjects() {
+  const MachineInstr *MI = SU->getInstr();
   if (!MI->hasOneMemOperand())
-    return;
+    return false;
   MachineMemOperand *MM = *MI->memoperands_begin();
   if (!MM->getValue())
-    return;
-  getUnderlyingObjects(MM->getValue(), Objs);
-  for (const Value *V : Objs) {
-    if (!isIdentifiedObject(V)) {
-      Objs.clear();
-      return;
-    }
-  }
+    return false;
+  MemOpValue = MM->getValue();
+  MemOpOffset = MM->getOffset();
+  llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs);
+
+  // TODO: A no alias scope may be valid only in a single iteration. In this
+  // case we need to peel off it like LoopAccessAnalysis does.
+  AATags = MM->getAAInfo();
+  return true;
 }
 
 /// Add a chain edge between a load and store if the store can be an
 /// alias of the load on a subsequent iteration, i.e., a loop carried
 /// dependence. This code is very similar to the code in ScheduleDAGInstrs
 /// but that code doesn't create loop carried dependences.
-void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
-  MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
-  Value *UnknownValue =
-    UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
+void SwingSchedulerDAG::addLoopCarriedDependences() {
+  SmallVector<SUnitWithMemInfo, 4> PendingLoads;
   for (auto &SU : SUnits) {
     MachineInstr &MI = *SU.getInstr();
     if (isDependenceBarrier(MI))
       PendingLoads.clear();
     else if (MI.mayLoad()) {
-      SmallVector<const Value *, 4> Objs;
-      ::getUnderlyingObjects(&MI, Objs);
-      if (Objs.empty())
-        Objs.push_back(UnknownValue);
-      for (const auto *V : Objs) {
-        SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
-        SUs.push_back(&SU);
-      }
+      PendingLoads.emplace_back(&SU);
     } else if (MI.mayStore()) {
-      SmallVector<const Value *, 4> Objs;
-      ::getUnderlyingObjects(&MI, Objs);
-      if (Objs.empty())
-        Objs.push_back(UnknownValue);
-      for (const auto *V : Objs) {
-        MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
-            PendingLoads.find(V);
-        if (I == PendingLoads.end())
+      SUnitWithMemInfo Store(&SU);
+      for (const SUnitWithMemInfo &Load : PendingLoads) {
+        if (Load.isTriviallyDisjoint(Store))
           continue;
-        for (auto *Load : I->second) {
-          if (isSuccOrder(Load, &SU))
-            continue;
-          MachineInstr &LdMI = *Load->getInstr();
-          // First, perform the cheaper check that compares the base register.
-          // If they are the same and the load offset is less than the store
-          // offset, then mark the dependence as loop carried potentially.
-          const MachineOperand *BaseOp1, *BaseOp2;
-          int64_t Offset1, Offset2;
-          bool Offset1IsScalable, Offset2IsScalable;
-          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
-                                           Offset1IsScalable, TRI) &&
-              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
-                                           Offset2IsScalable, TRI)) {
-            if (BaseOp1->isIdenticalTo(*BaseOp2) &&
-                Offset1IsScalable == Offset2IsScalable &&
-                (int)Offset1 < (int)Offset2) {
-              assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
-                     "What happened to the chain edge?");
-              SDep Dep(Load, SDep::Barrier);
-              Dep.setLatency(1);
-              SU.addPred(Dep);
-              continue;
-            }
-          }
-          // Second, the more expensive check that uses alias analysis on the
-          // base registers. If they alias, and the load offset is less than
-          // the store offset, the mark the dependence as loop carried.
-          if (!AA) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-            continue;
-          }
-          MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
-          MachineMemOperand *MMO2 = *MI.memoperands_begin();
-          if (!MMO1->getValue() || !MMO2->getValue()) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-            continue;
-          }
-          if (MMO1->getValue() == MMO2->getValue() &&
-              MMO1->getOffset() <= MMO2->getOffset()) {
-            SDep Dep(Load, SDep::Barrier);
+        if (isSuccOrder(Load.SU, Store.SU))
+          continue;
+        MachineInstr &LdMI = *Load.SU->getInstr();
+        // First, perform the cheaper check that compares the base register.
+        // If they are the same and the load offset is less than the store
+        // offset, then mark the dependence as loop carried potentially.
+        const MachineOperand *BaseOp1, *BaseOp2;
+        int64_t Offset1, Offset2;
+        bool Offset1IsScalable, Offset2IsScalable;
+        if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
+                                         Offset1IsScalable, TRI) &&
+            TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
+                                         Offset2IsScalable, TRI)) {
+          if (BaseOp1->isIdenticalTo(*BaseOp2) &&
+              Offset1IsScalable == Offset2IsScalable &&
+              (int)Offset1 < (int)Offset2) {
+            assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
+                   "What happened to the chain edge?");
+            SDep Dep(Load.SU, SDep::Barrier);
             Dep.setLatency(1);
             SU.addPred(Dep);
             continue;
           }
-          if (!AA->isNoAlias(
-                  MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
-                  MemoryLocation::getAfter(MMO2->getValue(),
-                                           MMO2->getAAInfo()))) {
-            SDep Dep(Load, SDep::Barrier);
-            Dep.setLatency(1);
-            SU.addPred(Dep);
-          }
+        }
+        // Second, the more expensive check that uses alias analysis on the
+        // base registers. If they alias, and the load offset is less than
+        // the store offset, the mark the dependence as loop carried.
+        if (Load.isUnknown() || Store.isUnknown()) {
+          SDep Dep(Load.SU, SDep::Barrier);
+          Dep.setLatency(1);
+          SU.addPred(Dep);
+          continue;
+        }
+        if (Load.MemOpValue == Store.MemOpValue &&
+            Load.MemOpOffset <= Store.MemOpOffset) {
+          SDep Dep(Load.SU, SDep::Barrier);
+          Dep.setLatency(1);
+          SU.addPred(Dep);
+          continue;
+        }
+
+        bool IsNoAlias = [&] {
+          if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue,
+                                                             Load.AATags),
+                            MemoryLocation::getBeforeOrAfter(Store.MemOpValue,
+                                                             Store.AATags)))
+            return true;
+
+          // AliasAnalysis sometimes gives up on following the underlying
+          // object. In such a case, separate checks for underlying objects may
+          // prove that there are no aliases between two accesses.
+          for (const Value *LoadObj : Load.UnderlyingObjs)
+            for (const Value *StoreObj : Store.UnderlyingObjs)
+              if (!BAA.isNoAlias(
+                      MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags),
+                      MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags)))
+                return false;
+
+          return true;
+        }();
+
+        if (!IsNoAlias) {
+          SDep Dep(Load.SU, SDep::Barrier);
+          Dep.setLatency(1);
+          SU.addPred(Dep);
         }
       }
     }
diff --git a/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir
new file mode 100644
index 0000000000000..8163074b589d8
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir
@@ -0,0 +1,72 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test that pipeliner correctly detects the loop-carried dependency between the
+# load and the store, which is indicated by `Ord` dependency from SU(2) to
+# SU(4). Note that there is no dependency within a single iteration.
+
+# CHECK:      SU(2):   %7:intregs = L2_loadri_io %5:intregs, 0 :: (load (s32) from %ir.ptr.load)
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Predecessors:
+# CHECK-NEXT:     SU(0): Data Latency=0 Reg=%5
+# CHECK-NEXT:   Successors:
+# CHECK-DAG:      SU(3): Data Latency=2 Reg=%7
+# CHECK-DAG:      SU(4): Ord  Latency=1 Barrier
+# CHECK-NEXT: SU(3):   %8:intregs = F2_sfadd %7:intregs, %3:intregs, implicit $usr
+# CHECK:      SU(4):   S2_storeri_io %6:intregs, 0, %8:intregs :: (store (s32) into %ir.ptr.store)
+
+
+--- |
+  define void @foo(ptr noalias %p0, ptr noalias %p1, i32 %n) {
+  entry:
+    br label %body
+  
+  body:                                             ; preds = %body, %entry
+    %i = phi i32 [ 0, %entry ], [ %i.next, %body ]
+    %ptr.load = phi ptr [ %p0, %entry ], [ %p1, %body ]
+    %ptr.store = phi ptr [ %p1, %entry ], [ %p0, %body ]
+    %v = load float, ptr %ptr.load, align 4
+    %add = fadd float %v, 1.000000e+00
+    store float %add, ptr %ptr.store, align 4
+    %i.next = add i32 %i, 1
+    %cond = icmp slt i32 %i.next, %n
+    br i1 %cond, label %body, label %exit
+  
+  exit:                                             ; preds = %body
+    ret void
+  }
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r2
+  
+    %6:intregs = COPY $r2
+    %5:intregs = COPY $r1
+    %4:intregs = COPY $r0
+    %9:intregs = A2_tfrsi 1065353216
+    %12:intregs = COPY %6
+    J2_loop0r %bb.1, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.1.body (machine-block-address-taken):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  
+    %1:intregs = PHI %4, %bb.0, %5, %bb.1
+    %2:intregs = PHI %5, %bb.0, %4, %bb.1
+    %8:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.ptr.load)
+    %10:intregs = F2_sfadd killed %8, %9, implicit $usr
+    S2_storeri_io %2, 0, killed %10 :: (store (s32) into %ir.ptr.store)
+    ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.2, implicit-def dead $pc
+  
+  bb.2.exit:
+    PS_jmpret $r31, implicit-def dead $pc
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-no-alias.mir b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir
new file mode 100644
index 0000000000000..38b7212702ff9
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir
@@ -0,0 +1,151 @@
+# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Test that there are no loop-carried dependencies between all memory instructions.
+
+# CHECK:      SU(0):   %8:intregs = PHI %1:intregs, %bb.1, %9:intregs, %bb.2
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Successors:
+# CHECK-DAG:      SU(6): Data Latency=0 Reg=%8
+# CHECK-DAG:      SU(5): Data Latency=0 Reg=%8
+# CHECK-DAG:      SU(3): Data Latency=0 Reg=%8
+# CHECK-DAG:      SU(6): Anti Latency=1
+# CHECK-NEXT: SU(1):   %10:intregs = PHI %2:intregs, %bb.1, %11:intregs, %bb.2
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Successors:
+# CHECK-DAG:      SU(7): Data Latency=0 Reg=%10
+# CHECK-DAG:      SU(4): Data Latency=0 Reg=%10
+# CHECK-DAG:      SU(2): Data Latency=0 Reg=%10
+# CHECK-DAG:      SU(7): Anti Latency=1
+# CHECK-NEXT: SU(2):   %12:hvxvr = V6_vL32b_ai %10:intregs, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4)
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Predecessors:
+# CHECK-NEXT:     SU(1): Data Latency=0 Reg=%10
+# CHECK-NEXT:   Successors:
+# CHECK-NEXT:     SU(3): Data Latency=0 Reg=%12
+# CHECK-NEXT: SU(3):   V6_vS32b_ai %8:intregs, 0, %12:hvxvr :: (store (s1024) into %ir.optr.010, !tbaa !4)
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Predecessors:
+# CHECK-DAG:      SU(2): Data Latency=0 Reg=%12
+# CHECK-DAG:      SU(0): Data Latency=0 Reg=%8
+# CHECK-NEXT: SU(4):   %13:hvxvr = V6_vL32b_ai %10:intregs, 128 :: (load (s1024) from %ir.cgep, !tbaa !4)
+# CHECK-NEXT:   # preds left
+# CHECK-NEXT:   # succs left
+# CHECK-NEXT:   # rdefs left
+# CHECK-NEXT:   Latency
+# CHECK-NEXT:   Depth
+# CHECK-NEXT:   Height
+# CHECK-NEXT:   Predecessors:
+# CHECK-NEXT:     SU(1): Data Latency=0 Reg=%10
+# CHECK-NEXT:   Successors:
+# CHECK-NEXT:     SU(5): Data Latency=0 Reg=%13
+# CHECK-NEXT: SU(5):   V6_vS32b_ai %8:intregs, 128, %13:hvxvr :: (store (s1024) into %ir.cgep3, !tbaa !4)
+
+
+
+
+--- |
+  define dso_local void @foo(ptr noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out, i32 noundef %width) local_unnamed_addr #0 {
+  entry:
+    %cmp7 = icmp sgt i32 %width, 0
+    br i1 %cmp7, label %for.body.preheader, label %for.end
+  
+  for.body.preheader:                               ; preds = %entry
+    %0 = add i32 %width, 128
+    br label %for.body
+  
+  for.body:                                         ; preds = %for.body.preheader, %for.body
+    %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ]
+    %optr.010 = phi ptr [ %cgep4, %for.body ], [ %out, %for.body.preheader ]
+    %iptr.09 = phi ptr [ %cgep5, %for.body ], [ %in, %for.body.preheader ]
+    %ald = load <128 x i8>, ptr %iptr.09, align 128, !tbaa !4
+    %cst = bitcast <128 x i8> %ald to <32 x i32>
+    store <32 x i32> %cst, ptr %optr.010, align 128, !tbaa !4
+    %cgep = getelementptr i8, ptr %iptr.09, i32 128
+    %ald1 = load <128 x i8>, ptr %cgep, align 128, !tbaa !4
+    %cst2 = bitcast <128 x i8> %ald1 to <32 x i32>
+    %cgep3 = getelementptr i8, ptr %optr.010, i32 128
+    store <32 x i32> %cst2, ptr %cgep3, align 128, !tbaa !4
+    %lsr.iv.next = add i32 %lsr.iv, -128
+    %cmp = icmp samesign ugt i32 %lsr.iv.next, 128
+    %cgep4 = getelementptr i8, ptr %optr.010, i32 256
+    %cgep5 = getelementptr i8, ptr %iptr.09, i32 256
+    br i1 %cmp, label %for.body, label %for.end
+  
+  for.end:                                          ; preds = %for.body, %entry
+    ret void
+  }
+  
+  attributes #0 = { "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" }
+  
+  !llvm.module.flags = !{!0, !1, !2, !3}
+  
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 8, !"PIC Level", i32 2}
+  !2 = !{i32 7, !"PIE Level", i32 2}
+  !3 = !{i32 7, !"frame-pointer", i32 2}
+  !4 = !{!5, !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+...
+---
+name:            foo
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1(0x50000000), %bb.3(0x30000000)
+    liveins: $r0, $r1, $r2
+  
+    %9:intregs = COPY $r2
+    %8:intregs = COPY $r1
+    %7:intregs = COPY $r0
+    %10:predregs = C2_cmpgti %9, 0
+    J2_jumpf %10, %bb.3, implicit-def dead $pc
+    J2_jump %bb.1, implicit-def dead $pc
+  
+  bb.1.for.body.preheader:
+    successors: %bb.2(0x80000000)
+  
+    %0:intregs = A2_addi %9, 128
+    %15:intregs = A2_addi %0, -1
+    %16:intregs = S2_lsr_i_r %15, 7
+    %17:intregs = COPY %16
+    J2_loop0r %bb.2, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+  
+  bb.2.for.body (machine-block-address-taken):
+    successors: %bb.2(0x7c000000), %bb.3(0x04000000)
+  
+    %2:intregs = PHI %8, %bb.1, %5, %bb.2
+    %3:intregs = PHI %7, %bb.1, %6, %bb.2
+    %12:hvxvr = V6_vL32b_ai %3, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4)
+    V6_vS32b_ai %2, 0, killed %12 :: (store (s1024) into %ir.optr.010, !tbaa !4)
+    %13:hvxvr = V6_vL32b_ai %3, 128 :: (load (s1024) from %ir.cgep, !tbaa !4)
+    V6_vS32b_ai %2, 128, killed %13 :: (store (s1024) into %ir.cgep3, !tbaa !4)
+    %5:intregs = A2_addi %2, 256
+    %6:intregs = A2_addi %3, 256
+    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+    J2_jump %bb.3, implicit-def dead $pc
+  
+  bb.3.for.end:
+    PS_jmpret $r31, implicit-def dead $pc
+...