diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h index fee6937e7d502..966ffb7a1fbd2 100644 --- a/llvm/include/llvm/CodeGen/MachinePipeliner.h +++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h @@ -278,6 +278,13 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { /// Ordered list of DAG postprocessing steps. std::vector> Mutations; + /// Used to compute single-iteration dependencies (i.e., buildSchedGraph). + AliasAnalysis *AA; + + /// Used to compute loop-carried dependencies (i.e., + /// addLoopCarriedDependences). + BatchAAResults BAA; + /// Helper class to implement Johnson's circuit finding algorithm. class Circuits { std::vector &SUnits; @@ -323,13 +330,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { public: SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis, const RegisterClassInfo &rci, unsigned II, - TargetInstrInfo::PipelinerLoopInfo *PLI) + TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA) : ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis), RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI), - Topo(SUnits, &ExitSU) { + Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) { P.MF->getSubtarget().getSMSMutations(Mutations); if (SwpEnableCopyToPhi) Mutations.push_back(std::make_unique()); + BAA.enableCrossIterationMode(); } void schedule() override; @@ -394,7 +402,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs { const MachineInstr *OtherMI) const; private: - void addLoopCarriedDependences(AAResults *AA); + void addLoopCarriedDependences(); void updatePhiDependences(); void changeDependences(); unsigned calculateResMII(); diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp index 6cb0299a30d7a..07bffc6c3de90 100644 --- a/llvm/lib/CodeGen/MachinePipeliner.cpp +++ b/llvm/lib/CodeGen/MachinePipeliner.cpp @@ -237,6 +237,37 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass) INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE, "Modulo Software Pipelining", false, false) +namespace { + +/// This class holds an SUnit corresponding to a memory operation and other +/// information related to the instruction. +struct SUnitWithMemInfo { + SUnit *SU; + SmallVector UnderlyingObjs; + + /// The value of a memory operand. + const Value *MemOpValue = nullptr; + + /// The offset of a memory operand. + int64_t MemOpOffset = 0; + + AAMDNodes AATags; + + /// True if all the underlying objects are identified. + bool IsAllIdentified = false; + + SUnitWithMemInfo(SUnit *SU); + + bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const; + + bool isUnknown() const { return MemOpValue == nullptr; } + +private: + bool getUnderlyingObjects(); +}; + +} // end anonymous namespace + /// The "main" function for implementing Swing Modulo Scheduling. bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) { if (skipFunction(mf.getFunction())) @@ -470,9 +501,10 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) { bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) { assert(L.getBlocks().size() == 1 && "SMS works on single blocks only."); + AliasAnalysis *AA = &getAnalysis().getAAResults(); SwingSchedulerDAG SMS( *this, L, getAnalysis().getLIS(), RegClassInfo, - II_setByPragma, LI.LoopPipelinerInfo.get()); + II_setByPragma, LI.LoopPipelinerInfo.get(), AA); MachineBasicBlock *MBB = L.getHeader(); // The kernel should not include any terminator instructions. These @@ -560,9 +592,8 @@ void SwingSchedulerDAG::setMAX_II() { /// We override the schedule function in ScheduleDAGInstrs to implement the /// scheduling part of the Swing Modulo Scheduling algorithm. void SwingSchedulerDAG::schedule() { - AliasAnalysis *AA = &Pass.getAnalysis().getAAResults(); buildSchedGraph(AA); - addLoopCarriedDependences(AA); + addLoopCarriedDependences(); updatePhiDependences(); Topo.InitDAGTopologicalSorting(); changeDependences(); @@ -810,113 +841,131 @@ static bool isDependenceBarrier(MachineInstr &MI) { (!MI.mayLoad() || !MI.isDereferenceableInvariantLoad())); } -/// Return the underlying objects for the memory references of an instruction. +SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) { + if (!getUnderlyingObjects()) + return; + for (const Value *Obj : UnderlyingObjs) + if (!isIdentifiedObject(Obj)) { + IsAllIdentified = false; + break; + } +} + +bool SUnitWithMemInfo::isTriviallyDisjoint( + const SUnitWithMemInfo &Other) const { + // If all underlying objects are identified objects and there is no overlap + // between them, then these two instructions are disjoint. + if (!IsAllIdentified || !Other.IsAllIdentified) + return false; + for (const Value *Obj : UnderlyingObjs) + if (llvm::is_contained(Other.UnderlyingObjs, Obj)) + return false; + return true; +} + +/// Collect the underlying objects for the memory references of an instruction. /// This function calls the code in ValueTracking, but first checks that the /// instruction has a memory operand. -static void getUnderlyingObjects(const MachineInstr *MI, - SmallVectorImpl &Objs) { +/// Returns false if we cannot find the underlying objects. +bool SUnitWithMemInfo::getUnderlyingObjects() { + const MachineInstr *MI = SU->getInstr(); if (!MI->hasOneMemOperand()) - return; + return false; MachineMemOperand *MM = *MI->memoperands_begin(); if (!MM->getValue()) - return; - getUnderlyingObjects(MM->getValue(), Objs); - for (const Value *V : Objs) { - if (!isIdentifiedObject(V)) { - Objs.clear(); - return; - } - } + return false; + MemOpValue = MM->getValue(); + MemOpOffset = MM->getOffset(); + llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs); + + // TODO: A no alias scope may be valid only in a single iteration. In this + // case we need to peel off it like LoopAccessAnalysis does. + AATags = MM->getAAInfo(); + return true; } /// Add a chain edge between a load and store if the store can be an /// alias of the load on a subsequent iteration, i.e., a loop carried /// dependence. This code is very similar to the code in ScheduleDAGInstrs /// but that code doesn't create loop carried dependences. -void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) { - MapVector> PendingLoads; - Value *UnknownValue = - UndefValue::get(Type::getVoidTy(MF.getFunction().getContext())); +void SwingSchedulerDAG::addLoopCarriedDependences() { + SmallVector PendingLoads; for (auto &SU : SUnits) { MachineInstr &MI = *SU.getInstr(); if (isDependenceBarrier(MI)) PendingLoads.clear(); else if (MI.mayLoad()) { - SmallVector Objs; - ::getUnderlyingObjects(&MI, Objs); - if (Objs.empty()) - Objs.push_back(UnknownValue); - for (const auto *V : Objs) { - SmallVector &SUs = PendingLoads[V]; - SUs.push_back(&SU); - } + PendingLoads.emplace_back(&SU); } else if (MI.mayStore()) { - SmallVector Objs; - ::getUnderlyingObjects(&MI, Objs); - if (Objs.empty()) - Objs.push_back(UnknownValue); - for (const auto *V : Objs) { - MapVector>::iterator I = - PendingLoads.find(V); - if (I == PendingLoads.end()) + SUnitWithMemInfo Store(&SU); + for (const SUnitWithMemInfo &Load : PendingLoads) { + if (Load.isTriviallyDisjoint(Store)) continue; - for (auto *Load : I->second) { - if (isSuccOrder(Load, &SU)) - continue; - MachineInstr &LdMI = *Load->getInstr(); - // First, perform the cheaper check that compares the base register. - // If they are the same and the load offset is less than the store - // offset, then mark the dependence as loop carried potentially. - const MachineOperand *BaseOp1, *BaseOp2; - int64_t Offset1, Offset2; - bool Offset1IsScalable, Offset2IsScalable; - if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, - Offset1IsScalable, TRI) && - TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, - Offset2IsScalable, TRI)) { - if (BaseOp1->isIdenticalTo(*BaseOp2) && - Offset1IsScalable == Offset2IsScalable && - (int)Offset1 < (int)Offset2) { - assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) && - "What happened to the chain edge?"); - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - } - // Second, the more expensive check that uses alias analysis on the - // base registers. If they alias, and the load offset is less than - // the store offset, the mark the dependence as loop carried. - if (!AA) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - MachineMemOperand *MMO1 = *LdMI.memoperands_begin(); - MachineMemOperand *MMO2 = *MI.memoperands_begin(); - if (!MMO1->getValue() || !MMO2->getValue()) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - continue; - } - if (MMO1->getValue() == MMO2->getValue() && - MMO1->getOffset() <= MMO2->getOffset()) { - SDep Dep(Load, SDep::Barrier); + if (isSuccOrder(Load.SU, Store.SU)) + continue; + MachineInstr &LdMI = *Load.SU->getInstr(); + // First, perform the cheaper check that compares the base register. + // If they are the same and the load offset is less than the store + // offset, then mark the dependence as loop carried potentially. + const MachineOperand *BaseOp1, *BaseOp2; + int64_t Offset1, Offset2; + bool Offset1IsScalable, Offset2IsScalable; + if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, + Offset1IsScalable, TRI) && + TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, + Offset2IsScalable, TRI)) { + if (BaseOp1->isIdenticalTo(*BaseOp2) && + Offset1IsScalable == Offset2IsScalable && + (int)Offset1 < (int)Offset2) { + assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) && + "What happened to the chain edge?"); + SDep Dep(Load.SU, SDep::Barrier); Dep.setLatency(1); SU.addPred(Dep); continue; } - if (!AA->isNoAlias( - MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()), - MemoryLocation::getAfter(MMO2->getValue(), - MMO2->getAAInfo()))) { - SDep Dep(Load, SDep::Barrier); - Dep.setLatency(1); - SU.addPred(Dep); - } + } + // Second, the more expensive check that uses alias analysis on the + // base registers. If they alias, and the load offset is less than + // the store offset, the mark the dependence as loop carried. + if (Load.isUnknown() || Store.isUnknown()) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); + continue; + } + if (Load.MemOpValue == Store.MemOpValue && + Load.MemOpOffset <= Store.MemOpOffset) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); + continue; + } + + bool IsNoAlias = [&] { + if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue, + Load.AATags), + MemoryLocation::getBeforeOrAfter(Store.MemOpValue, + Store.AATags))) + return true; + + // AliasAnalysis sometimes gives up on following the underlying + // object. In such a case, separate checks for underlying objects may + // prove that there are no aliases between two accesses. + for (const Value *LoadObj : Load.UnderlyingObjs) + for (const Value *StoreObj : Store.UnderlyingObjs) + if (!BAA.isNoAlias( + MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags), + MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags))) + return false; + + return true; + }(); + + if (!IsNoAlias) { + SDep Dep(Load.SU, SDep::Barrier); + Dep.setLatency(1); + SU.addPred(Dep); } } } diff --git a/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir new file mode 100644 index 0000000000000..8163074b589d8 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-alias-cross-iteration.mir @@ -0,0 +1,72 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test that pipeliner correctly detects the loop-carried dependency between the +# load and the store, which is indicated by `Ord` dependency from SU(2) to +# SU(4). Note that there is no dependency within a single iteration. + +# CHECK: SU(2): %7:intregs = L2_loadri_io %5:intregs, 0 :: (load (s32) from %ir.ptr.load) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(0): Data Latency=0 Reg=%5 +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(3): Data Latency=2 Reg=%7 +# CHECK-DAG: SU(4): Ord Latency=1 Barrier +# CHECK-NEXT: SU(3): %8:intregs = F2_sfadd %7:intregs, %3:intregs, implicit $usr +# CHECK: SU(4): S2_storeri_io %6:intregs, 0, %8:intregs :: (store (s32) into %ir.ptr.store) + + +--- | + define void @foo(ptr noalias %p0, ptr noalias %p1, i32 %n) { + entry: + br label %body + + body: ; preds = %body, %entry + %i = phi i32 [ 0, %entry ], [ %i.next, %body ] + %ptr.load = phi ptr [ %p0, %entry ], [ %p1, %body ] + %ptr.store = phi ptr [ %p1, %entry ], [ %p0, %body ] + %v = load float, ptr %ptr.load, align 4 + %add = fadd float %v, 1.000000e+00 + store float %add, ptr %ptr.store, align 4 + %i.next = add i32 %i, 1 + %cond = icmp slt i32 %i.next, %n + br i1 %cond, label %body, label %exit + + exit: ; preds = %body + ret void + } +... +--- +name: foo +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2 + + %6:intregs = COPY $r2 + %5:intregs = COPY $r1 + %4:intregs = COPY $r0 + %9:intregs = A2_tfrsi 1065353216 + %12:intregs = COPY %6 + J2_loop0r %bb.1, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.1.body (machine-block-address-taken): + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + + %1:intregs = PHI %4, %bb.0, %5, %bb.1 + %2:intregs = PHI %5, %bb.0, %4, %bb.1 + %8:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.ptr.load) + %10:intregs = F2_sfadd killed %8, %9, implicit $usr + S2_storeri_io %2, 0, killed %10 :: (store (s32) into %ir.ptr.store) + ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.2, implicit-def dead $pc + + bb.2.exit: + PS_jmpret $r31, implicit-def dead $pc +... diff --git a/llvm/test/CodeGen/Hexagon/swp-no-alias.mir b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir new file mode 100644 index 0000000000000..38b7212702ff9 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/swp-no-alias.mir @@ -0,0 +1,151 @@ +# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s +# REQUIRES: asserts + +# Test that there are no loop-carried dependencies between all memory instructions. + +# CHECK: SU(0): %8:intregs = PHI %1:intregs, %bb.1, %9:intregs, %bb.2 +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(6): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(5): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(3): Data Latency=0 Reg=%8 +# CHECK-DAG: SU(6): Anti Latency=1 +# CHECK-NEXT: SU(1): %10:intregs = PHI %2:intregs, %bb.1, %11:intregs, %bb.2 +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Successors: +# CHECK-DAG: SU(7): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(4): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(2): Data Latency=0 Reg=%10 +# CHECK-DAG: SU(7): Anti Latency=1 +# CHECK-NEXT: SU(2): %12:hvxvr = V6_vL32b_ai %10:intregs, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10 +# CHECK-NEXT: Successors: +# CHECK-NEXT: SU(3): Data Latency=0 Reg=%12 +# CHECK-NEXT: SU(3): V6_vS32b_ai %8:intregs, 0, %12:hvxvr :: (store (s1024) into %ir.optr.010, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-DAG: SU(2): Data Latency=0 Reg=%12 +# CHECK-DAG: SU(0): Data Latency=0 Reg=%8 +# CHECK-NEXT: SU(4): %13:hvxvr = V6_vL32b_ai %10:intregs, 128 :: (load (s1024) from %ir.cgep, !tbaa !4) +# CHECK-NEXT: # preds left +# CHECK-NEXT: # succs left +# CHECK-NEXT: # rdefs left +# CHECK-NEXT: Latency +# CHECK-NEXT: Depth +# CHECK-NEXT: Height +# CHECK-NEXT: Predecessors: +# CHECK-NEXT: SU(1): Data Latency=0 Reg=%10 +# CHECK-NEXT: Successors: +# CHECK-NEXT: SU(5): Data Latency=0 Reg=%13 +# CHECK-NEXT: SU(5): V6_vS32b_ai %8:intregs, 128, %13:hvxvr :: (store (s1024) into %ir.cgep3, !tbaa !4) + + + + +--- | + define dso_local void @foo(ptr noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out, i32 noundef %width) local_unnamed_addr #0 { + entry: + %cmp7 = icmp sgt i32 %width, 0 + br i1 %cmp7, label %for.body.preheader, label %for.end + + for.body.preheader: ; preds = %entry + %0 = add i32 %width, 128 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv = phi i32 [ %0, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %optr.010 = phi ptr [ %cgep4, %for.body ], [ %out, %for.body.preheader ] + %iptr.09 = phi ptr [ %cgep5, %for.body ], [ %in, %for.body.preheader ] + %ald = load <128 x i8>, ptr %iptr.09, align 128, !tbaa !4 + %cst = bitcast <128 x i8> %ald to <32 x i32> + store <32 x i32> %cst, ptr %optr.010, align 128, !tbaa !4 + %cgep = getelementptr i8, ptr %iptr.09, i32 128 + %ald1 = load <128 x i8>, ptr %cgep, align 128, !tbaa !4 + %cst2 = bitcast <128 x i8> %ald1 to <32 x i32> + %cgep3 = getelementptr i8, ptr %optr.010, i32 128 + store <32 x i32> %cst2, ptr %cgep3, align 128, !tbaa !4 + %lsr.iv.next = add i32 %lsr.iv, -128 + %cmp = icmp samesign ugt i32 %lsr.iv.next, 128 + %cgep4 = getelementptr i8, ptr %optr.010, i32 256 + %cgep5 = getelementptr i8, ptr %iptr.09, i32 256 + br i1 %cmp, label %for.body, label %for.end + + for.end: ; preds = %for.body, %entry + ret void + } + + attributes #0 = { "target-cpu"="hexagonv60" "target-features"="+hvx-length128b,+hvxv69,+v66,-long-calls" } + + !llvm.module.flags = !{!0, !1, !2, !3} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 8, !"PIC Level", i32 2} + !2 = !{i32 7, !"PIE Level", i32 2} + !3 = !{i32 7, !"frame-pointer", i32 2} + !4 = !{!5, !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} +... +--- +name: foo +tracksRegLiveness: true +body: | + bb.0.entry: + successors: %bb.1(0x50000000), %bb.3(0x30000000) + liveins: $r0, $r1, $r2 + + %9:intregs = COPY $r2 + %8:intregs = COPY $r1 + %7:intregs = COPY $r0 + %10:predregs = C2_cmpgti %9, 0 + J2_jumpf %10, %bb.3, implicit-def dead $pc + J2_jump %bb.1, implicit-def dead $pc + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + %0:intregs = A2_addi %9, 128 + %15:intregs = A2_addi %0, -1 + %16:intregs = S2_lsr_i_r %15, 7 + %17:intregs = COPY %16 + J2_loop0r %bb.2, %17, implicit-def $lc0, implicit-def $sa0, implicit-def $usr + + bb.2.for.body (machine-block-address-taken): + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + + %2:intregs = PHI %8, %bb.1, %5, %bb.2 + %3:intregs = PHI %7, %bb.1, %6, %bb.2 + %12:hvxvr = V6_vL32b_ai %3, 0 :: (load (s1024) from %ir.iptr.09, !tbaa !4) + V6_vS32b_ai %2, 0, killed %12 :: (store (s1024) into %ir.optr.010, !tbaa !4) + %13:hvxvr = V6_vL32b_ai %3, 128 :: (load (s1024) from %ir.cgep, !tbaa !4) + V6_vS32b_ai %2, 128, killed %13 :: (store (s1024) into %ir.cgep3, !tbaa !4) + %5:intregs = A2_addi %2, 256 + %6:intregs = A2_addi %3, 256 + ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0 + J2_jump %bb.3, implicit-def dead $pc + + bb.3.for.end: + PS_jmpret $r31, implicit-def dead $pc +...