Skip to content

Commit 3cd6b86

Browse files
authored
[MachinePipeliner] Use AliasAnalysis properly when analyzing loop-carried dependencies (#136691)
MachinePipeliner uses AliasAnalysis to collect loop-carried memory dependencies. To analyze loop-carried dependencies, we need to explicitly tell AliasAnalysis that the values may come from different iterations. Before this patch, MachinePipeliner didn't do this, so some loop-carried dependencies might be missed. For example, in the following case, there is a loop-carried dependency from the load to the store, but it wasn't considered. ``` def @f(ptr noalias %p0, ptr noalias %p1) { entry: br label %body loop: %idx0 = phi ptr [ %p0, %entry ], [ %p1, %body ] %idx1 = phi ptr [ %p1, %entry ], [ %p0, %body ] %v0 = load %idx0 ... store %v1, %idx1 ... } ``` Further, the handling of the underlying objects was not sound. If there is no information about memory operands (i.e., `memoperands()` is empty), it must be handled conservatively. However, Machinepipeliner uses a dummy value (namely `UnknownValue`). It is distinguished from other "known" objects, causing necessary dependencies to be missed. (NOTE: in such cases, `buildSchedGraph` adds non-loop-carried dependencies correctly, so perhaps a critical problem has not occurred.) This patch fixes the above problems. This change has increased false dependencies that didn't exist before. Therefore, this patch also introduces additional alias checks with the underlying objects. Split off from #135148
1 parent a133170 commit 3cd6b86

File tree

4 files changed

+371
-91
lines changed

4 files changed

+371
-91
lines changed

llvm/include/llvm/CodeGen/MachinePipeliner.h

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,13 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
278278
/// Ordered list of DAG postprocessing steps.
279279
std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
280280

281+
/// Used to compute single-iteration dependencies (i.e., buildSchedGraph).
282+
AliasAnalysis *AA;
283+
284+
/// Used to compute loop-carried dependencies (i.e.,
285+
/// addLoopCarriedDependences).
286+
BatchAAResults BAA;
287+
281288
/// Helper class to implement Johnson's circuit finding algorithm.
282289
class Circuits {
283290
std::vector<SUnit> &SUnits;
@@ -323,13 +330,14 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
323330
public:
324331
SwingSchedulerDAG(MachinePipeliner &P, MachineLoop &L, LiveIntervals &lis,
325332
const RegisterClassInfo &rci, unsigned II,
326-
TargetInstrInfo::PipelinerLoopInfo *PLI)
333+
TargetInstrInfo::PipelinerLoopInfo *PLI, AliasAnalysis *AA)
327334
: ScheduleDAGInstrs(*P.MF, P.MLI, false), Pass(P), Loop(L), LIS(lis),
328335
RegClassInfo(rci), II_setByPragma(II), LoopPipelinerInfo(PLI),
329-
Topo(SUnits, &ExitSU) {
336+
Topo(SUnits, &ExitSU), AA(AA), BAA(*AA) {
330337
P.MF->getSubtarget().getSMSMutations(Mutations);
331338
if (SwpEnableCopyToPhi)
332339
Mutations.push_back(std::make_unique<CopyToPhiMutation>());
340+
BAA.enableCrossIterationMode();
333341
}
334342

335343
void schedule() override;
@@ -394,7 +402,7 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
394402
const MachineInstr *OtherMI) const;
395403

396404
private:
397-
void addLoopCarriedDependences(AAResults *AA);
405+
void addLoopCarriedDependences();
398406
void updatePhiDependences();
399407
void changeDependences();
400408
unsigned calculateResMII();

llvm/lib/CodeGen/MachinePipeliner.cpp

Lines changed: 137 additions & 88 deletions
Original file line numberDiff line numberDiff line change
@@ -237,6 +237,37 @@ INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
237237
INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
238238
"Modulo Software Pipelining", false, false)
239239

240+
namespace {
241+
242+
/// This class holds an SUnit corresponding to a memory operation and other
243+
/// information related to the instruction.
244+
struct SUnitWithMemInfo {
245+
SUnit *SU;
246+
SmallVector<const Value *, 2> UnderlyingObjs;
247+
248+
/// The value of a memory operand.
249+
const Value *MemOpValue = nullptr;
250+
251+
/// The offset of a memory operand.
252+
int64_t MemOpOffset = 0;
253+
254+
AAMDNodes AATags;
255+
256+
/// True if all the underlying objects are identified.
257+
bool IsAllIdentified = false;
258+
259+
SUnitWithMemInfo(SUnit *SU);
260+
261+
bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
262+
263+
bool isUnknown() const { return MemOpValue == nullptr; }
264+
265+
private:
266+
bool getUnderlyingObjects();
267+
};
268+
269+
} // end anonymous namespace
270+
240271
/// The "main" function for implementing Swing Modulo Scheduling.
241272
bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
242273
if (skipFunction(mf.getFunction()))
@@ -470,9 +501,10 @@ void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
470501
bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
471502
assert(L.getBlocks().size() == 1 && "SMS works on single blocks only.");
472503

504+
AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
473505
SwingSchedulerDAG SMS(
474506
*this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
475-
II_setByPragma, LI.LoopPipelinerInfo.get());
507+
II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
476508

477509
MachineBasicBlock *MBB = L.getHeader();
478510
// The kernel should not include any terminator instructions. These
@@ -560,9 +592,8 @@ void SwingSchedulerDAG::setMAX_II() {
560592
/// We override the schedule function in ScheduleDAGInstrs to implement the
561593
/// scheduling part of the Swing Modulo Scheduling algorithm.
562594
void SwingSchedulerDAG::schedule() {
563-
AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
564595
buildSchedGraph(AA);
565-
addLoopCarriedDependences(AA);
596+
addLoopCarriedDependences();
566597
updatePhiDependences();
567598
Topo.InitDAGTopologicalSorting();
568599
changeDependences();
@@ -810,113 +841,131 @@ static bool isDependenceBarrier(MachineInstr &MI) {
810841
(!MI.mayLoad() || !MI.isDereferenceableInvariantLoad()));
811842
}
812843

813-
/// Return the underlying objects for the memory references of an instruction.
844+
SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
845+
if (!getUnderlyingObjects())
846+
return;
847+
for (const Value *Obj : UnderlyingObjs)
848+
if (!isIdentifiedObject(Obj)) {
849+
IsAllIdentified = false;
850+
break;
851+
}
852+
}
853+
854+
bool SUnitWithMemInfo::isTriviallyDisjoint(
855+
const SUnitWithMemInfo &Other) const {
856+
// If all underlying objects are identified objects and there is no overlap
857+
// between them, then these two instructions are disjoint.
858+
if (!IsAllIdentified || !Other.IsAllIdentified)
859+
return false;
860+
for (const Value *Obj : UnderlyingObjs)
861+
if (llvm::is_contained(Other.UnderlyingObjs, Obj))
862+
return false;
863+
return true;
864+
}
865+
866+
/// Collect the underlying objects for the memory references of an instruction.
814867
/// This function calls the code in ValueTracking, but first checks that the
815868
/// instruction has a memory operand.
816-
static void getUnderlyingObjects(const MachineInstr *MI,
817-
SmallVectorImpl<const Value *> &Objs) {
869+
/// Returns false if we cannot find the underlying objects.
870+
bool SUnitWithMemInfo::getUnderlyingObjects() {
871+
const MachineInstr *MI = SU->getInstr();
818872
if (!MI->hasOneMemOperand())
819-
return;
873+
return false;
820874
MachineMemOperand *MM = *MI->memoperands_begin();
821875
if (!MM->getValue())
822-
return;
823-
getUnderlyingObjects(MM->getValue(), Objs);
824-
for (const Value *V : Objs) {
825-
if (!isIdentifiedObject(V)) {
826-
Objs.clear();
827-
return;
828-
}
829-
}
876+
return false;
877+
MemOpValue = MM->getValue();
878+
MemOpOffset = MM->getOffset();
879+
llvm::getUnderlyingObjects(MemOpValue, UnderlyingObjs);
880+
881+
// TODO: A no alias scope may be valid only in a single iteration. In this
882+
// case we need to peel off it like LoopAccessAnalysis does.
883+
AATags = MM->getAAInfo();
884+
return true;
830885
}
831886

832887
/// Add a chain edge between a load and store if the store can be an
833888
/// alias of the load on a subsequent iteration, i.e., a loop carried
834889
/// dependence. This code is very similar to the code in ScheduleDAGInstrs
835890
/// but that code doesn't create loop carried dependences.
836-
void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
837-
MapVector<const Value *, SmallVector<SUnit *, 4>> PendingLoads;
838-
Value *UnknownValue =
839-
UndefValue::get(Type::getVoidTy(MF.getFunction().getContext()));
891+
void SwingSchedulerDAG::addLoopCarriedDependences() {
892+
SmallVector<SUnitWithMemInfo, 4> PendingLoads;
840893
for (auto &SU : SUnits) {
841894
MachineInstr &MI = *SU.getInstr();
842895
if (isDependenceBarrier(MI))
843896
PendingLoads.clear();
844897
else if (MI.mayLoad()) {
845-
SmallVector<const Value *, 4> Objs;
846-
::getUnderlyingObjects(&MI, Objs);
847-
if (Objs.empty())
848-
Objs.push_back(UnknownValue);
849-
for (const auto *V : Objs) {
850-
SmallVector<SUnit *, 4> &SUs = PendingLoads[V];
851-
SUs.push_back(&SU);
852-
}
898+
PendingLoads.emplace_back(&SU);
853899
} else if (MI.mayStore()) {
854-
SmallVector<const Value *, 4> Objs;
855-
::getUnderlyingObjects(&MI, Objs);
856-
if (Objs.empty())
857-
Objs.push_back(UnknownValue);
858-
for (const auto *V : Objs) {
859-
MapVector<const Value *, SmallVector<SUnit *, 4>>::iterator I =
860-
PendingLoads.find(V);
861-
if (I == PendingLoads.end())
900+
SUnitWithMemInfo Store(&SU);
901+
for (const SUnitWithMemInfo &Load : PendingLoads) {
902+
if (Load.isTriviallyDisjoint(Store))
862903
continue;
863-
for (auto *Load : I->second) {
864-
if (isSuccOrder(Load, &SU))
865-
continue;
866-
MachineInstr &LdMI = *Load->getInstr();
867-
// First, perform the cheaper check that compares the base register.
868-
// If they are the same and the load offset is less than the store
869-
// offset, then mark the dependence as loop carried potentially.
870-
const MachineOperand *BaseOp1, *BaseOp2;
871-
int64_t Offset1, Offset2;
872-
bool Offset1IsScalable, Offset2IsScalable;
873-
if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
874-
Offset1IsScalable, TRI) &&
875-
TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
876-
Offset2IsScalable, TRI)) {
877-
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
878-
Offset1IsScalable == Offset2IsScalable &&
879-
(int)Offset1 < (int)Offset2) {
880-
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
881-
"What happened to the chain edge?");
882-
SDep Dep(Load, SDep::Barrier);
883-
Dep.setLatency(1);
884-
SU.addPred(Dep);
885-
continue;
886-
}
887-
}
888-
// Second, the more expensive check that uses alias analysis on the
889-
// base registers. If they alias, and the load offset is less than
890-
// the store offset, the mark the dependence as loop carried.
891-
if (!AA) {
892-
SDep Dep(Load, SDep::Barrier);
893-
Dep.setLatency(1);
894-
SU.addPred(Dep);
895-
continue;
896-
}
897-
MachineMemOperand *MMO1 = *LdMI.memoperands_begin();
898-
MachineMemOperand *MMO2 = *MI.memoperands_begin();
899-
if (!MMO1->getValue() || !MMO2->getValue()) {
900-
SDep Dep(Load, SDep::Barrier);
901-
Dep.setLatency(1);
902-
SU.addPred(Dep);
903-
continue;
904-
}
905-
if (MMO1->getValue() == MMO2->getValue() &&
906-
MMO1->getOffset() <= MMO2->getOffset()) {
907-
SDep Dep(Load, SDep::Barrier);
904+
if (isSuccOrder(Load.SU, Store.SU))
905+
continue;
906+
MachineInstr &LdMI = *Load.SU->getInstr();
907+
// First, perform the cheaper check that compares the base register.
908+
// If they are the same and the load offset is less than the store
909+
// offset, then mark the dependence as loop carried potentially.
910+
const MachineOperand *BaseOp1, *BaseOp2;
911+
int64_t Offset1, Offset2;
912+
bool Offset1IsScalable, Offset2IsScalable;
913+
if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
914+
Offset1IsScalable, TRI) &&
915+
TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
916+
Offset2IsScalable, TRI)) {
917+
if (BaseOp1->isIdenticalTo(*BaseOp2) &&
918+
Offset1IsScalable == Offset2IsScalable &&
919+
(int)Offset1 < (int)Offset2) {
920+
assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
921+
"What happened to the chain edge?");
922+
SDep Dep(Load.SU, SDep::Barrier);
908923
Dep.setLatency(1);
909924
SU.addPred(Dep);
910925
continue;
911926
}
912-
if (!AA->isNoAlias(
913-
MemoryLocation::getAfter(MMO1->getValue(), MMO1->getAAInfo()),
914-
MemoryLocation::getAfter(MMO2->getValue(),
915-
MMO2->getAAInfo()))) {
916-
SDep Dep(Load, SDep::Barrier);
917-
Dep.setLatency(1);
918-
SU.addPred(Dep);
919-
}
927+
}
928+
// Second, the more expensive check that uses alias analysis on the
929+
// base registers. If they alias, and the load offset is less than
930+
// the store offset, the mark the dependence as loop carried.
931+
if (Load.isUnknown() || Store.isUnknown()) {
932+
SDep Dep(Load.SU, SDep::Barrier);
933+
Dep.setLatency(1);
934+
SU.addPred(Dep);
935+
continue;
936+
}
937+
if (Load.MemOpValue == Store.MemOpValue &&
938+
Load.MemOpOffset <= Store.MemOpOffset) {
939+
SDep Dep(Load.SU, SDep::Barrier);
940+
Dep.setLatency(1);
941+
SU.addPred(Dep);
942+
continue;
943+
}
944+
945+
bool IsNoAlias = [&] {
946+
if (BAA.isNoAlias(MemoryLocation::getBeforeOrAfter(Load.MemOpValue,
947+
Load.AATags),
948+
MemoryLocation::getBeforeOrAfter(Store.MemOpValue,
949+
Store.AATags)))
950+
return true;
951+
952+
// AliasAnalysis sometimes gives up on following the underlying
953+
// object. In such a case, separate checks for underlying objects may
954+
// prove that there are no aliases between two accesses.
955+
for (const Value *LoadObj : Load.UnderlyingObjs)
956+
for (const Value *StoreObj : Store.UnderlyingObjs)
957+
if (!BAA.isNoAlias(
958+
MemoryLocation::getBeforeOrAfter(LoadObj, Load.AATags),
959+
MemoryLocation::getBeforeOrAfter(StoreObj, Store.AATags)))
960+
return false;
961+
962+
return true;
963+
}();
964+
965+
if (!IsNoAlias) {
966+
SDep Dep(Load.SU, SDep::Barrier);
967+
Dep.setLatency(1);
968+
SU.addPred(Dep);
920969
}
921970
}
922971
}
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
# RUN: llc -mtriple=hexagon -run-pass pipeliner -debug-only=pipeliner %s -o /dev/null 2>&1 | FileCheck %s
2+
# REQUIRES: asserts
3+
4+
# Test that pipeliner correctly detects the loop-carried dependency between the
5+
# load and the store, which is indicated by `Ord` dependency from SU(2) to
6+
# SU(4). Note that there is no dependency within a single iteration.
7+
8+
# CHECK: SU(2): %7:intregs = L2_loadri_io %5:intregs, 0 :: (load (s32) from %ir.ptr.load)
9+
# CHECK-NEXT: # preds left
10+
# CHECK-NEXT: # succs left
11+
# CHECK-NEXT: # rdefs left
12+
# CHECK-NEXT: Latency
13+
# CHECK-NEXT: Depth
14+
# CHECK-NEXT: Height
15+
# CHECK-NEXT: Predecessors:
16+
# CHECK-NEXT: SU(0): Data Latency=0 Reg=%5
17+
# CHECK-NEXT: Successors:
18+
# CHECK-DAG: SU(3): Data Latency=2 Reg=%7
19+
# CHECK-DAG: SU(4): Ord Latency=1 Barrier
20+
# CHECK-NEXT: SU(3): %8:intregs = F2_sfadd %7:intregs, %3:intregs, implicit $usr
21+
# CHECK: SU(4): S2_storeri_io %6:intregs, 0, %8:intregs :: (store (s32) into %ir.ptr.store)
22+
23+
24+
--- |
25+
define void @foo(ptr noalias %p0, ptr noalias %p1, i32 %n) {
26+
entry:
27+
br label %body
28+
29+
body: ; preds = %body, %entry
30+
%i = phi i32 [ 0, %entry ], [ %i.next, %body ]
31+
%ptr.load = phi ptr [ %p0, %entry ], [ %p1, %body ]
32+
%ptr.store = phi ptr [ %p1, %entry ], [ %p0, %body ]
33+
%v = load float, ptr %ptr.load, align 4
34+
%add = fadd float %v, 1.000000e+00
35+
store float %add, ptr %ptr.store, align 4
36+
%i.next = add i32 %i, 1
37+
%cond = icmp slt i32 %i.next, %n
38+
br i1 %cond, label %body, label %exit
39+
40+
exit: ; preds = %body
41+
ret void
42+
}
43+
...
44+
---
45+
name: foo
46+
tracksRegLiveness: true
47+
body: |
48+
bb.0.entry:
49+
successors: %bb.1(0x80000000)
50+
liveins: $r0, $r1, $r2
51+
52+
%6:intregs = COPY $r2
53+
%5:intregs = COPY $r1
54+
%4:intregs = COPY $r0
55+
%9:intregs = A2_tfrsi 1065353216
56+
%12:intregs = COPY %6
57+
J2_loop0r %bb.1, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
58+
59+
bb.1.body (machine-block-address-taken):
60+
successors: %bb.1(0x7c000000), %bb.2(0x04000000)
61+
62+
%1:intregs = PHI %4, %bb.0, %5, %bb.1
63+
%2:intregs = PHI %5, %bb.0, %4, %bb.1
64+
%8:intregs = L2_loadri_io %1, 0 :: (load (s32) from %ir.ptr.load)
65+
%10:intregs = F2_sfadd killed %8, %9, implicit $usr
66+
S2_storeri_io %2, 0, killed %10 :: (store (s32) into %ir.ptr.store)
67+
ENDLOOP0 %bb.1, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
68+
J2_jump %bb.2, implicit-def dead $pc
69+
70+
bb.2.exit:
71+
PS_jmpret $r31, implicit-def dead $pc
72+
...

0 commit comments

Comments
 (0)