@@ -266,6 +266,82 @@ struct SUnitWithMemInfo {
266
266
bool getUnderlyingObjects ();
267
267
};
268
268
269
+ // / Add loop-carried chain dependencies. This class handles the same type of
270
+ // / dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
271
+ // / account dependencies across iterations.
272
+ class LoopCarriedOrderDepsTracker {
273
+ // Type of instruction that is relevant to order-dependencies
274
+ enum class InstrTag {
275
+ Barrier = 0 , // /< A barrier event instruction.
276
+ LoadOrStore = 1 , // /< An instruction that may load or store memory, but is
277
+ // /< not a barrier event.
278
+ FPExceptions = 2 , // /< An instruction that does not match above, but may
279
+ // /< raise floatin-point exceptions.
280
+ };
281
+
282
+ struct TaggedSUnit : PointerIntPair<SUnit *, 2 > {
283
+ TaggedSUnit (SUnit *SU, InstrTag Tag)
284
+ : PointerIntPair<SUnit *, 2 >(SU, unsigned (Tag)) {}
285
+
286
+ InstrTag getTag () const { return InstrTag (getInt ()); }
287
+ };
288
+
289
+ // / Holds loads and stores with memory related information.
290
+ struct LoadStoreChunk {
291
+ SmallVector<SUnitWithMemInfo, 4 > Loads;
292
+ SmallVector<SUnitWithMemInfo, 4 > Stores;
293
+
294
+ void append (SUnit *SU);
295
+ };
296
+
297
+ SwingSchedulerDAG *DAG;
298
+ BatchAAResults *BAA;
299
+ std::vector<SUnit> &SUnits;
300
+
301
+ // / The size of SUnits, for convenience.
302
+ const unsigned N;
303
+
304
+ // / Loop-carried Edges.
305
+ std::vector<BitVector> LoopCarried;
306
+
307
+ // / Instructions related to chain dependencies. They are one of the
308
+ // / following:
309
+ // /
310
+ // / 1. Barrier event.
311
+ // / 2. Load, but neither a barrier event, invariant load, nor may load trap
312
+ // / value.
313
+ // / 3. Store, but not a barrier event.
314
+ // / 4. None of them, but may raise floating-point exceptions.
315
+ // /
316
+ // / This is used when analyzing loop-carried dependencies that access global
317
+ // / barrier instructions.
318
+ std::vector<TaggedSUnit> TaggedSUnits;
319
+
320
+ const TargetInstrInfo *TII = nullptr ;
321
+ const TargetRegisterInfo *TRI = nullptr ;
322
+
323
+ public:
324
+ LoopCarriedOrderDepsTracker (SwingSchedulerDAG *SSD, BatchAAResults *BAA,
325
+ const TargetInstrInfo *TII,
326
+ const TargetRegisterInfo *TRI);
327
+
328
+ // / The main function to compute loop-carried order-dependencies.
329
+ void computeDependencies ();
330
+
331
+ const BitVector &getLoopCarried (unsigned Idx) const {
332
+ return LoopCarried[Idx];
333
+ }
334
+
335
+ private:
336
+ // / Tags to \p SU if the instruction may affect the order-dependencies.
337
+ std::optional<TaggedSUnit> checkInstrType (SUnit *SU) const ;
338
+
339
+ void addLoopCarriedDepenenciesForChunks (const LoadStoreChunk &From,
340
+ const LoadStoreChunk &To);
341
+
342
+ void computeDependenciesAux ();
343
+ };
344
+
269
345
} // end anonymous namespace
270
346
271
347
// / The "main" function for implementing Swing Modulo Scheduling.
@@ -593,13 +669,19 @@ void SwingSchedulerDAG::setMAX_II() {
593
669
// / scheduling part of the Swing Modulo Scheduling algorithm.
594
670
void SwingSchedulerDAG::schedule () {
595
671
buildSchedGraph (AA);
596
- addLoopCarriedDependences ();
672
+ const LoopCarriedEdges LCE = addLoopCarriedDependences ();
597
673
updatePhiDependences ();
598
674
Topo.InitDAGTopologicalSorting ();
599
675
changeDependences ();
600
676
postProcessDAG ();
601
677
DDG = std::make_unique<SwingSchedulerDDG>(SUnits, &EntrySU, &ExitSU);
602
- LLVM_DEBUG (dump ());
678
+ LLVM_DEBUG ({
679
+ dump ();
680
+ dbgs () << " ===== Loop Carried Edges Begin =====\n " ;
681
+ for (SUnit &SU : SUnits)
682
+ LCE.dump (&SU, TRI, &MRI);
683
+ dbgs () << " ===== Loop Carried Edges End =====\n " ;
684
+ });
603
685
604
686
NodeSetType NodeSets;
605
687
findCircuits (NodeSets);
@@ -832,15 +914,6 @@ static bool isSuccOrder(SUnit *SUa, SUnit *SUb) {
832
914
return false ;
833
915
}
834
916
835
- // / Return true if the instruction causes a chain between memory
836
- // / references before and after it.
837
- static bool isDependenceBarrier (MachineInstr &MI) {
838
- return MI.isCall () || MI.mayRaiseFPException () ||
839
- MI.hasUnmodeledSideEffects () ||
840
- (MI.hasOrderedMemoryRef () &&
841
- (!MI.mayLoad () || !MI.isDereferenceableInvariantLoad ()));
842
- }
843
-
844
917
SUnitWithMemInfo::SUnitWithMemInfo (SUnit *SU) : SU(SU) {
845
918
if (!getUnderlyingObjects ())
846
919
return ;
@@ -941,28 +1014,116 @@ static bool hasLoopCarriedMemDep(const SUnitWithMemInfo &Src,
941
1014
return false ;
942
1015
}
943
1016
1017
+ void LoopCarriedOrderDepsTracker::LoadStoreChunk::append (SUnit *SU) {
1018
+ const MachineInstr *MI = SU->getInstr ();
1019
+ if (!MI->mayLoadOrStore ())
1020
+ return ;
1021
+ (MI->mayStore () ? Stores : Loads).emplace_back (SU);
1022
+ }
1023
+
1024
+ LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker (
1025
+ SwingSchedulerDAG *SSD, BatchAAResults *BAA, const TargetInstrInfo *TII,
1026
+ const TargetRegisterInfo *TRI)
1027
+ : DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1028
+ LoopCarried(N, BitVector(N)), TII(TII), TRI(TRI) {}
1029
+
1030
+ void LoopCarriedOrderDepsTracker::computeDependencies () {
1031
+ // Traverse all instructions and extract only what we are targetting.
1032
+ for (auto &SU : SUnits) {
1033
+ auto Tagged = checkInstrType (&SU);
1034
+
1035
+ // This instruction has no loop-carried order-dependencies.
1036
+ if (!Tagged)
1037
+ continue ;
1038
+ TaggedSUnits.push_back (*Tagged);
1039
+ }
1040
+
1041
+ computeDependenciesAux ();
1042
+
1043
+ LLVM_DEBUG ({
1044
+ for (unsigned I = 0 ; I != N; I++)
1045
+ assert (!LoopCarried[I].test (I) && " Unexpected self-loop" );
1046
+ });
1047
+ }
1048
+
1049
+ std::optional<LoopCarriedOrderDepsTracker::TaggedSUnit>
1050
+ LoopCarriedOrderDepsTracker::checkInstrType (SUnit *SU) const {
1051
+ MachineInstr *MI = SU->getInstr ();
1052
+ if (TII->isGlobalMemoryObject (MI))
1053
+ return TaggedSUnit (SU, InstrTag::Barrier);
1054
+
1055
+ if (MI->mayStore () ||
1056
+ (MI->mayLoad () && !MI->isDereferenceableInvariantLoad ()))
1057
+ return TaggedSUnit (SU, InstrTag::LoadOrStore);
1058
+
1059
+ if (MI->mayRaiseFPException ())
1060
+ return TaggedSUnit (SU, InstrTag::FPExceptions);
1061
+
1062
+ return std::nullopt;
1063
+ }
1064
+
1065
+ void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks (
1066
+ const LoadStoreChunk &From, const LoadStoreChunk &To) {
1067
+ // Add dependencies for load-to-store (WAR) from top to bottom.
1068
+ for (const SUnitWithMemInfo &Src : From.Loads )
1069
+ for (const SUnitWithMemInfo &Dst : To.Stores )
1070
+ if (Src.SU ->NodeNum < Dst.SU ->NodeNum &&
1071
+ hasLoopCarriedMemDep (Src, Dst, *BAA, TII, TRI))
1072
+ LoopCarried[Src.SU ->NodeNum ].set (Dst.SU ->NodeNum );
1073
+
1074
+ // TODO: The following dependencies are missed.
1075
+ //
1076
+ // - Dependencies for load-to-store from bottom to top.
1077
+ // - Dependencies for store-to-load (RAW).
1078
+ // - Dependencies for store-to-store (WAW).
1079
+ }
1080
+
1081
+ void LoopCarriedOrderDepsTracker::computeDependenciesAux () {
1082
+ SmallVector<LoadStoreChunk, 2 > Chunks (1 );
1083
+ for (const auto &TSU : TaggedSUnits) {
1084
+ InstrTag Tag = TSU.getTag ();
1085
+ SUnit *SU = TSU.getPointer ();
1086
+ switch (Tag) {
1087
+ case InstrTag::Barrier:
1088
+ Chunks.emplace_back ();
1089
+ break ;
1090
+ case InstrTag::LoadOrStore:
1091
+ Chunks.back ().append (SU);
1092
+ break ;
1093
+ case InstrTag::FPExceptions:
1094
+ // TODO: Handle this properly.
1095
+ break ;
1096
+ }
1097
+ }
1098
+
1099
+ // Add dependencies between memory operations. If there are one or more
1100
+ // barrier events between two memory instructions, we don't add a
1101
+ // loop-carried dependence for them.
1102
+ for (const LoadStoreChunk &Chunk : Chunks)
1103
+ addLoopCarriedDepenenciesForChunks (Chunk, Chunk);
1104
+
1105
+ // TODO: If there are multiple barrier instructions, dependencies from the
1106
+ // last barrier instruction (or load/store below it) to the first barrier
1107
+ // instruction (or load/store above it).
1108
+ }
1109
+
944
1110
// / Add a chain edge between a load and store if the store can be an
945
1111
// / alias of the load on a subsequent iteration, i.e., a loop carried
946
1112
// / dependence. This code is very similar to the code in ScheduleDAGInstrs
947
1113
// / but that code doesn't create loop carried dependences.
948
- void SwingSchedulerDAG::addLoopCarriedDependences () {
949
- SmallVector<SUnitWithMemInfo, 4 > PendingLoads;
950
- for (auto &SU : SUnits) {
951
- MachineInstr &MI = *SU.getInstr ();
952
- if (isDependenceBarrier (MI))
953
- PendingLoads.clear ();
954
- else if (MI.mayLoad ()) {
955
- PendingLoads.emplace_back (&SU);
956
- } else if (MI.mayStore ()) {
957
- SUnitWithMemInfo Store (&SU);
958
- for (const SUnitWithMemInfo &Load : PendingLoads)
959
- if (hasLoopCarriedMemDep (Load, Store, BAA, TII, TRI)) {
960
- SDep Dep (Load.SU , SDep::Barrier);
961
- Dep.setLatency (1 );
962
- SU.addPred (Dep);
963
- }
964
- }
965
- }
1114
+ // / TODO: Also compute output-dependencies.
1115
+ LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences () {
1116
+ LoopCarriedEdges LCE;
1117
+
1118
+ // Add loop-carried order-dependencies
1119
+ LoopCarriedOrderDepsTracker LCODTracker (this , &BAA, TII, TRI);
1120
+ LCODTracker.computeDependencies ();
1121
+ for (unsigned I = 0 ; I != SUnits.size (); I++)
1122
+ for (const int Succ : LCODTracker.getLoopCarried (I).set_bits ())
1123
+ LCE.OrderDeps [&SUnits[I]].insert (&SUnits[Succ]);
1124
+
1125
+ LCE.modifySUnits (SUnits);
1126
+ return LCE;
966
1127
}
967
1128
968
1129
// / Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
@@ -4002,3 +4163,37 @@ const SwingSchedulerDDG::EdgesType &
4002
4163
SwingSchedulerDDG::getOutEdges (const SUnit *SU) const {
4003
4164
return getEdges (SU).Succs ;
4004
4165
}
4166
+
4167
+ void LoopCarriedEdges::modifySUnits (std::vector<SUnit> &SUnits) {
4168
+ // Currently this function simply adds all dependencies represented by this
4169
+ // object. After we properly handle missed dependencies, the logic here will
4170
+ // be more complex, as currently missed edges should not be added to the DAG.
4171
+ for (SUnit &SU : SUnits) {
4172
+ SUnit *Src = &SU;
4173
+ if (auto *OrderDep = getOrderDepOrNull (Src)) {
4174
+ SDep Dep (Src, SDep::Barrier);
4175
+ Dep.setLatency (1 );
4176
+ for (SUnit *Dst : *OrderDep)
4177
+ Dst->addPred (Dep);
4178
+ }
4179
+ }
4180
+ }
4181
+
4182
+ void LoopCarriedEdges::dump (SUnit *SU, const TargetRegisterInfo *TRI,
4183
+ const MachineRegisterInfo *MRI) const {
4184
+ const auto *Order = getOrderDepOrNull (SU);
4185
+
4186
+ if (!Order)
4187
+ return ;
4188
+
4189
+ const auto DumpSU = [](const SUnit *SU) {
4190
+ std::ostringstream OSS;
4191
+ OSS << " SU(" << SU->NodeNum << " )" ;
4192
+ return OSS.str ();
4193
+ };
4194
+
4195
+ dbgs () << " Loop carried edges from " << DumpSU (SU) << " \n "
4196
+ << " Order\n " ;
4197
+ for (SUnit *Dst : *Order)
4198
+ dbgs () << " " << DumpSU (Dst) << " \n " ;
4199
+ }
0 commit comments