Skip to content

Commit 8c9a585

Browse files
committed
extend fuse producer to multi-level extractSliceOp
1 parent 4762f3b commit 8c9a585

File tree

5 files changed

+293
-1
lines changed

5 files changed

+293
-1
lines changed

mlir/include/mlir/Dialect/SCF/Transforms/TileUsingInterface.h

+8
Original file line numberDiff line numberDiff line change
@@ -136,11 +136,19 @@ struct SCFFuseProducerOfSliceResult {
136136
Value tiledAndFusedProducer; // Tile and fused producer value.
137137
SmallVector<Operation *> tiledOps;
138138
};
139+
std::optional<SCFFuseProducerOfSliceResult>
140+
tileAndFuseProducerOfSliceImpl(RewriterBase &rewriter,
141+
tensor::ExtractSliceOp candidateSliceOp,
142+
MutableArrayRef<LoopLikeOpInterface> loops);
143+
139144
std::optional<SCFFuseProducerOfSliceResult>
140145
tileAndFuseProducerOfSlice(RewriterBase &rewriter,
141146
tensor::ExtractSliceOp candidateSliceOp,
142147
MutableArrayRef<LoopLikeOpInterface> loops);
143148

149+
std::optional<SCFFuseProducerOfSliceResult>
150+
tileAndFuseProducerOfSlice(RewriterBase &rewriter, Operation *candidateSliceOp);
151+
144152
/// Reconstruct the fused producer from within the tiled-and-fused code. Based
145153
/// on the slice of the producer computed in place it is possible that within
146154
/// the loop nest same slice of the producer is computed multiple times. It is

mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp

+130-1
Original file line numberDiff line numberDiff line change
@@ -835,7 +835,7 @@ getUntiledProducerFromSliceSource(OpOperand *source,
835835
/// Implementation of fusing producer of a single slice by computing the
836836
/// slice of the producer in-place.
837837
std::optional<scf::SCFFuseProducerOfSliceResult>
838-
mlir::scf::tileAndFuseProducerOfSlice(
838+
mlir::scf::tileAndFuseProducerOfSliceImpl(
839839
RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp,
840840
MutableArrayRef<LoopLikeOpInterface> loops) {
841841
// 1. Get the producer of the source (potentially walking through
@@ -949,6 +949,135 @@ mlir::scf::tileAndFuseProducerOfSlice(
949949
tileAndFuseResult->tiledOps};
950950
}
951951

952+
/// Get the Root source of target ExtractSliceOp
953+
/// %0 =
954+
/// %1 = scf.for(%arg1 = %0)
955+
/// %2 = extract %arg1
956+
/// %3 = scf.for(%arg2 = %2)
957+
/// %4 = extract %args2
958+
/// ...
959+
/// @param targetSliceOp: %4 = extract %args2
960+
/// @param extractSliceOpChain: chain of all related extract sliceOp
961+
/// @return Value of Root Source : %0
962+
static FailureOr<Value> getRootSourceOfExtractSliceOp(
963+
Operation *targetSliceOp,
964+
SmallVectorImpl<tensor::ExtractSliceOp> &extractSliceOpChain,
965+
int curDepth = 0, int maxDepth = 5) {
966+
assert(isa<tensor::ExtractSliceOp>(targetSliceOp));
967+
// control recursive time in avoid of stack overflow
968+
if (curDepth > maxDepth)
969+
return failure();
970+
971+
auto extractOp = cast<tensor::ExtractSliceOp>(targetSliceOp);
972+
extractSliceOpChain.push_back(extractOp);
973+
Value rootSource = extractOp.getSourceMutable().get();
974+
975+
while (true) {
976+
if (auto iterArg = dyn_cast<BlockArgument>(rootSource)) {
977+
if (auto outerLoop = dyn_cast<LoopLikeOpInterface>(
978+
iterArg.getOwner()->getParentOp())) {
979+
rootSource = outerLoop.getTiedLoopInit(iterArg)->get();
980+
continue;
981+
}
982+
return failure();
983+
} else if (auto sliceOp =
984+
rootSource.getDefiningOp<tensor::ExtractSliceOp>()) {
985+
// walk up loop to find larger candidate extractSliceOp
986+
return getRootSourceOfExtractSliceOp(sliceOp, extractSliceOpChain,
987+
curDepth + 1);
988+
}
989+
break;
990+
}
991+
return rootSource;
992+
}
993+
994+
/// Recursively find the outer nest loops of given loop(included) while the
995+
/// predict function succeed, sorted from outer to inner.
996+
///
997+
/// @param loop: target loop, note that this loop will be also included. I.e.
998+
/// if no other nest loops were found, just return itself.
999+
/// @param pred: predict function, the termination condition of recursive
1000+
/// process.
1001+
/// @return Outer Nest Loops: nest loops outside given target loop(included).
1002+
///
1003+
/// E.g.
1004+
///
1005+
/// ```
1006+
/// %0 = scf.for()
1007+
/// %1 = scf.for()
1008+
/// %2 = scf.for()
1009+
/// ```
1010+
///
1011+
/// If `%2 = scf.for` is given without specific prediction function, this
1012+
/// function will return three nest loops: %0 + %1 + %2.
1013+
static SmallVector<LoopLikeOpInterface>
1014+
getOuterNestLoopsWhile(LoopLikeOpInterface loop,
1015+
std::function<LogicalResult(LoopLikeOpInterface)> pred) {
1016+
SmallVector<LoopLikeOpInterface> nestLoops = {loop};
1017+
auto outerLoop = dyn_cast<LoopLikeOpInterface>(loop->getParentOp());
1018+
while (outerLoop && succeeded(pred(outerLoop))) {
1019+
nestLoops.push_back(outerLoop);
1020+
outerLoop = dyn_cast<LoopLikeOpInterface>(outerLoop->getParentOp());
1021+
}
1022+
// sorted from outer to inner
1023+
return {nestLoops.rbegin(), nestLoops.rend()};
1024+
}
1025+
1026+
/// Enhanced version of `tileAndFuseProducerOfSliceImpl`, which can deal with
1027+
/// multi-level `extractSliceOp`. E.g.
1028+
///
1029+
/// ```
1030+
/// %0 = untiled_producer
1031+
/// %1 = scf.for(%arg1 = %0)
1032+
/// %2 = extract %arg1
1033+
/// %3 = scf.for(%arg2 = %2)
1034+
/// %4 = extract %args2
1035+
/// %5 = tiled_consumer ins(%4)
1036+
/// ```
1037+
std::optional<scf::SCFFuseProducerOfSliceResult>
1038+
mlir::scf::tileAndFuseProducerOfSlice(RewriterBase &rewriter,
1039+
Operation *candidateSliceOp) {
1040+
SmallVector<tensor::ExtractSliceOp> sliceOpChain;
1041+
if (failed(getRootSourceOfExtractSliceOp(candidateSliceOp, sliceOpChain))) {
1042+
return std::nullopt;
1043+
}
1044+
1045+
std::optional<scf::SCFFuseProducerOfSliceResult> fuseProducerResult;
1046+
// reverse from outer to inner
1047+
std::reverse(sliceOpChain.begin(), sliceOpChain.end());
1048+
// multiple application of `tileAndFuseProducerOfSliceImpl`
1049+
for (auto &&[index, sliceOp] : llvm::enumerate(sliceOpChain)) {
1050+
// get nest loops between next candidate sliceOp and tiled producer.
1051+
auto whileProducerOutOfBlock =
1052+
[&fuseProducerResult](LoopLikeOpInterface loop) -> LogicalResult {
1053+
if (fuseProducerResult) {
1054+
Block &body = loop->getRegion(0).front();
1055+
if (fuseProducerResult->tiledAndFusedProducer.getDefiningOp()
1056+
->getBlock() == &body)
1057+
return failure();
1058+
}
1059+
return success();
1060+
};
1061+
SmallVector<LoopLikeOpInterface> outerLoops =
1062+
getOuterNestLoopsWhile(sliceOp->getParentOfType<LoopLikeOpInterface>(),
1063+
whileProducerOutOfBlock);
1064+
fuseProducerResult =
1065+
tileAndFuseProducerOfSliceImpl(rewriter, sliceOp, outerLoops);
1066+
if (!fuseProducerResult) {
1067+
return std::nullopt;
1068+
}
1069+
}
1070+
return fuseProducerResult;
1071+
}
1072+
1073+
/// To be compatible with previous behavior
1074+
std::optional<scf::SCFFuseProducerOfSliceResult>
1075+
mlir::scf::tileAndFuseProducerOfSlice(
1076+
RewriterBase &rewriter, tensor::ExtractSliceOp candidateSliceOp,
1077+
MutableArrayRef<LoopLikeOpInterface> loops) {
1078+
return tileAndFuseProducerOfSliceImpl(rewriter, candidateSliceOp, loops);
1079+
}
1080+
9521081
/// Reconstruct the fused producer from within the tiled-and-fused code.
9531082
LogicalResult mlir::scf::yieldReplacementForFusedProducer(
9541083
RewriterBase &rewriter, tensor::ExtractSliceOp sliceOp,
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
// RUN: mlir-opt --transform-interpreter --cse --split-input-file %s | FileCheck %s
2+
3+
#map = affine_map<(d0) -> (d0 * 128)>
4+
module {
5+
func.func @gemm_fill_fusion_multi_level_extract_slice(%arg0: tensor<256x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256x256xf32>) -> tensor<256x256xf32> {
6+
%c0 = arith.constant 0 : index
7+
%c64 = arith.constant 64 : index
8+
%c128 = arith.constant 128 : index
9+
%cst = arith.constant 0.000000e+00 : f32
10+
%dest0 = tensor.empty() : tensor<256x256xf32>
11+
%dest1 = linalg.fill ins(%cst : f32) outs(%dest0 : tensor<256x256xf32>) -> tensor<256x256xf32>
12+
%1 = scf.forall (%arg3, %arg4) in (2, 2) shared_outs(%arg5 = %dest1) -> tensor<256x256xf32> {
13+
%iv0 = affine.apply #map(%arg3)
14+
%iv1 = affine.apply #map(%arg4)
15+
%extracted_slice_1 = tensor.extract_slice %arg5[%iv0, %iv1] [128, 128] [1, 1] : tensor<256x256xf32> to tensor<128x128xf32>
16+
%extracted_slice_2 = tensor.extract_slice %arg0[%iv0, 0] [128, 512] [1, 1] : tensor<256x512xf32> to tensor<128x512xf32>
17+
%extracted_slice_3 = tensor.extract_slice %arg1[0, %iv1] [512, 128] [1, 1] : tensor<512x256xf32> to tensor<512x128xf32>
18+
%2 = scf.for %arg6 = %c0 to %c128 step %c64 iter_args(%arg7 = %extracted_slice_1) -> (tensor<128x128xf32>) {
19+
%3 = scf.for %arg8 = %c0 to %c128 step %c64 iter_args(%arg9 = %arg7) -> (tensor<128x128xf32>) {
20+
%extracted_slice_4 = tensor.extract_slice %arg9[%arg6, %arg8] [64, 64] [1, 1] : tensor<128x128xf32> to tensor<64x64xf32>
21+
%extracted_slice_5 = tensor.extract_slice %extracted_slice_2[%arg6, 0] [64, 512] [1, 1] : tensor<128x512xf32> to tensor<64x512xf32>
22+
%extracted_slice_6 = tensor.extract_slice %extracted_slice_3[0, %arg8] [512, 64] [1, 1] : tensor<512x128xf32> to tensor<512x64xf32>
23+
%4 = linalg.matmul ins(%extracted_slice_5, %extracted_slice_6 : tensor<64x512xf32>, tensor<512x64xf32>) outs(%extracted_slice_4 : tensor<64x64xf32>) -> tensor<64x64xf32>
24+
%insert_slice = tensor.insert_slice %4 into %arg9[%arg6, %arg8] [64, 64] [1, 1] : tensor<64x64xf32> into tensor<128x128xf32>
25+
scf.yield %insert_slice : tensor<128x128xf32>
26+
}
27+
scf.yield %3 : tensor<128x128xf32>
28+
}
29+
scf.forall.in_parallel {
30+
tensor.parallel_insert_slice %2 into %arg5[%iv0, %iv1] [128, 128] [1, 1] : tensor<128x128xf32> into tensor<256x256xf32>
31+
}
32+
}
33+
return %1 : tensor<256x256xf32>
34+
}
35+
}
36+
37+
module attributes {transform.with_named_sequence} {
38+
transform.named_sequence @__transform_main(%arg1 : !transform.any_op {transform.readonly}) {
39+
%matmul = transform.structured.match ops{["linalg.matmul"]} in %arg1
40+
: (!transform.any_op) -> !transform.any_op
41+
%yield = transform.get_producer_of_operand %matmul[2]
42+
: (!transform.any_op) -> !transform.any_op
43+
%a, %b = transform.test.fuse_producer %yield
44+
: (!transform.any_op) -> (!transform.any_op, !transform.any_op)
45+
transform.yield
46+
}
47+
}
48+
49+
// CHECK: #[[MAP0:.*]] = affine_map<(d0) -> (d0 * 128)>
50+
// CHECK: func.func @gemm_fill_fusion_multi_level_extract_slice(
51+
// CHECK-SAME: %[[ARG0:[a-zA-Z0-9]+]]: tensor<256x512xf32>
52+
// CHECK-SAME: %[[ARG1:[a-zA-Z0-9]+]]: tensor<512x256xf32>
53+
// CHECK-SAME: %[[ARG2:[a-zA-Z0-9]+]]: tensor<256x256xf32>
54+
// CHECK: %[[C0:.*]] = arith.constant 0 : index
55+
// CHECK: %[[dest0:.*]] = tensor.empty() : tensor<256x256xf32>
56+
// CHECK: %[[FORALL_RESULT:.*]] = scf.forall (%[[IV1:.*]], %[[IV2:.*]]) in (2, 2)
57+
// CHECK-SAME: shared_outs(%[[INIT_ARG0:.*]] = %[[dest0]])
58+
// CHECK-SAME: {
59+
// CHECK: %[[AFFINE_IV1:.*]] = affine.apply #[[MAP0]](%[[IV1]])
60+
// CHECK: %[[AFFINE_IV2:.*]] = affine.apply #[[MAP0]](%[[IV2]])
61+
// CHECK: %[[FILL_OUT_SLICE0:.*]] = tensor.extract_slice %[[INIT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
62+
// CHECK: %[[INPUT_SLICE0:.*]] = tensor.extract_slice %[[ARG0]][%[[AFFINE_IV1]], 0] [128, 512] [1, 1]
63+
// CHECK: %[[WEIGHT_SLICE0:.*]] = tensor.extract_slice %[[ARG1]][0, %[[AFFINE_IV2]]] [512, 128] [1, 1]
64+
// CHECK: %[[LOOP_RESULT1:.*]] = scf.for %[[IV3:.*]] = %[[C0]]
65+
// CHECK-SAME: iter_args(%[[INIT_ARG1:.*]] = %[[FILL_OUT_SLICE0]])
66+
// CHECK-SAME: {
67+
// CHECK: %[[LOOP_RESULT2:.*]] = scf.for %[[IV4:.*]] = %[[C0]]
68+
// CHECK-SAME: iter_args(%[[INIT_ARG2:.*]] = %[[INIT_ARG1]])
69+
// CHECK-SAME: {
70+
// CHECK: %[[FILL_OUT_SLICE1:.*]] = tensor.extract_slice %[[INIT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
71+
// CHECK: %[[TILED_FILL_OUT:.*]] = linalg.fill
72+
// CHECK-SAME: outs(%[[FILL_OUT_SLICE1]] :
73+
// CHECK: %[[INPUT_SLICE1:.*]] = tensor.extract_slice %[[INPUT_SLICE0]][%[[IV3]], 0] [64, 512] [1, 1]
74+
// CHECK: %[[WEIGHT_SLICE1:.*]] = tensor.extract_slice %[[WEIGHT_SLICE0]][0, %[[IV4]]] [512, 64] [1, 1]
75+
// CHECK: %[[TILED_MAT_OUT:.*]] = linalg.matmul
76+
// CHECK-SAME: outs(%[[TILED_FILL_OUT]] :
77+
// CHECK: %[[INSERT_MAT:.*]] = tensor.insert_slice %[[TILED_MAT_OUT]] into %[[INIT_ARG2]][%[[IV3]], %[[IV4]]] [64, 64] [1, 1]
78+
// CHECK: scf.yield %[[INSERT_MAT]] :
79+
// CHECK: }
80+
// CHECK: scf.yield %[[LOOP_RESULT2]] :
81+
// CHECK: }
82+
// CHECK: scf.forall.in_parallel {
83+
// CHECK: tensor.parallel_insert_slice %[[LOOP_RESULT1]] into %[[INIT_ARG0]][%[[AFFINE_IV1]], %[[AFFINE_IV2]]] [128, 128] [1, 1]
84+
// CHECK: }
85+
// CHECK: }
86+
// CHECK: return %[[FORALL_RESULT]] :

mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.cpp

+50
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,56 @@ transform::TestFuseAndYieldOp::apply(TransformRewriter &rewriter,
160160
: DiagnosedSilenceableFailure::success();
161161
}
162162

163+
//===----------------------------------------------------------------------===//
164+
// TestFuseProducerOp
165+
//===----------------------------------------------------------------------===//
166+
167+
/// Apply fusing of producer transformation to all payload ops and store both
168+
/// the original producer operation as well as the fused producer operation.
169+
template <typename Range>
170+
static LogicalResult
171+
applyFuseProducer(RewriterBase &rewriter, Operation *transformOp,
172+
Range &&payloadOps, TransformResults &transformResults) {
173+
SmallVector<Operation *> originalProducerOps;
174+
SmallVector<Operation *> fusedProducerOps;
175+
176+
for (Operation *target : payloadOps) {
177+
rewriter.setInsertionPoint(target);
178+
179+
std::optional<scf::SCFFuseProducerOfSliceResult> fuseProducerResults =
180+
scf::tileAndFuseProducerOfSlice(rewriter, target);
181+
182+
if (!fuseProducerResults)
183+
return failure();
184+
185+
// Report back the relevant handles to the transform op.
186+
originalProducerOps.push_back(fuseProducerResults->origProducer.getOwner());
187+
fusedProducerOps.push_back(fuseProducerResults->tiledOps[0]);
188+
}
189+
190+
transformResults.set(transformOp->getOpResult(0), originalProducerOps);
191+
transformResults.set(transformOp->getOpResult(1), fusedProducerOps);
192+
return success();
193+
}
194+
195+
DiagnosedSilenceableFailure
196+
transform::TestFuseProducerOp::apply(TransformRewriter &rewriter,
197+
TransformResults &transformResults,
198+
TransformState &state) {
199+
LogicalResult result =
200+
applyFuseProducer(rewriter, getOperation(),
201+
state.getPayloadOps(getTarget()), transformResults);
202+
return failed(result) ? DiagnosedSilenceableFailure::definiteFailure()
203+
: DiagnosedSilenceableFailure::success();
204+
}
205+
206+
void transform::TestFuseProducerOp::getEffects(
207+
SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
208+
consumesHandle(getTargetMutable(), effects);
209+
producesHandle(getOperation()->getOpResults(), effects);
210+
modifiesPayload(effects);
211+
}
212+
163213
//===----------------------------------------------------------------------===//
164214
// TestFuseConsumerOp
165215
//===----------------------------------------------------------------------===//

mlir/test/lib/Interfaces/TilingInterface/TestTilingInterfaceTransformOps.td

+19
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,25 @@ def TestFuseAndYieldOp : Op<Transform_Dialect, "test.fuse_and_yield",
4949
}];
5050
}
5151

52+
def TestFuseProducerOp : Op<Transform_Dialect, "test.fuse_producer",
53+
[DeclareOpInterfaceMethods<TransformOpInterface>,
54+
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
55+
ReportTrackingListenerFailuresOpTrait]> {
56+
let description = [{
57+
Fuses the producer of the operation pointed to by the target handle
58+
using the options provided as attributes.
59+
}];
60+
61+
let arguments =
62+
(ins TransformHandleTypeInterface:$target);
63+
let results = (outs TransformHandleTypeInterface:$producer,
64+
TransformHandleTypeInterface:$fused_producer);
65+
66+
let assemblyFormat = [{
67+
$target attr-dict `:` functional-type(operands, results)
68+
}];
69+
}
70+
5271
def TestFuseConsumerOp : Op<Transform_Dialect, "test.fuse_consumer",
5372
[DeclareOpInterfaceMethods<TransformOpInterface>,
5473
DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,

0 commit comments

Comments
 (0)