Skip to content

Commit 66118fd

Browse files
committed
fixup! Propagate the mask to underlying intrinsics as well
1 parent 69389a3 commit 66118fd

13 files changed

+397
-369
lines changed

llvm/include/llvm/CodeGen/TargetLowering.h

+9-11
Original file line numberDiff line numberDiff line change
@@ -3157,11 +3157,11 @@ class TargetLoweringBase {
31573157
/// Lower an interleaved load to target specific intrinsics. Return
31583158
/// true on success.
31593159
///
3160-
/// \p LoadOp is a vector load or vp.load instruction.
3160+
/// \p LI is the vector load instruction.
31613161
/// \p Shuffles is the shufflevector list to DE-interleave the loaded vector.
31623162
/// \p Indices is the corresponding indices for each shufflevector.
31633163
/// \p Factor is the interleave factor.
3164-
virtual bool lowerInterleavedLoad(Instruction *LoadOp,
3164+
virtual bool lowerInterleavedLoad(LoadInst *LI,
31653165
ArrayRef<ShuffleVectorInst *> Shuffles,
31663166
ArrayRef<unsigned> Indices,
31673167
unsigned Factor) const {
@@ -3171,24 +3171,23 @@ class TargetLoweringBase {
31713171
/// Lower an interleaved store to target specific intrinsics. Return
31723172
/// true on success.
31733173
///
3174-
/// \p StoreOp is a vector store or vp.store instruction.
3174+
/// \p SI is the vector store instruction.
31753175
/// \p SVI is the shufflevector to RE-interleave the stored vector.
31763176
/// \p Factor is the interleave factor.
3177-
virtual bool lowerInterleavedStore(Instruction *StoreOp,
3178-
ShuffleVectorInst *SVI,
3177+
virtual bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI,
31793178
unsigned Factor) const {
31803179
return false;
31813180
}
31823181

3183-
/// Lower an interleaved load to target specific intrinsics. Return
3182+
/// Lower a deinterleaved load to target specific intrinsics. Return
31843183
/// true on success.
31853184
///
31863185
/// \p Load is a vp.load instruction.
31873186
/// \p Mask is a mask value
31883187
/// \p DeinterleaveRes is a list of deinterleaved results.
31893188
virtual bool
3190-
lowerDeinterleavedIntrinsicToVPLoad(VPIntrinsic *Load, Value *Mask,
3191-
ArrayRef<Value *> DeinterleaveRes) const {
3189+
lowerDeinterleavedVPLoad(VPIntrinsic *Load, Value *Mask,
3190+
ArrayRef<Value *> DeinterleaveRes) const {
31923191
return false;
31933192
}
31943193

@@ -3198,9 +3197,8 @@ class TargetLoweringBase {
31983197
/// \p Store is the vp.store instruction.
31993198
/// \p Mask is a mask value
32003199
/// \p InterleaveOps is a list of values being interleaved.
3201-
virtual bool
3202-
lowerInterleavedIntrinsicToVPStore(VPIntrinsic *Store, Value *Mask,
3203-
ArrayRef<Value *> InterleaveOps) const {
3200+
virtual bool lowerInterleavedVPStore(VPIntrinsic *Store, Value *Mask,
3201+
ArrayRef<Value *> InterleaveOps) const {
32043202
return false;
32053203
}
32063204

llvm/include/llvm/IR/IntrinsicsRISCV.td

+20
Original file line numberDiff line numberDiff line change
@@ -1705,19 +1705,39 @@ let TargetPrefix = "riscv" in {
17051705

17061706
// Segment loads/stores for fixed vectors.
17071707
foreach nf = [2, 3, 4, 5, 6, 7, 8] in {
1708+
// Input: (pointer, vl)
17081709
def int_riscv_seg # nf # _load
17091710
: DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
17101711
!listsplat(LLVMMatchType<0>,
17111712
!add(nf, -1))),
17121713
[llvm_anyptr_ty, llvm_anyint_ty],
17131714
[NoCapture<ArgIndex<0>>, IntrReadMem]>;
1715+
// Input: (pointer, mask, vl)
1716+
def int_riscv_seg # nf # _load_mask
1717+
: DefaultAttrsIntrinsic<!listconcat([llvm_anyvector_ty],
1718+
!listsplat(LLVMMatchType<0>,
1719+
!add(nf, -1))),
1720+
[llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
1721+
llvm_anyint_ty],
1722+
[NoCapture<ArgIndex<0>>, IntrReadMem]>;
1723+
1724+
// Input: (<stored values>, pointer, vl)
17141725
def int_riscv_seg # nf # _store
17151726
: DefaultAttrsIntrinsic<[],
17161727
!listconcat([llvm_anyvector_ty],
17171728
!listsplat(LLVMMatchType<0>,
17181729
!add(nf, -1)),
17191730
[llvm_anyptr_ty, llvm_anyint_ty]),
17201731
[NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
1732+
// Input: (<stored values>, pointer, mask, vl)
1733+
def int_riscv_seg # nf # _store_mask
1734+
: DefaultAttrsIntrinsic<[],
1735+
!listconcat([llvm_anyvector_ty],
1736+
!listsplat(LLVMMatchType<0>,
1737+
!add(nf, -1)),
1738+
[llvm_ptr_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
1739+
llvm_anyint_ty]),
1740+
[NoCapture<ArgIndex<nf>>, IntrWriteMem]>;
17211741
}
17221742

17231743
} // TargetPrefix = "riscv"

llvm/lib/CodeGen/InterleavedAccessPass.cpp

+89-99
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,31 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor,
250250
return false;
251251
}
252252

253+
/// Return true if it's a non-all-zeros, interleaving mask. For instance,
254+
/// 111000111000 is interleaved from three 1010 masks.
255+
/// \p SubMask returns the mask of individual lane.
256+
static bool isInterleavedConstantMask(unsigned Factor, ConstantVector *Mask,
257+
SmallVectorImpl<Constant *> &LaneMask) {
258+
unsigned LaneMaskLen = LaneMask.size();
259+
if (auto *Splat = Mask->getSplatValue()) {
260+
// All-zeros mask.
261+
if (Splat->isZeroValue())
262+
return false;
263+
// All-ones mask.
264+
std::fill(LaneMask.begin(), LaneMask.end(),
265+
ConstantInt::getTrue(Mask->getContext()));
266+
} else {
267+
for (unsigned Idx = 0U, N = LaneMaskLen * Factor; Idx < N; ++Idx) {
268+
Constant *Ref = Mask->getAggregateElement((Idx / Factor) * Factor);
269+
if (Ref != Mask->getAggregateElement(Idx))
270+
return false;
271+
LaneMask[Idx / Factor] = Ref;
272+
}
273+
}
274+
275+
return true;
276+
}
277+
253278
bool InterleavedAccessImpl::lowerInterleavedLoad(
254279
Instruction *LoadOp, SmallSetVector<Instruction *, 32> &DeadInsts) {
255280
if (isa<ScalableVectorType>(LoadOp->getType()))
@@ -261,8 +286,7 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
261286
} else if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
262287
assert(VPLoad->getIntrinsicID() == Intrinsic::vp_load);
263288
// Require a constant mask and evl.
264-
if (!isa<ConstantVector>(VPLoad->getArgOperand(1)) ||
265-
!isa<ConstantInt>(VPLoad->getArgOperand(2)))
289+
if (!isa<ConstantVector>(VPLoad->getArgOperand(1)))
266290
return false;
267291
} else {
268292
llvm_unreachable("unsupported load operation");
@@ -315,24 +339,6 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
315339
NumLoadElements))
316340
return false;
317341

318-
// If this is a vp.load, record its mask (NOT shuffle mask).
319-
BitVector MaskedIndices(NumLoadElements);
320-
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
321-
auto *Mask = cast<ConstantVector>(VPLoad->getArgOperand(1));
322-
assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
323-
NumLoadElements);
324-
if (auto *Splat = Mask->getSplatValue()) {
325-
// All-zeros mask, bail out early.
326-
if (Splat->isZeroValue())
327-
return false;
328-
} else {
329-
for (unsigned i = 0U; i < NumLoadElements; ++i) {
330-
if (Mask->getAggregateElement(i)->isZeroValue())
331-
MaskedIndices.set(i);
332-
}
333-
}
334-
}
335-
336342
// Holds the corresponding index for each DE-interleave shuffle.
337343
SmallVector<unsigned, 4> Indices;
338344

@@ -373,48 +379,35 @@ bool InterleavedAccessImpl::lowerInterleavedLoad(
373379
bool BinOpShuffleChanged =
374380
replaceBinOpShuffles(BinOpShuffles.getArrayRef(), Shuffles, LoadOp);
375381

376-
// Check if we extract only the unmasked elements.
377-
if (MaskedIndices.any()) {
378-
if (any_of(Shuffles, [&](const auto *Shuffle) {
379-
ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
380-
for (int Idx : ShuffleMask) {
381-
if (Idx < 0)
382-
continue;
383-
if (MaskedIndices.test(unsigned(Idx)))
384-
return true;
385-
}
386-
return false;
387-
})) {
388-
LLVM_DEBUG(dbgs() << "IA: trying to extract a masked element through "
389-
<< "shufflevector\n");
390-
return false;
391-
}
392-
}
393-
// Check if we extract only the elements within evl.
382+
// Check if the de-interleaved vp.load masks are the same.
383+
unsigned ShuffleMaskLen = Shuffles[0]->getShuffleMask().size();
384+
SmallVector<Constant *, 8> LaneMask(ShuffleMaskLen, nullptr);
394385
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
395-
uint64_t EVL = cast<ConstantInt>(VPLoad->getArgOperand(2))->getZExtValue();
396-
if (any_of(Shuffles, [&](const auto *Shuffle) {
397-
ArrayRef<int> ShuffleMask = Shuffle->getShuffleMask();
398-
for (int Idx : ShuffleMask) {
399-
if (Idx < 0)
400-
continue;
401-
if (unsigned(Idx) >= EVL)
402-
return true;
403-
}
404-
return false;
405-
})) {
406-
LLVM_DEBUG(
407-
dbgs() << "IA: trying to extract an element out of EVL range\n");
386+
if (!isInterleavedConstantMask(
387+
Factor, cast<ConstantVector>(VPLoad->getArgOperand(1)), LaneMask))
408388
return false;
409-
}
410389
}
411390

412391
LLVM_DEBUG(dbgs() << "IA: Found an interleaved load: " << *LoadOp << "\n");
413392

414-
// Try to create target specific intrinsics to replace the load and shuffles.
415-
if (!TLI->lowerInterleavedLoad(LoadOp, Shuffles, Indices, Factor)) {
416-
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
417-
return !Extracts.empty() || BinOpShuffleChanged;
393+
if (auto *VPLoad = dyn_cast<VPIntrinsic>(LoadOp)) {
394+
auto *MaskVec = ConstantVector::get(LaneMask);
395+
// Sometimes the number of Shuffles might be less than Factor, we have to
396+
// fill the gaps with null. Also, lowerDeinterleavedVPLoad
397+
// expects them to be sorted.
398+
SmallVector<Value *, 4> ShuffleValues(Factor, nullptr);
399+
for (auto [Idx, ShuffleMaskIdx] : enumerate(Indices))
400+
ShuffleValues[ShuffleMaskIdx] = Shuffles[Idx];
401+
if (!TLI->lowerDeinterleavedVPLoad(VPLoad, MaskVec, ShuffleValues))
402+
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
403+
return !Extracts.empty() || BinOpShuffleChanged;
404+
} else {
405+
// Try to create target specific intrinsics to replace the load and
406+
// shuffles.
407+
if (!TLI->lowerInterleavedLoad(cast<LoadInst>(LoadOp), Shuffles, Indices,
408+
Factor))
409+
// If Extracts is not empty, tryReplaceExtracts made changes earlier.
410+
return !Extracts.empty() || BinOpShuffleChanged;
418411
}
419412

420413
DeadInsts.insert_range(Shuffles);
@@ -530,9 +523,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
530523
StoredValue = SI->getValueOperand();
531524
} else if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
532525
assert(VPStore->getIntrinsicID() == Intrinsic::vp_store);
533-
// Require a constant mask and evl.
534-
if (!isa<ConstantVector>(VPStore->getArgOperand(2)) ||
535-
!isa<ConstantInt>(VPStore->getArgOperand(3)))
526+
// Require a constant mask.
527+
if (!isa<ConstantVector>(VPStore->getArgOperand(2)))
536528
return false;
537529
StoredValue = VPStore->getArgOperand(0);
538530
} else {
@@ -545,53 +537,53 @@ bool InterleavedAccessImpl::lowerInterleavedStore(
545537

546538
unsigned NumStoredElements =
547539
cast<FixedVectorType>(SVI->getType())->getNumElements();
548-
// If this is a vp.store, record its mask (NOT shuffle mask).
549-
BitVector MaskedIndices(NumStoredElements);
550-
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
551-
auto *Mask = cast<ConstantVector>(VPStore->getArgOperand(2));
552-
assert(cast<FixedVectorType>(Mask->getType())->getNumElements() ==
553-
NumStoredElements);
554-
if (auto *Splat = Mask->getSplatValue()) {
555-
// All-zeros mask, bail out early.
556-
if (Splat->isZeroValue())
557-
return false;
558-
} else {
559-
for (unsigned i = 0U; i < NumStoredElements; ++i) {
560-
if (Mask->getAggregateElement(i)->isZeroValue())
561-
MaskedIndices.set(i);
562-
}
563-
}
564-
}
565-
566540
// Check if the shufflevector is RE-interleave shuffle.
567541
unsigned Factor;
568542
if (!isReInterleaveMask(SVI, Factor, MaxFactor))
569543
return false;
544+
assert(NumStoredElements % Factor == 0 &&
545+
"number of stored element should be a multiple of Factor");
570546

571-
// Check if we store only the unmasked elements.
572-
if (MaskedIndices.any()) {
573-
if (any_of(SVI->getShuffleMask(), [&](int Idx) {
574-
return Idx >= 0 && MaskedIndices.test(unsigned(Idx));
575-
})) {
576-
LLVM_DEBUG(dbgs() << "IA: trying to store a masked element\n");
577-
return false;
578-
}
579-
}
580-
// Check if we store only the elements within evl.
547+
// Check if the de-interleaved vp.store masks are the same.
548+
unsigned LaneMaskLen = NumStoredElements / Factor;
549+
SmallVector<Constant *, 8> LaneMask(LaneMaskLen, nullptr);
581550
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
582-
uint64_t EVL = cast<ConstantInt>(VPStore->getArgOperand(3))->getZExtValue();
583-
if (any_of(SVI->getShuffleMask(),
584-
[&](int Idx) { return Idx >= 0 && unsigned(Idx) >= EVL; })) {
585-
LLVM_DEBUG(dbgs() << "IA: trying to store an element out of EVL range\n");
551+
if (!isInterleavedConstantMask(
552+
Factor, cast<ConstantVector>(VPStore->getArgOperand(2)), LaneMask))
586553
return false;
587-
}
588554
}
589555

590556
LLVM_DEBUG(dbgs() << "IA: Found an interleaved store: " << *StoreOp << "\n");
591557

592-
// Try to create target specific intrinsics to replace the store and shuffle.
593-
if (!TLI->lowerInterleavedStore(StoreOp, SVI, Factor))
594-
return false;
558+
if (auto *VPStore = dyn_cast<VPIntrinsic>(StoreOp)) {
559+
IRBuilder<> Builder(VPStore);
560+
// We need to effectively de-interleave the shufflemask
561+
// because lowerInterleavedVPStore expected individual de-interleaved
562+
// values.
563+
SmallVector<Value *, 10> NewShuffles;
564+
SmallVector<int, 16> NewShuffleMask(LaneMaskLen);
565+
auto ShuffleMask = SVI->getShuffleMask();
566+
567+
for (unsigned i = 0; i < Factor; i++) {
568+
for (unsigned j = 0; j < LaneMaskLen; j++)
569+
NewShuffleMask[j] = ShuffleMask[i + Factor * j];
570+
571+
NewShuffles.push_back(Builder.CreateShuffleVector(
572+
SVI->getOperand(0), SVI->getOperand(1), NewShuffleMask));
573+
}
574+
575+
// Try to create target specific intrinsics to replace the vp.store and
576+
// shuffle.
577+
if (!TLI->lowerInterleavedVPStore(VPStore, ConstantVector::get(LaneMask),
578+
NewShuffles))
579+
// We already created new shuffles.
580+
return true;
581+
} else {
582+
// Try to create target specific intrinsics to replace the store and
583+
// shuffle.
584+
if (!TLI->lowerInterleavedStore(cast<StoreInst>(StoreOp), SVI, Factor))
585+
return false;
586+
}
595587

596588
// Already have a new target specific interleaved store. Erase the old store.
597589
DeadInsts.insert(StoreOp);
@@ -806,8 +798,7 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic(
806798

807799
// Since lowerInterleaveLoad expects Shuffles and LoadInst, use special
808800
// TLI function to emit target-specific interleaved instruction.
809-
if (!TLI->lowerDeinterleavedIntrinsicToVPLoad(VPLoad, Mask,
810-
DeinterleaveValues))
801+
if (!TLI->lowerDeinterleavedVPLoad(VPLoad, Mask, DeinterleaveValues))
811802
return false;
812803

813804
} else {
@@ -859,8 +850,7 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic(
859850

860851
// Since lowerInterleavedStore expects Shuffle and StoreInst, use special
861852
// TLI function to emit target-specific interleaved instruction.
862-
if (!TLI->lowerInterleavedIntrinsicToVPStore(VPStore, Mask,
863-
InterleaveValues))
853+
if (!TLI->lowerInterleavedVPStore(VPStore, Mask, InterleaveValues))
864854
return false;
865855
} else {
866856
auto *SI = cast<StoreInst>(StoredBy);

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

+2-10
Original file line numberDiff line numberDiff line change
@@ -17176,18 +17176,14 @@ static Function *getStructuredStoreFunction(Module *M, unsigned Factor,
1717617176
/// %vec0 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 0
1717717177
/// %vec1 = extractelement { <4 x i32>, <4 x i32> } %ld2, i32 1
1717817178
bool AArch64TargetLowering::lowerInterleavedLoad(
17179-
Instruction *LoadOp, ArrayRef<ShuffleVectorInst *> Shuffles,
17179+
LoadInst *LI, ArrayRef<ShuffleVectorInst *> Shuffles,
1718017180
ArrayRef<unsigned> Indices, unsigned Factor) const {
1718117181
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
1718217182
"Invalid interleave factor");
1718317183
assert(!Shuffles.empty() && "Empty shufflevector input");
1718417184
assert(Shuffles.size() == Indices.size() &&
1718517185
"Unmatched number of shufflevectors and indices");
1718617186

17187-
auto *LI = dyn_cast<LoadInst>(LoadOp);
17188-
if (!LI)
17189-
return false;
17190-
1719117187
const DataLayout &DL = LI->getDataLayout();
1719217188

1719317189
VectorType *VTy = Shuffles[0]->getType();
@@ -17363,17 +17359,13 @@ bool hasNearbyPairedStore(Iter It, Iter End, Value *Ptr, const DataLayout &DL) {
1736317359
/// %sub.v1 = shuffle <32 x i32> %v0, <32 x i32> v1, <32, 33, 34, 35>
1736417360
/// %sub.v2 = shuffle <32 x i32> %v0, <32 x i32> v1, <16, 17, 18, 19>
1736517361
/// call void llvm.aarch64.neon.st3(%sub.v0, %sub.v1, %sub.v2, %ptr)
17366-
bool AArch64TargetLowering::lowerInterleavedStore(Instruction *StoreOp,
17362+
bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
1736717363
ShuffleVectorInst *SVI,
1736817364
unsigned Factor) const {
1736917365

1737017366
assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
1737117367
"Invalid interleave factor");
1737217368

17373-
auto *SI = dyn_cast<StoreInst>(StoreOp);
17374-
if (!SI)
17375-
return false;
17376-
1737717369
auto *VecTy = cast<FixedVectorType>(SVI->getType());
1737817370
assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
1737917371

0 commit comments

Comments
 (0)