Skip to content

[AMDGPU] Do not add cost for register aligned shufflevectors #137052

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
79 changes: 73 additions & 6 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1120,6 +1120,64 @@ Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
}
}

/// Certain shufflemasks may not be either Identity masks or InsertSubvector
/// masks, but do not require instructions to produce. An example is if we are
/// shuffling two <16 x i8> sources with the 16 element mask: {0, 1, 2, 3, 4, 5,
/// 6, 7, 24, 25, 26, 27, poison, poison, posion, poison}. The result of this
/// shuffle is {first v4i8 of src0, second v4i8 of src0, third v4i8 of src1,
/// posion}. In order to produce this result, we do not need to insert shuffle
/// code, as these vectors already exist the source registers. Thus, we simply
/// need to ensure these registers are contiguous to produce the result.
/// countIdentityPerms analyzes the \p Mask to count the number of such register
/// aligned vectors (based on the provided \p ScalarSize ).
static unsigned countIdentityPerms(ArrayRef<int> Mask, unsigned ScalarSize) {
unsigned IdentityPerms = 0;
unsigned EltsPerPerm = 32 / ScalarSize;
if (!EltsPerPerm)
return 0;

// Split the shuffle mask into a number of 32 bit wide shuffles.
for (unsigned PermCand = 0; PermCand < (Mask.size() / EltsPerPerm);
PermCand++) {
std::pair<int, int> BasisIndex(-1, -1);
bool FoundMismatch = false;

// Analyze the 32 bit mask for register-aligned vectors.
for (int PermElement = 0; PermElement < (int)EltsPerPerm; PermElement++) {
unsigned Index = PermCand * EltsPerPerm + PermElement;
assert(Index < Mask.size());
int MaskVal = Mask[Index];

// Maskval of -1 is dont-care.
if (MaskVal == -1)
continue;
if (BasisIndex.second == -1) {
// Check if this mask represents alignment to bit position in the
// regsiter.
if (PermElement > MaskVal || ((MaskVal - PermElement) % EltsPerPerm)) {
FoundMismatch = true;
}
BasisIndex = {MaskVal, PermElement};
continue;
}

if (MaskVal < BasisIndex.first) {
FoundMismatch = true;
break;
}

// Check if this mask is contiguous with the previously matched mask
if ((MaskVal - BasisIndex.first) != (PermElement - BasisIndex.second)) {
FoundMismatch = true;
break;
}
}
if (!FoundMismatch)
IdentityPerms += 1;
}
return IdentityPerms;
}

InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
VectorType *VT, ArrayRef<int> Mask,
TTI::TargetCostKind CostKind,
Expand All @@ -1133,12 +1191,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,

// Larger vector widths may require additional instructions, but are
// typically cheaper than scalarized versions.
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
unsigned ScalarSize = DL.getTypeSizeInBits(VT->getElementType());
if (ST->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
DL.getTypeSizeInBits(VT->getElementType()) == 16) {
ScalarSize == 16) {
bool HasVOP3P = ST->hasVOP3PInsts();
unsigned RequestedElts =
count_if(Mask, [](int MaskElt) { return MaskElt != -1; });
unsigned NumVectorElts = cast<FixedVectorType>(VT)->getNumElements();
if (RequestedElts == 0)
return 0;
switch (Kind) {
Expand All @@ -1149,9 +1208,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
// half of a register, so any swizzle of two elements is free.
if (HasVOP3P && NumVectorElts == 2)
return 0;
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(Mask.size(), 2) / 2;
unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize);
assert(IdentPerms <= NumPerms);
NumPerms -= IdentPerms;
// SK_Broadcast just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Broadcast ? 1 : NumPerms;
unsigned NumPermMasks =
Kind == (TTI::SK_Broadcast && NumPerms > 1) ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}
case TTI::SK_ExtractSubvector:
Expand All @@ -1166,9 +1229,13 @@ InstructionCost GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
case TTI::SK_PermuteTwoSrc:
case TTI::SK_Splice:
case TTI::SK_Select: {
unsigned NumPerms = alignTo(RequestedElts, 2) / 2;
unsigned NumPerms = alignTo(Mask.size(), 2) / 2;
unsigned IdentPerms = countIdentityPerms(Mask, ScalarSize);
assert(IdentPerms <= NumPerms);
NumPerms -= IdentPerms;
// SK_Select just reuses the same mask
unsigned NumPermMasks = Kind == TTI::SK_Select ? 1 : NumPerms;
unsigned NumPermMasks =
Kind == (TTI::SK_Select && NumPerms > 1) ? 1 : NumPerms;
return NumPerms + NumPermMasks;
}

Expand Down
Loading
Loading