Skip to content

Commit e377dc4

Browse files
authored
[AMDGPU] Max. WG size-induced occupancy limits max. waves/EU (#137807)
The default maximum waves/EU returned by the family of `AMDGPUSubtarget::getWavesPerEU` is currently the maximum number of waves/EU supported by the subtarget (only a valid occupancy range in "amdgpu-waves-per-eu" may lower that maximum). This ignores maximum achievable occupancy imposed by flat workgroup size and LDS usage, resulting in situations where `AMDGPUSubtarget::getWavesPerEU` produces a maximum higher than the one from `AMDGPUSubtarget::getOccupancyWithWorkGroupSizes`. This limits the waves/EU range's maximum to the maximum achievable occupancy derived from flat workgroup sizes and LDS usage. This only has an impact on functions which restrict flat workgroup size with "amdgpu-flat-work-group-size", since the default range of flat workgroup sizes achieves the maximum number of waves/EU supported by the subtarget. Improvements to the handling of "amdgpu-waves-per-eu" are left for a follow up PR (e.g., I think the attribute should be able to lower the full range of waves/EU produced by these methods).
1 parent 212f245 commit e377dc4

7 files changed

+128
-114
lines changed

llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp

+11-2
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ class AMDGPUInformationCache : public InformationCache {
209209
getWavesPerEU(const Function &F,
210210
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
211211
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
212-
return ST.getWavesPerEU(F, FlatWorkGroupSize);
212+
return ST.getWavesPerEU(FlatWorkGroupSize, getLDSSize(F), F);
213213
}
214214

215215
std::optional<std::pair<unsigned, unsigned>>
@@ -230,7 +230,8 @@ class AMDGPUInformationCache : public InformationCache {
230230
std::pair<unsigned, unsigned> WavesPerEU,
231231
std::pair<unsigned, unsigned> FlatWorkGroupSize) {
232232
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
233-
return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize);
233+
return ST.getEffectiveWavesPerEU(WavesPerEU, FlatWorkGroupSize,
234+
getLDSSize(F));
234235
}
235236

236237
unsigned getMaxWavesPerEU(const Function &F) {
@@ -255,6 +256,14 @@ class AMDGPUInformationCache : public InformationCache {
255256
return Status;
256257
}
257258

259+
/// Returns the minimum amount of LDS space used by a workgroup running
260+
/// function \p F.
261+
static unsigned getLDSSize(const Function &F) {
262+
return AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
263+
{0, UINT32_MAX}, true)
264+
.first;
265+
}
266+
258267
/// Get the constant access bitmap for \p C.
259268
uint8_t getConstantAccess(const Constant *C,
260269
SmallPtrSetImpl<const Constant *> &Visited) {

llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp

+9-23
Original file line numberDiff line numberDiff line change
@@ -195,12 +195,14 @@ class AMDGPUPromoteAllocaToVector : public FunctionPass {
195195
}
196196
};
197197

198-
unsigned getMaxVGPRs(const TargetMachine &TM, const Function &F) {
198+
static unsigned getMaxVGPRs(unsigned LDSBytes, const TargetMachine &TM,
199+
const Function &F) {
199200
if (!TM.getTargetTriple().isAMDGCN())
200201
return 128;
201202

202203
const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
203-
unsigned MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
204+
unsigned MaxVGPRs = ST.getMaxNumVGPRs(
205+
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), LDSBytes, F).first);
204206

205207
// A non-entry function has only 32 caller preserved registers.
206208
// Do not promote alloca which will force spilling unless we know the function
@@ -336,10 +338,9 @@ bool AMDGPUPromoteAllocaImpl::run(Function &F, bool PromoteToLDS) {
336338
if (!ST.isPromoteAllocaEnabled())
337339
return false;
338340

339-
MaxVGPRs = getMaxVGPRs(TM, F);
340-
setFunctionLimits(F);
341-
342341
bool SufficientLDS = PromoteToLDS && hasSufficientLocalMem(F);
342+
MaxVGPRs = getMaxVGPRs(CurrentLocalMemUsage, TM, F);
343+
setFunctionLimits(F);
343344

344345
unsigned VectorizationBudget =
345346
(PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
@@ -1452,29 +1453,14 @@ bool AMDGPUPromoteAllocaImpl::hasSufficientLocalMem(const Function &F) {
14521453
}
14531454

14541455
unsigned MaxOccupancy =
1455-
ST.getOccupancyWithWorkGroupSizes(CurrentLocalMemUsage, F).second;
1456-
1457-
// Restrict local memory usage so that we don't drastically reduce occupancy,
1458-
// unless it is already significantly reduced.
1459-
1460-
// TODO: Have some sort of hint or other heuristics to guess occupancy based
1461-
// on other factors..
1462-
unsigned OccupancyHint = ST.getWavesPerEU(F).second;
1463-
if (OccupancyHint == 0)
1464-
OccupancyHint = 7;
1465-
1466-
// Clamp to max value.
1467-
OccupancyHint = std::min(OccupancyHint, ST.getMaxWavesPerEU());
1468-
1469-
// Check the hint but ignore it if it's obviously wrong from the existing LDS
1470-
// usage.
1471-
MaxOccupancy = std::min(OccupancyHint, MaxOccupancy);
1456+
ST.getWavesPerEU(ST.getFlatWorkGroupSizes(F), CurrentLocalMemUsage, F)
1457+
.second;
14721458

14731459
// Round up to the next tier of usage.
14741460
unsigned MaxSizeWithWaveCount =
14751461
ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
14761462

1477-
// Program is possibly broken by using more local mem than available.
1463+
// Program may already use more LDS than is usable at maximum occupancy.
14781464
if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
14791465
return false;
14801466

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp

+35-28
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
5555
return getLocalMemorySize() / WorkGroupsPerCU;
5656
}
5757

58-
std::pair<unsigned, unsigned>
59-
AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
60-
const Function &F) const {
58+
std::pair<unsigned, unsigned> AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(
59+
uint32_t LDSBytes, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
60+
6161
// FIXME: We should take into account the LDS allocation granularity.
6262
const unsigned MaxWGsLDS = getLocalMemorySize() / std::max(LDSBytes, 1u);
6363

@@ -81,7 +81,7 @@ AMDGPUSubtarget::getOccupancyWithWorkGroupSizes(uint32_t LDSBytes,
8181
// workgroups, maximum number of waves, and minimum occupancy. The opposite is
8282
// generally true for the minimum group size. LDS or barrier ressource
8383
// limitations can flip those minimums/maximums.
84-
const auto [MinWGSize, MaxWGSize] = getFlatWorkGroupSizes(F);
84+
const auto [MinWGSize, MaxWGSize] = FlatWorkGroupSizes;
8585
auto [MinWavesPerWG, MaxWGsPerCU, MaxWavesPerCU] = PropsFromWGSize(MinWGSize);
8686
auto [MaxWavesPerWG, MinWGsPerCU, MinWavesPerCU] = PropsFromWGSize(MaxWGSize);
8787

@@ -180,45 +180,52 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
180180
}
181181

182182
std::pair<unsigned, unsigned> AMDGPUSubtarget::getEffectiveWavesPerEU(
183-
std::pair<unsigned, unsigned> Requested,
184-
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
185-
// Default minimum/maximum number of waves per execution unit.
186-
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
187-
188-
// If minimum/maximum flat work group sizes were explicitly requested using
189-
// "amdgpu-flat-workgroup-size" attribute, then set default minimum/maximum
190-
// number of waves per execution unit to values implied by requested
191-
// minimum/maximum flat work group sizes.
192-
unsigned MinImpliedByFlatWorkGroupSize =
193-
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
194-
Default.first = MinImpliedByFlatWorkGroupSize;
183+
std::pair<unsigned, unsigned> RequestedWavesPerEU,
184+
std::pair<unsigned, unsigned> FlatWorkGroupSizes, unsigned LDSBytes) const {
185+
// Default minimum/maximum number of waves per EU. The range of flat workgroup
186+
// sizes limits the achievable maximum, and we aim to support enough waves per
187+
// EU so that we can concurrently execute all waves of a single workgroup of
188+
// maximum size on a CU.
189+
std::pair<unsigned, unsigned> Default = {
190+
getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second),
191+
getOccupancyWithWorkGroupSizes(LDSBytes, FlatWorkGroupSizes).second};
192+
Default.first = std::min(Default.first, Default.second);
195193

196194
// Make sure requested minimum is less than requested maximum.
197-
if (Requested.second && Requested.first > Requested.second)
195+
if (RequestedWavesPerEU.second &&
196+
RequestedWavesPerEU.first > RequestedWavesPerEU.second)
198197
return Default;
199198

200-
// Make sure requested values do not violate subtarget's specifications.
201-
if (Requested.first < getMinWavesPerEU() ||
202-
Requested.second > getMaxWavesPerEU())
199+
// Make sure requested values do not violate subtarget's specifications and
200+
// are compatible with values implied by minimum/maximum flat workgroup sizes.
201+
if (RequestedWavesPerEU.first < Default.first ||
202+
RequestedWavesPerEU.second > Default.second)
203203
return Default;
204204

205-
// Make sure requested values are compatible with values implied by requested
206-
// minimum/maximum flat work group sizes.
207-
if (Requested.first < MinImpliedByFlatWorkGroupSize)
208-
return Default;
205+
return RequestedWavesPerEU;
206+
}
209207

210-
return Requested;
208+
std::pair<unsigned, unsigned>
209+
AMDGPUSubtarget::getWavesPerEU(const Function &F) const {
210+
// Default/requested minimum/maximum flat work group sizes.
211+
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
212+
// Minimum number of bytes allocated in the LDS.
213+
unsigned LDSBytes = AMDGPU::getIntegerPairAttribute(F, "amdgpu-lds-size",
214+
{0, UINT32_MAX}, true)
215+
.first;
216+
return getWavesPerEU(FlatWorkGroupSizes, LDSBytes, F);
211217
}
212218

213-
std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
214-
const Function &F, std::pair<unsigned, unsigned> FlatWorkGroupSizes) const {
219+
std::pair<unsigned, unsigned>
220+
AMDGPUSubtarget::getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
221+
unsigned LDSBytes, const Function &F) const {
215222
// Default minimum/maximum number of waves per execution unit.
216223
std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
217224

218225
// Requested minimum/maximum number of waves per execution unit.
219226
std::pair<unsigned, unsigned> Requested =
220227
AMDGPU::getIntegerPairAttribute(F, "amdgpu-waves-per-eu", Default, true);
221-
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes);
228+
return getEffectiveWavesPerEU(Requested, FlatWorkGroupSizes, LDSBytes);
222229
}
223230

224231
static unsigned getReqdWorkGroupSize(const Function &Kernel, unsigned Dim) {

llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h

+26-14
Original file line numberDiff line numberDiff line change
@@ -106,21 +106,24 @@ class AMDGPUSubtarget {
106106
/// be converted to integer, violate subtarget's specifications, or are not
107107
/// compatible with minimum/maximum number of waves limited by flat work group
108108
/// size, register usage, and/or lds usage.
109-
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const {
110-
// Default/requested minimum/maximum flat work group sizes.
111-
std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
112-
return getWavesPerEU(F, FlatWorkGroupSizes);
113-
}
109+
std::pair<unsigned, unsigned> getWavesPerEU(const Function &F) const;
114110

115-
/// Overload which uses the specified values for the flat work group sizes,
116-
/// rather than querying the function itself. \p FlatWorkGroupSizes Should
117-
/// correspond to the function's value for getFlatWorkGroupSizes.
111+
/// Overload which uses the specified values for the flat workgroup sizes and
112+
/// LDS space rather than querying the function itself. \p FlatWorkGroupSizes
113+
/// should correspond to the function's value for getFlatWorkGroupSizes and \p
114+
/// LDSBytes to the per-workgroup LDS allocation.
118115
std::pair<unsigned, unsigned>
119-
getWavesPerEU(const Function &F,
120-
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
121-
std::pair<unsigned, unsigned> getEffectiveWavesPerEU(
122-
std::pair<unsigned, unsigned> WavesPerEU,
123-
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
116+
getWavesPerEU(std::pair<unsigned, unsigned> FlatWorkGroupSizes,
117+
unsigned LDSBytes, const Function &F) const;
118+
119+
/// Returns the target minimum/maximum number of waves per EU. This is based
120+
/// on the minimum/maximum number of \p RequestedWavesPerEU and further
121+
/// limited by the maximum achievable occupancy derived from the range of \p
122+
/// FlatWorkGroupSizes and number of \p LDSBytes per workgroup.
123+
std::pair<unsigned, unsigned>
124+
getEffectiveWavesPerEU(std::pair<unsigned, unsigned> RequestedWavesPerEU,
125+
std::pair<unsigned, unsigned> FlatWorkGroupSizes,
126+
unsigned LDSBytes) const;
124127

125128
/// Return the amount of LDS that can be used that will not restrict the
126129
/// occupancy lower than WaveCount.
@@ -133,7 +136,16 @@ class AMDGPUSubtarget {
133136
/// This notably depends on the range of allowed flat group sizes for the
134137
/// function and hardware characteristics.
135138
std::pair<unsigned, unsigned>
136-
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const;
139+
getOccupancyWithWorkGroupSizes(uint32_t LDSBytes, const Function &F) const {
140+
return getOccupancyWithWorkGroupSizes(LDSBytes, getFlatWorkGroupSizes(F));
141+
}
142+
143+
/// Overload which uses the specified values for the flat work group sizes,
144+
/// rather than querying the function itself. \p FlatWorkGroupSizes should
145+
/// correspond to the function's value for getFlatWorkGroupSizes.
146+
std::pair<unsigned, unsigned> getOccupancyWithWorkGroupSizes(
147+
uint32_t LDSBytes,
148+
std::pair<unsigned, unsigned> FlatWorkGroupSizes) const;
137149

138150
/// Subtarget's minimum/maximum occupancy, in number of waves per EU, that can
139151
/// be achieved when the only function running on a CU is \p MF. This notably

llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll

+6-6
Original file line numberDiff line numberDiff line change
@@ -24,20 +24,20 @@ entry:
2424
attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
2525

2626
; CHECK-LABEL: {{^}}min_128_max_128:
27-
; CHECK: SGPRBlocks: 0
28-
; CHECK: VGPRBlocks: 0
29-
; CHECK: NumSGPRsForWavesPerEU: 1
30-
; CHECK: NumVGPRsForWavesPerEU: 1
27+
; CHECK: SGPRBlocks: 8
28+
; CHECK: VGPRBlocks: 7
29+
; CHECK: NumSGPRsForWavesPerEU: 65
30+
; CHECK: NumVGPRsForWavesPerEU: 29
3131
define amdgpu_kernel void @min_128_max_128() #2 {
3232
entry:
3333
ret void
3434
}
3535
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
3636

3737
; CHECK-LABEL: {{^}}min_1024_max_1024
38-
; CHECK: SGPRBlocks: 2
38+
; CHECK: SGPRBlocks: 8
3939
; CHECK: VGPRBlocks: 10
40-
; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
40+
; CHECK: NumSGPRsForWavesPerEU: 65
4141
; CHECK: NumVGPRsForWavesPerEU: 43
4242
@var = addrspace(1) global float 0.0
4343
define amdgpu_kernel void @min_1024_max_1024() #3 {

llvm/test/CodeGen/AMDGPU/load-global-i16.ll

+40-40
Original file line numberDiff line numberDiff line change
@@ -6581,50 +6581,50 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
65816581
; GCN-NOHSA-SI-NEXT: s_mov_b32 s9, s7
65826582
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0
65836583
; GCN-NOHSA-SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
6584+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v8, 0
65846585
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(1)
6585-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v10, 16, v1
6586-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v14, 16, v2
6587-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v18, 16, v0
6588-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v16, 0xffff, v0
6589-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v12, 0xffff, v2
6590-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v8, 0xffff, v1
6591-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v3
6592-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v3
6593-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v21, 0
6586+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v13, 16, v1
6587+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v17, 16, v3
65946588
; GCN-NOHSA-SI-NEXT: s_waitcnt vmcnt(0)
6595-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v22, 16, v5
6596-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v26, 16, v6
6597-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v24, 0xffff, v6
6598-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v6, 16, v4
6599-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
6600-
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v30, 16, v7
6601-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v28, 0xffff, v7
6602-
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v20, 0xffff, v5
6603-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v23, v21
6604-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v29, v21
6605-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v31, v21
6606-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v21
6607-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v21
6608-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v9, v21
6609-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v11, v21
6610-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v5, v21
6611-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v7, v21
6612-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v25, v21
6613-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v27, v21
6614-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v13, v21
6615-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v15, v21
6616-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v17, v21
6617-
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v19, v21
6589+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v9, 16, v5
6590+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v21, 16, v2
6591+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v25, 16, v0
6592+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v23, 0xffff, v0
6593+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v19, 0xffff, v2
6594+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v11, 0xffff, v1
6595+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v15, 0xffff, v3
6596+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v2, 16, v6
6597+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v0, 0xffff, v6
6598+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v29, 16, v4
6599+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v27, 0xffff, v4
6600+
; GCN-NOHSA-SI-NEXT: v_lshrrev_b32_e32 v33, 16, v7
6601+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v31, 0xffff, v7
6602+
; GCN-NOHSA-SI-NEXT: v_and_b32_e32 v7, 0xffff, v5
6603+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v10, v8
6604+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v32, v8
6605+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v34, v8
6606+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v16, v8
6607+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v18, v8
6608+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v12, v8
6609+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v14, v8
6610+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v28, v8
6611+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v30, v8
6612+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v1, v8
6613+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v3, v8
6614+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v20, v8
6615+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v22, v8
6616+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v24, v8
6617+
; GCN-NOHSA-SI-NEXT: v_mov_b32_e32 v26, v8
66186618
; GCN-NOHSA-SI-NEXT: s_mov_b32 s0, s4
66196619
; GCN-NOHSA-SI-NEXT: s_mov_b32 s1, s5
6620-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
6621-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
6622-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
6623-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
6624-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
6625-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
6626-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
6627-
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[16:19], off, s[0:3], 0
6620+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
6621+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:112
6622+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
6623+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
6624+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:64
6625+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
6626+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
6627+
; GCN-NOHSA-SI-NEXT: buffer_store_dwordx4 v[23:26], off, s[0:3], 0
66286628
; GCN-NOHSA-SI-NEXT: s_endpgm
66296629
;
66306630
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:

llvm/test/CodeGen/AMDGPU/schedule-regpressure-limit-clustering.ll

+1-1
Original file line numberDiff line numberDiff line change
@@ -42,4 +42,4 @@ bb2:
4242
declare i32 @llvm.amdgcn.workitem.id.x() #0
4343

4444
attributes #0 = { nounwind readnone }
45-
attributes #1 = { "amdgpu-num-vgpr"="9" "amdgpu-flat-work-group-size"="1024,1024" }
45+
attributes #1 = { "amdgpu-num-vgpr"="9" }

0 commit comments

Comments
 (0)