Skip to content

Commit a61cc1b

Browse files
authored
[AMDGPU][True16][CodeGen] Skip combineDpp with t16 instructions (#128918)
We only emits v_mov_b32/64_dpp. Don't combine t16 instructions with mov dpp. Update the test inputs to be legal. It is future work to emit v_mov_b16_dpp, and then update GCNDPPCombine to combine it with the 16-bit instructions.
1 parent 8078665 commit a61cc1b

File tree

4 files changed

+186
-4
lines changed

4 files changed

+186
-4
lines changed

llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
215215

216216
bool HasVOP3DPP = ST->hasVOP3DPP();
217217
auto OrigOp = OrigMI.getOpcode();
218+
if (ST->useRealTrue16Insts() && AMDGPU::isTrue16Inst(OrigOp)) {
219+
LLVM_DEBUG(
220+
dbgs() << " failed: Did not expect any 16-bit uses of dpp values\n");
221+
return nullptr;
222+
}
218223
auto DPPOp = getDPPOp(OrigOp, IsShrinkable);
219224
if (DPPOp == -1) {
220225
LLVM_DEBUG(dbgs() << " failed: no DPP opcode\n");
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
3+
# XUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=gcn-dpp-combine -mattr=+real-true16 -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN,GFX1150
4+
5+
# FIXME-TRUE16 add gfx1200 runline when we have those true16 instructions supported
6+
7+
---
8+
9+
# V_MOV_B16_t16_e64_dpp is unsupported to combine
10+
# GCN-label: name: vop3_u16
11+
# GCN: %4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
12+
# GCN: %6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
13+
name: vop3_u16
14+
tracksRegLiveness: true
15+
body: |
16+
bb.0:
17+
liveins: $vgpr0, $vgpr1, $vgpr2
18+
19+
%0:vgpr_16 = COPY $vgpr0
20+
%1:vgpr_16 = COPY $vgpr1
21+
%2:vgpr_16 = COPY $vgpr2
22+
%3:vgpr_16 = IMPLICIT_DEF
23+
%4:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
24+
%5:vgpr_16 = V_ADD_NC_U16_t16_e64 0, %4, 0, %3, 0, 0, implicit $exec
25+
%6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %5, 0, 1, 15, 15, 1, implicit $exec
26+
%7:vgpr_16 = V_ADD_NC_U16_t16_e64 4, %6, 8, %5, 0, 0, implicit $exec
27+
...

llvm/test/CodeGen/AMDGPU/dpp_combine.ll

Lines changed: 30 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
2-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
3-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
4-
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GCN
1+
; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
2+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX9GFX10
3+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
4+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
5+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=+real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-TRUE16
6+
; RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -mattr=-real-true16 -verify-machineinstrs < %s | FileCheck %s -check-prefixes=GCN,GFX11-FAKE16
57

68
; GCN-LABEL: {{^}}dpp_add:
79
; GCN: global_load_{{dword|b32}} [[V:v[0-9]+]],
@@ -63,6 +65,30 @@ define amdgpu_kernel void @dpp_mul(ptr addrspace(1) %arg) {
6365
ret void
6466
}
6567

68+
; It is not expected to see a sequence of v_mov_b32_dpp feeding into a 16 bit instruction
69+
; GCN-LABEL: {{^}}dpp_fadd_f16:
70+
; GFX9GFX10: global_load_{{dword|b32}} [[V:v[0-9]+]],
71+
; GFX9GFX10: v_add_f16_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1{{$}}
72+
; GFX11-TRUE16: v_mov_b32_dpp {{v[0-9]+}}, {{v[0-9]+}} quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
73+
; GFX11-TRUE16: v_add_f16_e32
74+
; GFX11-FAKE16: global_load_{{dword|b32}} [[V:v[0-9]+]],
75+
; GFX11-FAKE16: v_add_f16_e64_dpp [[V]], [[V]], [[V]] quad_perm:[1,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1
76+
define amdgpu_kernel void @dpp_fadd_f16(ptr addrspace(1) %arg) {
77+
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
78+
%gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %id
79+
%load = load i32, ptr addrspace(1) %gep
80+
%tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %load, i32 %load, i32 1, i32 15, i32 15, i1 1) #0
81+
%tmp01 = trunc i32 %tmp0 to i16
82+
%tmp1 = bitcast i16 %tmp01 to half
83+
%tt = trunc i32 %load to i16
84+
%t = bitcast i16 %tt to half
85+
%add = fadd half %tmp1, %t
86+
%tmp2 = bitcast half %add to i16
87+
%tmp3 = zext i16 %tmp2 to i32
88+
store i32 %tmp3, ptr addrspace(1) %gep
89+
ret void
90+
}
91+
6692
declare i32 @llvm.amdgcn.workitem.id.x()
6793
declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #0
6894
declare float @llvm.ceil.f32(float)
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
2+
# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=gcn-dpp-combine -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GCN
3+
4+
---
5+
6+
name: vopc
7+
tracksRegLiveness: true
8+
body: |
9+
bb.0:
10+
liveins: $vgpr0, $vgpr1, $vgpr2
11+
12+
; GCN-LABEL: name: vopc
13+
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
14+
; GCN-NEXT: {{ $}}
15+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
16+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
17+
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
18+
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
19+
; GCN-NEXT: V_CMP_LT_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
20+
; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
21+
; GCN-NEXT: V_CMPX_GT_U32_nosdst_e64 [[V_MOV_B32_dpp]], [[COPY]], implicit-def $exec, implicit $mode, implicit $exec
22+
; GCN-NEXT: V_CMP_CLASS_F32_e32_dpp 2, [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
23+
; GCN-NEXT: V_CMP_NGE_F32_e32_dpp 0, [[COPY1]], 0, [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $mode, implicit $exec
24+
; GCN-NEXT: [[V_CMP_NGE_F32_e64_dpp:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F32_e64_dpp 0, [[COPY1]], 0, [[COPY]], 0, 1, 15, 15, 1, implicit $mode, implicit $exec
25+
; GCN-NEXT: [[S_AND_B32_:%[0-9]+]]:sgpr_32 = S_AND_B32 [[V_CMP_NGE_F32_e64_dpp]], 10101, implicit-def $scc
26+
; GCN-NEXT: V_CMP_GT_I32_e32_dpp [[COPY1]], [[COPY]], 1, 15, 15, 1, implicit-def $vcc, implicit $exec
27+
%0:vgpr_32 = COPY $vgpr0
28+
%1:vgpr_32 = COPY $vgpr1
29+
%2:vgpr_32 = COPY $vgpr2
30+
%3:vgpr_32 = IMPLICIT_DEF
31+
32+
%4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
33+
V_CMP_LT_F32_e32 %4, %0, implicit-def $vcc, implicit $mode, implicit $exec
34+
35+
%10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
36+
V_CMPX_GT_U32_nosdst_e64 %10, %0, implicit-def $exec, implicit $mode, implicit $exec
37+
38+
%11:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
39+
%12:sgpr_32 = V_CMP_CLASS_F32_e64 2, %11, %0, implicit $mode, implicit $exec
40+
41+
%13:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
42+
%14:sgpr_32 = V_CMP_NGE_F32_e64 0, %13, 0, %0, 0, implicit $mode, implicit $exec
43+
44+
%17:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
45+
%18:sgpr_32 = V_CMP_NGE_F32_e64 0, %17, 0, %0, 0, implicit $mode, implicit $exec
46+
%19:sgpr_32 = S_AND_B32 %18, 10101, implicit-def $scc
47+
48+
%20:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
49+
V_CMP_LT_I32_e32 %0, %20, implicit-def $vcc, implicit $exec
50+
51+
...
52+
---
53+
54+
# V_MOV_B16_t16_e64_dpp is unsupported to combine
55+
name: vopc_16
56+
tracksRegLiveness: true
57+
body: |
58+
bb.0:
59+
liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16
60+
61+
; GCN-LABEL: name: vopc_16
62+
; GCN: liveins: $vgpr0_lo16, $vgpr1_hi16, $vgpr255_hi16
63+
; GCN-NEXT: {{ $}}
64+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_16 = COPY $vgpr0_lo16
65+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY $vgpr1_hi16
66+
; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY $vgpr255_hi16
67+
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
68+
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
69+
; GCN-NEXT: V_CMPX_EQ_I16_t16_nosdst_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]], 0, implicit-def $exec, implicit-def $vcc_lo, implicit $mode, implicit $exec
70+
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp1:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
71+
; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp1]], 0, [[COPY]], 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
72+
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp2:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
73+
; GCN-NEXT: [[V_CMP_GE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F16_t16_e64 1, [[V_MOV_B16_t16_e64_dpp2]], 0, [[COPY]], 1, 0, implicit $mode, implicit $exec
74+
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp3:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]], 0, [[COPY1]], 0, 1, 15, 15, 1, implicit $exec
75+
; GCN-NEXT: [[V_CMP_NGE_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, [[V_CMP_NGE_F16_t16_e64_]], 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
76+
%0:vgpr_16 = COPY $vgpr0_lo16
77+
%1:vgpr_16 = COPY $vgpr1_hi16
78+
%2:vgpr_16 = COPY $vgpr255_hi16
79+
%3:vgpr_16 = IMPLICIT_DEF
80+
81+
%5:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
82+
V_CMPX_EQ_I16_t16_nosdst_e64 0, %5, 0, %0, 0, implicit-def $exec, implicit-def $vcc, implicit $mode, implicit $exec
83+
84+
%6:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
85+
%7:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %6, 0, %0, 0, implicit-def $vcc, implicit $mode, implicit $exec
86+
87+
%8:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
88+
%9:sgpr_32 = V_CMP_GE_F16_t16_e64 1, %8, 0, %0, 1, 0, implicit $mode, implicit $exec
89+
90+
%15:vgpr_16 = V_MOV_B16_t16_e64_dpp %3, 0, %1, 0, 1, 15, 15, 1, implicit $exec
91+
%16:sgpr_32 = V_CMP_NGE_F16_t16_e64 0, %16, 0, %0, 0, 0, implicit $mode, implicit $exec
92+
93+
...
94+
---
95+
96+
name: mask_not_full
97+
tracksRegLiveness: true
98+
body: |
99+
bb.0:
100+
liveins: $vgpr0, $vgpr1, $vgpr2
101+
102+
; GCN-LABEL: name: mask_not_full
103+
; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
104+
; GCN-NEXT: {{ $}}
105+
; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
106+
; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
107+
; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
108+
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
109+
; GCN-NEXT: [[V_MOV_B16_t16_e64_dpp:%[0-9]+]]:vgpr_16 = V_MOV_B16_t16_e64_dpp [[DEF]].lo16, 0, [[COPY1]].hi16, 0, 1, 15, 14, 1, implicit $exec
110+
; GCN-NEXT: [[V_CMP_CLASS_F16_t16_e64_:%[0-9]+]]:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, [[V_MOV_B16_t16_e64_dpp]], 0, [[COPY]].lo16, 0, implicit-def $vcc_lo, implicit $mode, implicit $exec
111+
; GCN-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[V_MOV_B32_e32_]], [[COPY1]], 1, 13, 15, 1, implicit $exec
112+
; GCN-NEXT: [[V_CMP_GE_F32_e64_:%[0-9]+]]:sgpr_32 = V_CMP_GE_F32_e64 1, [[V_MOV_B32_dpp]], 0, [[COPY]], 1, implicit $mode, implicit $exec
113+
%0:vgpr_32 = COPY $vgpr0
114+
%1:vgpr_32 = COPY $vgpr1
115+
%2:vgpr_32 = IMPLICIT_DEF
116+
%3:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
117+
118+
%4:vgpr_16 = V_MOV_B16_t16_e64_dpp %2.lo16, 0, %1.hi16, 0, 1, 15, 14, 1, implicit $exec
119+
%99:sgpr_32 = V_CMP_CLASS_F16_t16_e64 0, %4, 0, %0.lo16, 0, implicit-def $vcc, implicit $mode, implicit $exec
120+
121+
%5:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 13, 15, 1, implicit $exec
122+
%6:sgpr_32 = V_CMP_GE_F32_e64 1, %5, 0, %0, 1, implicit $mode, implicit $exec
123+
124+
...

0 commit comments

Comments
 (0)