xref: /llvm-project/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (revision 11b040192640ef3b1f481124c440f464ed6ec86a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
3
4declare i32 @llvm.amdgcn.workitem.id.x()
5
6define <2 x i64> @f1() #0 {
7; GFX11-LABEL: f1:
8; GFX11:       ; %bb.0:
9; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
10; GFX11-NEXT:    v_mov_b32_e32 v0, 0
11; GFX11-NEXT:    v_mov_b32_e32 v1, 0
12; GFX11-NEXT:    v_mov_b32_e32 v2, 0
13; GFX11-NEXT:    v_mov_b32_e32 v3, 0
14; GFX11-NEXT:    s_setpc_b64 s[30:31]
15  ret <2 x i64> zeroinitializer
16}
17
18define void @f0() {
19; GFX11-LABEL: f0:
20; GFX11:       ; %bb.0: ; %bb
21; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
22; GFX11-NEXT:    s_mov_b32 s2, s33
23; GFX11-NEXT:    s_mov_b32 s33, s32
24; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
25; GFX11-NEXT:    scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill
26; GFX11-NEXT:    s_mov_b32 exec_lo, s0
27; GFX11-NEXT:    s_add_i32 s32, s32, 16
28; GFX11-NEXT:    s_getpc_b64 s[0:1]
29; GFX11-NEXT:    s_add_u32 s0, s0, f1@gotpcrel32@lo+4
30; GFX11-NEXT:    s_addc_u32 s1, s1, f1@gotpcrel32@hi+12
31; GFX11-NEXT:    v_writelane_b32 v4, s30, 0
32; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
33; GFX11-NEXT:    v_writelane_b32 v4, s31, 1
34; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
36; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
37; GFX11-NEXT:    v_readlane_b32 s31, v4, 1
38; GFX11-NEXT:    v_readlane_b32 s30, v4, 0
39; GFX11-NEXT:    s_mov_b32 s32, s33
40; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
41; GFX11-NEXT:    scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload
42; GFX11-NEXT:    s_mov_b32 exec_lo, s0
43; GFX11-NEXT:    s_mov_b32 s33, s2
44; GFX11-NEXT:    s_waitcnt vmcnt(0)
45; GFX11-NEXT:    s_setpc_b64 s[30:31]
46bb:
47  %i = call <2 x i64> @f1()
48  ret void
49}
50
51; FIXME: This generates "instid1(/* invalid instid value */)".
52define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg4, i1 %arg5, ptr %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10, i1 %arg11) {
53; GFX11-LABEL: f2:
54; GFX11:       ; %bb.0: ; %bb
55; GFX11-NEXT:    s_mov_b64 s[16:17], s[4:5]
56; GFX11-NEXT:    v_mov_b32_e32 v31, v0
57; GFX11-NEXT:    s_load_b32 s19, s[16:17], 0x24
58; GFX11-NEXT:    s_mov_b32 s12, s13
59; GFX11-NEXT:    s_mov_b64 s[10:11], s[6:7]
60; GFX11-NEXT:    s_mov_b64 s[6:7], s[2:3]
61; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v31
62; GFX11-NEXT:    s_mov_b64 s[4:5], s[0:1]
63; GFX11-NEXT:    s_mov_b32 s20, 0
64; GFX11-NEXT:    s_mov_b32 s0, -1
65; GFX11-NEXT:    s_mov_b32 s3, exec_lo
66; GFX11-NEXT:    s_mov_b32 s32, 0
67; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX11-NEXT:    v_mul_lo_u32 v0, s19, v0
69; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
70; GFX11-NEXT:    v_cmpx_eq_u32_e32 0, v0
71; GFX11-NEXT:    s_cbranch_execz .LBB2_13
72; GFX11-NEXT:  ; %bb.1: ; %bb14
73; GFX11-NEXT:    s_load_b128 s[20:23], s[16:17], 0x2c
74; GFX11-NEXT:    s_mov_b32 s18, 0
75; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
76; GFX11-NEXT:    s_bitcmp1_b32 s21, 0
77; GFX11-NEXT:    s_cselect_b32 s24, -1, 0
78; GFX11-NEXT:    s_bitcmp0_b32 s21, 0
79; GFX11-NEXT:    s_cbranch_scc0 .LBB2_3
80; GFX11-NEXT:  ; %bb.2: ; %bb15
81; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
82; GFX11-NEXT:    s_addc_u32 s9, s17, 0
83; GFX11-NEXT:    s_getpc_b64 s[0:1]
84; GFX11-NEXT:    s_add_u32 s0, s0, f0@gotpcrel32@lo+4
85; GFX11-NEXT:    s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
86; GFX11-NEXT:    s_mov_b32 s13, s14
87; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
88; GFX11-NEXT:    s_mov_b32 s21, s14
89; GFX11-NEXT:    s_mov_b32 s14, s15
90; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
91; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
92; GFX11-NEXT:    s_mov_b32 s14, s21
93; GFX11-NEXT:    s_mov_b32 s2, -1
94; GFX11-NEXT:    s_cbranch_execz .LBB2_4
95; GFX11-NEXT:    s_branch .LBB2_12
96; GFX11-NEXT:  .LBB2_3:
97; GFX11-NEXT:    s_mov_b32 s2, 0
98; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
99; GFX11-NEXT:    s_cbranch_vccnz .LBB2_12
100; GFX11-NEXT:  .LBB2_4: ; %bb16
101; GFX11-NEXT:    s_load_b32 s0, s[16:17], 0x54
102; GFX11-NEXT:    s_bitcmp1_b32 s23, 0
103; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
104; GFX11-NEXT:    s_and_b32 s1, s23, 1
105; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
106; GFX11-NEXT:    s_bitcmp1_b32 s0, 0
107; GFX11-NEXT:    s_mov_b32 s0, -1
108; GFX11-NEXT:    s_cselect_b32 s8, -1, 0
109; GFX11-NEXT:    s_cmp_eq_u32 s1, 0
110; GFX11-NEXT:    s_cbranch_scc0 .LBB2_8
111; GFX11-NEXT:  ; %bb.5: ; %bb18.preheader
112; GFX11-NEXT:    s_load_b128 s[28:31], s[16:17], 0x44
113; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
114; GFX11-NEXT:    s_mul_hi_u32 s0, s29, s28
115; GFX11-NEXT:    s_mul_i32 s1, s29, s28
116; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
117; GFX11-NEXT:    v_alignbit_b32 v0, s0, s1, 1
118; GFX11-NEXT:    s_mov_b32 s1, 0
119; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
120; GFX11-NEXT:    v_mov_b32_e32 v0, 0
121; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
122; GFX11-NEXT:    s_or_b32 s0, s0, 1
123; GFX11-NEXT:    s_lshr_b32 s0, s0, s30
124; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
125; GFX11-NEXT:    s_mul_i32 s0, s0, s22
126; GFX11-NEXT:    s_mul_i32 s0, s0, s20
127; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
128; GFX11-NEXT:    s_or_b32 s0, s19, s0
129; GFX11-NEXT:    s_lshl_b64 s[20:21], s[0:1], 1
130; GFX11-NEXT:    s_mov_b32 s0, s1
131; GFX11-NEXT:    global_load_u16 v1, v0, s[20:21]
132; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s24
133; GFX11-NEXT:    s_waitcnt vmcnt(0)
134; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
135; GFX11-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
136; GFX11-NEXT:    s_mov_b32 vcc_lo, 0
137; GFX11-NEXT:    .p2align 6
138; GFX11-NEXT:  .LBB2_6: ; %bb18
139; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
140; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
141; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
142; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
143; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
144; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
145; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
146; GFX11-NEXT:    s_and_b32 s1, s8, s1
147; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
148; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
149; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
150; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
151; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
152; GFX11-NEXT:    s_and_b32 s1, s1, 1
153; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
154; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
155; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
156; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
157; GFX11-NEXT:    v_readfirstlane_b32 s13, v1
158; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
159; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
160; GFX11-NEXT:    s_cselect_b32 s13, s19, s13
161; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
162; GFX11-NEXT:    s_bitcmp1_b32 s13, 0
163; GFX11-NEXT:    s_cselect_b32 s13, 0x100, 0
164; GFX11-NEXT:    s_or_b32 s0, s13, s0
165; GFX11-NEXT:    s_cbranch_vccz .LBB2_6
166; GFX11-NEXT:  ; %bb.7: ; %Flow
167; GFX11-NEXT:    s_mov_b32 s0, 0
168; GFX11-NEXT:  .LBB2_8: ; %Flow12
169; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
170; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
171; GFX11-NEXT:    s_cbranch_vccz .LBB2_12
172; GFX11-NEXT:  ; %bb.9:
173; GFX11-NEXT:    s_xor_b32 s0, s8, -1
174; GFX11-NEXT:  .LBB2_10: ; %bb17
175; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
176; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
177; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
178; GFX11-NEXT:    s_cbranch_vccz .LBB2_10
179; GFX11-NEXT:  ; %bb.11: ; %Flow6
180; GFX11-NEXT:    s_mov_b32 s18, -1
181; GFX11-NEXT:  .LBB2_12: ; %Flow11
182; GFX11-NEXT:    s_and_b32 s20, s2, exec_lo
183; GFX11-NEXT:    s_or_not1_b32 s0, s18, exec_lo
184; GFX11-NEXT:  .LBB2_13: ; %Flow9
185; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
186; GFX11-NEXT:    s_and_saveexec_b32 s3, s0
187; GFX11-NEXT:    s_cbranch_execz .LBB2_15
188; GFX11-NEXT:  ; %bb.14: ; %bb43
189; GFX11-NEXT:    s_add_u32 s8, s16, 0x58
190; GFX11-NEXT:    s_addc_u32 s9, s17, 0
191; GFX11-NEXT:    s_getpc_b64 s[0:1]
192; GFX11-NEXT:    s_add_u32 s0, s0, f0@gotpcrel32@lo+4
193; GFX11-NEXT:    s_addc_u32 s1, s1, f0@gotpcrel32@hi+12
194; GFX11-NEXT:    s_mov_b32 s13, s14
195; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
196; GFX11-NEXT:    s_mov_b32 s14, s15
197; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
198; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
199; GFX11-NEXT:    s_or_b32 s20, s20, exec_lo
200; GFX11-NEXT:  .LBB2_15: ; %Flow14
201; GFX11-NEXT:    s_or_b32 exec_lo, exec_lo, s3
202; GFX11-NEXT:    s_and_saveexec_b32 s0, s20
203; GFX11-NEXT:  ; %bb.16: ; %UnifiedUnreachableBlock
204; GFX11-NEXT:    ; divergent unreachable
205; GFX11-NEXT:  ; %bb.17: ; %UnifiedReturnBlock
206; GFX11-NEXT:    s_endpgm
207bb:
208  %i = tail call i32 @llvm.amdgcn.workitem.id.x()
209  %i12 = mul i32 %arg, %i
210  %i13 = icmp ult i32 %i12, 1
211  br i1 %i13, label %bb14, label %bb43
212
213bb14:
214  br i1 %arg3, label %bb16, label %bb15
215
216bb15:
217  call void @f0()
218  unreachable
219
220bb16:
221  br i1 %arg5, label %bb17, label %bb18
222
223bb17:
224  br i1 %arg11, label %bb17, label %bb43
225
226bb18:
227  %i19 = phi i16 [ %i38, %bb18 ], [ 0, %bb16 ]
228  %i20 = phi i16 [ %i42, %bb18 ], [ 0, %bb16 ]
229  %i21 = zext i32 %arg7 to i64
230  %i22 = zext i32 %arg8 to i64
231  %i23 = mul i64 %i22, %i21
232  %i24 = lshr i64 %i23, 1
233  %i25 = trunc i64 %i24 to i32
234  %i26 = or i32 1, %i25
235  %i27 = lshr i32 %i26, %arg9
236  %i28 = mul i32 %i27, %arg4
237  %i29 = mul i32 %i28, %arg2
238  %i30 = or i32 %arg, %i29
239  %i31 = zext i32 %i30 to i64
240  %i32 = getelementptr { [2 x i8] }, ptr addrspace(1) null, i64 %i31
241  %i33 = load i16, ptr addrspace(1) %i32, align 2
242  %i34 = icmp ult i16 %i33, 1
243  %i35 = icmp ne i16 %i19, 0
244  %i36 = select i1 %arg11, i1 %i35, i1 false
245  %i37 = select i1 %i36, i1 %i35, i1 %arg3
246  %i38 = select i1 %i37, i16 1, i16 0
247  %i39 = icmp ne i16 %i20, 0
248  %i40 = select i1 %arg5, i1 %i39, i1 %i34
249  %i41 = select i1 %i40, i16 256, i16 0
250  %i42 = or i16 %i41, %i20
251  br label %bb18
252
253bb43:
254  call void @f0()
255  unreachable
256}
257
258attributes #0 = { noinline optnone }
259
260!llvm.module.flags = !{!0}
261!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
262