xref: /llvm-project/llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3
4; OBJ:       Relocations [
5; OBJ-NEXT: ]
6
7; Used to emit an always 4 byte instruction. Inline asm always assumes
8; each instruction is the maximum size.
9declare void @llvm.amdgcn.s.sleep(i32) #0
10
11declare i32 @llvm.amdgcn.workitem.id.x() #1
12
13
14define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
15; GCN-LABEL: uniform_conditional_max_short_forward_branch:
16; GCN:       ; %bb.0: ; %bb
17; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
18; GCN-NEXT:    s_waitcnt lgkmcnt(0)
19; GCN-NEXT:    s_cmp_eq_u32 s0, 0
20; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
21; GCN-NEXT:  ; %bb.1: ; %bb2
22; GCN-NEXT:    ;;#ASMSTART
23; GCN-NEXT:    v_nop_e64
24; GCN-NEXT:    v_nop_e64
25; GCN-NEXT:    v_nop_e64
26; GCN-NEXT:    ;;#ASMEND
27; GCN-NEXT:    s_sleep 0
28; GCN-NEXT:  .LBB0_2: ; %bb3
29; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
30; GCN-NEXT:    s_mov_b32 s7, 0xf000
31; GCN-NEXT:    s_mov_b32 s6, -1
32; GCN-NEXT:    v_mov_b32_e32 v0, s0
33; GCN-NEXT:    s_waitcnt lgkmcnt(0)
34; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
35; GCN-NEXT:    s_waitcnt vmcnt(0)
36; GCN-NEXT:    s_endpgm
37bb:
38  %cmp = icmp eq i32 %cnd, 0
39  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
40
41bb2:
42; 24 bytes
43  call void asm sideeffect
44  "v_nop_e64
45  v_nop_e64
46  v_nop_e64", ""() #0
47  call void @llvm.amdgcn.s.sleep(i32 0)
48  br label %bb3
49
50bb3:
51  store volatile i32 %cnd, ptr addrspace(1) %arg
52  ret void
53}
54
55define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
56; GCN-LABEL: uniform_conditional_min_long_forward_branch:
57; GCN:       ; %bb.0: ; %bb0
58; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
59; GCN-NEXT:    s_waitcnt lgkmcnt(0)
60; GCN-NEXT:    s_cmp_eq_u32 s0, 0
61; GCN-NEXT:    s_cbranch_scc0 .LBB1_1
62; GCN-NEXT:  ; %bb.3: ; %bb0
63; GCN-NEXT:    s_getpc_b64 s[2:3]
64; GCN-NEXT:  .Lpost_getpc0:
65; GCN-NEXT:    s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc0)&4294967295
66; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc0)>>32
67; GCN-NEXT:    s_setpc_b64 s[2:3]
68; GCN-NEXT:  .LBB1_1: ; %bb2
69; GCN-NEXT:    ;;#ASMSTART
70; GCN-NEXT:    v_nop_e64
71; GCN-NEXT:    v_nop_e64
72; GCN-NEXT:    v_nop_e64
73; GCN-NEXT:    v_nop_e64
74; GCN-NEXT:    ;;#ASMEND
75; GCN-NEXT:  .LBB1_2: ; %bb3
76; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
77; GCN-NEXT:    s_mov_b32 s7, 0xf000
78; GCN-NEXT:    s_mov_b32 s6, -1
79; GCN-NEXT:    v_mov_b32_e32 v0, s0
80; GCN-NEXT:    s_waitcnt lgkmcnt(0)
81; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
82; GCN-NEXT:    s_waitcnt vmcnt(0)
83; GCN-NEXT:    s_endpgm
84bb0:
85  %cmp = icmp eq i32 %cnd, 0
86  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
87
88bb2:
89; 32 bytes
90  call void asm sideeffect
91  "v_nop_e64
92  v_nop_e64
93  v_nop_e64
94  v_nop_e64", ""() #0
95  br label %bb3
96
97bb3:
98  store volatile i32 %cnd, ptr addrspace(1) %arg
99  ret void
100}
101
102define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
103; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
104; GCN:       ; %bb.0: ; %bb0
105; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
106; GCN-NEXT:    s_waitcnt lgkmcnt(0)
107; GCN-NEXT:    v_cmp_eq_f32_e64 s[2:3], s0, 0
108; GCN-NEXT:    s_and_b64 vcc, exec, s[2:3]
109; GCN-NEXT:    s_cbranch_vccz .LBB2_1
110; GCN-NEXT:  ; %bb.3: ; %bb0
111; GCN-NEXT:    s_getpc_b64 s[8:9]
112; GCN-NEXT:  .Lpost_getpc1:
113; GCN-NEXT:    s_add_u32 s8, s8, (.LBB2_2-.Lpost_getpc1)&4294967295
114; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB2_2-.Lpost_getpc1)>>32
115; GCN-NEXT:    s_setpc_b64 s[8:9]
116; GCN-NEXT:  .LBB2_1: ; %bb2
117; GCN-NEXT:    ;;#ASMSTART
118; GCN-NEXT:     ; 32 bytes
119; GCN-NEXT:    v_nop_e64
120; GCN-NEXT:    v_nop_e64
121; GCN-NEXT:    v_nop_e64
122; GCN-NEXT:    v_nop_e64
123; GCN-NEXT:    ;;#ASMEND
124; GCN-NEXT:  .LBB2_2: ; %bb3
125; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
126; GCN-NEXT:    s_mov_b32 s7, 0xf000
127; GCN-NEXT:    s_mov_b32 s6, -1
128; GCN-NEXT:    v_mov_b32_e32 v0, s0
129; GCN-NEXT:    s_waitcnt lgkmcnt(0)
130; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
131; GCN-NEXT:    s_waitcnt vmcnt(0)
132; GCN-NEXT:    s_endpgm
133bb0:
134  %cmp = fcmp oeq float %cnd, 0.0
135  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
136
137bb2:
138  call void asm sideeffect " ; 32 bytes
139  v_nop_e64
140  v_nop_e64
141  v_nop_e64
142  v_nop_e64", ""() #0
143  br label %bb3
144
145bb3:
146  store volatile float %cnd, ptr addrspace(1) %arg
147  ret void
148}
149
150define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
151; GCN-LABEL: min_long_forward_vbranch:
152; GCN:       ; %bb.0: ; %bb
153; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
154; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
155; GCN-NEXT:    v_mov_b32_e32 v1, 0
156; GCN-NEXT:    s_mov_b32 s3, 0xf000
157; GCN-NEXT:    s_mov_b32 s2, 0
158; GCN-NEXT:    s_waitcnt lgkmcnt(0)
159; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
160; GCN-NEXT:    s_waitcnt vmcnt(0)
161; GCN-NEXT:    v_mov_b32_e32 v1, s1
162; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
163; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
164; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
165; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
166; GCN-NEXT:    s_cbranch_execnz .LBB3_1
167; GCN-NEXT:  ; %bb.3: ; %bb
168; GCN-NEXT:    s_getpc_b64 s[6:7]
169; GCN-NEXT:  .Lpost_getpc2:
170; GCN-NEXT:    s_add_u32 s6, s6, (.LBB3_2-.Lpost_getpc2)&4294967295
171; GCN-NEXT:    s_addc_u32 s7, s7, (.LBB3_2-.Lpost_getpc2)>>32
172; GCN-NEXT:    s_setpc_b64 s[6:7]
173; GCN-NEXT:  .LBB3_1: ; %bb2
174; GCN-NEXT:    ;;#ASMSTART
175; GCN-NEXT:     ; 32 bytes
176; GCN-NEXT:    v_nop_e64
177; GCN-NEXT:    v_nop_e64
178; GCN-NEXT:    v_nop_e64
179; GCN-NEXT:    v_nop_e64
180; GCN-NEXT:    ;;#ASMEND
181; GCN-NEXT:  .LBB3_2: ; %bb3
182; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
183; GCN-NEXT:    s_mov_b32 s0, s2
184; GCN-NEXT:    s_mov_b32 s1, s2
185; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
186; GCN-NEXT:    s_waitcnt vmcnt(0)
187; GCN-NEXT:    s_endpgm
188bb:
189  %tid = call i32 @llvm.amdgcn.workitem.id.x()
190  %tid.ext = zext i32 %tid to i64
191  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
192  %load = load volatile i32, ptr addrspace(1) %gep
193  %cmp = icmp eq i32 %load, 0
194  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
195
196bb2:
197  call void asm sideeffect " ; 32 bytes
198  v_nop_e64
199  v_nop_e64
200  v_nop_e64
201  v_nop_e64", ""() #0
202  br label %bb3
203
204bb3:
205  store volatile i32 %load, ptr addrspace(1) %gep
206  ret void
207}
208
209define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
210; GCN-LABEL: long_backward_sbranch:
211; GCN:       ; %bb.0: ; %bb
212; GCN-NEXT:    s_mov_b32 s0, 0
213; GCN-NEXT:  .LBB4_1: ; %bb2
214; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
215; GCN-NEXT:    s_add_i32 s0, s0, 1
216; GCN-NEXT:    s_cmp_lt_i32 s0, 10
217; GCN-NEXT:    ;;#ASMSTART
218; GCN-NEXT:    v_nop_e64
219; GCN-NEXT:    v_nop_e64
220; GCN-NEXT:    v_nop_e64
221; GCN-NEXT:    ;;#ASMEND
222; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
223; GCN-NEXT:  ; %bb.3: ; %bb2
224; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
225; GCN-NEXT:    s_getpc_b64 s[2:3]
226; GCN-NEXT:  .Lpost_getpc3:
227; GCN-NEXT:    s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295
228; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32
229; GCN-NEXT:    s_setpc_b64 s[2:3]
230; GCN-NEXT:  .LBB4_2: ; %bb3
231; GCN-NEXT:    s_endpgm
232
233bb:
234  br label %bb2
235
236bb2:
237  %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
238  ; 24 bytes
239  call void asm sideeffect
240  "v_nop_e64
241  v_nop_e64
242  v_nop_e64", ""() #0
243  %inc = add nsw i32 %loop.idx, 1 ; add cost 4
244  %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
245  br i1 %cmp, label %bb2, label %bb3 ; -
246
247bb3:
248  ret void
249}
250
251; Requires expansion of unconditional branch from %bb2 to %bb4 (and
252; expansion of conditional branch from %bb to %bb3.
253
254define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
255; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
256; GCN:       ; %bb.0: ; %bb0
257; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
258; GCN-NEXT:    s_waitcnt lgkmcnt(0)
259; GCN-NEXT:    s_cmp_eq_u32 s0, 0
260; GCN-NEXT:    s_mov_b64 s[0:1], -1
261; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
262; GCN-NEXT:  ; %bb.7: ; %bb0
263; GCN-NEXT:    s_getpc_b64 s[6:7]
264; GCN-NEXT:  .Lpost_getpc5:
265; GCN-NEXT:    s_add_u32 s6, s6, (.LBB5_4-.Lpost_getpc5)&4294967295
266; GCN-NEXT:    s_addc_u32 s7, s7, (.LBB5_4-.Lpost_getpc5)>>32
267; GCN-NEXT:    s_setpc_b64 s[6:7]
268; GCN-NEXT:  .LBB5_1: ; %Flow
269; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
270; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
271; GCN-NEXT:  .LBB5_2: ; %bb2
272; GCN-NEXT:    s_mov_b32 s3, 0xf000
273; GCN-NEXT:    s_mov_b32 s2, -1
274; GCN-NEXT:    v_mov_b32_e32 v0, 17
275; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
276; GCN-NEXT:    s_waitcnt vmcnt(0)
277; GCN-NEXT:  .LBB5_3: ; %bb4
278; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
279; GCN-NEXT:    s_mov_b32 s3, 0xf000
280; GCN-NEXT:    s_mov_b32 s2, -1
281; GCN-NEXT:    s_waitcnt expcnt(0)
282; GCN-NEXT:    v_mov_b32_e32 v0, 63
283; GCN-NEXT:    s_waitcnt lgkmcnt(0)
284; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
285; GCN-NEXT:    s_waitcnt vmcnt(0)
286; GCN-NEXT:    s_endpgm
287; GCN-NEXT:  .LBB5_4: ; %bb3
288; GCN-NEXT:    ;;#ASMSTART
289; GCN-NEXT:    v_nop_e64
290; GCN-NEXT:    v_nop_e64
291; GCN-NEXT:    v_nop_e64
292; GCN-NEXT:    v_nop_e64
293; GCN-NEXT:    ;;#ASMEND
294; GCN-NEXT:    s_mov_b64 vcc, exec
295; GCN-NEXT:    s_cbranch_execnz .LBB5_5
296; GCN-NEXT:  ; %bb.9: ; %bb3
297; GCN-NEXT:    s_getpc_b64 s[6:7]
298; GCN-NEXT:  .Lpost_getpc6:
299; GCN-NEXT:    s_add_u32 s6, s6, (.LBB5_2-.Lpost_getpc6)&4294967295
300; GCN-NEXT:    s_addc_u32 s7, s7, (.LBB5_2-.Lpost_getpc6)>>32
301; GCN-NEXT:    s_setpc_b64 s[6:7]
302; GCN-NEXT:  .LBB5_5: ; %bb3
303; GCN-NEXT:    s_getpc_b64 s[6:7]
304; GCN-NEXT:  .Lpost_getpc4:
305; GCN-NEXT:    s_add_u32 s6, s6, (.LBB5_3-.Lpost_getpc4)&4294967295
306; GCN-NEXT:    s_addc_u32 s7, s7, (.LBB5_3-.Lpost_getpc4)>>32
307; GCN-NEXT:    s_setpc_b64 s[6:7]
308bb0:
309  %tmp = icmp ne i32 %arg1, 0
310  br i1 %tmp, label %bb2, label %bb3
311
312bb2:
313  store volatile i32 17, ptr addrspace(1) undef
314  br label %bb4
315
316bb3:
317  ; 32 byte asm
318  call void asm sideeffect
319  "v_nop_e64
320  v_nop_e64
321  v_nop_e64
322  v_nop_e64", ""() #0
323  br label %bb4
324
325bb4:
326  store volatile i32 63, ptr addrspace(1) %arg
327  ret void
328}
329
330attributes #0 = { nounwind }
331attributes #1 = { nounwind readnone }
332