xref: /llvm-project/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (revision d3508ccd1512c57094ec7b321d147aa72c9fbc7e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -amdgpu-s-branch-bits=4 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
3; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX11 %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -amdgpu-s-branch-bits=5 -simplifycfg-require-and-preserve-domtree=1 -amdgpu-long-branch-factor=0 < %s | FileCheck -enable-var-scope -check-prefix=GFX12 %s
5
6
7; FIXME: We should use llvm-mc for this, but we can't even parse our own output.
8;        See PR33579.
9; RUN: llc -mtriple=amdgcn -verify-machineinstrs -amdgpu-s-branch-bits=4 -amdgpu-long-branch-factor=0 -o %t.o -filetype=obj -simplifycfg-require-and-preserve-domtree=1 %s
10; RUN: llvm-readobj -r %t.o | FileCheck --check-prefix=OBJ %s
11
12; OBJ:       Relocations [
13; OBJ-NEXT: ]
14
15; Restrict maximum branch to between +7 and -8 dwords
16
17; Used to emit an always 4 byte instruction. Inline asm always assumes
18; each instruction is the maximum size.
19declare void @llvm.amdgcn.s.sleep(i32) #0
20
21declare i32 @llvm.amdgcn.workitem.id.x() #1
22
23
24define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
25; GCN-LABEL: uniform_conditional_max_short_forward_branch:
26; GCN:       ; %bb.0: ; %bb
27; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
28; GCN-NEXT:    s_waitcnt lgkmcnt(0)
29; GCN-NEXT:    s_cmp_eq_u32 s0, 0
30; GCN-NEXT:    s_cbranch_scc1 .LBB0_2
31; GCN-NEXT:  ; %bb.1: ; %bb2
32; GCN-NEXT:    ;;#ASMSTART
33; GCN-NEXT:    v_nop_e64
34; GCN-NEXT:    v_nop_e64
35; GCN-NEXT:    v_nop_e64
36; GCN-NEXT:    ;;#ASMEND
37; GCN-NEXT:    s_sleep 0
38; GCN-NEXT:  .LBB0_2: ; %bb3
39; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
40; GCN-NEXT:    s_mov_b32 s7, 0xf000
41; GCN-NEXT:    s_mov_b32 s6, -1
42; GCN-NEXT:    v_mov_b32_e32 v0, s0
43; GCN-NEXT:    s_waitcnt lgkmcnt(0)
44; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
45; GCN-NEXT:    s_waitcnt vmcnt(0)
46; GCN-NEXT:    s_endpgm
47;
48; GFX11-LABEL: uniform_conditional_max_short_forward_branch:
49; GFX11:       ; %bb.0: ; %bb
50; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
51; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
52; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
53; GFX11-NEXT:    s_cbranch_scc0 .LBB0_1
54; GFX11-NEXT:  ; %bb.3: ; %bb
55; GFX11-NEXT:    s_getpc_b64 s[2:3]
56; GFX11-NEXT:  .Lpost_getpc0:
57; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
58; GFX11-NEXT:    s_add_u32 s2, s2, (.LBB0_2-.Lpost_getpc0)&4294967295
59; GFX11-NEXT:    s_addc_u32 s3, s3, (.LBB0_2-.Lpost_getpc0)>>32
60; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
61; GFX11-NEXT:    s_setpc_b64 s[2:3]
62; GFX11-NEXT:  .LBB0_1: ; %bb2
63; GFX11-NEXT:    ;;#ASMSTART
64; GFX11-NEXT:    v_nop_e64
65; GFX11-NEXT:    v_nop_e64
66; GFX11-NEXT:    v_nop_e64
67; GFX11-NEXT:    ;;#ASMEND
68; GFX11-NEXT:    s_sleep 0
69; GFX11-NEXT:  .LBB0_2: ; %bb3
70; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
71; GFX11-NEXT:    v_mov_b32_e32 v0, 0
72; GFX11-NEXT:    v_mov_b32_e32 v1, s0
73; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
74; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
75; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
76; GFX11-NEXT:    s_endpgm
77;
78; GFX12-LABEL: uniform_conditional_max_short_forward_branch:
79; GFX12:       ; %bb.0: ; %bb
80; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x2c
81; GFX12-NEXT:    s_wait_kmcnt 0x0
82; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
83; GFX12-NEXT:    s_cbranch_scc0 .LBB0_1
84; GFX12-NEXT:  ; %bb.3: ; %bb
85; GFX12-NEXT:    s_getpc_b64 s[2:3]
86; GFX12-NEXT:  .Lpost_getpc0:
87; GFX12-NEXT:    s_wait_alu 0xfffe
88; GFX12-NEXT:    s_add_co_u32 s2, s2, (.LBB0_2-.Lpost_getpc0)&4294967295
89; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, (.LBB0_2-.Lpost_getpc0)>>32
90; GFX12-NEXT:    s_wait_alu 0xfffe
91; GFX12-NEXT:    s_setpc_b64 s[2:3]
92; GFX12-NEXT:  .LBB0_1: ; %bb2
93; GFX12-NEXT:    ;;#ASMSTART
94; GFX12-NEXT:    v_nop_e64
95; GFX12-NEXT:    v_nop_e64
96; GFX12-NEXT:    v_nop_e64
97; GFX12-NEXT:    ;;#ASMEND
98; GFX12-NEXT:    s_sleep 0
99; GFX12-NEXT:  .LBB0_2: ; %bb3
100; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
101; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
102; GFX12-NEXT:    s_wait_kmcnt 0x0
103; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
104; GFX12-NEXT:    s_wait_storecnt 0x0
105; GFX12-NEXT:    s_endpgm
106bb:
107  %cmp = icmp eq i32 %cnd, 0
108  br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
109
110bb2:
111; 24 bytes
112  call void asm sideeffect
113  "v_nop_e64
114  v_nop_e64
115  v_nop_e64", ""() #0
116  call void @llvm.amdgcn.s.sleep(i32 0)
117  br label %bb3
118
119bb3:
120  store volatile i32 %cnd, ptr addrspace(1) %arg
121  ret void
122}
123
124define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
125; GCN-LABEL: uniform_conditional_min_long_forward_branch:
126; GCN:       ; %bb.0: ; %bb0
127; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
128; GCN-NEXT:    s_waitcnt lgkmcnt(0)
129; GCN-NEXT:    s_cmp_eq_u32 s0, 0
130; GCN-NEXT:    s_cbranch_scc0 .LBB1_1
131; GCN-NEXT:  ; %bb.3: ; %bb0
132; GCN-NEXT:    s_getpc_b64 s[2:3]
133; GCN-NEXT:  .Lpost_getpc0:
134; GCN-NEXT:    s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc0)&4294967295
135; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc0)>>32
136; GCN-NEXT:    s_setpc_b64 s[2:3]
137; GCN-NEXT:  .LBB1_1: ; %bb2
138; GCN-NEXT:    ;;#ASMSTART
139; GCN-NEXT:    v_nop_e64
140; GCN-NEXT:    v_nop_e64
141; GCN-NEXT:    v_nop_e64
142; GCN-NEXT:    v_nop_e64
143; GCN-NEXT:    ;;#ASMEND
144; GCN-NEXT:  .LBB1_2: ; %bb3
145; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
146; GCN-NEXT:    s_mov_b32 s7, 0xf000
147; GCN-NEXT:    s_mov_b32 s6, -1
148; GCN-NEXT:    v_mov_b32_e32 v0, s0
149; GCN-NEXT:    s_waitcnt lgkmcnt(0)
150; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
151; GCN-NEXT:    s_waitcnt vmcnt(0)
152; GCN-NEXT:    s_endpgm
153;
154; GFX11-LABEL: uniform_conditional_min_long_forward_branch:
155; GFX11:       ; %bb.0: ; %bb0
156; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
157; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
158; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
159; GFX11-NEXT:    s_cbranch_scc0 .LBB1_1
160; GFX11-NEXT:  ; %bb.3: ; %bb0
161; GFX11-NEXT:    s_getpc_b64 s[2:3]
162; GFX11-NEXT:  .Lpost_getpc1:
163; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
164; GFX11-NEXT:    s_add_u32 s2, s2, (.LBB1_2-.Lpost_getpc1)&4294967295
165; GFX11-NEXT:    s_addc_u32 s3, s3, (.LBB1_2-.Lpost_getpc1)>>32
166; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
167; GFX11-NEXT:    s_setpc_b64 s[2:3]
168; GFX11-NEXT:  .LBB1_1: ; %bb2
169; GFX11-NEXT:    ;;#ASMSTART
170; GFX11-NEXT:    v_nop_e64
171; GFX11-NEXT:    v_nop_e64
172; GFX11-NEXT:    v_nop_e64
173; GFX11-NEXT:    v_nop_e64
174; GFX11-NEXT:    ;;#ASMEND
175; GFX11-NEXT:  .LBB1_2: ; %bb3
176; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
177; GFX11-NEXT:    v_mov_b32_e32 v0, 0
178; GFX11-NEXT:    v_mov_b32_e32 v1, s0
179; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
180; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
181; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
182; GFX11-NEXT:    s_endpgm
183;
184; GFX12-LABEL: uniform_conditional_min_long_forward_branch:
185; GFX12:       ; %bb.0: ; %bb0
186; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x2c
187; GFX12-NEXT:    s_wait_kmcnt 0x0
188; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
189; GFX12-NEXT:    s_cbranch_scc0 .LBB1_1
190; GFX12-NEXT:  ; %bb.3: ; %bb0
191; GFX12-NEXT:    s_getpc_b64 s[2:3]
192; GFX12-NEXT:  .Lpost_getpc1:
193; GFX12-NEXT:    s_wait_alu 0xfffe
194; GFX12-NEXT:    s_add_co_u32 s2, s2, (.LBB1_2-.Lpost_getpc1)&4294967295
195; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, (.LBB1_2-.Lpost_getpc1)>>32
196; GFX12-NEXT:    s_wait_alu 0xfffe
197; GFX12-NEXT:    s_setpc_b64 s[2:3]
198; GFX12-NEXT:  .LBB1_1: ; %bb2
199; GFX12-NEXT:    ;;#ASMSTART
200; GFX12-NEXT:    v_nop_e64
201; GFX12-NEXT:    v_nop_e64
202; GFX12-NEXT:    v_nop_e64
203; GFX12-NEXT:    v_nop_e64
204; GFX12-NEXT:    ;;#ASMEND
205; GFX12-NEXT:  .LBB1_2: ; %bb3
206; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
207; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
208; GFX12-NEXT:    s_wait_kmcnt 0x0
209; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
210; GFX12-NEXT:    s_wait_storecnt 0x0
211; GFX12-NEXT:    s_endpgm
212bb0:
213  %cmp = icmp eq i32 %cnd, 0
214  br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
215
216bb2:
217; 32 bytes
218  call void asm sideeffect
219  "v_nop_e64
220  v_nop_e64
221  v_nop_e64
222  v_nop_e64", ""() #0
223  br label %bb3
224
225bb3:
226  store volatile i32 %cnd, ptr addrspace(1) %arg
227  ret void
228}
229
230define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
231; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
232; GCN:       ; %bb.0: ; %bb0
233; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
234; GCN-NEXT:    s_waitcnt lgkmcnt(0)
235; GCN-NEXT:    v_cmp_eq_f32_e64 s[2:3], s0, 0
236; GCN-NEXT:    s_and_b64 vcc, exec, s[2:3]
237; GCN-NEXT:    s_cbranch_vccz .LBB2_1
238; GCN-NEXT:  ; %bb.3: ; %bb0
239; GCN-NEXT:    s_getpc_b64 s[2:3]
240; GCN-NEXT:  .Lpost_getpc1:
241; GCN-NEXT:    s_add_u32 s2, s2, (.LBB2_2-.Lpost_getpc1)&4294967295
242; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB2_2-.Lpost_getpc1)>>32
243; GCN-NEXT:    s_setpc_b64 s[2:3]
244; GCN-NEXT:  .LBB2_1: ; %bb2
245; GCN-NEXT:    ;;#ASMSTART
246; GCN-NEXT:     ; 32 bytes
247; GCN-NEXT:    v_nop_e64
248; GCN-NEXT:    v_nop_e64
249; GCN-NEXT:    v_nop_e64
250; GCN-NEXT:    v_nop_e64
251; GCN-NEXT:    ;;#ASMEND
252; GCN-NEXT:  .LBB2_2: ; %bb3
253; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
254; GCN-NEXT:    s_mov_b32 s7, 0xf000
255; GCN-NEXT:    s_mov_b32 s6, -1
256; GCN-NEXT:    v_mov_b32_e32 v0, s0
257; GCN-NEXT:    s_waitcnt lgkmcnt(0)
258; GCN-NEXT:    buffer_store_dword v0, off, s[4:7], 0
259; GCN-NEXT:    s_waitcnt vmcnt(0)
260; GCN-NEXT:    s_endpgm
261;
262; GFX11-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
263; GFX11:       ; %bb.0: ; %bb0
264; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
265; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
266; GFX11-NEXT:    v_cmp_eq_f32_e64 s[2:3], s0, 0
267; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
268; GFX11-NEXT:    s_and_b64 vcc, exec, s[2:3]
269; GFX11-NEXT:    s_cbranch_vccz .LBB2_1
270; GFX11-NEXT:  ; %bb.3: ; %bb0
271; GFX11-NEXT:    s_getpc_b64 s[2:3]
272; GFX11-NEXT:  .Lpost_getpc2:
273; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
274; GFX11-NEXT:    s_add_u32 s2, s2, (.LBB2_2-.Lpost_getpc2)&4294967295
275; GFX11-NEXT:    s_addc_u32 s3, s3, (.LBB2_2-.Lpost_getpc2)>>32
276; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
277; GFX11-NEXT:    s_setpc_b64 s[2:3]
278; GFX11-NEXT:  .LBB2_1: ; %bb2
279; GFX11-NEXT:    ;;#ASMSTART
280; GFX11-NEXT:     ; 32 bytes
281; GFX11-NEXT:    v_nop_e64
282; GFX11-NEXT:    v_nop_e64
283; GFX11-NEXT:    v_nop_e64
284; GFX11-NEXT:    v_nop_e64
285; GFX11-NEXT:    ;;#ASMEND
286; GFX11-NEXT:  .LBB2_2: ; %bb3
287; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
288; GFX11-NEXT:    v_mov_b32_e32 v0, 0
289; GFX11-NEXT:    v_mov_b32_e32 v1, s0
290; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
291; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3] dlc
292; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
293; GFX11-NEXT:    s_endpgm
294;
295; GFX12-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
296; GFX12:       ; %bb.0: ; %bb0
297; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x2c
298; GFX12-NEXT:    s_wait_kmcnt 0x0
299; GFX12-NEXT:    s_cmp_eq_f32 s0, 0
300; GFX12-NEXT:    s_cbranch_scc0 .LBB2_1
301; GFX12-NEXT:  ; %bb.3: ; %bb0
302; GFX12-NEXT:    s_getpc_b64 s[2:3]
303; GFX12-NEXT:  .Lpost_getpc2:
304; GFX12-NEXT:    s_wait_alu 0xfffe
305; GFX12-NEXT:    s_add_co_u32 s2, s2, (.LBB2_2-.Lpost_getpc2)&4294967295
306; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, (.LBB2_2-.Lpost_getpc2)>>32
307; GFX12-NEXT:    s_wait_alu 0xfffe
308; GFX12-NEXT:    s_setpc_b64 s[2:3]
309; GFX12-NEXT:  .LBB2_1: ; %bb2
310; GFX12-NEXT:    ;;#ASMSTART
311; GFX12-NEXT:     ; 32 bytes
312; GFX12-NEXT:    v_nop_e64
313; GFX12-NEXT:    v_nop_e64
314; GFX12-NEXT:    v_nop_e64
315; GFX12-NEXT:    v_nop_e64
316; GFX12-NEXT:    ;;#ASMEND
317; GFX12-NEXT:  .LBB2_2: ; %bb3
318; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
319; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
320; GFX12-NEXT:    s_wait_kmcnt 0x0
321; GFX12-NEXT:    global_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
322; GFX12-NEXT:    s_wait_storecnt 0x0
323; GFX12-NEXT:    s_endpgm
324bb0:
325  %cmp = fcmp oeq float %cnd, 0.0
326  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
327
328bb2:
329  call void asm sideeffect " ; 32 bytes
330  v_nop_e64
331  v_nop_e64
332  v_nop_e64
333  v_nop_e64", ""() #0
334  br label %bb3
335
336bb3:
337  store volatile float %cnd, ptr addrspace(1) %arg
338  ret void
339}
340
341define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
342; GCN-LABEL: min_long_forward_vbranch:
343; GCN:       ; %bb.0: ; %bb
344; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
345; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
346; GCN-NEXT:    v_mov_b32_e32 v1, 0
347; GCN-NEXT:    s_mov_b32 s3, 0xf000
348; GCN-NEXT:    s_mov_b32 s2, 0
349; GCN-NEXT:    s_waitcnt lgkmcnt(0)
350; GCN-NEXT:    buffer_load_dword v2, v[0:1], s[0:3], 0 addr64 glc
351; GCN-NEXT:    s_waitcnt vmcnt(0)
352; GCN-NEXT:    v_mov_b32_e32 v1, s1
353; GCN-NEXT:    v_add_i32_e32 v0, vcc, s0, v0
354; GCN-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
355; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
356; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
357; GCN-NEXT:    s_cbranch_execnz .LBB3_1
358; GCN-NEXT:  ; %bb.3: ; %bb
359; GCN-NEXT:    s_getpc_b64 s[4:5]
360; GCN-NEXT:  .Lpost_getpc2:
361; GCN-NEXT:    s_add_u32 s4, s4, (.LBB3_2-.Lpost_getpc2)&4294967295
362; GCN-NEXT:    s_addc_u32 s5, s5, (.LBB3_2-.Lpost_getpc2)>>32
363; GCN-NEXT:    s_setpc_b64 s[4:5]
364; GCN-NEXT:  .LBB3_1: ; %bb2
365; GCN-NEXT:    ;;#ASMSTART
366; GCN-NEXT:     ; 32 bytes
367; GCN-NEXT:    v_nop_e64
368; GCN-NEXT:    v_nop_e64
369; GCN-NEXT:    v_nop_e64
370; GCN-NEXT:    v_nop_e64
371; GCN-NEXT:    ;;#ASMEND
372; GCN-NEXT:  .LBB3_2: ; %bb3
373; GCN-NEXT:    s_or_b64 exec, exec, s[0:1]
374; GCN-NEXT:    s_mov_b32 s0, s2
375; GCN-NEXT:    s_mov_b32 s1, s2
376; GCN-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
377; GCN-NEXT:    s_waitcnt vmcnt(0)
378; GCN-NEXT:    s_endpgm
379;
380; GFX11-LABEL: min_long_forward_vbranch:
381; GFX11:       ; %bb.0: ; %bb
382; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
383; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
384; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
385; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
386; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
387; GFX11-NEXT:    global_load_b32 v2, v0, s[0:1] glc dlc
388; GFX11-NEXT:    s_waitcnt vmcnt(0)
389; GFX11-NEXT:    v_add_co_u32 v0, s[2:3], s0, v0
390; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s[2:3]
391; GFX11-NEXT:    s_mov_b64 s[0:1], exec
392; GFX11-NEXT:    v_cmpx_ne_u32_e32 0, v2
393; GFX11-NEXT:    s_cbranch_execnz .LBB3_1
394; GFX11-NEXT:  ; %bb.3: ; %bb
395; GFX11-NEXT:    s_getpc_b64 s[2:3]
396; GFX11-NEXT:  .Lpost_getpc3:
397; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
398; GFX11-NEXT:    s_add_u32 s2, s2, (.LBB3_2-.Lpost_getpc3)&4294967295
399; GFX11-NEXT:    s_addc_u32 s3, s3, (.LBB3_2-.Lpost_getpc3)>>32
400; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
401; GFX11-NEXT:    s_setpc_b64 s[2:3]
402; GFX11-NEXT:  .LBB3_1: ; %bb2
403; GFX11-NEXT:    ;;#ASMSTART
404; GFX11-NEXT:     ; 32 bytes
405; GFX11-NEXT:    v_nop_e64
406; GFX11-NEXT:    v_nop_e64
407; GFX11-NEXT:    v_nop_e64
408; GFX11-NEXT:    v_nop_e64
409; GFX11-NEXT:    ;;#ASMEND
410; GFX11-NEXT:  .LBB3_2: ; %bb3
411; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
412; GFX11-NEXT:    global_store_b32 v[0:1], v2, off dlc
413; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
414; GFX11-NEXT:    s_endpgm
415;
416; GFX12-LABEL: min_long_forward_vbranch:
417; GFX12:       ; %bb.0: ; %bb
418; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
419; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
420; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
421; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
422; GFX12-NEXT:    s_wait_kmcnt 0x0
423; GFX12-NEXT:    global_load_b32 v2, v0, s[0:1] scope:SCOPE_SYS
424; GFX12-NEXT:    s_wait_loadcnt 0x0
425; GFX12-NEXT:    v_add_co_u32 v0, s0, s0, v0
426; GFX12-NEXT:    v_add_co_ci_u32_e64 v1, null, s1, 0, s0
427; GFX12-NEXT:    s_mov_b32 s0, exec_lo
428; GFX12-NEXT:    v_cmpx_ne_u32_e32 0, v2
429; GFX12-NEXT:    s_cbranch_execnz .LBB3_1
430; GFX12-NEXT:  ; %bb.3: ; %bb
431; GFX12-NEXT:    s_getpc_b64 s[2:3]
432; GFX12-NEXT:  .Lpost_getpc3:
433; GFX12-NEXT:    s_wait_alu 0xfffe
434; GFX12-NEXT:    s_add_co_u32 s2, s2, (.LBB3_2-.Lpost_getpc3)&4294967295
435; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, (.LBB3_2-.Lpost_getpc3)>>32
436; GFX12-NEXT:    s_wait_alu 0xfffe
437; GFX12-NEXT:    s_setpc_b64 s[2:3]
438; GFX12-NEXT:  .LBB3_1: ; %bb2
439; GFX12-NEXT:    ;;#ASMSTART
440; GFX12-NEXT:     ; 32 bytes
441; GFX12-NEXT:    v_nop_e64
442; GFX12-NEXT:    v_nop_e64
443; GFX12-NEXT:    v_nop_e64
444; GFX12-NEXT:    v_nop_e64
445; GFX12-NEXT:    ;;#ASMEND
446; GFX12-NEXT:  .LBB3_2: ; %bb3
447; GFX12-NEXT:    s_wait_alu 0xfffe
448; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s0
449; GFX12-NEXT:    global_store_b32 v[0:1], v2, off scope:SCOPE_SYS
450; GFX12-NEXT:    s_wait_storecnt 0x0
451; GFX12-NEXT:    s_endpgm
452bb:
453  %tid = call i32 @llvm.amdgcn.workitem.id.x()
454  %tid.ext = zext i32 %tid to i64
455  %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %tid.ext
456  %load = load volatile i32, ptr addrspace(1) %gep
457  %cmp = icmp eq i32 %load, 0
458  br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
459
460bb2:
461  call void asm sideeffect " ; 32 bytes
462  v_nop_e64
463  v_nop_e64
464  v_nop_e64
465  v_nop_e64", ""() #0
466  br label %bb3
467
468bb3:
469  store volatile i32 %load, ptr addrspace(1) %gep
470  ret void
471}
472
473define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
474; GCN-LABEL: long_backward_sbranch:
475; GCN:       ; %bb.0: ; %bb
476; GCN-NEXT:    s_mov_b32 s0, 0
477; GCN-NEXT:  .LBB4_1: ; %bb2
478; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
479; GCN-NEXT:    s_add_i32 s0, s0, 1
480; GCN-NEXT:    s_cmp_lt_i32 s0, 10
481; GCN-NEXT:    ;;#ASMSTART
482; GCN-NEXT:    v_nop_e64
483; GCN-NEXT:    v_nop_e64
484; GCN-NEXT:    v_nop_e64
485; GCN-NEXT:    ;;#ASMEND
486; GCN-NEXT:    s_cbranch_scc0 .LBB4_2
487; GCN-NEXT:  ; %bb.3: ; %bb2
488; GCN-NEXT:    ; in Loop: Header=BB4_1 Depth=1
489; GCN-NEXT:    s_getpc_b64 s[2:3]
490; GCN-NEXT:  .Lpost_getpc3:
491; GCN-NEXT:    s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc3)&4294967295
492; GCN-NEXT:    s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc3)>>32
493; GCN-NEXT:    s_setpc_b64 s[2:3]
494; GCN-NEXT:  .LBB4_2: ; %bb3
495; GCN-NEXT:    s_endpgm
496;
497; GFX11-LABEL: long_backward_sbranch:
498; GFX11:       ; %bb.0: ; %bb
499; GFX11-NEXT:    s_mov_b32 s0, 0
500; GFX11-NEXT:    .p2align 6
501; GFX11-NEXT:  .LBB4_1: ; %bb2
502; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
503; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
504; GFX11-NEXT:    s_add_i32 s0, s0, 1
505; GFX11-NEXT:    ;;#ASMSTART
506; GFX11-NEXT:    v_nop_e64
507; GFX11-NEXT:    v_nop_e64
508; GFX11-NEXT:    v_nop_e64
509; GFX11-NEXT:    ;;#ASMEND
510; GFX11-NEXT:    s_cmp_lt_i32 s0, 10
511; GFX11-NEXT:    s_cbranch_scc0 .LBB4_2
512; GFX11-NEXT:  ; %bb.3: ; %bb2
513; GFX11-NEXT:    ; in Loop: Header=BB4_1 Depth=1
514; GFX11-NEXT:    s_getpc_b64 s[2:3]
515; GFX11-NEXT:  .Lpost_getpc4:
516; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
517; GFX11-NEXT:    s_add_u32 s2, s2, (.LBB4_1-.Lpost_getpc4)&4294967295
518; GFX11-NEXT:    s_addc_u32 s3, s3, (.LBB4_1-.Lpost_getpc4)>>32
519; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
520; GFX11-NEXT:    s_setpc_b64 s[2:3]
521; GFX11-NEXT:  .LBB4_2: ; %bb3
522; GFX11-NEXT:    s_endpgm
523;
524; GFX12-LABEL: long_backward_sbranch:
525; GFX12:       ; %bb.0: ; %bb
526; GFX12-NEXT:    s_mov_b32 s0, 0
527; GFX12-NEXT:  .LBB4_1: ; %bb2
528; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
529; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
530; GFX12-NEXT:    s_add_co_i32 s0, s0, 1
531; GFX12-NEXT:    ;;#ASMSTART
532; GFX12-NEXT:    v_nop_e64
533; GFX12-NEXT:    v_nop_e64
534; GFX12-NEXT:    v_nop_e64
535; GFX12-NEXT:    ;;#ASMEND
536; GFX12-NEXT:    s_cmp_lt_i32 s0, 10
537; GFX12-NEXT:    s_cbranch_scc0 .LBB4_2
538; GFX12-NEXT:  ; %bb.3: ; %bb2
539; GFX12-NEXT:    ; in Loop: Header=BB4_1 Depth=1
540; GFX12-NEXT:    s_getpc_b64 s[2:3]
541; GFX12-NEXT:  .Lpost_getpc4:
542; GFX12-NEXT:    s_wait_alu 0xfffe
543; GFX12-NEXT:    s_add_co_u32 s2, s2, (.LBB4_1-.Lpost_getpc4)&4294967295
544; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, (.LBB4_1-.Lpost_getpc4)>>32
545; GFX12-NEXT:    s_wait_alu 0xfffe
546; GFX12-NEXT:    s_setpc_b64 s[2:3]
547; GFX12-NEXT:  .LBB4_2: ; %bb3
548; GFX12-NEXT:    s_endpgm
549bb:
550  br label %bb2
551
552bb2:
553  %loop.idx = phi i32 [ 0, %bb ], [ %inc, %bb2 ]
554  ; 24 bytes
555  call void asm sideeffect
556  "v_nop_e64
557  v_nop_e64
558  v_nop_e64", ""() #0
559  %inc = add nsw i32 %loop.idx, 1 ; add cost 4
560  %cmp = icmp slt i32 %inc, 10 ; condition cost = 8
561  br i1 %cmp, label %bb2, label %bb3 ; -
562
563bb3:
564  ret void
565}
566
567; Requires expansion of unconditional branch from %bb2 to %bb4 (and
568; expansion of conditional branch from %bb to %bb3.
569
570define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
571; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
572; GCN:       ; %bb.0: ; %bb0
573; GCN-NEXT:    s_load_dword s0, s[4:5], 0xb
574; GCN-NEXT:    s_waitcnt lgkmcnt(0)
575; GCN-NEXT:    s_cmp_eq_u32 s0, 0
576; GCN-NEXT:    s_mov_b64 s[0:1], -1
577; GCN-NEXT:    s_cbranch_scc0 .LBB5_1
578; GCN-NEXT:  ; %bb.7: ; %bb0
579; GCN-NEXT:    s_getpc_b64 s[0:1]
580; GCN-NEXT:  .Lpost_getpc5:
581; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_4-.Lpost_getpc5)&4294967295
582; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_4-.Lpost_getpc5)>>32
583; GCN-NEXT:    s_setpc_b64 s[0:1]
584; GCN-NEXT:  .LBB5_1: ; %Flow
585; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
586; GCN-NEXT:    s_cbranch_vccnz .LBB5_3
587; GCN-NEXT:  .LBB5_2: ; %bb2
588; GCN-NEXT:    s_mov_b32 s3, 0xf000
589; GCN-NEXT:    s_mov_b32 s2, -1
590; GCN-NEXT:    v_mov_b32_e32 v0, 17
591; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
592; GCN-NEXT:    s_waitcnt vmcnt(0)
593; GCN-NEXT:  .LBB5_3: ; %bb4
594; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
595; GCN-NEXT:    s_mov_b32 s3, 0xf000
596; GCN-NEXT:    s_mov_b32 s2, -1
597; GCN-NEXT:    s_waitcnt expcnt(0)
598; GCN-NEXT:    v_mov_b32_e32 v0, 63
599; GCN-NEXT:    s_waitcnt lgkmcnt(0)
600; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
601; GCN-NEXT:    s_waitcnt vmcnt(0)
602; GCN-NEXT:    s_endpgm
603; GCN-NEXT:  .LBB5_4: ; %bb3
604; GCN-NEXT:    ;;#ASMSTART
605; GCN-NEXT:    v_nop_e64
606; GCN-NEXT:    v_nop_e64
607; GCN-NEXT:    v_nop_e64
608; GCN-NEXT:    v_nop_e64
609; GCN-NEXT:    ;;#ASMEND
610; GCN-NEXT:    s_cbranch_execnz .LBB5_5
611; GCN-NEXT:  ; %bb.9: ; %bb3
612; GCN-NEXT:    s_getpc_b64 s[0:1]
613; GCN-NEXT:  .Lpost_getpc6:
614; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295
615; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32
616; GCN-NEXT:    s_setpc_b64 s[0:1]
617; GCN-NEXT:  .LBB5_5: ; %bb3
618; GCN-NEXT:    s_getpc_b64 s[0:1]
619; GCN-NEXT:  .Lpost_getpc4:
620; GCN-NEXT:    s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc4)&4294967295
621; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc4)>>32
622; GCN-NEXT:    s_setpc_b64 s[0:1]
623;
624; GFX11-LABEL: uniform_unconditional_min_long_forward_branch:
625; GFX11:       ; %bb.0: ; %bb0
626; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x2c
627; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
628; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
629; GFX11-NEXT:    s_mov_b64 s[0:1], -1
630; GFX11-NEXT:    s_cbranch_scc1 .LBB5_4
631; GFX11-NEXT:  ; %bb.1: ; %Flow
632; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[0:1]
633; GFX11-NEXT:    s_cbranch_vccnz .LBB5_3
634; GFX11-NEXT:  .LBB5_2: ; %bb2
635; GFX11-NEXT:    v_mov_b32_e32 v0, 17
636; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
637; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
638; GFX11-NEXT:  .LBB5_3: ; %bb4
639; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
640; GFX11-NEXT:    v_mov_b32_e32 v0, 0
641; GFX11-NEXT:    v_mov_b32_e32 v1, 63
642; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
643; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1] dlc
644; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
645; GFX11-NEXT:    s_endpgm
646; GFX11-NEXT:  .LBB5_4: ; %bb3
647; GFX11-NEXT:    ;;#ASMSTART
648; GFX11-NEXT:    v_nop_e64
649; GFX11-NEXT:    v_nop_e64
650; GFX11-NEXT:    v_nop_e64
651; GFX11-NEXT:    v_nop_e64
652; GFX11-NEXT:    ;;#ASMEND
653; GFX11-NEXT:    s_cbranch_execnz .LBB5_5
654; GFX11-NEXT:  ; %bb.7: ; %bb3
655; GFX11-NEXT:    s_getpc_b64 s[0:1]
656; GFX11-NEXT:  .Lpost_getpc6:
657; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
658; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_2-.Lpost_getpc6)&4294967295
659; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_2-.Lpost_getpc6)>>32
660; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
661; GFX11-NEXT:    s_setpc_b64 s[0:1]
662; GFX11-NEXT:  .LBB5_5: ; %bb3
663; GFX11-NEXT:    s_getpc_b64 s[0:1]
664; GFX11-NEXT:  .Lpost_getpc5:
665; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
666; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295
667; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32
668; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
669; GFX11-NEXT:    s_setpc_b64 s[0:1]
670;
671; GFX12-LABEL: uniform_unconditional_min_long_forward_branch:
672; GFX12:       ; %bb.0: ; %bb0
673; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x2c
674; GFX12-NEXT:    s_wait_kmcnt 0x0
675; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
676; GFX12-NEXT:    s_mov_b32 s0, -1
677; GFX12-NEXT:    s_cbranch_scc0 .LBB5_1
678; GFX12-NEXT:  ; %bb.7: ; %bb0
679; GFX12-NEXT:    s_getpc_b64 s[0:1]
680; GFX12-NEXT:  .Lpost_getpc6:
681; GFX12-NEXT:    s_wait_alu 0xfffe
682; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_4-.Lpost_getpc6)&4294967295
683; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_4-.Lpost_getpc6)>>32
684; GFX12-NEXT:    s_wait_alu 0xfffe
685; GFX12-NEXT:    s_setpc_b64 s[0:1]
686; GFX12-NEXT:  .LBB5_1: ; %Flow
687; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
688; GFX12-NEXT:    s_cbranch_vccnz .LBB5_3
689; GFX12-NEXT:  .LBB5_2: ; %bb2
690; GFX12-NEXT:    v_mov_b32_e32 v0, 17
691; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
692; GFX12-NEXT:    s_wait_storecnt 0x0
693; GFX12-NEXT:  .LBB5_3: ; %bb4
694; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
695; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 63
696; GFX12-NEXT:    s_wait_kmcnt 0x0
697; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1] scope:SCOPE_SYS
698; GFX12-NEXT:    s_wait_storecnt 0x0
699; GFX12-NEXT:    s_endpgm
700; GFX12-NEXT:  .LBB5_4: ; %bb3
701; GFX12-NEXT:    ;;#ASMSTART
702; GFX12-NEXT:    v_nop_e64
703; GFX12-NEXT:    v_nop_e64
704; GFX12-NEXT:    v_nop_e64
705; GFX12-NEXT:    v_nop_e64
706; GFX12-NEXT:    ;;#ASMEND
707; GFX12-NEXT:    s_cbranch_execnz .LBB5_5
708; GFX12-NEXT:  ; %bb.9: ; %bb3
709; GFX12-NEXT:    s_getpc_b64 s[0:1]
710; GFX12-NEXT:  .Lpost_getpc7:
711; GFX12-NEXT:    s_wait_alu 0xfffe
712; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_2-.Lpost_getpc7)&4294967295
713; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_2-.Lpost_getpc7)>>32
714; GFX12-NEXT:    s_wait_alu 0xfffe
715; GFX12-NEXT:    s_setpc_b64 s[0:1]
716; GFX12-NEXT:  .LBB5_5: ; %bb3
717; GFX12-NEXT:    s_getpc_b64 s[0:1]
718; GFX12-NEXT:  .Lpost_getpc5:
719; GFX12-NEXT:    s_wait_alu 0xfffe
720; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB5_3-.Lpost_getpc5)&4294967295
721; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB5_3-.Lpost_getpc5)>>32
722; GFX12-NEXT:    s_wait_alu 0xfffe
723; GFX12-NEXT:    s_setpc_b64 s[0:1]
724bb0:
725  %tmp = icmp ne i32 %arg1, 0
726  br i1 %tmp, label %bb2, label %bb3
727
728bb2:
729  store volatile i32 17, ptr addrspace(1) undef
730  br label %bb4
731
732bb3:
733  ; 32 byte asm
734  call void asm sideeffect
735  "v_nop_e64
736  v_nop_e64
737  v_nop_e64
738  v_nop_e64", ""() #0
739  br label %bb4
740
741bb4:
742  store volatile i32 63, ptr addrspace(1) %arg
743  ret void
744}
745
746define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) {
747; GCN-LABEL: uniform_unconditional_min_long_backward_branch:
748; GCN:       ; %bb.0: ; %entry
749; GCN-NEXT:    s_and_b64 vcc, exec, -1
750; GCN-NEXT:  .LBB6_1: ; %loop
751; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
752; GCN-NEXT:    ;;#ASMSTART
753; GCN-NEXT:    v_nop_e64
754; GCN-NEXT:    v_nop_e64
755; GCN-NEXT:    v_nop_e64
756; GCN-NEXT:    v_nop_e64
757; GCN-NEXT:    ;;#ASMEND
758; GCN-NEXT:    s_mov_b64 vcc, vcc
759; GCN-NEXT:    s_cbranch_vccz .LBB6_2
760; GCN-NEXT:  ; %bb.3: ; %loop
761; GCN-NEXT:    ; in Loop: Header=BB6_1 Depth=1
762; GCN-NEXT:    s_getpc_b64 s[0:1]
763; GCN-NEXT:  .Lpost_getpc7:
764; GCN-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295
765; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32
766; GCN-NEXT:    s_setpc_b64 s[0:1]
767; GCN-NEXT:  .LBB6_2: ; %DummyReturnBlock
768; GCN-NEXT:    s_endpgm
769;
770; GFX11-LABEL: uniform_unconditional_min_long_backward_branch:
771; GFX11:       ; %bb.0: ; %entry
772; GFX11-NEXT:    s_and_b64 vcc, exec, -1
773; GFX11-NEXT:    .p2align 6
774; GFX11-NEXT:  .LBB6_1: ; %loop
775; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
776; GFX11-NEXT:    ;;#ASMSTART
777; GFX11-NEXT:    v_nop_e64
778; GFX11-NEXT:    v_nop_e64
779; GFX11-NEXT:    v_nop_e64
780; GFX11-NEXT:    v_nop_e64
781; GFX11-NEXT:    ;;#ASMEND
782; GFX11-NEXT:    s_cbranch_vccz .LBB6_2
783; GFX11-NEXT:  ; %bb.3: ; %loop
784; GFX11-NEXT:    ; in Loop: Header=BB6_1 Depth=1
785; GFX11-NEXT:    s_getpc_b64 s[0:1]
786; GFX11-NEXT:  .Lpost_getpc7:
787; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
788; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB6_1-.Lpost_getpc7)&4294967295
789; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB6_1-.Lpost_getpc7)>>32
790; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
791; GFX11-NEXT:    s_setpc_b64 s[0:1]
792; GFX11-NEXT:  .LBB6_2: ; %DummyReturnBlock
793; GFX11-NEXT:    s_endpgm
794;
795; GFX12-LABEL: uniform_unconditional_min_long_backward_branch:
796; GFX12:       ; %bb.0: ; %entry
797; GFX12-NEXT:    s_mov_b32 vcc_lo, exec_lo
798; GFX12-NEXT:  .LBB6_1: ; %loop
799; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
800; GFX12-NEXT:    ;;#ASMSTART
801; GFX12-NEXT:    v_nop_e64
802; GFX12-NEXT:    v_nop_e64
803; GFX12-NEXT:    v_nop_e64
804; GFX12-NEXT:    v_nop_e64
805; GFX12-NEXT:    ;;#ASMEND
806; GFX12-NEXT:    s_cbranch_vccz .LBB6_2
807; GFX12-NEXT:  ; %bb.3: ; %loop
808; GFX12-NEXT:    ; in Loop: Header=BB6_1 Depth=1
809; GFX12-NEXT:    s_getpc_b64 s[0:1]
810; GFX12-NEXT:  .Lpost_getpc8:
811; GFX12-NEXT:    s_wait_alu 0xfffe
812; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB6_1-.Lpost_getpc8)&4294967295
813; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB6_1-.Lpost_getpc8)>>32
814; GFX12-NEXT:    s_wait_alu 0xfffe
815; GFX12-NEXT:    s_setpc_b64 s[0:1]
816; GFX12-NEXT:  .LBB6_2: ; %DummyReturnBlock
817; GFX12-NEXT:    s_endpgm
818entry:
819  br label %loop
820
821loop:
822  ; 32 byte asm
823  call void asm sideeffect
824  "v_nop_e64
825  v_nop_e64
826  v_nop_e64
827  v_nop_e64", ""() #0
828  br label %loop
829}
830
831; Expansion of branch from %bb1 to %bb3 introduces need to expand
832; branch from %bb0 to %bb2
833
834define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
835; GCN-LABEL: expand_requires_expand:
836; GCN:       ; %bb.0: ; %bb0
837; GCN-NEXT:    s_load_dword s0, s[4:5], 0x9
838; GCN-NEXT:    s_waitcnt lgkmcnt(0)
839; GCN-NEXT:    s_cmp_lt_i32 s0, 0
840; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
841; GCN-NEXT:    s_and_b64 vcc, exec, s[0:1]
842; GCN-NEXT:    s_cbranch_vccnz .LBB7_2
843; GCN-NEXT:  ; %bb.1: ; %bb1
844; GCN-NEXT:    s_load_dword s0, s[0:1], 0x0
845; GCN-NEXT:    s_waitcnt lgkmcnt(0)
846; GCN-NEXT:    s_cmp_lg_u32 s0, 3
847; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
848; GCN-NEXT:  .LBB7_2: ; %Flow
849; GCN-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
850; GCN-NEXT:    s_cbranch_vccz .LBB7_3
851; GCN-NEXT:  ; %bb.5: ; %Flow
852; GCN-NEXT:    s_getpc_b64 s[0:1]
853; GCN-NEXT:  .Lpost_getpc8:
854; GCN-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295
855; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32
856; GCN-NEXT:    s_setpc_b64 s[0:1]
857; GCN-NEXT:  .LBB7_3: ; %bb2
858; GCN-NEXT:    ;;#ASMSTART
859; GCN-NEXT:    v_nop_e64
860; GCN-NEXT:    v_nop_e64
861; GCN-NEXT:    v_nop_e64
862; GCN-NEXT:    v_nop_e64
863; GCN-NEXT:    ;;#ASMEND
864; GCN-NEXT:  .LBB7_4: ; %bb3
865; GCN-NEXT:    ;;#ASMSTART
866; GCN-NEXT:    v_nop_e64
867; GCN-NEXT:    ;;#ASMEND
868; GCN-NEXT:    ;;#ASMSTART
869; GCN-NEXT:    v_nop_e64
870; GCN-NEXT:    ;;#ASMEND
871; GCN-NEXT:    s_endpgm
872;
873; GFX11-LABEL: expand_requires_expand:
874; GFX11:       ; %bb.0: ; %bb0
875; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x24
876; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX11-NEXT:    s_cmp_lt_i32 s0, 0
878; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
879; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
880; GFX11-NEXT:    s_and_b64 vcc, exec, s[0:1]
881; GFX11-NEXT:    s_cbranch_vccnz .LBB7_2
882; GFX11-NEXT:  ; %bb.1: ; %bb1
883; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
884; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
885; GFX11-NEXT:    s_cmp_lg_u32 s0, 3
886; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
887; GFX11-NEXT:  .LBB7_2: ; %Flow
888; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
889; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[0:1]
890; GFX11-NEXT:    s_cbranch_vccz .LBB7_3
891; GFX11-NEXT:  ; %bb.5: ; %Flow
892; GFX11-NEXT:    s_getpc_b64 s[0:1]
893; GFX11-NEXT:  .Lpost_getpc8:
894; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
895; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB7_4-.Lpost_getpc8)&4294967295
896; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB7_4-.Lpost_getpc8)>>32
897; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
898; GFX11-NEXT:    s_setpc_b64 s[0:1]
899; GFX11-NEXT:  .LBB7_3: ; %bb2
900; GFX11-NEXT:    ;;#ASMSTART
901; GFX11-NEXT:    v_nop_e64
902; GFX11-NEXT:    v_nop_e64
903; GFX11-NEXT:    v_nop_e64
904; GFX11-NEXT:    v_nop_e64
905; GFX11-NEXT:    ;;#ASMEND
906; GFX11-NEXT:  .LBB7_4: ; %bb3
907; GFX11-NEXT:    ;;#ASMSTART
908; GFX11-NEXT:    v_nop_e64
909; GFX11-NEXT:    ;;#ASMEND
910; GFX11-NEXT:    ;;#ASMSTART
911; GFX11-NEXT:    v_nop_e64
912; GFX11-NEXT:    ;;#ASMEND
913; GFX11-NEXT:    s_endpgm
914;
915; GFX12-LABEL: expand_requires_expand:
916; GFX12:       ; %bb.0: ; %bb0
917; GFX12-NEXT:    s_load_b32 s0, s[4:5], 0x24
918; GFX12-NEXT:    s_wait_kmcnt 0x0
919; GFX12-NEXT:    s_cmp_lt_i32 s0, 0
920; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
921; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
922; GFX12-NEXT:    s_and_b32 vcc_lo, exec_lo, s0
923; GFX12-NEXT:    s_cbranch_vccnz .LBB7_2
924; GFX12-NEXT:  ; %bb.1: ; %bb1
925; GFX12-NEXT:    s_load_b32 s0, s[0:1], 0x0
926; GFX12-NEXT:    s_wait_kmcnt 0x0
927; GFX12-NEXT:    s_cmp_lg_u32 s0, 3
928; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
929; GFX12-NEXT:  .LBB7_2: ; %Flow
930; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
931; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
932; GFX12-NEXT:    s_cbranch_vccz .LBB7_3
933; GFX12-NEXT:  ; %bb.5: ; %Flow
934; GFX12-NEXT:    s_getpc_b64 s[0:1]
935; GFX12-NEXT:  .Lpost_getpc9:
936; GFX12-NEXT:    s_wait_alu 0xfffe
937; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB7_4-.Lpost_getpc9)&4294967295
938; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB7_4-.Lpost_getpc9)>>32
939; GFX12-NEXT:    s_wait_alu 0xfffe
940; GFX12-NEXT:    s_setpc_b64 s[0:1]
941; GFX12-NEXT:  .LBB7_3: ; %bb2
942; GFX12-NEXT:    ;;#ASMSTART
943; GFX12-NEXT:    v_nop_e64
944; GFX12-NEXT:    v_nop_e64
945; GFX12-NEXT:    v_nop_e64
946; GFX12-NEXT:    v_nop_e64
947; GFX12-NEXT:    ;;#ASMEND
948; GFX12-NEXT:  .LBB7_4: ; %bb3
949; GFX12-NEXT:    ;;#ASMSTART
950; GFX12-NEXT:    v_nop_e64
951; GFX12-NEXT:    ;;#ASMEND
952; GFX12-NEXT:    ;;#ASMSTART
953; GFX12-NEXT:    v_nop_e64
954; GFX12-NEXT:    ;;#ASMEND
955; GFX12-NEXT:    s_endpgm
956bb0:
957  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
958  %cmp0 = icmp slt i32 %cond0, 0
959  br i1 %cmp0, label %bb2, label %bb1
960
961bb1:
962  %val = load volatile i32, ptr addrspace(4) undef
963  %cmp1 = icmp eq i32 %val, 3
964  br i1 %cmp1, label %bb3, label %bb2
965
966bb2:
967  call void asm sideeffect
968  "v_nop_e64
969  v_nop_e64
970  v_nop_e64
971  v_nop_e64", ""() #0
972  br label %bb3
973
974bb3:
975; These NOPs prevent tail-duplication-based outlining
976; from firing, which defeats the need to expand the branches and this test.
977  call void asm sideeffect
978  "v_nop_e64", ""() #0
979  call void asm sideeffect
980  "v_nop_e64", ""() #0
981  ret void
982}
983
984; Requires expanding of required skip branch.
985
986define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 {
987; GCN-LABEL: uniform_inside_divergent:
988; GCN:       ; %bb.0: ; %entry
989; GCN-NEXT:    v_cmp_gt_u32_e32 vcc, 16, v0
990; GCN-NEXT:    s_and_saveexec_b64 s[6:7], vcc
991; GCN-NEXT:    s_cbranch_execnz .LBB8_1
992; GCN-NEXT:  ; %bb.4: ; %entry
993; GCN-NEXT:    s_getpc_b64 s[0:1]
994; GCN-NEXT:  .Lpost_getpc9:
995; GCN-NEXT:    s_add_u32 s0, s0, (.LBB8_3-.Lpost_getpc9)&4294967295
996; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB8_3-.Lpost_getpc9)>>32
997; GCN-NEXT:    s_setpc_b64 s[0:1]
998; GCN-NEXT:  .LBB8_1: ; %if
999; GCN-NEXT:    s_load_dword s8, s[4:5], 0xb
1000; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1001; GCN-NEXT:    s_mov_b32 s3, 0xf000
1002; GCN-NEXT:    s_mov_b32 s2, -1
1003; GCN-NEXT:    v_mov_b32_e32 v0, 0
1004; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1005; GCN-NEXT:    s_cmp_lg_u32 s8, 0
1006; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1007; GCN-NEXT:    s_cbranch_scc1 .LBB8_3
1008; GCN-NEXT:  ; %bb.2: ; %if_uniform
1009; GCN-NEXT:    s_waitcnt expcnt(0)
1010; GCN-NEXT:    v_mov_b32_e32 v0, 1
1011; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1012; GCN-NEXT:  .LBB8_3: ; %endif
1013; GCN-NEXT:    s_or_b64 exec, exec, s[6:7]
1014; GCN-NEXT:    s_sleep 5
1015; GCN-NEXT:    s_endpgm
1016;
1017; GFX11-LABEL: uniform_inside_divergent:
1018; GFX11:       ; %bb.0: ; %entry
1019; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1020; GFX11-NEXT:    s_mov_b64 s[0:1], exec
1021; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1022; GFX11-NEXT:    v_cmpx_gt_u32_e32 16, v0
1023; GFX11-NEXT:    s_cbranch_execz .LBB8_3
1024; GFX11-NEXT:  ; %bb.1: ; %if
1025; GFX11-NEXT:    s_clause 0x1
1026; GFX11-NEXT:    s_load_b32 s6, s[4:5], 0x2c
1027; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1028; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1029; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1030; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
1031; GFX11-NEXT:    global_store_b32 v0, v0, s[2:3]
1032; GFX11-NEXT:    s_cbranch_scc1 .LBB8_3
1033; GFX11-NEXT:  ; %bb.2: ; %if_uniform
1034; GFX11-NEXT:    v_mov_b32_e32 v1, 1
1035; GFX11-NEXT:    global_store_b32 v0, v1, s[2:3]
1036; GFX11-NEXT:  .LBB8_3: ; %endif
1037; GFX11-NEXT:    s_or_b64 exec, exec, s[0:1]
1038; GFX11-NEXT:    s_sleep 5
1039; GFX11-NEXT:    s_endpgm
1040;
1041; GFX12-LABEL: uniform_inside_divergent:
1042; GFX12:       ; %bb.0: ; %entry
1043; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
1044; GFX12-NEXT:    s_mov_b32 s3, exec_lo
1045; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1046; GFX12-NEXT:    v_cmpx_gt_u32_e32 16, v0
1047; GFX12-NEXT:    s_cbranch_execz .LBB8_3
1048; GFX12-NEXT:  ; %bb.1: ; %if
1049; GFX12-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
1050; GFX12-NEXT:    v_mov_b32_e32 v0, 0
1051; GFX12-NEXT:    s_wait_kmcnt 0x0
1052; GFX12-NEXT:    s_cmp_lg_u32 s2, 0
1053; GFX12-NEXT:    global_store_b32 v0, v0, s[0:1]
1054; GFX12-NEXT:    s_cbranch_scc1 .LBB8_3
1055; GFX12-NEXT:  ; %bb.2: ; %if_uniform
1056; GFX12-NEXT:    v_mov_b32_e32 v1, 1
1057; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
1058; GFX12-NEXT:  .LBB8_3: ; %endif
1059; GFX12-NEXT:    s_or_b32 exec_lo, exec_lo, s3
1060; GFX12-NEXT:    s_sleep 5
1061; GFX12-NEXT:    s_endpgm
1062entry:
1063  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1064  %d_cmp = icmp ult i32 %tid, 16
1065  br i1 %d_cmp, label %if, label %endif
1066
1067if:
1068  store i32 0, ptr addrspace(1) %out
1069  %u_cmp = icmp eq i32 %cond, 0
1070  br i1 %u_cmp, label %if_uniform, label %endif
1071
1072if_uniform:
1073  store i32 1, ptr addrspace(1) %out
1074  br label %endif
1075
1076endif:
1077  ; layout can remove the split branch if it can copy the return block.
1078  ; This call makes the return block long enough that it doesn't get copied.
1079  call void @llvm.amdgcn.s.sleep(i32 5);
1080  ret void
1081}
1082
1083; si_mask_branch
1084
1085define amdgpu_kernel void @analyze_mask_branch() #0 {
1086; GCN-LABEL: analyze_mask_branch:
1087; GCN:       ; %bb.0: ; %entry
1088; GCN-NEXT:    ;;#ASMSTART
1089; GCN-NEXT:    v_mov_b32_e64 v0, 0
1090; GCN-NEXT:    ;;#ASMEND
1091; GCN-NEXT:    v_cmp_nlt_f32_e32 vcc, 0, v0
1092; GCN-NEXT:    s_and_saveexec_b64 s[0:1], vcc
1093; GCN-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1094; GCN-NEXT:    s_cbranch_execz .LBB9_2
1095; GCN-NEXT:  ; %bb.1: ; %ret
1096; GCN-NEXT:    s_mov_b32 s3, 0xf000
1097; GCN-NEXT:    s_mov_b32 s2, -1
1098; GCN-NEXT:    v_mov_b32_e32 v0, 7
1099; GCN-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1100; GCN-NEXT:    s_waitcnt vmcnt(0)
1101; GCN-NEXT:  .LBB9_2: ; %Flow1
1102; GCN-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
1103; GCN-NEXT:    s_cbranch_execnz .LBB9_3
1104; GCN-NEXT:  ; %bb.6: ; %Flow1
1105; GCN-NEXT:    s_getpc_b64 s[0:1]
1106; GCN-NEXT:  .Lpost_getpc10:
1107; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295
1108; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32
1109; GCN-NEXT:    s_setpc_b64 s[0:1]
1110; GCN-NEXT:  .LBB9_3: ; %loop.preheader
1111; GCN-NEXT:    s_and_b64 vcc, exec, 0
1112; GCN-NEXT:  .LBB9_4: ; %loop
1113; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
1114; GCN-NEXT:    ;;#ASMSTART
1115; GCN-NEXT:    v_nop_e64
1116; GCN-NEXT:    v_nop_e64
1117; GCN-NEXT:    ;;#ASMEND
1118; GCN-NEXT:    ;;#ASMSTART
1119; GCN-NEXT:    v_nop_e64
1120; GCN-NEXT:    v_nop_e64
1121; GCN-NEXT:    v_nop_e64
1122; GCN-NEXT:    v_nop_e64
1123; GCN-NEXT:    ;;#ASMEND
1124; GCN-NEXT:    s_mov_b64 vcc, vcc
1125; GCN-NEXT:    s_cbranch_vccnz .LBB9_5
1126; GCN-NEXT:  ; %bb.8: ; %loop
1127; GCN-NEXT:    ; in Loop: Header=BB9_4 Depth=1
1128; GCN-NEXT:    s_getpc_b64 s[0:1]
1129; GCN-NEXT:  .Lpost_getpc11:
1130; GCN-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295
1131; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32
1132; GCN-NEXT:    s_setpc_b64 s[0:1]
1133; GCN-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
1134; GCN-NEXT:    s_endpgm
1135;
1136; GFX11-LABEL: analyze_mask_branch:
1137; GFX11:       ; %bb.0: ; %entry
1138; GFX11-NEXT:    s_mov_b64 s[0:1], exec
1139; GFX11-NEXT:    ;;#ASMSTART
1140; GFX11-NEXT:    v_mov_b32_e64 v0, 0
1141; GFX11-NEXT:    ;;#ASMEND
1142; GFX11-NEXT:    v_cmpx_nlt_f32_e32 0, v0
1143; GFX11-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
1144; GFX11-NEXT:    s_cbranch_execz .LBB9_2
1145; GFX11-NEXT:  ; %bb.1: ; %ret
1146; GFX11-NEXT:    v_mov_b32_e32 v0, 7
1147; GFX11-NEXT:    global_store_b32 v[0:1], v0, off dlc
1148; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
1149; GFX11-NEXT:  .LBB9_2: ; %Flow1
1150; GFX11-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
1151; GFX11-NEXT:    s_cbranch_execnz .LBB9_3
1152; GFX11-NEXT:  ; %bb.6: ; %Flow1
1153; GFX11-NEXT:    s_getpc_b64 s[0:1]
1154; GFX11-NEXT:  .Lpost_getpc9:
1155; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1156; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_5-.Lpost_getpc9)&4294967295
1157; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_5-.Lpost_getpc9)>>32
1158; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1159; GFX11-NEXT:    s_setpc_b64 s[0:1]
1160; GFX11-NEXT:  .LBB9_3: ; %loop.preheader
1161; GFX11-NEXT:    s_and_b64 vcc, exec, 0
1162; GFX11-NEXT:    .p2align 6
1163; GFX11-NEXT:  .LBB9_4: ; %loop
1164; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
1165; GFX11-NEXT:    ;;#ASMSTART
1166; GFX11-NEXT:    v_nop_e64
1167; GFX11-NEXT:    v_nop_e64
1168; GFX11-NEXT:    ;;#ASMEND
1169; GFX11-NEXT:    ;;#ASMSTART
1170; GFX11-NEXT:    v_nop_e64
1171; GFX11-NEXT:    v_nop_e64
1172; GFX11-NEXT:    v_nop_e64
1173; GFX11-NEXT:    v_nop_e64
1174; GFX11-NEXT:    ;;#ASMEND
1175; GFX11-NEXT:    s_cbranch_vccnz .LBB9_5
1176; GFX11-NEXT:  ; %bb.8: ; %loop
1177; GFX11-NEXT:    ; in Loop: Header=BB9_4 Depth=1
1178; GFX11-NEXT:    s_getpc_b64 s[0:1]
1179; GFX11-NEXT:  .Lpost_getpc10:
1180; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1181; GFX11-NEXT:    s_add_u32 s0, s0, (.LBB9_4-.Lpost_getpc10)&4294967295
1182; GFX11-NEXT:    s_addc_u32 s1, s1, (.LBB9_4-.Lpost_getpc10)>>32
1183; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1184; GFX11-NEXT:    s_setpc_b64 s[0:1]
1185; GFX11-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
1186; GFX11-NEXT:    s_endpgm
1187;
1188; GFX12-LABEL: analyze_mask_branch:
1189; GFX12:       ; %bb.0: ; %entry
1190; GFX12-NEXT:    s_mov_b32 s0, exec_lo
1191; GFX12-NEXT:    ;;#ASMSTART
1192; GFX12-NEXT:    v_mov_b32_e64 v0, 0
1193; GFX12-NEXT:    ;;#ASMEND
1194; GFX12-NEXT:    v_cmpx_nlt_f32_e32 0, v0
1195; GFX12-NEXT:    s_xor_b32 s0, exec_lo, s0
1196; GFX12-NEXT:    s_cbranch_execz .LBB9_2
1197; GFX12-NEXT:  ; %bb.1: ; %ret
1198; GFX12-NEXT:    v_mov_b32_e32 v0, 7
1199; GFX12-NEXT:    global_store_b32 v[0:1], v0, off scope:SCOPE_SYS
1200; GFX12-NEXT:    s_wait_storecnt 0x0
1201; GFX12-NEXT:  .LBB9_2: ; %Flow1
1202; GFX12-NEXT:    s_and_not1_saveexec_b32 s0, s0
1203; GFX12-NEXT:    s_cbranch_execnz .LBB9_3
1204; GFX12-NEXT:  ; %bb.6: ; %Flow1
1205; GFX12-NEXT:    s_getpc_b64 s[0:1]
1206; GFX12-NEXT:  .Lpost_getpc10:
1207; GFX12-NEXT:    s_wait_alu 0xfffe
1208; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_5-.Lpost_getpc10)&4294967295
1209; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_5-.Lpost_getpc10)>>32
1210; GFX12-NEXT:    s_wait_alu 0xfffe
1211; GFX12-NEXT:    s_setpc_b64 s[0:1]
1212; GFX12-NEXT:  .LBB9_3: ; %loop.preheader
1213; GFX12-NEXT:    s_mov_b32 vcc_lo, 0
1214; GFX12-NEXT:  .LBB9_4: ; %loop
1215; GFX12-NEXT:    ; =>This Inner Loop Header: Depth=1
1216; GFX12-NEXT:    ;;#ASMSTART
1217; GFX12-NEXT:    v_nop_e64
1218; GFX12-NEXT:    v_nop_e64
1219; GFX12-NEXT:    ;;#ASMEND
1220; GFX12-NEXT:    ;;#ASMSTART
1221; GFX12-NEXT:    v_nop_e64
1222; GFX12-NEXT:    v_nop_e64
1223; GFX12-NEXT:    v_nop_e64
1224; GFX12-NEXT:    v_nop_e64
1225; GFX12-NEXT:    ;;#ASMEND
1226; GFX12-NEXT:    s_cbranch_vccnz .LBB9_5
1227; GFX12-NEXT:  ; %bb.8: ; %loop
1228; GFX12-NEXT:    ; in Loop: Header=BB9_4 Depth=1
1229; GFX12-NEXT:    s_getpc_b64 s[0:1]
1230; GFX12-NEXT:  .Lpost_getpc11:
1231; GFX12-NEXT:    s_wait_alu 0xfffe
1232; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB9_4-.Lpost_getpc11)&4294967295
1233; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB9_4-.Lpost_getpc11)>>32
1234; GFX12-NEXT:    s_wait_alu 0xfffe
1235; GFX12-NEXT:    s_setpc_b64 s[0:1]
1236; GFX12-NEXT:  .LBB9_5: ; %UnifiedReturnBlock
1237; GFX12-NEXT:    s_endpgm
1238entry:
1239  %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
1240  %cmp0 = fcmp ogt float %reg, 0.000000e+00
1241  br i1 %cmp0, label %loop, label %ret
1242
1243loop:
1244  %phi = phi float [ 0.000000e+00, %loop_body ], [ 1.000000e+00, %entry ]
1245  call void asm sideeffect
1246  "v_nop_e64
1247  v_nop_e64", ""() #0
1248  %cmp1 = fcmp olt float %phi, 8.0
1249  br i1 %cmp1, label %loop_body, label %ret
1250
1251loop_body:
1252  call void asm sideeffect
1253  "v_nop_e64
1254  v_nop_e64
1255  v_nop_e64
1256  v_nop_e64", ""() #0
1257  br label %loop
1258
1259ret:
1260  store volatile i32 7, ptr addrspace(1) undef
1261  ret void
1262}
1263
1264define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
1265; GCN-LABEL: long_branch_hang:
1266; GCN:       ; %bb.0: ; %bb
1267; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
1268; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1269; GCN-NEXT:    s_cmp_eq_u32 s0, 0
1270; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
1271; GCN-NEXT:    s_cmp_lg_u32 s0, 0
1272; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
1273; GCN-NEXT:    s_cmp_lt_i32 s3, 6
1274; GCN-NEXT:    s_cbranch_scc1 .LBB10_1
1275; GCN-NEXT:  ; %bb.8: ; %bb
1276; GCN-NEXT:    s_getpc_b64 s[8:9]
1277; GCN-NEXT:  .Lpost_getpc12:
1278; GCN-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295
1279; GCN-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32
1280; GCN-NEXT:    s_setpc_b64 s[8:9]
1281; GCN-NEXT:  .LBB10_1: ; %bb13
1282; GCN-NEXT:    ;;#ASMSTART
1283; GCN-NEXT:    v_nop_e64
1284; GCN-NEXT:    v_nop_e64
1285; GCN-NEXT:    v_nop_e64
1286; GCN-NEXT:    v_nop_e64
1287; GCN-NEXT:    ;;#ASMEND
1288; GCN-NEXT:    s_cbranch_execz .LBB10_3
1289; GCN-NEXT:    s_branch .LBB10_4
1290; GCN-NEXT:  .LBB10_2:
1291; GCN-NEXT:    s_mov_b64 s[8:9], 0
1292; GCN-NEXT:  .LBB10_3: ; %bb9
1293; GCN-NEXT:    s_cmp_lt_i32 s3, 11
1294; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
1295; GCN-NEXT:    s_cmp_ge_i32 s2, s3
1296; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
1297; GCN-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
1298; GCN-NEXT:  .LBB10_4: ; %Flow5
1299; GCN-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
1300; GCN-NEXT:    s_cbranch_vccz .LBB10_5
1301; GCN-NEXT:  ; %bb.10: ; %Flow5
1302; GCN-NEXT:    s_getpc_b64 s[0:1]
1303; GCN-NEXT:  .Lpost_getpc13:
1304; GCN-NEXT:    s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295
1305; GCN-NEXT:    s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32
1306; GCN-NEXT:    s_setpc_b64 s[0:1]
1307; GCN-NEXT:  .LBB10_5: ; %bb14
1308; GCN-NEXT:    s_cmp_lt_i32 s1, 9
1309; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
1310; GCN-NEXT:    s_cmp_lt_i32 s2, s3
1311; GCN-NEXT:    s_cselect_b64 s[2:3], -1, 0
1312; GCN-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1313; GCN-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1314; GCN-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1315; GCN-NEXT:    s_branch .LBB10_7
1316; GCN-NEXT:  .LBB10_6:
1317; GCN-NEXT:    ; implicit-def: $vgpr0
1318; GCN-NEXT:  .LBB10_7: ; %bb19
1319; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xf
1320; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1321; GCN-NEXT:    s_mov_b32 s3, 0xf000
1322; GCN-NEXT:    s_mov_b32 s2, 0
1323; GCN-NEXT:    s_waitcnt lgkmcnt(0)
1324; GCN-NEXT:    s_lshl_b64 s[4:5], s[6:7], 2
1325; GCN-NEXT:    v_mov_b32_e32 v1, s4
1326; GCN-NEXT:    v_mov_b32_e32 v2, s5
1327; GCN-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1328; GCN-NEXT:    s_endpgm
1329;
1330; GFX11-LABEL: long_branch_hang:
1331; GFX11:       ; %bb.0: ; %bb
1332; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
1333; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1334; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
1335; GFX11-NEXT:    s_cselect_b64 s[6:7], -1, 0
1336; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
1337; GFX11-NEXT:    s_cselect_b64 s[8:9], -1, 0
1338; GFX11-NEXT:    s_cmp_lt_i32 s3, 6
1339; GFX11-NEXT:    s_cbranch_scc1 .LBB10_1
1340; GFX11-NEXT:  ; %bb.8: ; %bb
1341; GFX11-NEXT:    s_getpc_b64 s[8:9]
1342; GFX11-NEXT:  .Lpost_getpc11:
1343; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1344; GFX11-NEXT:    s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295
1345; GFX11-NEXT:    s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32
1346; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1347; GFX11-NEXT:    s_setpc_b64 s[8:9]
1348; GFX11-NEXT:  .LBB10_1: ; %bb13
1349; GFX11-NEXT:    ;;#ASMSTART
1350; GFX11-NEXT:    v_nop_e64
1351; GFX11-NEXT:    v_nop_e64
1352; GFX11-NEXT:    v_nop_e64
1353; GFX11-NEXT:    v_nop_e64
1354; GFX11-NEXT:    ;;#ASMEND
1355; GFX11-NEXT:    s_cbranch_execz .LBB10_3
1356; GFX11-NEXT:    s_branch .LBB10_4
1357; GFX11-NEXT:  .LBB10_2:
1358; GFX11-NEXT:    s_mov_b64 s[8:9], 0
1359; GFX11-NEXT:  .LBB10_3: ; %bb9
1360; GFX11-NEXT:    s_cmp_lt_i32 s3, 11
1361; GFX11-NEXT:    s_cselect_b64 s[8:9], -1, 0
1362; GFX11-NEXT:    s_cmp_ge_i32 s2, s3
1363; GFX11-NEXT:    s_cselect_b64 s[10:11], -1, 0
1364; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1365; GFX11-NEXT:    s_and_b64 s[8:9], s[10:11], s[8:9]
1366; GFX11-NEXT:  .LBB10_4: ; %Flow5
1367; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1368; GFX11-NEXT:    s_and_not1_b64 vcc, exec, s[8:9]
1369; GFX11-NEXT:    s_cbranch_vccnz .LBB10_6
1370; GFX11-NEXT:  ; %bb.5: ; %bb14
1371; GFX11-NEXT:    s_cmp_lt_i32 s1, 9
1372; GFX11-NEXT:    s_cselect_b64 s[0:1], -1, 0
1373; GFX11-NEXT:    s_cmp_lt_i32 s2, s3
1374; GFX11-NEXT:    s_cselect_b64 s[2:3], -1, 0
1375; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1376; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
1377; GFX11-NEXT:    s_and_b64 s[0:1], s[6:7], s[0:1]
1378; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1379; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
1380; GFX11-NEXT:    s_branch .LBB10_7
1381; GFX11-NEXT:  .LBB10_6:
1382; GFX11-NEXT:    ; implicit-def: $vgpr0
1383; GFX11-NEXT:  .LBB10_7: ; %bb19
1384; GFX11-NEXT:    s_clause 0x1
1385; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
1386; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1387; GFX11-NEXT:    v_mov_b32_e32 v1, 0
1388; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1389; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1390; GFX11-NEXT:    s_waitcnt_depctr 0xfffe
1391; GFX11-NEXT:    s_add_u32 s0, s2, s0
1392; GFX11-NEXT:    s_addc_u32 s1, s3, s1
1393; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1394; GFX11-NEXT:    s_endpgm
1395;
1396; GFX12-LABEL: long_branch_hang:
1397; GFX12:       ; %bb.0: ; %bb
1398; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
1399; GFX12-NEXT:    s_mov_b32 s7, -1
1400; GFX12-NEXT:    s_wait_kmcnt 0x0
1401; GFX12-NEXT:    s_cmp_eq_u32 s0, 0
1402; GFX12-NEXT:    s_cselect_b32 s6, -1, 0
1403; GFX12-NEXT:    s_cmp_lg_u32 s0, 0
1404; GFX12-NEXT:    s_mov_b32 s0, 0
1405; GFX12-NEXT:    s_cselect_b32 s8, -1, 0
1406; GFX12-NEXT:    s_cmp_lt_i32 s3, 6
1407; GFX12-NEXT:    s_cbranch_scc0 .LBB10_1
1408; GFX12-NEXT:  ; %bb.18: ; %bb
1409; GFX12-NEXT:    s_getpc_b64 s[10:11]
1410; GFX12-NEXT:  .Lpost_getpc17:
1411; GFX12-NEXT:    s_wait_alu 0xfffe
1412; GFX12-NEXT:    s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc17)&4294967295
1413; GFX12-NEXT:    s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc17)>>32
1414; GFX12-NEXT:    s_wait_alu 0xfffe
1415; GFX12-NEXT:    s_setpc_b64 s[10:11]
1416; GFX12-NEXT:  .LBB10_1: ; %Flow
1417; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s7
1418; GFX12-NEXT:    s_cbranch_vccnz .LBB10_2
1419; GFX12-NEXT:  ; %bb.10: ; %Flow
1420; GFX12-NEXT:    s_getpc_b64 s[8:9]
1421; GFX12-NEXT:  .Lpost_getpc13:
1422; GFX12-NEXT:    s_wait_alu 0xfffe
1423; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc13)&4294967295
1424; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32
1425; GFX12-NEXT:    s_wait_alu 0xfffe
1426; GFX12-NEXT:    s_setpc_b64 s[8:9]
1427; GFX12-NEXT:  .LBB10_2: ; %Flow5
1428; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
1429; GFX12-NEXT:    s_cbranch_vccz .LBB10_3
1430; GFX12-NEXT:  ; %bb.12: ; %Flow5
1431; GFX12-NEXT:    s_getpc_b64 s[0:1]
1432; GFX12-NEXT:  .Lpost_getpc14:
1433; GFX12-NEXT:    s_wait_alu 0xfffe
1434; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295
1435; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32
1436; GFX12-NEXT:    s_wait_alu 0xfffe
1437; GFX12-NEXT:    s_setpc_b64 s[0:1]
1438; GFX12-NEXT:  .LBB10_3: ; %bb14
1439; GFX12-NEXT:    s_cmp_lt_i32 s1, 9
1440; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
1441; GFX12-NEXT:    s_cmp_lt_i32 s2, s3
1442; GFX12-NEXT:    s_cselect_b32 s1, -1, 0
1443; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1444; GFX12-NEXT:    s_or_b32 s0, s1, s0
1445; GFX12-NEXT:    s_and_b32 s0, s6, s0
1446; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
1447; GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
1448; GFX12-NEXT:  ; %bb.8: ; %bb14
1449; GFX12-NEXT:    s_getpc_b64 s[0:1]
1450; GFX12-NEXT:  .Lpost_getpc12:
1451; GFX12-NEXT:    s_wait_alu 0xfffe
1452; GFX12-NEXT:    s_add_co_u32 s0, s0, (.LBB10_7-.Lpost_getpc12)&4294967295
1453; GFX12-NEXT:    s_add_co_ci_u32 s1, s1, (.LBB10_7-.Lpost_getpc12)>>32
1454; GFX12-NEXT:    s_wait_alu 0xfffe
1455; GFX12-NEXT:    s_setpc_b64 s[0:1]
1456; GFX12-NEXT:  .LBB10_4: ; %bb13
1457; GFX12-NEXT:    s_mov_b32 s0, s8
1458; GFX12-NEXT:    ;;#ASMSTART
1459; GFX12-NEXT:    v_nop_e64
1460; GFX12-NEXT:    v_nop_e64
1461; GFX12-NEXT:    v_nop_e64
1462; GFX12-NEXT:    v_nop_e64
1463; GFX12-NEXT:    ;;#ASMEND
1464; GFX12-NEXT:    s_cbranch_execz .LBB10_5
1465; GFX12-NEXT:  ; %bb.14: ; %bb13
1466; GFX12-NEXT:    s_getpc_b64 s[8:9]
1467; GFX12-NEXT:  .Lpost_getpc15:
1468; GFX12-NEXT:    s_wait_alu 0xfffe
1469; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc15)&4294967295
1470; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc15)>>32
1471; GFX12-NEXT:    s_wait_alu 0xfffe
1472; GFX12-NEXT:    s_setpc_b64 s[8:9]
1473; GFX12-NEXT:  .LBB10_5: ; %bb9
1474; GFX12-NEXT:    s_cmp_lt_i32 s3, 11
1475; GFX12-NEXT:    s_cselect_b32 s0, -1, 0
1476; GFX12-NEXT:    s_cmp_ge_i32 s2, s3
1477; GFX12-NEXT:    s_cselect_b32 s7, -1, 0
1478; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
1479; GFX12-NEXT:    s_and_b32 s0, s7, s0
1480; GFX12-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
1481; GFX12-NEXT:    s_cbranch_vccnz .LBB10_6
1482; GFX12-NEXT:  ; %bb.16: ; %bb9
1483; GFX12-NEXT:    s_getpc_b64 s[8:9]
1484; GFX12-NEXT:  .Lpost_getpc16:
1485; GFX12-NEXT:    s_wait_alu 0xfffe
1486; GFX12-NEXT:    s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc16)&4294967295
1487; GFX12-NEXT:    s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc16)>>32
1488; GFX12-NEXT:    s_wait_alu 0xfffe
1489; GFX12-NEXT:    s_setpc_b64 s[8:9]
1490; GFX12-NEXT:  .LBB10_6:
1491; GFX12-NEXT:    ; implicit-def: $vgpr0
1492; GFX12-NEXT:  .LBB10_7: ; %bb19
1493; GFX12-NEXT:    s_clause 0x1
1494; GFX12-NEXT:    s_load_b64 s[0:1], s[4:5], 0x3c
1495; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
1496; GFX12-NEXT:    v_mov_b32_e32 v1, 0
1497; GFX12-NEXT:    s_wait_kmcnt 0x0
1498; GFX12-NEXT:    s_wait_alu 0xfffe
1499; GFX12-NEXT:    s_lshl_b64 s[0:1], s[0:1], 2
1500; GFX12-NEXT:    s_wait_alu 0xfffe
1501; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[2:3], s[0:1]
1502; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
1503; GFX12-NEXT:    s_endpgm
1504bb:
1505  %tmp = icmp slt i32 %arg2, 9
1506  %tmp6 = icmp eq i32 %arg1, 0
1507  %tmp8 = icmp sgt i32 %arg4, 5
1508  br i1 %tmp8, label %bb9, label %bb13
1509
1510bb9:                                              ; preds = %bb
1511  %tmp7 = icmp sgt i32 %arg4, 10                  ; avoid being optimized away through the domination
1512  %tmp11 = icmp slt i32 %arg3, %arg4
1513  %tmp12 = or i1 %tmp11, %tmp7
1514  br i1 %tmp12, label %bb19, label %bb14
1515
1516bb13:                                             ; preds = %bb
1517  call void asm sideeffect
1518  "v_nop_e64
1519  v_nop_e64
1520  v_nop_e64
1521  v_nop_e64", ""() #0
1522  br i1 %tmp6, label %bb19, label %bb14
1523
1524bb14:                                             ; preds = %bb13, %bb9
1525  %tmp15 = icmp slt i32 %arg3, %arg4
1526  %tmp16 = or i1 %tmp15, %tmp
1527  %tmp17 = and i1 %tmp6, %tmp16
1528  %tmp18 = zext i1 %tmp17 to i32
1529  br label %bb19
1530
1531bb19:                                             ; preds = %bb14, %bb13, %bb9
1532  %tmp20 = phi i32 [ undef, %bb9 ], [ undef, %bb13 ], [ %tmp18, %bb14 ]
1533  %tmp21 = getelementptr inbounds i32, ptr addrspace(1) %arg, i64 %arg5
1534  store i32 %tmp20, ptr addrspace(1) %tmp21, align 4
1535  ret void
1536}
1537
1538attributes #0 = { nounwind }
1539attributes #1 = { nounwind readnone }
1540