xref: /llvm-project/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.reduce.umax.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8DAGISEL %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX8GISEL %s
4; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=0 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9DAGISEL %s
5; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -global-isel=1 -verify-machineinstrs < %s | FileCheck  -check-prefixes=GFX9GISEL %s
6; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1064DAGISEL %s
7; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1064GISEL %s
8; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10DAGISEL,GFX1032DAGISEL %s
9; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10GISEL,GFX1032GISEL %s
10; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1164DAGISEL %s
11; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1164GISEL %s
12; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11DAGISEL,GFX1132DAGISEL %s
13; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11GISEL,GFX1132GISEL %s
14
15declare i32 @llvm.amdgcn.wave.reduce.umax.i32(i32, i32 immarg)
16declare i32 @llvm.amdgcn.workitem.id.x()
17
18define amdgpu_kernel void @uniform_value(ptr addrspace(1) %out, i32 %in) {
19; GFX8DAGISEL-LABEL: uniform_value:
20; GFX8DAGISEL:       ; %bb.0: ; %entry
21; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
22; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
23; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
24; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
25; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
26; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s2
27; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
28; GFX8DAGISEL-NEXT:    s_endpgm
29;
30; GFX8GISEL-LABEL: uniform_value:
31; GFX8GISEL:       ; %bb.0: ; %entry
32; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
33; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
34; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
35; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s2
36; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
37; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
38; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
39; GFX8GISEL-NEXT:    s_endpgm
40;
41; GFX9DAGISEL-LABEL: uniform_value:
42; GFX9DAGISEL:       ; %bb.0: ; %entry
43; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
44; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
45; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
46; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
47; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
48; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
49; GFX9DAGISEL-NEXT:    s_endpgm
50;
51; GFX9GISEL-LABEL: uniform_value:
52; GFX9GISEL:       ; %bb.0: ; %entry
53; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
54; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
55; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
56; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
57; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s2
58; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
59; GFX9GISEL-NEXT:    s_endpgm
60;
61; GFX10DAGISEL-LABEL: uniform_value:
62; GFX10DAGISEL:       ; %bb.0: ; %entry
63; GFX10DAGISEL-NEXT:    s_clause 0x1
64; GFX10DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
65; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
66; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
69; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
70; GFX10DAGISEL-NEXT:    s_endpgm
71;
72; GFX10GISEL-LABEL: uniform_value:
73; GFX10GISEL:       ; %bb.0: ; %entry
74; GFX10GISEL-NEXT:    s_clause 0x1
75; GFX10GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
76; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
77; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
78; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
79; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, s2
80; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
81; GFX10GISEL-NEXT:    s_endpgm
82;
83; GFX1164DAGISEL-LABEL: uniform_value:
84; GFX1164DAGISEL:       ; %bb.0: ; %entry
85; GFX1164DAGISEL-NEXT:    s_clause 0x1
86; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
87; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
88; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
89; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
90; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
91; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
92; GFX1164DAGISEL-NEXT:    s_endpgm
93;
94; GFX1164GISEL-LABEL: uniform_value:
95; GFX1164GISEL:       ; %bb.0: ; %entry
96; GFX1164GISEL-NEXT:    s_clause 0x1
97; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
98; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
99; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
100; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
101; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s2
102; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
103; GFX1164GISEL-NEXT:    s_endpgm
104;
105; GFX1132DAGISEL-LABEL: uniform_value:
106; GFX1132DAGISEL:       ; %bb.0: ; %entry
107; GFX1132DAGISEL-NEXT:    s_clause 0x1
108; GFX1132DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
109; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
110; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
111; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
112; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
113; GFX1132DAGISEL-NEXT:    s_endpgm
114;
115; GFX1132GISEL-LABEL: uniform_value:
116; GFX1132GISEL:       ; %bb.0: ; %entry
117; GFX1132GISEL-NEXT:    s_clause 0x1
118; GFX1132GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
119; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
120; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
121; GFX1132GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
122; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
123; GFX1132GISEL-NEXT:    s_endpgm
124entry:
125    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
126    store i32 %result, ptr addrspace(1) %out
127    ret void
128}
129
130define amdgpu_kernel void @const_value(ptr addrspace(1) %out) {
131; GFX8DAGISEL-LABEL: const_value:
132; GFX8DAGISEL:       ; %bb.0: ; %entry
133; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
134; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
135; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
136; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
137; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
138; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
139; GFX8DAGISEL-NEXT:    s_endpgm
140;
141; GFX8GISEL-LABEL: const_value:
142; GFX8GISEL:       ; %bb.0: ; %entry
143; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
144; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, 0x7b
145; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
146; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
147; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
148; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
149; GFX8GISEL-NEXT:    s_endpgm
150;
151; GFX9DAGISEL-LABEL: const_value:
152; GFX9DAGISEL:       ; %bb.0: ; %entry
153; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
154; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
155; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
156; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
157; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
158; GFX9DAGISEL-NEXT:    s_endpgm
159;
160; GFX9GISEL-LABEL: const_value:
161; GFX9GISEL:       ; %bb.0: ; %entry
162; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
163; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
164; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
165; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
166; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
167; GFX9GISEL-NEXT:    s_endpgm
168;
169; GFX10DAGISEL-LABEL: const_value:
170; GFX10DAGISEL:       ; %bb.0: ; %entry
171; GFX10DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
172; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
173; GFX10DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
174; GFX10DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
175; GFX10DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
176; GFX10DAGISEL-NEXT:    s_endpgm
177;
178; GFX10GISEL-LABEL: const_value:
179; GFX10GISEL:       ; %bb.0: ; %entry
180; GFX10GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
181; GFX10GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
182; GFX10GISEL-NEXT:    v_mov_b32_e32 v1, 0
183; GFX10GISEL-NEXT:    s_waitcnt lgkmcnt(0)
184; GFX10GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
185; GFX10GISEL-NEXT:    s_endpgm
186;
187; GFX1164DAGISEL-LABEL: const_value:
188; GFX1164DAGISEL:       ; %bb.0: ; %entry
189; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
190; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
191; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0x7b
192; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
193; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
194; GFX1164DAGISEL-NEXT:    s_endpgm
195;
196; GFX1164GISEL-LABEL: const_value:
197; GFX1164GISEL:       ; %bb.0: ; %entry
198; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
199; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, 0x7b
200; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
201; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
202; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
203; GFX1164GISEL-NEXT:    s_endpgm
204;
205; GFX1132DAGISEL-LABEL: const_value:
206; GFX1132DAGISEL:       ; %bb.0: ; %entry
207; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
208; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7b
209; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
210; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
211; GFX1132DAGISEL-NEXT:    s_endpgm
212;
213; GFX1132GISEL-LABEL: const_value:
214; GFX1132GISEL:       ; %bb.0: ; %entry
215; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
216; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
217; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
219; GFX1132GISEL-NEXT:    s_endpgm
220entry:
221    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 123, i32 1)
222    store i32 %result, ptr addrspace(1) %out
223    ret void
224}
225
226define amdgpu_kernel void @poison_value(ptr addrspace(1) %out, i32 %in) {
227; GFX8DAGISEL-LABEL: poison_value:
228; GFX8DAGISEL:       ; %bb.0: ; %entry
229; GFX8DAGISEL-NEXT:    s_endpgm
230;
231; GFX8GISEL-LABEL: poison_value:
232; GFX8GISEL:       ; %bb.0: ; %entry
233; GFX8GISEL-NEXT:    s_endpgm
234;
235; GFX9DAGISEL-LABEL: poison_value:
236; GFX9DAGISEL:       ; %bb.0: ; %entry
237; GFX9DAGISEL-NEXT:    s_endpgm
238;
239; GFX9GISEL-LABEL: poison_value:
240; GFX9GISEL:       ; %bb.0: ; %entry
241; GFX9GISEL-NEXT:    s_endpgm
242;
243; GFX10DAGISEL-LABEL: poison_value:
244; GFX10DAGISEL:       ; %bb.0: ; %entry
245; GFX10DAGISEL-NEXT:    s_endpgm
246;
247; GFX10GISEL-LABEL: poison_value:
248; GFX10GISEL:       ; %bb.0: ; %entry
249; GFX10GISEL-NEXT:    s_endpgm
250;
251; GFX11DAGISEL-LABEL: poison_value:
252; GFX11DAGISEL:       ; %bb.0: ; %entry
253; GFX11DAGISEL-NEXT:    s_endpgm
254;
255; GFX11GISEL-LABEL: poison_value:
256; GFX11GISEL:       ; %bb.0: ; %entry
257; GFX11GISEL-NEXT:    s_endpgm
258entry:
259    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 poison, i32 1)
260    store i32 %result, ptr addrspace(1) %out
261    ret void
262}
263
264define amdgpu_kernel void @divergent_value(ptr addrspace(1) %out, i32 %in) {
265; GFX8DAGISEL-LABEL: divergent_value:
266; GFX8DAGISEL:       ; %bb.0: ; %entry
267; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
268; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
269; GFX8DAGISEL-NEXT:    s_mov_b32 s4, 0
270; GFX8DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
271; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
272; GFX8DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
273; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
274; GFX8DAGISEL-NEXT:    s_max_u32 s4, s4, s6
275; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
276; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
277; GFX8DAGISEL-NEXT:  ; %bb.2:
278; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
279; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v0, s0
280; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
281; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s4
282; GFX8DAGISEL-NEXT:    flat_store_dword v[0:1], v2
283; GFX8DAGISEL-NEXT:    s_endpgm
284;
285; GFX8GISEL-LABEL: divergent_value:
286; GFX8GISEL:       ; %bb.0: ; %entry
287; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
288; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
289; GFX8GISEL-NEXT:    s_mov_b32 s4, 0
290; GFX8GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
291; GFX8GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
292; GFX8GISEL-NEXT:    v_readlane_b32 s6, v0, s5
293; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
294; GFX8GISEL-NEXT:    s_max_u32 s4, s4, s6
295; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
296; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
297; GFX8GISEL-NEXT:  ; %bb.2:
298; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
299; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
300; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s4
301; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
302; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
303; GFX8GISEL-NEXT:    s_endpgm
304;
305; GFX9DAGISEL-LABEL: divergent_value:
306; GFX9DAGISEL:       ; %bb.0: ; %entry
307; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
308; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
309; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
310; GFX9DAGISEL-NEXT:    s_mov_b32 s4, 0
311; GFX9DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
312; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
313; GFX9DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
314; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
315; GFX9DAGISEL-NEXT:    s_max_u32 s4, s4, s6
316; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
317; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
318; GFX9DAGISEL-NEXT:  ; %bb.2:
319; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
320; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
321; GFX9DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
322; GFX9DAGISEL-NEXT:    s_endpgm
323;
324; GFX9GISEL-LABEL: divergent_value:
325; GFX9GISEL:       ; %bb.0: ; %entry
326; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
327; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
328; GFX9GISEL-NEXT:    s_mov_b32 s4, 0
329; GFX9GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
330; GFX9GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
331; GFX9GISEL-NEXT:    v_readlane_b32 s6, v0, s5
332; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
333; GFX9GISEL-NEXT:    s_max_u32 s4, s4, s6
334; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
335; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
336; GFX9GISEL-NEXT:  ; %bb.2:
337; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s4
338; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
339; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
340; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
341; GFX9GISEL-NEXT:    s_endpgm
342;
343; GFX1064DAGISEL-LABEL: divergent_value:
344; GFX1064DAGISEL:       ; %bb.0: ; %entry
345; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
346; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
347; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
348; GFX1064DAGISEL-NEXT:    s_mov_b32 s4, 0
349; GFX1064DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
350; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
351; GFX1064DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
352; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
353; GFX1064DAGISEL-NEXT:    s_max_u32 s4, s4, s6
354; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
355; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
356; GFX1064DAGISEL-NEXT:  ; %bb.2:
357; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
358; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
359; GFX1064DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
360; GFX1064DAGISEL-NEXT:    s_endpgm
361;
362; GFX1064GISEL-LABEL: divergent_value:
363; GFX1064GISEL:       ; %bb.0: ; %entry
364; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
365; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
366; GFX1064GISEL-NEXT:    s_mov_b32 s4, 0
367; GFX1064GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
368; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s5, s[2:3]
369; GFX1064GISEL-NEXT:    v_readlane_b32 s6, v0, s5
370; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
371; GFX1064GISEL-NEXT:    s_max_u32 s4, s4, s6
372; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
373; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
374; GFX1064GISEL-NEXT:  ; %bb.2:
375; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s4
376; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
377; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
378; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
379; GFX1064GISEL-NEXT:    s_endpgm
380;
381; GFX1032DAGISEL-LABEL: divergent_value:
382; GFX1032DAGISEL:       ; %bb.0: ; %entry
383; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
384; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
385; GFX1032DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
386; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, 0
387; GFX1032DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
388; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s4, s3
389; GFX1032DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
390; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s3, s4
391; GFX1032DAGISEL-NEXT:    s_max_u32 s2, s2, s5
392; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
393; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
394; GFX1032DAGISEL-NEXT:  ; %bb.2:
395; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
396; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
397; GFX1032DAGISEL-NEXT:    global_store_dword v1, v0, s[0:1]
398; GFX1032DAGISEL-NEXT:    s_endpgm
399;
400; GFX1032GISEL-LABEL: divergent_value:
401; GFX1032GISEL:       ; %bb.0: ; %entry
402; GFX1032GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
403; GFX1032GISEL-NEXT:    s_mov_b32 s3, exec_lo
404; GFX1032GISEL-NEXT:    s_mov_b32 s2, 0
405; GFX1032GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
406; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s4, s3
407; GFX1032GISEL-NEXT:    v_readlane_b32 s5, v0, s4
408; GFX1032GISEL-NEXT:    s_bitset0_b32 s3, s4
409; GFX1032GISEL-NEXT:    s_max_u32 s2, s2, s5
410; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s3, 0
411; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
412; GFX1032GISEL-NEXT:  ; %bb.2:
413; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s2
414; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
415; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
416; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
417; GFX1032GISEL-NEXT:    s_endpgm
418;
419; GFX1164DAGISEL-LABEL: divergent_value:
420; GFX1164DAGISEL:       ; %bb.0: ; %entry
421; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
422; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, 0
423; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
424; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
425; GFX1164DAGISEL-NEXT:    s_mov_b32 s4, 0
426; GFX1164DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
427; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
428; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
429; GFX1164DAGISEL-NEXT:    v_readlane_b32 s6, v0, s5
430; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s5
431; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
432; GFX1164DAGISEL-NEXT:    s_max_u32 s4, s4, s6
433; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
434; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
435; GFX1164DAGISEL-NEXT:  ; %bb.2:
436; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, s4
437; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
438; GFX1164DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
439; GFX1164DAGISEL-NEXT:    s_endpgm
440;
441; GFX1164GISEL-LABEL: divergent_value:
442; GFX1164GISEL:       ; %bb.0: ; %entry
443; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
444; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
445; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
446; GFX1164GISEL-NEXT:    s_mov_b32 s4, 0
447; GFX1164GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
448; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s5, s[2:3]
449; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
450; GFX1164GISEL-NEXT:    v_readlane_b32 s6, v0, s5
451; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s5
452; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
453; GFX1164GISEL-NEXT:    s_max_u32 s4, s4, s6
454; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
455; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
456; GFX1164GISEL-NEXT:  ; %bb.2:
457; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s4
458; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
459; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
460; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
461; GFX1164GISEL-NEXT:    s_endpgm
462;
463; GFX1132DAGISEL-LABEL: divergent_value:
464; GFX1132DAGISEL:       ; %bb.0: ; %entry
465; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
466; GFX1132DAGISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
467; GFX1132DAGISEL-NEXT:    s_mov_b32 s3, exec_lo
468; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, 0
469; GFX1132DAGISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
470; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s4, s3
471; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
472; GFX1132DAGISEL-NEXT:    v_readlane_b32 s5, v0, s4
473; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s3, s4
474; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
475; GFX1132DAGISEL-NEXT:    s_max_u32 s2, s2, s5
476; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s3, 0
477; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB3_1
478; GFX1132DAGISEL-NEXT:  ; %bb.2:
479; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, s2
480; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX1132DAGISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
482; GFX1132DAGISEL-NEXT:    s_endpgm
483;
484; GFX1132GISEL-LABEL: divergent_value:
485; GFX1132GISEL:       ; %bb.0: ; %entry
486; GFX1132GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
487; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
488; GFX1132GISEL-NEXT:    s_mov_b32 s3, exec_lo
489; GFX1132GISEL-NEXT:    s_mov_b32 s2, 0
490; GFX1132GISEL-NEXT:  .LBB3_1: ; =>This Inner Loop Header: Depth=1
491; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s4, s3
492; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
493; GFX1132GISEL-NEXT:    v_readlane_b32 s5, v0, s4
494; GFX1132GISEL-NEXT:    s_bitset0_b32 s3, s4
495; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
496; GFX1132GISEL-NEXT:    s_max_u32 s2, s2, s5
497; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s3, 0
498; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB3_1
499; GFX1132GISEL-NEXT:  ; %bb.2:
500; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
501; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
502; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
503; GFX1132GISEL-NEXT:    s_endpgm
504entry:
505    %id.x = call i32 @llvm.amdgcn.workitem.id.x()
506    %result = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %id.x, i32 1)
507    store i32 %result, ptr addrspace(1) %out
508    ret void
509}
510
511define amdgpu_kernel void @divergent_cfg(ptr addrspace(1) %out, i32 %in) {
512; GFX8DAGISEL-LABEL: divergent_cfg:
513; GFX8DAGISEL:       ; %bb.0: ; %entry
514; GFX8DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
515; GFX8DAGISEL-NEXT:    ; implicit-def: $sgpr2
516; GFX8DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
517; GFX8DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
518; GFX8DAGISEL-NEXT:  ; %bb.1: ; %else
519; GFX8DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
520; GFX8DAGISEL-NEXT:    ; implicit-def: $vgpr0
521; GFX8DAGISEL-NEXT:  ; %bb.2: ; %Flow
522; GFX8DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
523; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
524; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
525; GFX8DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
526; GFX8DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
527; GFX8DAGISEL-NEXT:  ; %bb.3: ; %if
528; GFX8DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
529; GFX8DAGISEL-NEXT:    s_mov_b32 s6, 0
530; GFX8DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
531; GFX8DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
532; GFX8DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
533; GFX8DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
534; GFX8DAGISEL-NEXT:    s_max_u32 s6, s6, s8
535; GFX8DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
536; GFX8DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
537; GFX8DAGISEL-NEXT:  ; %bb.5:
538; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
539; GFX8DAGISEL-NEXT:  .LBB4_6: ; %endif
540; GFX8DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
541; GFX8DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
542; GFX8DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
543; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v3, s1
544; GFX8DAGISEL-NEXT:    v_mov_b32_e32 v2, s0
545; GFX8DAGISEL-NEXT:    flat_store_dword v[2:3], v1
546; GFX8DAGISEL-NEXT:    s_endpgm
547;
548; GFX8GISEL-LABEL: divergent_cfg:
549; GFX8GISEL:       ; %bb.0: ; %entry
550; GFX8GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
551; GFX8GISEL-NEXT:    ; implicit-def: $sgpr6
552; GFX8GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
553; GFX8GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
554; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_2
555; GFX8GISEL-NEXT:  ; %bb.1: ; %else
556; GFX8GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
557; GFX8GISEL-NEXT:    ; implicit-def: $vgpr0
558; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX8GISEL-NEXT:    s_mov_b32 s6, s2
560; GFX8GISEL-NEXT:  .LBB4_2: ; %Flow
561; GFX8GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
562; GFX8GISEL-NEXT:    s_cbranch_execz .LBB4_5
563; GFX8GISEL-NEXT:  ; %bb.3: ; %if
564; GFX8GISEL-NEXT:    s_mov_b64 s[2:3], exec
565; GFX8GISEL-NEXT:    s_mov_b32 s6, 0
566; GFX8GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
567; GFX8GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
568; GFX8GISEL-NEXT:    v_readlane_b32 s8, v0, s7
569; GFX8GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
570; GFX8GISEL-NEXT:    s_max_u32 s6, s6, s8
571; GFX8GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
572; GFX8GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
573; GFX8GISEL-NEXT:  .LBB4_5: ; %endif
574; GFX8GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
575; GFX8GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
576; GFX8GISEL-NEXT:    v_mov_b32_e32 v2, s6
577; GFX8GISEL-NEXT:    s_waitcnt lgkmcnt(0)
578; GFX8GISEL-NEXT:    v_mov_b32_e32 v0, s0
579; GFX8GISEL-NEXT:    v_mov_b32_e32 v1, s1
580; GFX8GISEL-NEXT:    flat_store_dword v[0:1], v2
581; GFX8GISEL-NEXT:    s_endpgm
582;
583; GFX9DAGISEL-LABEL: divergent_cfg:
584; GFX9DAGISEL:       ; %bb.0: ; %entry
585; GFX9DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
586; GFX9DAGISEL-NEXT:    ; implicit-def: $sgpr2
587; GFX9DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
588; GFX9DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
589; GFX9DAGISEL-NEXT:  ; %bb.1: ; %else
590; GFX9DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
591; GFX9DAGISEL-NEXT:    ; implicit-def: $vgpr0
592; GFX9DAGISEL-NEXT:  ; %bb.2: ; %Flow
593; GFX9DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
594; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
595; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
596; GFX9DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
597; GFX9DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
598; GFX9DAGISEL-NEXT:  ; %bb.3: ; %if
599; GFX9DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
600; GFX9DAGISEL-NEXT:    s_mov_b32 s6, 0
601; GFX9DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
602; GFX9DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
603; GFX9DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
604; GFX9DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
605; GFX9DAGISEL-NEXT:    s_max_u32 s6, s6, s8
606; GFX9DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
607; GFX9DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
608; GFX9DAGISEL-NEXT:  ; %bb.5:
609; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
610; GFX9DAGISEL-NEXT:  .LBB4_6: ; %endif
611; GFX9DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
612; GFX9DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
613; GFX9DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
614; GFX9DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
615; GFX9DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
616; GFX9DAGISEL-NEXT:    s_endpgm
617;
618; GFX9GISEL-LABEL: divergent_cfg:
619; GFX9GISEL:       ; %bb.0: ; %entry
620; GFX9GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
621; GFX9GISEL-NEXT:    ; implicit-def: $sgpr6
622; GFX9GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
623; GFX9GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
624; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_2
625; GFX9GISEL-NEXT:  ; %bb.1: ; %else
626; GFX9GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
627; GFX9GISEL-NEXT:    ; implicit-def: $vgpr0
628; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9GISEL-NEXT:    s_mov_b32 s6, s2
630; GFX9GISEL-NEXT:  .LBB4_2: ; %Flow
631; GFX9GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
632; GFX9GISEL-NEXT:    s_cbranch_execz .LBB4_5
633; GFX9GISEL-NEXT:  ; %bb.3: ; %if
634; GFX9GISEL-NEXT:    s_mov_b64 s[2:3], exec
635; GFX9GISEL-NEXT:    s_mov_b32 s6, 0
636; GFX9GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
637; GFX9GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
638; GFX9GISEL-NEXT:    v_readlane_b32 s8, v0, s7
639; GFX9GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
640; GFX9GISEL-NEXT:    s_max_u32 s6, s6, s8
641; GFX9GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
642; GFX9GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
643; GFX9GISEL-NEXT:  .LBB4_5: ; %endif
644; GFX9GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
645; GFX9GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
646; GFX9GISEL-NEXT:    v_mov_b32_e32 v0, s6
647; GFX9GISEL-NEXT:    v_mov_b32_e32 v1, 0
648; GFX9GISEL-NEXT:    s_waitcnt lgkmcnt(0)
649; GFX9GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
650; GFX9GISEL-NEXT:    s_endpgm
651;
652; GFX1064DAGISEL-LABEL: divergent_cfg:
653; GFX1064DAGISEL:       ; %bb.0: ; %entry
654; GFX1064DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc, 15, v0
655; GFX1064DAGISEL-NEXT:    ; implicit-def: $sgpr2
656; GFX1064DAGISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
657; GFX1064DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
658; GFX1064DAGISEL-NEXT:  ; %bb.1: ; %else
659; GFX1064DAGISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
660; GFX1064DAGISEL-NEXT:    ; implicit-def: $vgpr0
661; GFX1064DAGISEL-NEXT:  ; %bb.2: ; %Flow
662; GFX1064DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
663; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
664; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
665; GFX1064DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
666; GFX1064DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
667; GFX1064DAGISEL-NEXT:  ; %bb.3: ; %if
668; GFX1064DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
669; GFX1064DAGISEL-NEXT:    s_mov_b32 s6, 0
670; GFX1064DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
671; GFX1064DAGISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
672; GFX1064DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
673; GFX1064DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
674; GFX1064DAGISEL-NEXT:    s_max_u32 s6, s6, s8
675; GFX1064DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
676; GFX1064DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
677; GFX1064DAGISEL-NEXT:  ; %bb.5:
678; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
679; GFX1064DAGISEL-NEXT:  .LBB4_6: ; %endif
680; GFX1064DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
681; GFX1064DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
682; GFX1064DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
683; GFX1064DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
684; GFX1064DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
685; GFX1064DAGISEL-NEXT:    s_endpgm
686;
687; GFX1064GISEL-LABEL: divergent_cfg:
688; GFX1064GISEL:       ; %bb.0: ; %entry
689; GFX1064GISEL-NEXT:    v_cmp_le_u32_e32 vcc, 16, v0
690; GFX1064GISEL-NEXT:    ; implicit-def: $sgpr6
691; GFX1064GISEL-NEXT:    s_and_saveexec_b64 s[0:1], vcc
692; GFX1064GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
693; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_2
694; GFX1064GISEL-NEXT:  ; %bb.1: ; %else
695; GFX1064GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
696; GFX1064GISEL-NEXT:    ; implicit-def: $vgpr0
697; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
698; GFX1064GISEL-NEXT:    s_mov_b32 s6, s2
699; GFX1064GISEL-NEXT:  .LBB4_2: ; %Flow
700; GFX1064GISEL-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
701; GFX1064GISEL-NEXT:    s_cbranch_execz .LBB4_5
702; GFX1064GISEL-NEXT:  ; %bb.3: ; %if
703; GFX1064GISEL-NEXT:    s_mov_b64 s[2:3], exec
704; GFX1064GISEL-NEXT:    s_mov_b32 s6, 0
705; GFX1064GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
706; GFX1064GISEL-NEXT:    s_ff1_i32_b64 s7, s[2:3]
707; GFX1064GISEL-NEXT:    v_readlane_b32 s8, v0, s7
708; GFX1064GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
709; GFX1064GISEL-NEXT:    s_max_u32 s6, s6, s8
710; GFX1064GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
711; GFX1064GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
712; GFX1064GISEL-NEXT:  .LBB4_5: ; %endif
713; GFX1064GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
714; GFX1064GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
715; GFX1064GISEL-NEXT:    v_mov_b32_e32 v0, s6
716; GFX1064GISEL-NEXT:    v_mov_b32_e32 v1, 0
717; GFX1064GISEL-NEXT:    s_waitcnt lgkmcnt(0)
718; GFX1064GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
719; GFX1064GISEL-NEXT:    s_endpgm
720;
721; GFX1032DAGISEL-LABEL: divergent_cfg:
722; GFX1032DAGISEL:       ; %bb.0: ; %entry
723; GFX1032DAGISEL-NEXT:    v_cmp_lt_u32_e32 vcc_lo, 15, v0
724; GFX1032DAGISEL-NEXT:    ; implicit-def: $sgpr1
725; GFX1032DAGISEL-NEXT:    s_and_saveexec_b32 s0, vcc_lo
726; GFX1032DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
727; GFX1032DAGISEL-NEXT:  ; %bb.1: ; %else
728; GFX1032DAGISEL-NEXT:    s_load_dword s1, s[4:5], 0x2c
729; GFX1032DAGISEL-NEXT:    ; implicit-def: $vgpr0
730; GFX1032DAGISEL-NEXT:  ; %bb.2: ; %Flow
731; GFX1032DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
732; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
733; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
734; GFX1032DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
735; GFX1032DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
736; GFX1032DAGISEL-NEXT:  ; %bb.3: ; %if
737; GFX1032DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
738; GFX1032DAGISEL-NEXT:    s_mov_b32 s1, 0
739; GFX1032DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
740; GFX1032DAGISEL-NEXT:    s_ff1_i32_b32 s3, s2
741; GFX1032DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
742; GFX1032DAGISEL-NEXT:    s_bitset0_b32 s2, s3
743; GFX1032DAGISEL-NEXT:    s_max_u32 s1, s1, s6
744; GFX1032DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
745; GFX1032DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
746; GFX1032DAGISEL-NEXT:  ; %bb.5:
747; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
748; GFX1032DAGISEL-NEXT:  .LBB4_6: ; %endif
749; GFX1032DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
750; GFX1032DAGISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
751; GFX1032DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
752; GFX1032DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
753; GFX1032DAGISEL-NEXT:    global_store_dword v0, v1, s[0:1]
754; GFX1032DAGISEL-NEXT:    s_endpgm
755;
756; GFX1032GISEL-LABEL: divergent_cfg:
757; GFX1032GISEL:       ; %bb.0: ; %entry
758; GFX1032GISEL-NEXT:    v_cmp_le_u32_e32 vcc_lo, 16, v0
759; GFX1032GISEL-NEXT:    ; implicit-def: $sgpr0
760; GFX1032GISEL-NEXT:    s_and_saveexec_b32 s1, vcc_lo
761; GFX1032GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
762; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_2
763; GFX1032GISEL-NEXT:  ; %bb.1: ; %else
764; GFX1032GISEL-NEXT:    s_load_dword s0, s[4:5], 0x2c
765; GFX1032GISEL-NEXT:    ; implicit-def: $vgpr0
766; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
767; GFX1032GISEL-NEXT:    s_mov_b32 s0, s0
768; GFX1032GISEL-NEXT:  .LBB4_2: ; %Flow
769; GFX1032GISEL-NEXT:    s_andn2_saveexec_b32 s1, s1
770; GFX1032GISEL-NEXT:    s_cbranch_execz .LBB4_5
771; GFX1032GISEL-NEXT:  ; %bb.3: ; %if
772; GFX1032GISEL-NEXT:    s_mov_b32 s2, exec_lo
773; GFX1032GISEL-NEXT:    s_mov_b32 s0, 0
774; GFX1032GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
775; GFX1032GISEL-NEXT:    s_ff1_i32_b32 s3, s2
776; GFX1032GISEL-NEXT:    v_readlane_b32 s6, v0, s3
777; GFX1032GISEL-NEXT:    s_bitset0_b32 s2, s3
778; GFX1032GISEL-NEXT:    s_max_u32 s0, s0, s6
779; GFX1032GISEL-NEXT:    s_cmp_lg_u32 s2, 0
780; GFX1032GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
781; GFX1032GISEL-NEXT:  .LBB4_5: ; %endif
782; GFX1032GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
783; GFX1032GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
784; GFX1032GISEL-NEXT:    v_mov_b32_e32 v0, s0
785; GFX1032GISEL-NEXT:    v_mov_b32_e32 v1, 0
786; GFX1032GISEL-NEXT:    s_waitcnt lgkmcnt(0)
787; GFX1032GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
788; GFX1032GISEL-NEXT:    s_endpgm
789;
790; GFX1164DAGISEL-LABEL: divergent_cfg:
791; GFX1164DAGISEL:       ; %bb.0: ; %entry
792; GFX1164DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
793; GFX1164DAGISEL-NEXT:    s_mov_b64 s[0:1], exec
794; GFX1164DAGISEL-NEXT:    ; implicit-def: $sgpr2
795; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
796; GFX1164DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
797; GFX1164DAGISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
798; GFX1164DAGISEL-NEXT:  ; %bb.1: ; %else
799; GFX1164DAGISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
800; GFX1164DAGISEL-NEXT:    ; implicit-def: $vgpr0
801; GFX1164DAGISEL-NEXT:  ; %bb.2: ; %Flow
802; GFX1164DAGISEL-NEXT:    s_or_saveexec_b64 s[0:1], s[0:1]
803; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
804; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s2
805; GFX1164DAGISEL-NEXT:    s_xor_b64 exec, exec, s[0:1]
806; GFX1164DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
807; GFX1164DAGISEL-NEXT:  ; %bb.3: ; %if
808; GFX1164DAGISEL-NEXT:    s_mov_b64 s[2:3], exec
809; GFX1164DAGISEL-NEXT:    s_mov_b32 s6, 0
810; GFX1164DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
811; GFX1164DAGISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
812; GFX1164DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
813; GFX1164DAGISEL-NEXT:    v_readlane_b32 s8, v0, s7
814; GFX1164DAGISEL-NEXT:    s_bitset0_b64 s[2:3], s7
815; GFX1164DAGISEL-NEXT:    s_max_u32 s6, s6, s8
816; GFX1164DAGISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
817; GFX1164DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
818; GFX1164DAGISEL-NEXT:  ; %bb.5:
819; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v1, s6
820; GFX1164DAGISEL-NEXT:  .LBB4_6: ; %endif
821; GFX1164DAGISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
822; GFX1164DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
823; GFX1164DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
824; GFX1164DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
825; GFX1164DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
826; GFX1164DAGISEL-NEXT:    s_endpgm
827;
828; GFX1164GISEL-LABEL: divergent_cfg:
829; GFX1164GISEL:       ; %bb.0: ; %entry
830; GFX1164GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
831; GFX1164GISEL-NEXT:    s_mov_b64 s[0:1], exec
832; GFX1164GISEL-NEXT:    ; implicit-def: $sgpr6
833; GFX1164GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
834; GFX1164GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
835; GFX1164GISEL-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
836; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_2
837; GFX1164GISEL-NEXT:  ; %bb.1: ; %else
838; GFX1164GISEL-NEXT:    s_load_b32 s2, s[4:5], 0x2c
839; GFX1164GISEL-NEXT:    ; implicit-def: $vgpr0
840; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
841; GFX1164GISEL-NEXT:    s_mov_b32 s6, s2
842; GFX1164GISEL-NEXT:  .LBB4_2: ; %Flow
843; GFX1164GISEL-NEXT:    s_and_not1_saveexec_b64 s[0:1], s[0:1]
844; GFX1164GISEL-NEXT:    s_cbranch_execz .LBB4_5
845; GFX1164GISEL-NEXT:  ; %bb.3: ; %if
846; GFX1164GISEL-NEXT:    s_mov_b64 s[2:3], exec
847; GFX1164GISEL-NEXT:    s_mov_b32 s6, 0
848; GFX1164GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
849; GFX1164GISEL-NEXT:    s_ctz_i32_b64 s7, s[2:3]
850; GFX1164GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
851; GFX1164GISEL-NEXT:    v_readlane_b32 s8, v0, s7
852; GFX1164GISEL-NEXT:    s_bitset0_b64 s[2:3], s7
853; GFX1164GISEL-NEXT:    s_max_u32 s6, s6, s8
854; GFX1164GISEL-NEXT:    s_cmp_lg_u64 s[2:3], 0
855; GFX1164GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
856; GFX1164GISEL-NEXT:  .LBB4_5: ; %endif
857; GFX1164GISEL-NEXT:    s_or_b64 exec, exec, s[0:1]
858; GFX1164GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
859; GFX1164GISEL-NEXT:    v_mov_b32_e32 v0, s6
860; GFX1164GISEL-NEXT:    v_mov_b32_e32 v1, 0
861; GFX1164GISEL-NEXT:    s_waitcnt lgkmcnt(0)
862; GFX1164GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
863; GFX1164GISEL-NEXT:    s_endpgm
864;
865; GFX1132DAGISEL-LABEL: divergent_cfg:
866; GFX1132DAGISEL:       ; %bb.0: ; %entry
867; GFX1132DAGISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
868; GFX1132DAGISEL-NEXT:    s_mov_b32 s0, exec_lo
869; GFX1132DAGISEL-NEXT:    ; implicit-def: $sgpr1
870; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
871; GFX1132DAGISEL-NEXT:    v_cmpx_lt_u32_e32 15, v0
872; GFX1132DAGISEL-NEXT:    s_xor_b32 s0, exec_lo, s0
873; GFX1132DAGISEL-NEXT:  ; %bb.1: ; %else
874; GFX1132DAGISEL-NEXT:    s_load_b32 s1, s[4:5], 0x2c
875; GFX1132DAGISEL-NEXT:    ; implicit-def: $vgpr0
876; GFX1132DAGISEL-NEXT:  ; %bb.2: ; %Flow
877; GFX1132DAGISEL-NEXT:    s_or_saveexec_b32 s0, s0
878; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
879; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
880; GFX1132DAGISEL-NEXT:    s_xor_b32 exec_lo, exec_lo, s0
881; GFX1132DAGISEL-NEXT:    s_cbranch_execz .LBB4_6
882; GFX1132DAGISEL-NEXT:  ; %bb.3: ; %if
883; GFX1132DAGISEL-NEXT:    s_mov_b32 s2, exec_lo
884; GFX1132DAGISEL-NEXT:    s_mov_b32 s1, 0
885; GFX1132DAGISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
886; GFX1132DAGISEL-NEXT:    s_ctz_i32_b32 s3, s2
887; GFX1132DAGISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
888; GFX1132DAGISEL-NEXT:    v_readlane_b32 s6, v0, s3
889; GFX1132DAGISEL-NEXT:    s_bitset0_b32 s2, s3
890; GFX1132DAGISEL-NEXT:    s_max_u32 s1, s1, s6
891; GFX1132DAGISEL-NEXT:    s_cmp_lg_u32 s2, 0
892; GFX1132DAGISEL-NEXT:    s_cbranch_scc1 .LBB4_4
893; GFX1132DAGISEL-NEXT:  ; %bb.5:
894; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v1, s1
895; GFX1132DAGISEL-NEXT:  .LBB4_6: ; %endif
896; GFX1132DAGISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s0
897; GFX1132DAGISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
898; GFX1132DAGISEL-NEXT:    v_mov_b32_e32 v0, 0
899; GFX1132DAGISEL-NEXT:    s_waitcnt lgkmcnt(0)
900; GFX1132DAGISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
901; GFX1132DAGISEL-NEXT:    s_endpgm
902;
903; GFX1132GISEL-LABEL: divergent_cfg:
904; GFX1132GISEL:       ; %bb.0: ; %entry
905; GFX1132GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
906; GFX1132GISEL-NEXT:    s_mov_b32 s1, exec_lo
907; GFX1132GISEL-NEXT:    ; implicit-def: $sgpr0
908; GFX1132GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
909; GFX1132GISEL-NEXT:    v_cmpx_le_u32_e32 16, v0
910; GFX1132GISEL-NEXT:    s_xor_b32 s1, exec_lo, s1
911; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_2
912; GFX1132GISEL-NEXT:  ; %bb.1: ; %else
913; GFX1132GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x2c
914; GFX1132GISEL-NEXT:    ; implicit-def: $vgpr0
915; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
916; GFX1132GISEL-NEXT:    s_mov_b32 s0, s0
917; GFX1132GISEL-NEXT:  .LBB4_2: ; %Flow
918; GFX1132GISEL-NEXT:    s_and_not1_saveexec_b32 s1, s1
919; GFX1132GISEL-NEXT:    s_cbranch_execz .LBB4_5
920; GFX1132GISEL-NEXT:  ; %bb.3: ; %if
921; GFX1132GISEL-NEXT:    s_mov_b32 s2, exec_lo
922; GFX1132GISEL-NEXT:    s_mov_b32 s0, 0
923; GFX1132GISEL-NEXT:  .LBB4_4: ; =>This Inner Loop Header: Depth=1
924; GFX1132GISEL-NEXT:    s_ctz_i32_b32 s3, s2
925; GFX1132GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
926; GFX1132GISEL-NEXT:    v_readlane_b32 s6, v0, s3
927; GFX1132GISEL-NEXT:    s_bitset0_b32 s2, s3
928; GFX1132GISEL-NEXT:    s_max_u32 s0, s0, s6
929; GFX1132GISEL-NEXT:    s_cmp_lg_u32 s2, 0
930; GFX1132GISEL-NEXT:    s_cbranch_scc1 .LBB4_4
931; GFX1132GISEL-NEXT:  .LBB4_5: ; %endif
932; GFX1132GISEL-NEXT:    s_or_b32 exec_lo, exec_lo, s1
933; GFX1132GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
934; GFX1132GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
935; GFX1132GISEL-NEXT:    s_waitcnt lgkmcnt(0)
936; GFX1132GISEL-NEXT:    global_store_b32 v1, v0, s[2:3]
937; GFX1132GISEL-NEXT:    s_endpgm
938entry:
939  %tid = call i32 @llvm.amdgcn.workitem.id.x()
940  %d_cmp = icmp ult i32 %tid, 16
941  br i1 %d_cmp, label %if, label %else
942
943if:
944  %reducedValTid = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %tid, i32 1)
945  br label %endif
946
947else:
948  %reducedValIn = call i32 @llvm.amdgcn.wave.reduce.umax.i32(i32 %in, i32 1)
949  br label %endif
950
951endif:
952  %combine = phi i32 [%reducedValTid, %if], [%reducedValIn, %else]
953  store i32 %combine, ptr addrspace(1) %out
954  ret void
955}
956