xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
6
7declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
8declare <2 x i7> @llvm.ctlz.v2i7(<2 x i7>, i1) nounwind readnone
9declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
10declare <2 x i8> @llvm.ctlz.v2i8(<2 x i8>, i1) nounwind readnone
11
12declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
13declare i18 @llvm.ctlz.i18(i18, i1) nounwind readnone
14
15declare <2 x i16> @llvm.ctlz.v2i16(<2 x i16>, i1) nounwind readnone
16declare <3 x i16> @llvm.ctlz.v3i16(<3 x i16>, i1) nounwind readnone
17declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>, i1) nounwind readnone
18
19declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
20declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
21declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
22
23declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
24declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
25declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
26
27declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
28
29define amdgpu_kernel void @s_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
30; SI-LABEL: s_ctlz_zero_undef_i32:
31; SI:       ; %bb.0:
32; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
33; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
34; SI-NEXT:    s_mov_b32 s3, 0xf000
35; SI-NEXT:    s_waitcnt lgkmcnt(0)
36; SI-NEXT:    s_flbit_i32_b32 s4, s2
37; SI-NEXT:    s_mov_b32 s2, -1
38; SI-NEXT:    v_mov_b32_e32 v0, s4
39; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
40; SI-NEXT:    s_endpgm
41;
42; VI-LABEL: s_ctlz_zero_undef_i32:
43; VI:       ; %bb.0:
44; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
45; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
46; VI-NEXT:    s_waitcnt lgkmcnt(0)
47; VI-NEXT:    s_flbit_i32_b32 s2, s2
48; VI-NEXT:    v_mov_b32_e32 v0, s0
49; VI-NEXT:    v_mov_b32_e32 v1, s1
50; VI-NEXT:    v_mov_b32_e32 v2, s2
51; VI-NEXT:    flat_store_dword v[0:1], v2
52; VI-NEXT:    s_endpgm
53;
54; EG-LABEL: s_ctlz_zero_undef_i32:
55; EG:       ; %bb.0:
56; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
57; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
58; EG-NEXT:    CF_END
59; EG-NEXT:    PAD
60; EG-NEXT:    ALU clause starting at 4:
61; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
62; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
63; EG-NEXT:     FFBH_UINT * T1.X, KC0[2].Z,
64;
65; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32:
66; GFX9-GISEL:       ; %bb.0:
67; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
68; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
69; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
70; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
71; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
72; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
73; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
74; GFX9-GISEL-NEXT:    s_endpgm
75  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
76  store i32 %ctlz, ptr addrspace(1) %out, align 4
77  ret void
78}
79
80define amdgpu_kernel void @v_ctlz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
81; SI-LABEL: v_ctlz_zero_undef_i32:
82; SI:       ; %bb.0:
83; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
84; SI-NEXT:    s_mov_b32 s7, 0xf000
85; SI-NEXT:    s_mov_b32 s10, 0
86; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
87; SI-NEXT:    v_mov_b32_e32 v1, 0
88; SI-NEXT:    s_mov_b32 s11, s7
89; SI-NEXT:    s_waitcnt lgkmcnt(0)
90; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
91; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
92; SI-NEXT:    s_mov_b32 s6, -1
93; SI-NEXT:    s_mov_b32 s4, s0
94; SI-NEXT:    s_mov_b32 s5, s1
95; SI-NEXT:    s_waitcnt vmcnt(0)
96; SI-NEXT:    v_ffbh_u32_e32 v0, v0
97; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
98; SI-NEXT:    s_endpgm
99;
100; VI-LABEL: v_ctlz_zero_undef_i32:
101; VI:       ; %bb.0:
102; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
103; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
104; VI-NEXT:    s_waitcnt lgkmcnt(0)
105; VI-NEXT:    v_mov_b32_e32 v1, s3
106; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
107; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
108; VI-NEXT:    flat_load_dword v0, v[0:1]
109; VI-NEXT:    s_waitcnt vmcnt(0)
110; VI-NEXT:    v_ffbh_u32_e32 v2, v0
111; VI-NEXT:    v_mov_b32_e32 v0, s0
112; VI-NEXT:    v_mov_b32_e32 v1, s1
113; VI-NEXT:    flat_store_dword v[0:1], v2
114; VI-NEXT:    s_endpgm
115;
116; EG-LABEL: v_ctlz_zero_undef_i32:
117; EG:       ; %bb.0:
118; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
119; EG-NEXT:    TEX 0 @6
120; EG-NEXT:    ALU 2, @11, KC0[CB0:0-32], KC1[]
121; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
122; EG-NEXT:    CF_END
123; EG-NEXT:    PAD
124; EG-NEXT:    Fetch clause starting at 6:
125; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
126; EG-NEXT:    ALU clause starting at 8:
127; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
128; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
129; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
130; EG-NEXT:    ALU clause starting at 11:
131; EG-NEXT:     FFBH_UINT T0.X, T0.X,
132; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
134;
135; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32:
136; GFX9-GISEL:       ; %bb.0:
137; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
138; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
139; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
140; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
141; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
142; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
143; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
144; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
145; GFX9-GISEL-NEXT:    s_endpgm
146  %tid = call i32 @llvm.amdgcn.workitem.id.x()
147  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
148  %val = load i32, ptr addrspace(1) %in.gep, align 4
149  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
150  store i32 %ctlz, ptr addrspace(1) %out, align 4
151  ret void
152}
153
154define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
155; SI-LABEL: v_ctlz_zero_undef_v2i32:
156; SI:       ; %bb.0:
157; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
158; SI-NEXT:    s_mov_b32 s7, 0xf000
159; SI-NEXT:    s_mov_b32 s10, 0
160; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
161; SI-NEXT:    v_mov_b32_e32 v1, 0
162; SI-NEXT:    s_mov_b32 s11, s7
163; SI-NEXT:    s_waitcnt lgkmcnt(0)
164; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
165; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
166; SI-NEXT:    s_mov_b32 s6, -1
167; SI-NEXT:    s_mov_b32 s4, s0
168; SI-NEXT:    s_mov_b32 s5, s1
169; SI-NEXT:    s_waitcnt vmcnt(0)
170; SI-NEXT:    v_ffbh_u32_e32 v1, v1
171; SI-NEXT:    v_ffbh_u32_e32 v0, v0
172; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
173; SI-NEXT:    s_endpgm
174;
175; VI-LABEL: v_ctlz_zero_undef_v2i32:
176; VI:       ; %bb.0:
177; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
178; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
179; VI-NEXT:    s_waitcnt lgkmcnt(0)
180; VI-NEXT:    v_mov_b32_e32 v1, s3
181; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
182; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
183; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
184; VI-NEXT:    v_mov_b32_e32 v3, s1
185; VI-NEXT:    v_mov_b32_e32 v2, s0
186; VI-NEXT:    s_waitcnt vmcnt(0)
187; VI-NEXT:    v_ffbh_u32_e32 v1, v1
188; VI-NEXT:    v_ffbh_u32_e32 v0, v0
189; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
190; VI-NEXT:    s_endpgm
191;
192; EG-LABEL: v_ctlz_zero_undef_v2i32:
193; EG:       ; %bb.0:
194; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
195; EG-NEXT:    TEX 0 @6
196; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
197; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
198; EG-NEXT:    CF_END
199; EG-NEXT:    PAD
200; EG-NEXT:    Fetch clause starting at 6:
201; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
202; EG-NEXT:    ALU clause starting at 8:
203; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
204; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
205; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
206; EG-NEXT:    ALU clause starting at 11:
207; EG-NEXT:     FFBH_UINT * T0.Y, T0.Y,
208; EG-NEXT:     FFBH_UINT T0.X, T0.X,
209; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
210; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
211;
212; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i32:
213; GFX9-GISEL:       ; %bb.0:
214; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
215; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
216; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
217; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
218; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
219; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
220; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
221; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
222; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
223; GFX9-GISEL-NEXT:    s_endpgm
224  %tid = call i32 @llvm.amdgcn.workitem.id.x()
225  %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
226  %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
227  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
228  store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
229  ret void
230}
231
232define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
233; SI-LABEL: v_ctlz_zero_undef_v4i32:
234; SI:       ; %bb.0:
235; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
236; SI-NEXT:    s_mov_b32 s7, 0xf000
237; SI-NEXT:    s_mov_b32 s10, 0
238; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
239; SI-NEXT:    v_mov_b32_e32 v1, 0
240; SI-NEXT:    s_mov_b32 s11, s7
241; SI-NEXT:    s_waitcnt lgkmcnt(0)
242; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
243; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
244; SI-NEXT:    s_mov_b32 s6, -1
245; SI-NEXT:    s_mov_b32 s4, s0
246; SI-NEXT:    s_mov_b32 s5, s1
247; SI-NEXT:    s_waitcnt vmcnt(0)
248; SI-NEXT:    v_ffbh_u32_e32 v3, v3
249; SI-NEXT:    v_ffbh_u32_e32 v2, v2
250; SI-NEXT:    v_ffbh_u32_e32 v1, v1
251; SI-NEXT:    v_ffbh_u32_e32 v0, v0
252; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
253; SI-NEXT:    s_endpgm
254;
255; VI-LABEL: v_ctlz_zero_undef_v4i32:
256; VI:       ; %bb.0:
257; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
258; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
259; VI-NEXT:    s_waitcnt lgkmcnt(0)
260; VI-NEXT:    v_mov_b32_e32 v1, s3
261; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
262; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
263; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
264; VI-NEXT:    v_mov_b32_e32 v5, s1
265; VI-NEXT:    v_mov_b32_e32 v4, s0
266; VI-NEXT:    s_waitcnt vmcnt(0)
267; VI-NEXT:    v_ffbh_u32_e32 v3, v3
268; VI-NEXT:    v_ffbh_u32_e32 v2, v2
269; VI-NEXT:    v_ffbh_u32_e32 v1, v1
270; VI-NEXT:    v_ffbh_u32_e32 v0, v0
271; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
272; VI-NEXT:    s_endpgm
273;
274; EG-LABEL: v_ctlz_zero_undef_v4i32:
275; EG:       ; %bb.0:
276; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
277; EG-NEXT:    TEX 0 @6
278; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
279; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
280; EG-NEXT:    CF_END
281; EG-NEXT:    PAD
282; EG-NEXT:    Fetch clause starting at 6:
283; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
284; EG-NEXT:    ALU clause starting at 8:
285; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
286; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
287; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
288; EG-NEXT:    ALU clause starting at 11:
289; EG-NEXT:     FFBH_UINT * T0.W, T0.W,
290; EG-NEXT:     FFBH_UINT * T0.Z, T0.Z,
291; EG-NEXT:     FFBH_UINT * T0.Y, T0.Y,
292; EG-NEXT:     FFBH_UINT T0.X, T0.X,
293; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
294; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
295;
296; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i32:
297; GFX9-GISEL:       ; %bb.0:
298; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
299; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
300; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
301; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
302; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
303; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
304; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
305; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
306; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
307; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
308; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
309; GFX9-GISEL-NEXT:    s_endpgm
310  %tid = call i32 @llvm.amdgcn.workitem.id.x()
311  %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
312  %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
313  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
314  store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
315  ret void
316}
317
318define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
319; SI-LABEL: s_ctlz_zero_undef_i8_with_select:
320; SI:       ; %bb.0:
321; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
322; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
323; SI-NEXT:    s_mov_b32 s3, 0xf000
324; SI-NEXT:    s_waitcnt lgkmcnt(0)
325; SI-NEXT:    s_lshl_b32 s2, s2, 24
326; SI-NEXT:    s_flbit_i32_b32 s4, s2
327; SI-NEXT:    s_mov_b32 s2, -1
328; SI-NEXT:    v_mov_b32_e32 v0, s4
329; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
330; SI-NEXT:    s_endpgm
331;
332; VI-LABEL: s_ctlz_zero_undef_i8_with_select:
333; VI:       ; %bb.0:
334; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
335; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
336; VI-NEXT:    s_waitcnt lgkmcnt(0)
337; VI-NEXT:    s_lshl_b32 s2, s2, 24
338; VI-NEXT:    s_flbit_i32_b32 s2, s2
339; VI-NEXT:    v_mov_b32_e32 v0, s0
340; VI-NEXT:    v_mov_b32_e32 v1, s1
341; VI-NEXT:    v_mov_b32_e32 v2, s2
342; VI-NEXT:    flat_store_byte v[0:1], v2
343; VI-NEXT:    s_endpgm
344;
345; EG-LABEL: s_ctlz_zero_undef_i8_with_select:
346; EG:       ; %bb.0:
347; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
348; EG-NEXT:    TEX 0 @6
349; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
350; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
351; EG-NEXT:    CF_END
352; EG-NEXT:    PAD
353; EG-NEXT:    Fetch clause starting at 6:
354; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
355; EG-NEXT:    ALU clause starting at 8:
356; EG-NEXT:     MOV * T0.X, 0.0,
357; EG-NEXT:    ALU clause starting at 9:
358; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
359; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
360; EG-NEXT:     FFBH_UINT T0.W, PV.W,
361; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
362; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
363; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
364; EG-NEXT:     LSHL * T1.W, PS, literal.y,
365; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
366; EG-NEXT:     LSHL T0.X, PV.W, PS,
367; EG-NEXT:     LSHL * T0.W, literal.x, PS,
368; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
369; EG-NEXT:     MOV T0.Y, 0.0,
370; EG-NEXT:     MOV * T0.Z, 0.0,
371; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
372; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
373;
374; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i8_with_select:
375; GFX9-GISEL:       ; %bb.0:
376; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
377; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
378; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
379; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
380; GFX9-GISEL-NEXT:    s_lshl_b32 s2, s2, 24
381; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
382; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
383; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
384; GFX9-GISEL-NEXT:    s_endpgm
385  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
386  %ctlz_ret = icmp ne i8 %val, 0
387  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
388  store i8 %ctlz, ptr addrspace(1) %out, align 4
389  ret void
390}
391
392define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
393; SI-LABEL: s_ctlz_zero_undef_i16_with_select:
394; SI:       ; %bb.0:
395; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
396; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
397; SI-NEXT:    s_mov_b32 s3, 0xf000
398; SI-NEXT:    s_waitcnt lgkmcnt(0)
399; SI-NEXT:    s_lshl_b32 s2, s2, 16
400; SI-NEXT:    s_flbit_i32_b32 s4, s2
401; SI-NEXT:    s_mov_b32 s2, -1
402; SI-NEXT:    v_mov_b32_e32 v0, s4
403; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
404; SI-NEXT:    s_endpgm
405;
406; VI-LABEL: s_ctlz_zero_undef_i16_with_select:
407; VI:       ; %bb.0:
408; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
409; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
410; VI-NEXT:    s_waitcnt lgkmcnt(0)
411; VI-NEXT:    s_lshl_b32 s2, s2, 16
412; VI-NEXT:    s_flbit_i32_b32 s2, s2
413; VI-NEXT:    v_mov_b32_e32 v0, s0
414; VI-NEXT:    v_mov_b32_e32 v1, s1
415; VI-NEXT:    v_mov_b32_e32 v2, s2
416; VI-NEXT:    flat_store_short v[0:1], v2
417; VI-NEXT:    s_endpgm
418;
419; EG-LABEL: s_ctlz_zero_undef_i16_with_select:
420; EG:       ; %bb.0:
421; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
422; EG-NEXT:    TEX 0 @6
423; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
424; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
425; EG-NEXT:    CF_END
426; EG-NEXT:    PAD
427; EG-NEXT:    Fetch clause starting at 6:
428; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
429; EG-NEXT:    ALU clause starting at 8:
430; EG-NEXT:     MOV * T0.X, 0.0,
431; EG-NEXT:    ALU clause starting at 9:
432; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
433; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
434; EG-NEXT:     FFBH_UINT T0.W, PV.W,
435; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
436; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
437; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
438; EG-NEXT:     LSHL * T1.W, PS, literal.y,
439; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
440; EG-NEXT:     LSHL T0.X, PV.W, PS,
441; EG-NEXT:     LSHL * T0.W, literal.x, PS,
442; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
443; EG-NEXT:     MOV T0.Y, 0.0,
444; EG-NEXT:     MOV * T0.Z, 0.0,
445; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
446; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
447;
448; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i16_with_select:
449; GFX9-GISEL:       ; %bb.0:
450; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
451; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
452; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
453; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
454; GFX9-GISEL-NEXT:    s_lshl_b32 s2, s2, 16
455; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
456; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
457; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[0:1]
458; GFX9-GISEL-NEXT:    s_endpgm
459  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
460  %ctlz_ret = icmp ne i16 %val, 0
461  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
462  store i16 %ctlz, ptr addrspace(1) %out, align 4
463  ret void
464}
465
466define amdgpu_kernel void @s_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
467; SI-LABEL: s_ctlz_zero_undef_i32_with_select:
468; SI:       ; %bb.0:
469; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
470; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
471; SI-NEXT:    s_mov_b32 s3, 0xf000
472; SI-NEXT:    s_waitcnt lgkmcnt(0)
473; SI-NEXT:    s_flbit_i32_b32 s4, s2
474; SI-NEXT:    s_mov_b32 s2, -1
475; SI-NEXT:    v_mov_b32_e32 v0, s4
476; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
477; SI-NEXT:    s_endpgm
478;
479; VI-LABEL: s_ctlz_zero_undef_i32_with_select:
480; VI:       ; %bb.0:
481; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
482; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
483; VI-NEXT:    s_waitcnt lgkmcnt(0)
484; VI-NEXT:    s_flbit_i32_b32 s2, s2
485; VI-NEXT:    v_mov_b32_e32 v0, s0
486; VI-NEXT:    v_mov_b32_e32 v1, s1
487; VI-NEXT:    v_mov_b32_e32 v2, s2
488; VI-NEXT:    flat_store_dword v[0:1], v2
489; VI-NEXT:    s_endpgm
490;
491; EG-LABEL: s_ctlz_zero_undef_i32_with_select:
492; EG:       ; %bb.0:
493; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
494; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
495; EG-NEXT:    CF_END
496; EG-NEXT:    PAD
497; EG-NEXT:    ALU clause starting at 4:
498; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
499; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
500; EG-NEXT:     FFBH_UINT * T1.X, KC0[2].Z,
501;
502; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i32_with_select:
503; GFX9-GISEL:       ; %bb.0:
504; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
505; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
506; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
507; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
508; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
509; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
510; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
511; GFX9-GISEL-NEXT:    s_endpgm
512  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
513  %ctlz_ret = icmp ne i32 %val, 0
514  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
515  store i32 %ctlz, ptr addrspace(1) %out, align 4
516  ret void
517}
518
519define amdgpu_kernel void @s_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
520; SI-LABEL: s_ctlz_zero_undef_i64_with_select:
521; SI:       ; %bb.0:
522; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
523; SI-NEXT:    s_mov_b32 s7, 0xf000
524; SI-NEXT:    s_mov_b32 s6, -1
525; SI-NEXT:    s_waitcnt lgkmcnt(0)
526; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
527; SI-NEXT:    v_mov_b32_e32 v1, 0
528; SI-NEXT:    s_mov_b32 s4, s0
529; SI-NEXT:    s_mov_b32 s5, s1
530; SI-NEXT:    v_mov_b32_e32 v0, s2
531; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
532; SI-NEXT:    s_endpgm
533;
534; VI-LABEL: s_ctlz_zero_undef_i64_with_select:
535; VI:       ; %bb.0:
536; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
537; VI-NEXT:    v_mov_b32_e32 v1, 0
538; VI-NEXT:    s_waitcnt lgkmcnt(0)
539; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
540; VI-NEXT:    v_mov_b32_e32 v3, s1
541; VI-NEXT:    v_mov_b32_e32 v0, s2
542; VI-NEXT:    v_mov_b32_e32 v2, s0
543; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
544; VI-NEXT:    s_endpgm
545;
546; EG-LABEL: s_ctlz_zero_undef_i64_with_select:
547; EG:       ; %bb.0:
548; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
549; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
550; EG-NEXT:    CF_END
551; EG-NEXT:    PAD
552; EG-NEXT:    ALU clause starting at 4:
553; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
554; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
555; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
556; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
557; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
558; EG-NEXT:     MOV T0.Y, 0.0,
559; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
560; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
561;
562; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_with_select:
563; GFX9-GISEL:       ; %bb.0:
564; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
565; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
566; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
567; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
568; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[2:3]
569; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
570; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
571; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
572; GFX9-GISEL-NEXT:    s_endpgm
573  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
574  %ctlz_ret = icmp ne i64 %val, 0
575  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 32
576  store i64 %ctlz, ptr addrspace(1) %out, align 4
577  ret void
578}
579
580define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
581; SI-LABEL: v_ctlz_zero_undef_i8_with_select:
582; SI:       ; %bb.0:
583; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
584; SI-NEXT:    s_mov_b32 s7, 0xf000
585; SI-NEXT:    s_mov_b32 s6, -1
586; SI-NEXT:    s_mov_b32 s10, s6
587; SI-NEXT:    s_mov_b32 s11, s7
588; SI-NEXT:    s_waitcnt lgkmcnt(0)
589; SI-NEXT:    s_mov_b32 s8, s2
590; SI-NEXT:    s_mov_b32 s9, s3
591; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
592; SI-NEXT:    s_mov_b32 s4, s0
593; SI-NEXT:    s_mov_b32 s5, s1
594; SI-NEXT:    s_waitcnt vmcnt(0)
595; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
596; SI-NEXT:    v_ffbh_u32_e32 v1, v1
597; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
598; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
599; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
600; SI-NEXT:    s_endpgm
601;
602; VI-LABEL: v_ctlz_zero_undef_i8_with_select:
603; VI:       ; %bb.0:
604; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
605; VI-NEXT:    s_waitcnt lgkmcnt(0)
606; VI-NEXT:    v_mov_b32_e32 v0, s2
607; VI-NEXT:    v_mov_b32_e32 v1, s3
608; VI-NEXT:    flat_load_ubyte v0, v[0:1]
609; VI-NEXT:    s_waitcnt vmcnt(0)
610; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
611; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
612; VI-NEXT:    v_ffbh_u32_e32 v1, v1
613; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
614; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
615; VI-NEXT:    v_mov_b32_e32 v0, s0
616; VI-NEXT:    v_mov_b32_e32 v1, s1
617; VI-NEXT:    flat_store_byte v[0:1], v2
618; VI-NEXT:    s_endpgm
619;
620; EG-LABEL: v_ctlz_zero_undef_i8_with_select:
621; EG:       ; %bb.0:
622; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
623; EG-NEXT:    TEX 0 @6
624; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
625; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
626; EG-NEXT:    CF_END
627; EG-NEXT:    PAD
628; EG-NEXT:    Fetch clause starting at 6:
629; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
630; EG-NEXT:    ALU clause starting at 8:
631; EG-NEXT:     MOV * T0.X, KC0[2].Z,
632; EG-NEXT:    ALU clause starting at 9:
633; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
634; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
635; EG-NEXT:     FFBH_UINT T0.W, PV.W,
636; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
637; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
638; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
639; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
640; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
641; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
642; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
643; EG-NEXT:     LSHL T0.X, PV.W, PS,
644; EG-NEXT:     LSHL * T0.W, literal.x, PS,
645; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
646; EG-NEXT:     MOV T0.Y, 0.0,
647; EG-NEXT:     MOV * T0.Z, 0.0,
648; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
649; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
650;
651; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_with_select:
652; GFX9-GISEL:       ; %bb.0:
653; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
654; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
655; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
656; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
657; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
658; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v1
659; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
660; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
661; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
662; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
663; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
664; GFX9-GISEL-NEXT:    s_endpgm
665  %val = load i8, ptr addrspace(1) %arrayidx, align 1
666  %ctlz = tail call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
667  %ctlz_ret = icmp ne i8 %val, 0
668  %ret = select i1 %ctlz_ret, i8 %ctlz, i8 32
669  store i8 %ret, ptr addrspace(1) %out, align 4
670  ret void
671}
672
673define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
674; SI-LABEL: v_ctlz_zero_undef_i16_with_select:
675; SI:       ; %bb.0:
676; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
677; SI-NEXT:    s_mov_b32 s7, 0xf000
678; SI-NEXT:    s_mov_b32 s6, -1
679; SI-NEXT:    s_mov_b32 s10, s6
680; SI-NEXT:    s_mov_b32 s11, s7
681; SI-NEXT:    s_waitcnt lgkmcnt(0)
682; SI-NEXT:    s_mov_b32 s8, s2
683; SI-NEXT:    s_mov_b32 s9, s3
684; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
685; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
686; SI-NEXT:    s_mov_b32 s4, s0
687; SI-NEXT:    s_mov_b32 s5, s1
688; SI-NEXT:    s_waitcnt vmcnt(1)
689; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
690; SI-NEXT:    s_waitcnt vmcnt(0)
691; SI-NEXT:    v_or_b32_e32 v0, v0, v1
692; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
693; SI-NEXT:    v_ffbh_u32_e32 v1, v1
694; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
695; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
696; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
697; SI-NEXT:    s_endpgm
698;
699; VI-LABEL: v_ctlz_zero_undef_i16_with_select:
700; VI:       ; %bb.0:
701; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
702; VI-NEXT:    s_waitcnt lgkmcnt(0)
703; VI-NEXT:    s_add_u32 s4, s2, 1
704; VI-NEXT:    s_addc_u32 s5, s3, 0
705; VI-NEXT:    v_mov_b32_e32 v2, s4
706; VI-NEXT:    v_mov_b32_e32 v0, s2
707; VI-NEXT:    v_mov_b32_e32 v3, s5
708; VI-NEXT:    v_mov_b32_e32 v1, s3
709; VI-NEXT:    flat_load_ubyte v2, v[2:3]
710; VI-NEXT:    flat_load_ubyte v0, v[0:1]
711; VI-NEXT:    s_waitcnt vmcnt(1)
712; VI-NEXT:    v_readfirstlane_b32 s2, v2
713; VI-NEXT:    s_waitcnt vmcnt(0)
714; VI-NEXT:    v_readfirstlane_b32 s3, v0
715; VI-NEXT:    s_lshl_b32 s2, s2, 8
716; VI-NEXT:    s_or_b32 s2, s2, s3
717; VI-NEXT:    s_lshl_b32 s3, s2, 16
718; VI-NEXT:    s_and_b32 s2, s2, 0xffff
719; VI-NEXT:    s_flbit_i32_b32 s3, s3
720; VI-NEXT:    s_cmp_lg_u32 s2, 0
721; VI-NEXT:    s_cselect_b32 s2, s3, 32
722; VI-NEXT:    v_mov_b32_e32 v0, s0
723; VI-NEXT:    v_mov_b32_e32 v1, s1
724; VI-NEXT:    v_mov_b32_e32 v2, s2
725; VI-NEXT:    flat_store_short v[0:1], v2
726; VI-NEXT:    s_endpgm
727;
728; EG-LABEL: v_ctlz_zero_undef_i16_with_select:
729; EG:       ; %bb.0:
730; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
731; EG-NEXT:    TEX 0 @6
732; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
733; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
734; EG-NEXT:    CF_END
735; EG-NEXT:    PAD
736; EG-NEXT:    Fetch clause starting at 6:
737; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
738; EG-NEXT:    ALU clause starting at 8:
739; EG-NEXT:     MOV * T0.X, KC0[2].Z,
740; EG-NEXT:    ALU clause starting at 9:
741; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
742; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
743; EG-NEXT:     FFBH_UINT T0.W, PV.W,
744; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
745; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
746; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
747; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
748; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
749; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
750; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
751; EG-NEXT:     LSHL T0.X, PV.W, PS,
752; EG-NEXT:     LSHL * T0.W, literal.x, PS,
753; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
754; EG-NEXT:     MOV T0.Y, 0.0,
755; EG-NEXT:     MOV * T0.Z, 0.0,
756; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
757; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
758;
759; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i16_with_select:
760; GFX9-GISEL:       ; %bb.0:
761; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
762; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
763; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
764; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
765; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
766; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
767; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
768; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
769; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
770; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
771; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
772; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
773; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
774; GFX9-GISEL-NEXT:    s_endpgm
775  %val = load i16, ptr addrspace(1) %arrayidx, align 1
776  %ctlz = tail call i16 @llvm.ctlz.i16(i16 %val, i1 true) nounwind readnone
777  %ctlz_ret = icmp ne i16 %val, 0
778  %ret = select i1 %ctlz_ret, i16 %ctlz, i16 32
779  store i16 %ret, ptr addrspace(1) %out, align 4
780  ret void
781}
782
783define amdgpu_kernel void @v_ctlz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
784; SI-LABEL: v_ctlz_zero_undef_i32_with_select:
785; SI:       ; %bb.0:
786; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
787; SI-NEXT:    s_mov_b32 s7, 0xf000
788; SI-NEXT:    s_mov_b32 s6, -1
789; SI-NEXT:    s_mov_b32 s10, s6
790; SI-NEXT:    s_mov_b32 s11, s7
791; SI-NEXT:    s_waitcnt lgkmcnt(0)
792; SI-NEXT:    s_mov_b32 s8, s2
793; SI-NEXT:    s_mov_b32 s9, s3
794; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
795; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
796; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
797; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
798; SI-NEXT:    s_mov_b32 s4, s0
799; SI-NEXT:    s_mov_b32 s5, s1
800; SI-NEXT:    s_waitcnt vmcnt(3)
801; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
802; SI-NEXT:    s_waitcnt vmcnt(2)
803; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
804; SI-NEXT:    s_waitcnt vmcnt(1)
805; SI-NEXT:    v_or_b32_e32 v0, v0, v2
806; SI-NEXT:    s_waitcnt vmcnt(0)
807; SI-NEXT:    v_or_b32_e32 v1, v1, v3
808; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
809; SI-NEXT:    v_or_b32_e32 v0, v1, v0
810; SI-NEXT:    v_ffbh_u32_e32 v0, v0
811; SI-NEXT:    v_min_u32_e32 v0, 32, v0
812; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
813; SI-NEXT:    s_endpgm
814;
815; VI-LABEL: v_ctlz_zero_undef_i32_with_select:
816; VI:       ; %bb.0:
817; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
818; VI-NEXT:    s_waitcnt lgkmcnt(0)
819; VI-NEXT:    s_add_u32 s4, s2, 3
820; VI-NEXT:    s_addc_u32 s5, s3, 0
821; VI-NEXT:    v_mov_b32_e32 v2, s4
822; VI-NEXT:    v_mov_b32_e32 v3, s5
823; VI-NEXT:    s_add_u32 s4, s2, 2
824; VI-NEXT:    v_mov_b32_e32 v0, s2
825; VI-NEXT:    s_addc_u32 s5, s3, 0
826; VI-NEXT:    v_mov_b32_e32 v1, s3
827; VI-NEXT:    s_add_u32 s2, s2, 1
828; VI-NEXT:    s_addc_u32 s3, s3, 0
829; VI-NEXT:    v_mov_b32_e32 v4, s4
830; VI-NEXT:    v_mov_b32_e32 v7, s3
831; VI-NEXT:    v_mov_b32_e32 v5, s5
832; VI-NEXT:    v_mov_b32_e32 v6, s2
833; VI-NEXT:    flat_load_ubyte v2, v[2:3]
834; VI-NEXT:    flat_load_ubyte v3, v[4:5]
835; VI-NEXT:    flat_load_ubyte v4, v[6:7]
836; VI-NEXT:    flat_load_ubyte v0, v[0:1]
837; VI-NEXT:    s_waitcnt vmcnt(3)
838; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
839; VI-NEXT:    s_waitcnt vmcnt(2)
840; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
841; VI-NEXT:    s_waitcnt vmcnt(1)
842; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
843; VI-NEXT:    s_waitcnt vmcnt(0)
844; VI-NEXT:    v_or_b32_e32 v0, v2, v0
845; VI-NEXT:    v_or_b32_e32 v0, v1, v0
846; VI-NEXT:    v_ffbh_u32_e32 v0, v0
847; VI-NEXT:    v_min_u32_e32 v2, 32, v0
848; VI-NEXT:    v_mov_b32_e32 v0, s0
849; VI-NEXT:    v_mov_b32_e32 v1, s1
850; VI-NEXT:    flat_store_dword v[0:1], v2
851; VI-NEXT:    s_endpgm
852;
853; EG-LABEL: v_ctlz_zero_undef_i32_with_select:
854; EG:       ; %bb.0:
855; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
856; EG-NEXT:    TEX 1 @6
857; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
858; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
859; EG-NEXT:    CF_END
860; EG-NEXT:    PAD
861; EG-NEXT:    Fetch clause starting at 6:
862; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
863; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
864; EG-NEXT:    ALU clause starting at 10:
865; EG-NEXT:     MOV * T0.X, KC0[2].Z,
866; EG-NEXT:    ALU clause starting at 11:
867; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
868; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
869; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
870; EG-NEXT:     FFBH_UINT * T1.W, PV.W,
871; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
872; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
873; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
874;
875; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_with_select:
876; GFX9-GISEL:       ; %bb.0:
877; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
878; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
879; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
880; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
881; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
882; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
883; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
884; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
885; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
886; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
887; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
888; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
889; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
890; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
891; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
892; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
893; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
894; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
895; GFX9-GISEL-NEXT:    s_endpgm
896  %val = load i32, ptr addrspace(1) %arrayidx, align 1
897  %ctlz = tail call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
898  %ctlz_ret = icmp ne i32 %val, 0
899  %ret = select i1 %ctlz_ret, i32 %ctlz, i32 32
900  store i32 %ret, ptr addrspace(1) %out, align 4
901  ret void
902}
903
904define amdgpu_kernel void @v_ctlz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
905; SI-LABEL: v_ctlz_zero_undef_i64_with_select:
906; SI:       ; %bb.0:
907; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
908; SI-NEXT:    s_mov_b32 s3, 0xf000
909; SI-NEXT:    s_mov_b32 s2, -1
910; SI-NEXT:    s_mov_b32 s10, s2
911; SI-NEXT:    s_mov_b32 s11, s3
912; SI-NEXT:    s_waitcnt lgkmcnt(0)
913; SI-NEXT:    s_mov_b32 s8, s6
914; SI-NEXT:    s_mov_b32 s9, s7
915; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
916; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
917; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
918; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
919; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
920; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
921; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
922; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
923; SI-NEXT:    s_mov_b32 s0, s4
924; SI-NEXT:    s_mov_b32 s1, s5
925; SI-NEXT:    s_waitcnt vmcnt(7)
926; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
927; SI-NEXT:    s_waitcnt vmcnt(6)
928; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
929; SI-NEXT:    s_waitcnt vmcnt(4)
930; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
931; SI-NEXT:    s_waitcnt vmcnt(2)
932; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
933; SI-NEXT:    s_waitcnt vmcnt(1)
934; SI-NEXT:    v_or_b32_e32 v0, v0, v6
935; SI-NEXT:    s_waitcnt vmcnt(0)
936; SI-NEXT:    v_or_b32_e32 v1, v1, v7
937; SI-NEXT:    v_or_b32_e32 v2, v3, v2
938; SI-NEXT:    v_or_b32_e32 v3, v5, v4
939; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
940; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
941; SI-NEXT:    v_or_b32_e32 v0, v1, v0
942; SI-NEXT:    v_or_b32_e32 v1, v3, v2
943; SI-NEXT:    v_ffbh_u32_e32 v1, v1
944; SI-NEXT:    v_ffbh_u32_e32 v0, v0
945; SI-NEXT:    v_add_i32_e32 v1, vcc, 32, v1
946; SI-NEXT:    v_min_u32_e32 v0, v1, v0
947; SI-NEXT:    v_min_u32_e32 v0, 64, v0
948; SI-NEXT:    v_mov_b32_e32 v1, 0
949; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
950; SI-NEXT:    s_endpgm
951;
952; VI-LABEL: v_ctlz_zero_undef_i64_with_select:
953; VI:       ; %bb.0:
954; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
955; VI-NEXT:    s_waitcnt lgkmcnt(0)
956; VI-NEXT:    s_add_u32 s4, s2, 5
957; VI-NEXT:    s_addc_u32 s5, s3, 0
958; VI-NEXT:    v_mov_b32_e32 v0, s4
959; VI-NEXT:    v_mov_b32_e32 v1, s5
960; VI-NEXT:    s_add_u32 s4, s2, 4
961; VI-NEXT:    s_addc_u32 s5, s3, 0
962; VI-NEXT:    v_mov_b32_e32 v2, s4
963; VI-NEXT:    v_mov_b32_e32 v3, s5
964; VI-NEXT:    s_add_u32 s4, s2, 7
965; VI-NEXT:    s_addc_u32 s5, s3, 0
966; VI-NEXT:    v_mov_b32_e32 v4, s4
967; VI-NEXT:    v_mov_b32_e32 v5, s5
968; VI-NEXT:    s_add_u32 s4, s2, 6
969; VI-NEXT:    s_addc_u32 s5, s3, 0
970; VI-NEXT:    v_mov_b32_e32 v7, s5
971; VI-NEXT:    v_mov_b32_e32 v6, s4
972; VI-NEXT:    s_add_u32 s4, s2, 3
973; VI-NEXT:    s_addc_u32 s5, s3, 0
974; VI-NEXT:    v_mov_b32_e32 v9, s5
975; VI-NEXT:    v_mov_b32_e32 v8, s4
976; VI-NEXT:    s_add_u32 s4, s2, 2
977; VI-NEXT:    s_addc_u32 s5, s3, 0
978; VI-NEXT:    v_mov_b32_e32 v11, s5
979; VI-NEXT:    v_mov_b32_e32 v10, s4
980; VI-NEXT:    s_add_u32 s4, s2, 1
981; VI-NEXT:    flat_load_ubyte v12, v[0:1]
982; VI-NEXT:    flat_load_ubyte v13, v[2:3]
983; VI-NEXT:    flat_load_ubyte v4, v[4:5]
984; VI-NEXT:    flat_load_ubyte v5, v[6:7]
985; VI-NEXT:    s_addc_u32 s5, s3, 0
986; VI-NEXT:    v_mov_b32_e32 v0, s4
987; VI-NEXT:    flat_load_ubyte v6, v[8:9]
988; VI-NEXT:    v_mov_b32_e32 v2, s2
989; VI-NEXT:    v_mov_b32_e32 v1, s5
990; VI-NEXT:    v_mov_b32_e32 v3, s3
991; VI-NEXT:    flat_load_ubyte v7, v[10:11]
992; VI-NEXT:    flat_load_ubyte v0, v[0:1]
993; VI-NEXT:    flat_load_ubyte v2, v[2:3]
994; VI-NEXT:    v_mov_b32_e32 v1, 0
995; VI-NEXT:    s_waitcnt vmcnt(7)
996; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
997; VI-NEXT:    s_waitcnt vmcnt(6)
998; VI-NEXT:    v_or_b32_e32 v3, v3, v13
999; VI-NEXT:    s_waitcnt vmcnt(5)
1000; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
1001; VI-NEXT:    s_waitcnt vmcnt(4)
1002; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1003; VI-NEXT:    v_or_b32_e32 v3, v4, v3
1004; VI-NEXT:    v_ffbh_u32_e32 v3, v3
1005; VI-NEXT:    s_waitcnt vmcnt(3)
1006; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
1007; VI-NEXT:    s_waitcnt vmcnt(2)
1008; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1009; VI-NEXT:    s_waitcnt vmcnt(1)
1010; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1011; VI-NEXT:    s_waitcnt vmcnt(0)
1012; VI-NEXT:    v_or_b32_e32 v0, v0, v2
1013; VI-NEXT:    v_or_b32_e32 v0, v4, v0
1014; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1015; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1016; VI-NEXT:    v_min_u32_e32 v0, v0, v3
1017; VI-NEXT:    v_mov_b32_e32 v3, s1
1018; VI-NEXT:    v_min_u32_e32 v0, 64, v0
1019; VI-NEXT:    v_mov_b32_e32 v2, s0
1020; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1021; VI-NEXT:    s_endpgm
1022;
1023; EG-LABEL: v_ctlz_zero_undef_i64_with_select:
1024; EG:       ; %bb.0:
1025; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1026; EG-NEXT:    TEX 3 @6
1027; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
1028; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1029; EG-NEXT:    CF_END
1030; EG-NEXT:    PAD
1031; EG-NEXT:    Fetch clause starting at 6:
1032; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1033; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 4, #1
1034; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 6, #1
1035; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1036; EG-NEXT:    ALU clause starting at 14:
1037; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1038; EG-NEXT:    ALU clause starting at 15:
1039; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1040; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1041; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1042; EG-NEXT:     FFBH_UINT T1.W, PV.W,
1043; EG-NEXT:     LSHL * T2.W, T3.X, literal.x,
1044; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1045; EG-NEXT:     CNDE_INT T0.W, T0.W, literal.x, PV.W,
1046; EG-NEXT:     OR_INT * T1.W, PS, T2.X,
1047; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1048; EG-NEXT:     FFBH_UINT T2.W, PS,
1049; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
1050; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1051; EG-NEXT:     CNDE_INT T0.X, T1.W, PS, PV.W,
1052; EG-NEXT:     MOV T0.Y, 0.0,
1053; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1054; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1055;
1056; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_with_select:
1057; GFX9-GISEL:       ; %bb.0:
1058; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1059; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1060; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1061; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
1062; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
1063; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
1064; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
1065; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
1066; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
1067; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
1068; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
1069; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
1070; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
1071; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
1072; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1073; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
1074; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
1075; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
1076; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1077; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
1078; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1079; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
1080; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1081; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v8, 24, v5
1082; GFX9-GISEL-NEXT:    v_or3_b32 v3, v0, v4, 0
1083; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v2
1084; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v4, v3
1085; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
1086; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1087; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v4, v0
1088; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
1089; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
1090; GFX9-GISEL-NEXT:    s_endpgm
1091  %val = load i64, ptr addrspace(1) %arrayidx, align 1
1092  %ctlz = tail call i64 @llvm.ctlz.i64(i64 %val, i1 true) nounwind readnone
1093  %ctlz_ret = icmp ne i64 %val, 0
1094  %ret = select i1 %ctlz_ret, i64 %ctlz, i64 64
1095  store i64 %ret, ptr addrspace(1) %out, align 4
1096  ret void
1097}
1098
1099define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1100; SI-LABEL: v_ctlz_zero_undef_i8:
1101; SI:       ; %bb.0:
1102; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1103; SI-NEXT:    s_mov_b32 s7, 0xf000
1104; SI-NEXT:    v_mov_b32_e32 v1, 0
1105; SI-NEXT:    s_mov_b32 s10, 0
1106; SI-NEXT:    s_mov_b32 s11, s7
1107; SI-NEXT:    s_waitcnt lgkmcnt(0)
1108; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1109; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1110; SI-NEXT:    s_mov_b32 s6, -1
1111; SI-NEXT:    s_mov_b32 s4, s0
1112; SI-NEXT:    s_mov_b32 s5, s1
1113; SI-NEXT:    s_waitcnt vmcnt(0)
1114; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
1115; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1116; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1117; SI-NEXT:    s_endpgm
1118;
1119; VI-LABEL: v_ctlz_zero_undef_i8:
1120; VI:       ; %bb.0:
1121; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1122; VI-NEXT:    s_waitcnt lgkmcnt(0)
1123; VI-NEXT:    v_mov_b32_e32 v1, s3
1124; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1125; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1126; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1127; VI-NEXT:    s_waitcnt vmcnt(0)
1128; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
1129; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1130; VI-NEXT:    v_mov_b32_e32 v0, s0
1131; VI-NEXT:    v_mov_b32_e32 v1, s1
1132; VI-NEXT:    flat_store_byte v[0:1], v2
1133; VI-NEXT:    s_endpgm
1134;
1135; EG-LABEL: v_ctlz_zero_undef_i8:
1136; EG:       ; %bb.0:
1137; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1138; EG-NEXT:    TEX 0 @6
1139; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
1140; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1141; EG-NEXT:    CF_END
1142; EG-NEXT:    PAD
1143; EG-NEXT:    Fetch clause starting at 6:
1144; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1145; EG-NEXT:    ALU clause starting at 8:
1146; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1147; EG-NEXT:    ALU clause starting at 9:
1148; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1149; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
1150; EG-NEXT:     FFBH_UINT T0.W, PV.W,
1151; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1152; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1153; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1154; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1155; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1156; EG-NEXT:     LSHL T0.X, PV.W, PS,
1157; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1158; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1159; EG-NEXT:     MOV T0.Y, 0.0,
1160; EG-NEXT:     MOV * T0.Z, 0.0,
1161; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1162; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1163;
1164; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8:
1165; GFX9-GISEL:       ; %bb.0:
1166; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1167; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1168; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1169; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1170; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1171; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
1172; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1173; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1174; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1175; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1176; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
1177; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1178; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1179; GFX9-GISEL-NEXT:    s_endpgm
1180  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1181  %in.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1182  %val = load i8, ptr addrspace(1) %in.gep
1183  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1184  store i8 %ctlz, ptr addrspace(1) %out
1185  ret void
1186}
1187
1188define amdgpu_kernel void @s_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
1189; SI-LABEL: s_ctlz_zero_undef_i64:
1190; SI:       ; %bb.0:
1191; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
1192; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
1193; SI-NEXT:    s_mov_b32 s3, 0xf000
1194; SI-NEXT:    s_mov_b32 s2, -1
1195; SI-NEXT:    s_waitcnt lgkmcnt(0)
1196; SI-NEXT:    s_flbit_i32_b64 s4, s[6:7]
1197; SI-NEXT:    v_mov_b32_e32 v1, 0
1198; SI-NEXT:    v_mov_b32_e32 v0, s4
1199; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
1200; SI-NEXT:    s_endpgm
1201;
1202; VI-LABEL: s_ctlz_zero_undef_i64:
1203; VI:       ; %bb.0:
1204; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
1205; VI-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1206; VI-NEXT:    v_mov_b32_e32 v1, 0
1207; VI-NEXT:    s_waitcnt lgkmcnt(0)
1208; VI-NEXT:    s_flbit_i32_b64 s0, s[0:1]
1209; VI-NEXT:    v_mov_b32_e32 v2, s2
1210; VI-NEXT:    v_mov_b32_e32 v0, s0
1211; VI-NEXT:    v_mov_b32_e32 v3, s3
1212; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
1213; VI-NEXT:    s_endpgm
1214;
1215; EG-LABEL: s_ctlz_zero_undef_i64:
1216; EG:       ; %bb.0:
1217; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
1218; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1219; EG-NEXT:    CF_END
1220; EG-NEXT:    PAD
1221; EG-NEXT:    ALU clause starting at 4:
1222; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
1223; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
1224; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
1225; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1226; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
1227; EG-NEXT:     MOV T0.Y, 0.0,
1228; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1229; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1230;
1231; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64:
1232; GFX9-GISEL:       ; %bb.0:
1233; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
1234; GFX9-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
1235; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
1236; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
1237; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1238; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s4, s[0:1]
1239; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
1240; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
1241; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
1242; GFX9-GISEL-NEXT:    s_endpgm
1243  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1244  store i64 %ctlz, ptr addrspace(1) %out
1245  ret void
1246}
1247
1248define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
1249; SI-LABEL: s_ctlz_zero_undef_i64_trunc:
1250; SI:       ; %bb.0:
1251; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1252; SI-NEXT:    s_mov_b32 s7, 0xf000
1253; SI-NEXT:    s_waitcnt lgkmcnt(0)
1254; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
1255; SI-NEXT:    s_mov_b32 s6, -1
1256; SI-NEXT:    s_mov_b32 s4, s0
1257; SI-NEXT:    s_mov_b32 s5, s1
1258; SI-NEXT:    v_mov_b32_e32 v0, s2
1259; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1260; SI-NEXT:    s_endpgm
1261;
1262; VI-LABEL: s_ctlz_zero_undef_i64_trunc:
1263; VI:       ; %bb.0:
1264; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1265; VI-NEXT:    s_waitcnt lgkmcnt(0)
1266; VI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
1267; VI-NEXT:    v_mov_b32_e32 v0, s0
1268; VI-NEXT:    v_mov_b32_e32 v1, s1
1269; VI-NEXT:    v_mov_b32_e32 v2, s2
1270; VI-NEXT:    flat_store_dword v[0:1], v2
1271; VI-NEXT:    s_endpgm
1272;
1273; EG-LABEL: s_ctlz_zero_undef_i64_trunc:
1274; EG:       ; %bb.0:
1275; EG-NEXT:    ALU 6, @4, KC0[CB0:0-32], KC1[]
1276; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1277; EG-NEXT:    CF_END
1278; EG-NEXT:    PAD
1279; EG-NEXT:    ALU clause starting at 4:
1280; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
1281; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
1282; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
1283; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1284; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
1285; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1286; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1287;
1288; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i64_trunc:
1289; GFX9-GISEL:       ; %bb.0:
1290; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1291; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1292; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1293; GFX9-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
1294; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
1295; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1296; GFX9-GISEL-NEXT:    s_endpgm
1297  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1298  %trunc = trunc i64 %ctlz to i32
1299  store i32 %trunc, ptr addrspace(1) %out
1300  ret void
1301}
1302
1303define amdgpu_kernel void @v_ctlz_zero_undef_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1304; SI-LABEL: v_ctlz_zero_undef_i64:
1305; SI:       ; %bb.0:
1306; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1307; SI-NEXT:    s_mov_b32 s7, 0xf000
1308; SI-NEXT:    s_mov_b32 s6, 0
1309; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
1310; SI-NEXT:    v_mov_b32_e32 v1, 0
1311; SI-NEXT:    s_waitcnt lgkmcnt(0)
1312; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1313; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
1314; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1315; SI-NEXT:    s_waitcnt vmcnt(0)
1316; SI-NEXT:    v_ffbh_u32_e32 v2, v2
1317; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
1318; SI-NEXT:    v_ffbh_u32_e32 v3, v3
1319; SI-NEXT:    v_min_u32_e32 v2, v2, v3
1320; SI-NEXT:    v_mov_b32_e32 v3, v1
1321; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
1322; SI-NEXT:    s_endpgm
1323;
1324; VI-LABEL: v_ctlz_zero_undef_i64:
1325; VI:       ; %bb.0:
1326; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1327; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
1328; VI-NEXT:    v_mov_b32_e32 v2, 0
1329; VI-NEXT:    s_waitcnt lgkmcnt(0)
1330; VI-NEXT:    v_mov_b32_e32 v1, s3
1331; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
1332; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1333; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
1334; VI-NEXT:    v_mov_b32_e32 v4, s1
1335; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
1336; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1337; VI-NEXT:    s_waitcnt vmcnt(0)
1338; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1339; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1340; VI-NEXT:    v_ffbh_u32_e32 v1, v1
1341; VI-NEXT:    v_min_u32_e32 v1, v0, v1
1342; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
1343; VI-NEXT:    s_endpgm
1344;
1345; EG-LABEL: v_ctlz_zero_undef_i64:
1346; EG:       ; %bb.0:
1347; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1348; EG-NEXT:    TEX 0 @6
1349; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1350; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
1351; EG-NEXT:    CF_END
1352; EG-NEXT:    PAD
1353; EG-NEXT:    Fetch clause starting at 6:
1354; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
1355; EG-NEXT:    ALU clause starting at 8:
1356; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1357; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1358; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1359; EG-NEXT:    ALU clause starting at 11:
1360; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
1361; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
1362; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
1363; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1364; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
1365; EG-NEXT:     MOV T0.Y, 0.0,
1366; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
1367; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
1368; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1369;
1370; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64:
1371; GFX9-GISEL:       ; %bb.0:
1372; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1373; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
1374; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1375; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
1376; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1377; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1378; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
1379; GFX9-GISEL-NEXT:    v_add_u32_e32 v0, 32, v0
1380; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
1381; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1382; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
1383; GFX9-GISEL-NEXT:    s_endpgm
1384  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1385  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1386  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
1387  %val = load i64, ptr addrspace(1) %in.gep
1388  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1389  store i64 %ctlz, ptr addrspace(1) %out.gep
1390  ret void
1391}
1392
1393define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1394; SI-LABEL: v_ctlz_zero_undef_i64_trunc:
1395; SI:       ; %bb.0:
1396; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1397; SI-NEXT:    s_mov_b32 s7, 0xf000
1398; SI-NEXT:    s_mov_b32 s6, 0
1399; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1400; SI-NEXT:    v_mov_b32_e32 v2, 0
1401; SI-NEXT:    s_waitcnt lgkmcnt(0)
1402; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
1403; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
1404; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
1405; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
1406; SI-NEXT:    s_waitcnt vmcnt(0)
1407; SI-NEXT:    v_ffbh_u32_e32 v0, v3
1408; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
1409; SI-NEXT:    v_ffbh_u32_e32 v3, v4
1410; SI-NEXT:    v_min_u32_e32 v0, v0, v3
1411; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
1412; SI-NEXT:    s_endpgm
1413;
1414; VI-LABEL: v_ctlz_zero_undef_i64_trunc:
1415; VI:       ; %bb.0:
1416; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1417; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1418; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1419; VI-NEXT:    s_waitcnt lgkmcnt(0)
1420; VI-NEXT:    v_mov_b32_e32 v2, s3
1421; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
1422; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
1423; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
1424; VI-NEXT:    v_mov_b32_e32 v4, s1
1425; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
1426; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
1427; VI-NEXT:    s_waitcnt vmcnt(0)
1428; VI-NEXT:    v_ffbh_u32_e32 v0, v1
1429; VI-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
1430; VI-NEXT:    v_ffbh_u32_e32 v1, v2
1431; VI-NEXT:    v_min_u32_e32 v0, v0, v1
1432; VI-NEXT:    flat_store_dword v[3:4], v0
1433; VI-NEXT:    s_endpgm
1434;
1435; EG-LABEL: v_ctlz_zero_undef_i64_trunc:
1436; EG:       ; %bb.0:
1437; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1438; EG-NEXT:    TEX 0 @6
1439; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1440; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1441; EG-NEXT:    CF_END
1442; EG-NEXT:    PAD
1443; EG-NEXT:    Fetch clause starting at 6:
1444; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
1445; EG-NEXT:    ALU clause starting at 8:
1446; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1447; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1448; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
1449; EG-NEXT:    ALU clause starting at 11:
1450; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
1451; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
1452; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
1453; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
1454; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
1455; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
1456; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
1457; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
1458; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1459;
1460; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i64_trunc:
1461; GFX9-GISEL:       ; %bb.0:
1462; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1463; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
1464; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1465; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1466; GFX9-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
1467; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1468; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
1469; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
1470; GFX9-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
1471; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
1472; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1473; GFX9-GISEL-NEXT:    s_endpgm
1474  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1475  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1476  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
1477  %val = load i64, ptr addrspace(1) %in.gep
1478  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
1479  %trunc = trunc i64 %ctlz to i32
1480  store i32 %trunc, ptr addrspace(1) %out.gep
1481  ret void
1482}
1483
1484define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1485; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1486; SI:       ; %bb.0:
1487; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1488; SI-NEXT:    s_mov_b32 s7, 0xf000
1489; SI-NEXT:    s_mov_b32 s10, 0
1490; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1491; SI-NEXT:    v_mov_b32_e32 v1, 0
1492; SI-NEXT:    s_mov_b32 s11, s7
1493; SI-NEXT:    s_waitcnt lgkmcnt(0)
1494; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1495; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1496; SI-NEXT:    s_mov_b32 s6, -1
1497; SI-NEXT:    s_mov_b32 s4, s0
1498; SI-NEXT:    s_mov_b32 s5, s1
1499; SI-NEXT:    s_waitcnt vmcnt(0)
1500; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1501; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1502; SI-NEXT:    s_endpgm
1503;
1504; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1505; VI:       ; %bb.0:
1506; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1507; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1508; VI-NEXT:    s_waitcnt lgkmcnt(0)
1509; VI-NEXT:    v_mov_b32_e32 v1, s3
1510; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1511; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1512; VI-NEXT:    flat_load_dword v0, v[0:1]
1513; VI-NEXT:    s_waitcnt vmcnt(0)
1514; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1515; VI-NEXT:    v_mov_b32_e32 v0, s0
1516; VI-NEXT:    v_mov_b32_e32 v1, s1
1517; VI-NEXT:    flat_store_dword v[0:1], v2
1518; VI-NEXT:    s_endpgm
1519;
1520; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1521; EG:       ; %bb.0:
1522; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1523; EG-NEXT:    TEX 0 @6
1524; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1525; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1526; EG-NEXT:    CF_END
1527; EG-NEXT:    PAD
1528; EG-NEXT:    Fetch clause starting at 6:
1529; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1530; EG-NEXT:    ALU clause starting at 8:
1531; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1532; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1533; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1534; EG-NEXT:    ALU clause starting at 11:
1535; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1536; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1537; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1538; EG-NEXT:    -1(nan), 2(2.802597e-45)
1539;
1540; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1:
1541; GFX9-GISEL:       ; %bb.0:
1542; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1543; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1544; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1545; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1546; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1547; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1548; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1549; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc
1550; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1551; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1552; GFX9-GISEL-NEXT:    s_endpgm
1553  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1554  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1555  %val = load i32, ptr addrspace(1) %in.gep
1556  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1557  %cmp = icmp eq i32 %val, 0
1558  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1559  store i32 %sel, ptr addrspace(1) %out
1560  ret void
1561}
1562
1563define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1564; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1565; SI:       ; %bb.0:
1566; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1567; SI-NEXT:    s_mov_b32 s7, 0xf000
1568; SI-NEXT:    s_mov_b32 s10, 0
1569; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1570; SI-NEXT:    v_mov_b32_e32 v1, 0
1571; SI-NEXT:    s_mov_b32 s11, s7
1572; SI-NEXT:    s_waitcnt lgkmcnt(0)
1573; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1574; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1575; SI-NEXT:    s_mov_b32 s6, -1
1576; SI-NEXT:    s_mov_b32 s4, s0
1577; SI-NEXT:    s_mov_b32 s5, s1
1578; SI-NEXT:    s_waitcnt vmcnt(0)
1579; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1580; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1581; SI-NEXT:    s_endpgm
1582;
1583; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1584; VI:       ; %bb.0:
1585; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1586; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1587; VI-NEXT:    s_waitcnt lgkmcnt(0)
1588; VI-NEXT:    v_mov_b32_e32 v1, s3
1589; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1590; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1591; VI-NEXT:    flat_load_dword v0, v[0:1]
1592; VI-NEXT:    s_waitcnt vmcnt(0)
1593; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1594; VI-NEXT:    v_mov_b32_e32 v0, s0
1595; VI-NEXT:    v_mov_b32_e32 v1, s1
1596; VI-NEXT:    flat_store_dword v[0:1], v2
1597; VI-NEXT:    s_endpgm
1598;
1599; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1600; EG:       ; %bb.0:
1601; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1602; EG-NEXT:    TEX 0 @6
1603; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1604; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1605; EG-NEXT:    CF_END
1606; EG-NEXT:    PAD
1607; EG-NEXT:    Fetch clause starting at 6:
1608; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1609; EG-NEXT:    ALU clause starting at 8:
1610; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1611; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1612; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1613; EG-NEXT:    ALU clause starting at 11:
1614; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1615; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1616; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1617; EG-NEXT:    -1(nan), 2(2.802597e-45)
1618;
1619; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_neg1:
1620; GFX9-GISEL:       ; %bb.0:
1621; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1622; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1623; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1624; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1625; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1626; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1627; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1628; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc
1629; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1630; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1631; GFX9-GISEL-NEXT:    s_endpgm
1632  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1633  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1634  %val = load i32, ptr addrspace(1) %in.gep
1635  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1636  %cmp = icmp ne i32 %val, 0
1637  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1638  store i32 %sel, ptr addrspace(1) %out
1639  ret void
1640}
1641
1642define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1643; SI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1644; SI:       ; %bb.0:
1645; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1646; SI-NEXT:    s_mov_b32 s7, 0xf000
1647; SI-NEXT:    v_mov_b32_e32 v1, 0
1648; SI-NEXT:    s_mov_b32 s10, 0
1649; SI-NEXT:    s_mov_b32 s11, s7
1650; SI-NEXT:    s_waitcnt lgkmcnt(0)
1651; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1652; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1653; SI-NEXT:    s_mov_b32 s6, -1
1654; SI-NEXT:    s_mov_b32 s4, s0
1655; SI-NEXT:    s_mov_b32 s5, s1
1656; SI-NEXT:    s_waitcnt vmcnt(0)
1657; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1658; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1659; SI-NEXT:    s_endpgm
1660;
1661; VI-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1662; VI:       ; %bb.0:
1663; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1664; VI-NEXT:    s_waitcnt lgkmcnt(0)
1665; VI-NEXT:    v_mov_b32_e32 v1, s3
1666; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1667; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1668; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1669; VI-NEXT:    s_waitcnt vmcnt(0)
1670; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1671; VI-NEXT:    v_mov_b32_e32 v0, s0
1672; VI-NEXT:    v_mov_b32_e32 v1, s1
1673; VI-NEXT:    flat_store_byte v[0:1], v2
1674; VI-NEXT:    s_endpgm
1675;
1676; EG-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1677; EG:       ; %bb.0:
1678; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1679; EG-NEXT:    TEX 0 @6
1680; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1681; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1682; EG-NEXT:    CF_END
1683; EG-NEXT:    PAD
1684; EG-NEXT:    Fetch clause starting at 6:
1685; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1686; EG-NEXT:    ALU clause starting at 8:
1687; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1688; EG-NEXT:    ALU clause starting at 9:
1689; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1690; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1691; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1692; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1693; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1694; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1695; EG-NEXT:     LSHL T0.X, PV.W, PS,
1696; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1697; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1698; EG-NEXT:     MOV T0.Y, 0.0,
1699; EG-NEXT:     MOV * T0.Z, 0.0,
1700; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1701; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1702;
1703; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i8_sel_eq_neg1:
1704; GFX9-GISEL:       ; %bb.0:
1705; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1706; GFX9-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1707; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1708; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1709; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1710; GFX9-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, v1, v0
1711; GFX9-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, v2, v3, vcc
1712; GFX9-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1713; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1714; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff
1715; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1716; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
1717; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
1718; GFX9-GISEL-NEXT:    v_cmp_eq_u32_sdwa vcc, v0, v1 src0_sel:BYTE_0 src1_sel:DWORD
1719; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
1720; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1721; GFX9-GISEL-NEXT:    s_endpgm
1722  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1723  %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1724  %val = load i8, ptr addrspace(1) %valptr.gep
1725  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
1726  %cmp = icmp eq i8 %val, 0
1727  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1728  store i8 %sel, ptr addrspace(1) %out
1729  ret void
1730}
1731
1732define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1733; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1734; SI:       ; %bb.0:
1735; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1736; SI-NEXT:    s_mov_b32 s7, 0xf000
1737; SI-NEXT:    s_mov_b32 s10, 0
1738; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1739; SI-NEXT:    v_mov_b32_e32 v1, 0
1740; SI-NEXT:    s_mov_b32 s11, s7
1741; SI-NEXT:    s_waitcnt lgkmcnt(0)
1742; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1743; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1744; SI-NEXT:    s_mov_b32 s6, -1
1745; SI-NEXT:    s_mov_b32 s4, s0
1746; SI-NEXT:    s_mov_b32 s5, s1
1747; SI-NEXT:    s_waitcnt vmcnt(0)
1748; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1749; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1750; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
1751; SI-NEXT:    buffer_store_dword v1, off, s[4:7], 0
1752; SI-NEXT:    s_waitcnt vmcnt(0)
1753; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1754; SI-NEXT:    s_waitcnt vmcnt(0)
1755; SI-NEXT:    s_endpgm
1756;
1757; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1758; VI:       ; %bb.0:
1759; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1760; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1761; VI-NEXT:    s_waitcnt lgkmcnt(0)
1762; VI-NEXT:    v_mov_b32_e32 v1, s3
1763; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1764; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1765; VI-NEXT:    flat_load_dword v2, v[0:1]
1766; VI-NEXT:    v_mov_b32_e32 v0, s0
1767; VI-NEXT:    v_mov_b32_e32 v1, s1
1768; VI-NEXT:    s_waitcnt vmcnt(0)
1769; VI-NEXT:    v_ffbh_u32_e32 v3, v2
1770; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
1771; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1772; VI-NEXT:    flat_store_dword v[0:1], v3
1773; VI-NEXT:    s_waitcnt vmcnt(0)
1774; VI-NEXT:    flat_store_byte v[0:1], v2
1775; VI-NEXT:    s_waitcnt vmcnt(0)
1776; VI-NEXT:    s_endpgm
1777;
1778; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1779; EG:       ; %bb.0:
1780; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1781; EG-NEXT:    TEX 0 @6
1782; EG-NEXT:    ALU 11, @11, KC0[CB0:0-32], KC1[]
1783; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T3.X, 0
1784; EG-NEXT:    MEM_RAT MSKOR T1.XW, T2.X
1785; EG-NEXT:    CF_END
1786; EG-NEXT:    Fetch clause starting at 6:
1787; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1788; EG-NEXT:    ALU clause starting at 8:
1789; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1790; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1791; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1792; EG-NEXT:    ALU clause starting at 11:
1793; EG-NEXT:     SETE_INT * T0.W, T0.X, 0.0,
1794; EG-NEXT:     AND_INT T1.X, PV.W, 1,
1795; EG-NEXT:     MOV * T1.W, literal.x,
1796; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1797; EG-NEXT:     MOV T1.Y, 0.0,
1798; EG-NEXT:     MOV * T1.Z, 0.0,
1799; EG-NEXT:     MOV T2.X, literal.x,
1800; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1801; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1802; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1803; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.y,
1804; EG-NEXT:    -1(nan), 2(2.802597e-45)
1805;
1806; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_neg1_two_use:
1807; GFX9-GISEL:       ; %bb.0:
1808; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1809; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1810; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1811; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1812; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1813; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1814; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v0
1815; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1816; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, -1, vcc
1817; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
1818; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1819; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1820; GFX9-GISEL-NEXT:    global_store_byte v[0:1], v2, off
1821; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1822; GFX9-GISEL-NEXT:    s_endpgm
1823  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1824  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1825  %val = load i32, ptr addrspace(1) %in.gep
1826  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1827  %cmp = icmp eq i32 %val, 0
1828  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1829  store volatile i32 %sel, ptr addrspace(1) %out
1830  store volatile i1 %cmp, ptr addrspace(1) undef
1831  ret void
1832}
1833
1834; Selected on wrong constant
1835define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1836; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1837; SI:       ; %bb.0:
1838; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1839; SI-NEXT:    s_mov_b32 s7, 0xf000
1840; SI-NEXT:    s_mov_b32 s10, 0
1841; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1842; SI-NEXT:    v_mov_b32_e32 v1, 0
1843; SI-NEXT:    s_mov_b32 s11, s7
1844; SI-NEXT:    s_waitcnt lgkmcnt(0)
1845; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1846; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1847; SI-NEXT:    s_mov_b32 s6, -1
1848; SI-NEXT:    s_mov_b32 s4, s0
1849; SI-NEXT:    s_mov_b32 s5, s1
1850; SI-NEXT:    s_waitcnt vmcnt(0)
1851; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1852; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1853; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1854; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1855; SI-NEXT:    s_endpgm
1856;
1857; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1858; VI:       ; %bb.0:
1859; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1860; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1861; VI-NEXT:    s_waitcnt lgkmcnt(0)
1862; VI-NEXT:    v_mov_b32_e32 v1, s3
1863; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1864; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1865; VI-NEXT:    flat_load_dword v0, v[0:1]
1866; VI-NEXT:    s_waitcnt vmcnt(0)
1867; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1868; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1869; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1870; VI-NEXT:    v_mov_b32_e32 v0, s0
1871; VI-NEXT:    v_mov_b32_e32 v1, s1
1872; VI-NEXT:    flat_store_dword v[0:1], v2
1873; VI-NEXT:    s_endpgm
1874;
1875; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1876; EG:       ; %bb.0:
1877; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1878; EG-NEXT:    TEX 0 @6
1879; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1880; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1881; EG-NEXT:    CF_END
1882; EG-NEXT:    PAD
1883; EG-NEXT:    Fetch clause starting at 6:
1884; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1885; EG-NEXT:    ALU clause starting at 8:
1886; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1887; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1888; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1889; EG-NEXT:    ALU clause starting at 11:
1890; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1891; EG-NEXT:     CNDE_INT T0.X, T0.X, 0.0, PV.W,
1892; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1893; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1894;
1895; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_0:
1896; GFX9-GISEL:       ; %bb.0:
1897; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1898; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1899; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1900; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1901; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1902; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1903; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
1904; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
1905; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1906; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1907; GFX9-GISEL-NEXT:    s_endpgm
1908  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1909  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1910  %val = load i32, ptr addrspace(1) %in.gep
1911  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1912  %cmp = icmp eq i32 %val, 0
1913  %sel = select i1 %cmp, i32 0, i32 %ctlz
1914  store i32 %sel, ptr addrspace(1) %out
1915  ret void
1916}
1917
1918; Selected on wrong constant
1919define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1920; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1921; SI:       ; %bb.0:
1922; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1923; SI-NEXT:    s_mov_b32 s7, 0xf000
1924; SI-NEXT:    s_mov_b32 s10, 0
1925; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1926; SI-NEXT:    v_mov_b32_e32 v1, 0
1927; SI-NEXT:    s_mov_b32 s11, s7
1928; SI-NEXT:    s_waitcnt lgkmcnt(0)
1929; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1930; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1931; SI-NEXT:    s_mov_b32 s6, -1
1932; SI-NEXT:    s_mov_b32 s4, s0
1933; SI-NEXT:    s_mov_b32 s5, s1
1934; SI-NEXT:    s_waitcnt vmcnt(0)
1935; SI-NEXT:    v_ffbh_u32_e32 v1, v0
1936; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1937; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1938; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1939; SI-NEXT:    s_endpgm
1940;
1941; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1942; VI:       ; %bb.0:
1943; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1944; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1945; VI-NEXT:    s_waitcnt lgkmcnt(0)
1946; VI-NEXT:    v_mov_b32_e32 v1, s3
1947; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1948; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1949; VI-NEXT:    flat_load_dword v0, v[0:1]
1950; VI-NEXT:    s_waitcnt vmcnt(0)
1951; VI-NEXT:    v_ffbh_u32_e32 v1, v0
1952; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1953; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
1954; VI-NEXT:    v_mov_b32_e32 v0, s0
1955; VI-NEXT:    v_mov_b32_e32 v1, s1
1956; VI-NEXT:    flat_store_dword v[0:1], v2
1957; VI-NEXT:    s_endpgm
1958;
1959; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1960; EG:       ; %bb.0:
1961; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1962; EG-NEXT:    TEX 0 @6
1963; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
1964; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1965; EG-NEXT:    CF_END
1966; EG-NEXT:    PAD
1967; EG-NEXT:    Fetch clause starting at 6:
1968; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1969; EG-NEXT:    ALU clause starting at 8:
1970; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1971; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1972; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1973; EG-NEXT:    ALU clause starting at 11:
1974; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1975; EG-NEXT:     CNDE_INT T0.X, T0.X, 0.0, PV.W,
1976; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1977; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1978;
1979; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_0:
1980; GFX9-GISEL:       ; %bb.0:
1981; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1982; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1983; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1984; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1985; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1986; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1987; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1988; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
1989; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1990; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1991; GFX9-GISEL-NEXT:    s_endpgm
1992  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1993  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1994  %val = load i32, ptr addrspace(1) %in.gep
1995  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
1996  %cmp = icmp ne i32 %val, 0
1997  %sel = select i1 %cmp, i32 %ctlz, i32 0
1998  store i32 %sel, ptr addrspace(1) %out
1999  ret void
2000}
2001
2002; Compare on wrong constant
2003define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
2004; SI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2005; SI:       ; %bb.0:
2006; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2007; SI-NEXT:    s_mov_b32 s7, 0xf000
2008; SI-NEXT:    s_mov_b32 s10, 0
2009; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2010; SI-NEXT:    v_mov_b32_e32 v1, 0
2011; SI-NEXT:    s_mov_b32 s11, s7
2012; SI-NEXT:    s_waitcnt lgkmcnt(0)
2013; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2014; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2015; SI-NEXT:    s_mov_b32 s6, -1
2016; SI-NEXT:    s_mov_b32 s4, s0
2017; SI-NEXT:    s_mov_b32 s5, s1
2018; SI-NEXT:    s_waitcnt vmcnt(0)
2019; SI-NEXT:    v_ffbh_u32_e32 v1, v0
2020; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
2021; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2022; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2023; SI-NEXT:    s_endpgm
2024;
2025; VI-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2026; VI:       ; %bb.0:
2027; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2028; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2029; VI-NEXT:    s_waitcnt lgkmcnt(0)
2030; VI-NEXT:    v_mov_b32_e32 v1, s3
2031; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2032; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2033; VI-NEXT:    flat_load_dword v0, v[0:1]
2034; VI-NEXT:    s_waitcnt vmcnt(0)
2035; VI-NEXT:    v_ffbh_u32_e32 v1, v0
2036; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
2037; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
2038; VI-NEXT:    v_mov_b32_e32 v0, s0
2039; VI-NEXT:    v_mov_b32_e32 v1, s1
2040; VI-NEXT:    flat_store_dword v[0:1], v2
2041; VI-NEXT:    s_endpgm
2042;
2043; EG-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2044; EG:       ; %bb.0:
2045; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
2046; EG-NEXT:    TEX 0 @6
2047; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
2048; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2049; EG-NEXT:    CF_END
2050; EG-NEXT:    PAD
2051; EG-NEXT:    Fetch clause starting at 6:
2052; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2053; EG-NEXT:    ALU clause starting at 8:
2054; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2055; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2056; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
2057; EG-NEXT:    ALU clause starting at 11:
2058; EG-NEXT:     FFBH_UINT T0.W, T0.X,
2059; EG-NEXT:     SETE_INT * T1.W, T0.X, 1,
2060; EG-NEXT:     CNDE_INT T0.X, PS, PV.W, 0.0,
2061; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2062; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2063;
2064; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_eq_cmp_non0:
2065; GFX9-GISEL:       ; %bb.0:
2066; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2067; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2068; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2069; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
2070; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2071; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
2072; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
2073; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0, vcc
2074; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2075; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
2076; GFX9-GISEL-NEXT:    s_endpgm
2077  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2078  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2079  %val = load i32, ptr addrspace(1) %in.gep
2080  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2081  %cmp = icmp eq i32 %val, 1
2082  %sel = select i1 %cmp, i32 0, i32 %ctlz
2083  store i32 %sel, ptr addrspace(1) %out
2084  ret void
2085}
2086
2087; Selected on wrong constant
2088define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
2089; SI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2090; SI:       ; %bb.0:
2091; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
2092; SI-NEXT:    s_mov_b32 s7, 0xf000
2093; SI-NEXT:    s_mov_b32 s10, 0
2094; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2095; SI-NEXT:    v_mov_b32_e32 v1, 0
2096; SI-NEXT:    s_mov_b32 s11, s7
2097; SI-NEXT:    s_waitcnt lgkmcnt(0)
2098; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
2099; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
2100; SI-NEXT:    s_mov_b32 s6, -1
2101; SI-NEXT:    s_mov_b32 s4, s0
2102; SI-NEXT:    s_mov_b32 s5, s1
2103; SI-NEXT:    s_waitcnt vmcnt(0)
2104; SI-NEXT:    v_ffbh_u32_e32 v1, v0
2105; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
2106; SI-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2107; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
2108; SI-NEXT:    s_endpgm
2109;
2110; VI-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2111; VI:       ; %bb.0:
2112; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2113; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2114; VI-NEXT:    s_waitcnt lgkmcnt(0)
2115; VI-NEXT:    v_mov_b32_e32 v1, s3
2116; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
2117; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
2118; VI-NEXT:    flat_load_dword v0, v[0:1]
2119; VI-NEXT:    s_waitcnt vmcnt(0)
2120; VI-NEXT:    v_ffbh_u32_e32 v1, v0
2121; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
2122; VI-NEXT:    v_cndmask_b32_e32 v2, 0, v1, vcc
2123; VI-NEXT:    v_mov_b32_e32 v0, s0
2124; VI-NEXT:    v_mov_b32_e32 v1, s1
2125; VI-NEXT:    flat_store_dword v[0:1], v2
2126; VI-NEXT:    s_endpgm
2127;
2128; EG-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2129; EG:       ; %bb.0:
2130; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
2131; EG-NEXT:    TEX 0 @6
2132; EG-NEXT:    ALU 4, @11, KC0[CB0:0-32], KC1[]
2133; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
2134; EG-NEXT:    CF_END
2135; EG-NEXT:    PAD
2136; EG-NEXT:    Fetch clause starting at 6:
2137; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
2138; EG-NEXT:    ALU clause starting at 8:
2139; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
2140; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2141; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
2142; EG-NEXT:    ALU clause starting at 11:
2143; EG-NEXT:     FFBH_UINT T0.W, T0.X,
2144; EG-NEXT:     SETNE_INT * T1.W, T0.X, 1,
2145; EG-NEXT:     CNDE_INT T0.X, PS, 0.0, PV.W,
2146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
2147; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2148;
2149; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i32_sel_ne_cmp_non0:
2150; GFX9-GISEL:       ; %bb.0:
2151; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
2152; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
2153; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2154; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
2155; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
2156; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
2157; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
2158; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
2159; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
2160; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
2161; GFX9-GISEL-NEXT:    s_endpgm
2162  %tid = call i32 @llvm.amdgcn.workitem.id.x()
2163  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
2164  %val = load i32, ptr addrspace(1) %in.gep
2165  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
2166  %cmp = icmp ne i32 %val, 1
2167  %sel = select i1 %cmp, i32 %ctlz, i32 0
2168  store i32 %sel, ptr addrspace(1) %out
2169  ret void
2170}
2171
2172define i7 @v_ctlz_zero_undef_i7(i7 %val) {
2173; SI-LABEL: v_ctlz_zero_undef_i7:
2174; SI:       ; %bb.0:
2175; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2176; SI-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2177; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2178; SI-NEXT:    s_setpc_b64 s[30:31]
2179;
2180; VI-LABEL: v_ctlz_zero_undef_i7:
2181; VI:       ; %bb.0:
2182; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2183; VI-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2184; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2185; VI-NEXT:    s_setpc_b64 s[30:31]
2186;
2187; EG-LABEL: v_ctlz_zero_undef_i7:
2188; EG:       ; %bb.0:
2189; EG-NEXT:    CF_END
2190; EG-NEXT:    PAD
2191;
2192; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i7:
2193; GFX9-GISEL:       ; %bb.0:
2194; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2195; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2196; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2197; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2198  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 true)
2199  ret i7 %ctlz
2200}
2201
2202define amdgpu_kernel void @s_ctlz_zero_undef_i18(ptr addrspace(1) noalias %out, i18 %val) nounwind {
2203; SI-LABEL: s_ctlz_zero_undef_i18:
2204; SI:       ; %bb.0:
2205; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
2206; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
2207; SI-NEXT:    s_mov_b32 s3, 0xf000
2208; SI-NEXT:    s_waitcnt lgkmcnt(0)
2209; SI-NEXT:    s_lshl_b32 s2, s2, 14
2210; SI-NEXT:    s_flbit_i32_b32 s4, s2
2211; SI-NEXT:    s_mov_b32 s2, -1
2212; SI-NEXT:    v_mov_b32_e32 v0, s4
2213; SI-NEXT:    s_bfe_u32 s4, s4, 0x20010
2214; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
2215; SI-NEXT:    s_waitcnt expcnt(0)
2216; SI-NEXT:    v_mov_b32_e32 v0, s4
2217; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0 offset:2
2218; SI-NEXT:    s_endpgm
2219;
2220; VI-LABEL: s_ctlz_zero_undef_i18:
2221; VI:       ; %bb.0:
2222; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
2223; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2224; VI-NEXT:    s_waitcnt lgkmcnt(0)
2225; VI-NEXT:    s_lshl_b32 s2, s2, 14
2226; VI-NEXT:    v_mov_b32_e32 v0, s0
2227; VI-NEXT:    s_flbit_i32_b32 s2, s2
2228; VI-NEXT:    v_mov_b32_e32 v1, s1
2229; VI-NEXT:    s_add_u32 s0, s0, 2
2230; VI-NEXT:    v_mov_b32_e32 v2, s2
2231; VI-NEXT:    s_addc_u32 s1, s1, 0
2232; VI-NEXT:    flat_store_short v[0:1], v2
2233; VI-NEXT:    s_bfe_u32 s2, s2, 0x20010
2234; VI-NEXT:    v_mov_b32_e32 v0, s0
2235; VI-NEXT:    v_mov_b32_e32 v1, s1
2236; VI-NEXT:    v_mov_b32_e32 v2, s2
2237; VI-NEXT:    flat_store_byte v[0:1], v2
2238; VI-NEXT:    s_endpgm
2239;
2240; EG-LABEL: s_ctlz_zero_undef_i18:
2241; EG:       ; %bb.0:
2242; EG-NEXT:    ALU 28, @4, KC0[CB0:0-32], KC1[]
2243; EG-NEXT:    MEM_RAT MSKOR T1.XW, T3.X
2244; EG-NEXT:    MEM_RAT MSKOR T0.XW, T2.X
2245; EG-NEXT:    CF_END
2246; EG-NEXT:    ALU clause starting at 4:
2247; EG-NEXT:     LSHL * T0.W, KC0[2].Z, literal.x,
2248; EG-NEXT:    14(1.961818e-44), 0(0.000000e+00)
2249; EG-NEXT:     FFBH_UINT T0.W, PV.W,
2250; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
2251; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
2252; EG-NEXT:     AND_INT T2.W, PV.W, literal.x,
2253; EG-NEXT:     LSHL * T1.W, PS, literal.y,
2254; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
2255; EG-NEXT:     LSHL T1.X, PV.W, PS,
2256; EG-NEXT:     LSHL * T1.W, literal.x, PS,
2257; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
2258; EG-NEXT:     MOV T1.Y, 0.0,
2259; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.x,
2260; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2261; EG-NEXT:     AND_INT T3.W, PV.W, literal.x,
2262; EG-NEXT:     MOV * T4.W, literal.y,
2263; EG-NEXT:    3(4.203895e-45), 2(2.802597e-45)
2264; EG-NEXT:     BFE_UINT T0.W, T0.W, literal.x, PS,
2265; EG-NEXT:     LSHL * T3.W, PV.W, literal.y,
2266; EG-NEXT:    16(2.242078e-44), 3(4.203895e-45)
2267; EG-NEXT:     LSHL T0.X, PV.W, PS,
2268; EG-NEXT:     LSHL * T0.W, literal.x, PS,
2269; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
2270; EG-NEXT:     MOV T0.Y, 0.0,
2271; EG-NEXT:     MOV T1.Z, 0.0,
2272; EG-NEXT:     MOV * T0.Z, 0.0,
2273; EG-NEXT:     LSHR T2.X, T2.W, literal.x,
2274; EG-NEXT:     LSHR * T3.X, KC0[2].Y, literal.x,
2275; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
2276;
2277; GFX9-GISEL-LABEL: s_ctlz_zero_undef_i18:
2278; GFX9-GISEL:       ; %bb.0:
2279; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
2280; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
2281; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
2282; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
2283; GFX9-GISEL-NEXT:    s_lshl_b32 s2, s2, 14
2284; GFX9-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
2285; GFX9-GISEL-NEXT:    s_and_b32 s2, s2, 0x3ffff
2286; GFX9-GISEL-NEXT:    s_lshr_b32 s3, s2, 16
2287; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s2
2288; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
2289; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
2290; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1] offset:2
2291; GFX9-GISEL-NEXT:    s_endpgm
2292  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true) nounwind readnone
2293  store i18 %ctlz, ptr addrspace(1) %out, align 4
2294  ret void
2295}
2296
2297define i18 @v_ctlz_zero_undef_i18(i18 %val) {
2298; SI-LABEL: v_ctlz_zero_undef_i18:
2299; SI:       ; %bb.0:
2300; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2301; SI-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2302; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2303; SI-NEXT:    s_setpc_b64 s[30:31]
2304;
2305; VI-LABEL: v_ctlz_zero_undef_i18:
2306; VI:       ; %bb.0:
2307; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2308; VI-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2309; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2310; VI-NEXT:    s_setpc_b64 s[30:31]
2311;
2312; EG-LABEL: v_ctlz_zero_undef_i18:
2313; EG:       ; %bb.0:
2314; EG-NEXT:    CF_END
2315; EG-NEXT:    PAD
2316;
2317; GFX9-GISEL-LABEL: v_ctlz_zero_undef_i18:
2318; GFX9-GISEL:       ; %bb.0:
2319; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2320; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2321; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2322; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2323  %ctlz = call i18 @llvm.ctlz.i18(i18 %val, i1 true)
2324  ret i18 %ctlz
2325}
2326
2327define <2 x i18> @v_ctlz_zero_undef_v2i18(<2 x i18> %val) {
2328; SI-LABEL: v_ctlz_zero_undef_v2i18:
2329; SI:       ; %bb.0:
2330; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2331; SI-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2332; SI-NEXT:    v_lshlrev_b32_e32 v1, 14, v1
2333; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2334; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2335; SI-NEXT:    s_setpc_b64 s[30:31]
2336;
2337; VI-LABEL: v_ctlz_zero_undef_v2i18:
2338; VI:       ; %bb.0:
2339; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2340; VI-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2341; VI-NEXT:    v_lshlrev_b32_e32 v1, 14, v1
2342; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2343; VI-NEXT:    v_ffbh_u32_e32 v1, v1
2344; VI-NEXT:    s_setpc_b64 s[30:31]
2345;
2346; EG-LABEL: v_ctlz_zero_undef_v2i18:
2347; EG:       ; %bb.0:
2348; EG-NEXT:    CF_END
2349; EG-NEXT:    PAD
2350;
2351; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i18:
2352; GFX9-GISEL:       ; %bb.0:
2353; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2354; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 14, v0
2355; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 14, v1
2356; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2357; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2358; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2359  %ctlz = call <2 x i18> @llvm.ctlz.v2i18(<2 x i18> %val, i1 true)
2360  ret <2 x i18> %ctlz
2361}
2362
2363define <2 x i16> @v_ctlz_zero_undef_v2i16(<2 x i16> %val) {
2364; SI-LABEL: v_ctlz_zero_undef_v2i16:
2365; SI:       ; %bb.0:
2366; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2367; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2368; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2369; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2370; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
2371; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2372; SI-NEXT:    v_or_b32_e32 v0, v0, v2
2373; SI-NEXT:    s_setpc_b64 s[30:31]
2374;
2375; VI-LABEL: v_ctlz_zero_undef_v2i16:
2376; VI:       ; %bb.0:
2377; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2378; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v0
2379; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2380; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2381; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2382; VI-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2383; VI-NEXT:    s_setpc_b64 s[30:31]
2384;
2385; EG-LABEL: v_ctlz_zero_undef_v2i16:
2386; EG:       ; %bb.0:
2387; EG-NEXT:    CF_END
2388; EG-NEXT:    PAD
2389;
2390; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i16:
2391; GFX9-GISEL:       ; %bb.0:
2392; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2393; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
2394; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2395; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2396; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2397; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2398; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2399; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
2400; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2401  %ctlz = call <2 x i16> @llvm.ctlz.v2i16(<2 x i16> %val, i1 true)
2402  ret <2 x i16> %ctlz
2403}
2404
2405define <3 x i16> @v_ctlz_zero_undef_v3i16(<3 x i16> %val) {
2406; SI-LABEL: v_ctlz_zero_undef_v3i16:
2407; SI:       ; %bb.0:
2408; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2409; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2410; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2411; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2412; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2413; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2414; SI-NEXT:    v_ffbh_u32_e32 v3, v2
2415; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2416; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2417; SI-NEXT:    v_or_b32_e32 v2, 0x200000, v3
2418; SI-NEXT:    v_alignbit_b32 v1, v3, v0, 16
2419; SI-NEXT:    s_setpc_b64 s[30:31]
2420;
2421; VI-LABEL: v_ctlz_zero_undef_v3i16:
2422; VI:       ; %bb.0:
2423; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2424; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
2425; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2426; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2427; VI-NEXT:    v_ffbh_u32_e32 v2, v2
2428; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2429; VI-NEXT:    v_ffbh_u32_e32 v1, v1
2430; VI-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2431; VI-NEXT:    s_setpc_b64 s[30:31]
2432;
2433; EG-LABEL: v_ctlz_zero_undef_v3i16:
2434; EG:       ; %bb.0:
2435; EG-NEXT:    CF_END
2436; EG-NEXT:    PAD
2437;
2438; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v3i16:
2439; GFX9-GISEL:       ; %bb.0:
2440; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2441; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2442; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2443; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2444; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2445; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
2446; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2447; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2448; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2449; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
2450; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2451  %ctlz = call <3 x i16> @llvm.ctlz.v3i16(<3 x i16> %val, i1 true)
2452  ret <3 x i16> %ctlz
2453}
2454
2455define <4 x i16> @v_ctlz_zero_undef_v4i16(<4 x i16> %val) {
2456; SI-LABEL: v_ctlz_zero_undef_v4i16:
2457; SI:       ; %bb.0:
2458; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2459; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2460; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2461; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2462; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2463; SI-NEXT:    v_ffbh_u32_e32 v3, v3
2464; SI-NEXT:    v_ffbh_u32_e32 v2, v2
2465; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2466; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2467; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2468; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2469; SI-NEXT:    v_or_b32_e32 v2, v2, v3
2470; SI-NEXT:    v_or_b32_e32 v0, v0, v1
2471; SI-NEXT:    v_alignbit_b32 v1, v2, v0, 16
2472; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
2473; SI-NEXT:    s_setpc_b64 s[30:31]
2474;
2475; VI-LABEL: v_ctlz_zero_undef_v4i16:
2476; VI:       ; %bb.0:
2477; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2478; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v1
2479; VI-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
2480; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v0
2481; VI-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
2482; VI-NEXT:    v_ffbh_u32_e32 v2, v2
2483; VI-NEXT:    v_ffbh_u32_sdwa v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2484; VI-NEXT:    v_ffbh_u32_e32 v3, v3
2485; VI-NEXT:    v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
2486; VI-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2487; VI-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
2488; VI-NEXT:    s_setpc_b64 s[30:31]
2489;
2490; EG-LABEL: v_ctlz_zero_undef_v4i16:
2491; EG:       ; %bb.0:
2492; EG-NEXT:    CF_END
2493; EG-NEXT:    PAD
2494;
2495; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v4i16:
2496; GFX9-GISEL:       ; %bb.0:
2497; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2498; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
2499; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
2500; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
2501; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
2502; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2503; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
2504; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2505; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
2506; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
2507; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
2508; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
2509; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
2510; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
2511; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
2512; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2513  %ctlz = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %val, i1 true)
2514  ret <4 x i16> %ctlz
2515}
2516
2517define <2 x i8> @v_ctlz_zero_undef_v2i8(<2 x i8> %val) {
2518; SI-LABEL: v_ctlz_zero_undef_v2i8:
2519; SI:       ; %bb.0:
2520; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2521; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
2522; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
2523; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2524; SI-NEXT:    v_lshlrev_b32_e32 v2, 8, v1
2525; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2526; SI-NEXT:    v_or_b32_e32 v0, v0, v2
2527; SI-NEXT:    s_setpc_b64 s[30:31]
2528;
2529; VI-LABEL: v_ctlz_zero_undef_v2i8:
2530; VI:       ; %bb.0:
2531; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2532; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
2533; VI-NEXT:    v_ffbh_u32_e32 v1, v1
2534; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
2535; VI-NEXT:    v_lshlrev_b16_e32 v2, 8, v1
2536; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2537; VI-NEXT:    v_or_b32_e32 v0, v0, v2
2538; VI-NEXT:    v_and_b32_e32 v1, 0xff, v1
2539; VI-NEXT:    s_setpc_b64 s[30:31]
2540;
2541; EG-LABEL: v_ctlz_zero_undef_v2i8:
2542; EG:       ; %bb.0:
2543; EG-NEXT:    CF_END
2544; EG-NEXT:    PAD
2545;
2546; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i8:
2547; GFX9-GISEL:       ; %bb.0:
2548; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2549; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
2550; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
2551; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2552; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2553; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2554  %ctlz = call <2 x i8> @llvm.ctlz.v2i8(<2 x i8> %val, i1 true)
2555  ret <2 x i8> %ctlz
2556}
2557
2558define <2 x i7> @v_ctlz_zero_undef_v2i7(<2 x i7> %val) {
2559; SI-LABEL: v_ctlz_zero_undef_v2i7:
2560; SI:       ; %bb.0:
2561; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2562; SI-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2563; SI-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
2564; SI-NEXT:    v_ffbh_u32_e32 v0, v0
2565; SI-NEXT:    v_ffbh_u32_e32 v1, v1
2566; SI-NEXT:    s_setpc_b64 s[30:31]
2567;
2568; VI-LABEL: v_ctlz_zero_undef_v2i7:
2569; VI:       ; %bb.0:
2570; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2571; VI-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2572; VI-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
2573; VI-NEXT:    v_ffbh_u32_e32 v0, v0
2574; VI-NEXT:    v_ffbh_u32_e32 v1, v1
2575; VI-NEXT:    s_setpc_b64 s[30:31]
2576;
2577; EG-LABEL: v_ctlz_zero_undef_v2i7:
2578; EG:       ; %bb.0:
2579; EG-NEXT:    CF_END
2580; EG-NEXT:    PAD
2581;
2582; GFX9-GISEL-LABEL: v_ctlz_zero_undef_v2i7:
2583; GFX9-GISEL:       ; %bb.0:
2584; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
2585; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 25, v0
2586; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 25, v1
2587; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
2588; GFX9-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
2589; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
2590  %ctlz = call <2 x i7> @llvm.ctlz.v2i7(<2 x i7> %val, i1 true)
2591  ret <2 x i7> %ctlz
2592}
2593