xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=VI %s
4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=EG %s
5; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9-GISEL %s
6
7declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
8declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
9declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
10declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
11declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
12declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
13declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
14declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
15
16define amdgpu_kernel void @s_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
17; SI-LABEL: s_cttz_zero_undef_i32:
18; SI:       ; %bb.0:
19; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
20; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
21; SI-NEXT:    s_mov_b32 s3, 0xf000
22; SI-NEXT:    s_waitcnt lgkmcnt(0)
23; SI-NEXT:    s_ff1_i32_b32 s4, s2
24; SI-NEXT:    s_mov_b32 s2, -1
25; SI-NEXT:    v_mov_b32_e32 v0, s4
26; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: s_cttz_zero_undef_i32:
30; VI:       ; %bb.0:
31; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
32; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
33; VI-NEXT:    s_waitcnt lgkmcnt(0)
34; VI-NEXT:    s_ff1_i32_b32 s2, s2
35; VI-NEXT:    v_mov_b32_e32 v0, s0
36; VI-NEXT:    v_mov_b32_e32 v1, s1
37; VI-NEXT:    v_mov_b32_e32 v2, s2
38; VI-NEXT:    flat_store_dword v[0:1], v2
39; VI-NEXT:    s_endpgm
40;
41; EG-LABEL: s_cttz_zero_undef_i32:
42; EG:       ; %bb.0:
43; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
44; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
45; EG-NEXT:    CF_END
46; EG-NEXT:    PAD
47; EG-NEXT:    ALU clause starting at 4:
48; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
49; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
50; EG-NEXT:     FFBL_INT * T1.X, KC0[2].Z,
51;
52; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32:
53; GFX9-GISEL:       ; %bb.0:
54; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
55; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
56; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
57; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
58; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
59; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
60; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
61; GFX9-GISEL-NEXT:    s_endpgm
62  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
63  store i32 %cttz, ptr addrspace(1) %out, align 4
64  ret void
65}
66
67define amdgpu_kernel void @v_cttz_zero_undef_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
68; SI-LABEL: v_cttz_zero_undef_i32:
69; SI:       ; %bb.0:
70; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
71; SI-NEXT:    s_mov_b32 s7, 0xf000
72; SI-NEXT:    s_mov_b32 s10, 0
73; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
74; SI-NEXT:    v_mov_b32_e32 v1, 0
75; SI-NEXT:    s_mov_b32 s11, s7
76; SI-NEXT:    s_waitcnt lgkmcnt(0)
77; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
78; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
79; SI-NEXT:    s_mov_b32 s6, -1
80; SI-NEXT:    s_mov_b32 s4, s0
81; SI-NEXT:    s_mov_b32 s5, s1
82; SI-NEXT:    s_waitcnt vmcnt(0)
83; SI-NEXT:    v_ffbl_b32_e32 v0, v0
84; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
85; SI-NEXT:    s_endpgm
86;
87; VI-LABEL: v_cttz_zero_undef_i32:
88; VI:       ; %bb.0:
89; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
90; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
91; VI-NEXT:    s_waitcnt lgkmcnt(0)
92; VI-NEXT:    v_mov_b32_e32 v1, s3
93; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
94; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
95; VI-NEXT:    flat_load_dword v0, v[0:1]
96; VI-NEXT:    s_waitcnt vmcnt(0)
97; VI-NEXT:    v_ffbl_b32_e32 v2, v0
98; VI-NEXT:    v_mov_b32_e32 v0, s0
99; VI-NEXT:    v_mov_b32_e32 v1, s1
100; VI-NEXT:    flat_store_dword v[0:1], v2
101; VI-NEXT:    s_endpgm
102;
103; EG-LABEL: v_cttz_zero_undef_i32:
104; EG:       ; %bb.0:
105; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
106; EG-NEXT:    TEX 0 @6
107; EG-NEXT:    ALU 2, @11, KC0[CB0:0-32], KC1[]
108; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
109; EG-NEXT:    CF_END
110; EG-NEXT:    PAD
111; EG-NEXT:    Fetch clause starting at 6:
112; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
113; EG-NEXT:    ALU clause starting at 8:
114; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
115; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
116; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
117; EG-NEXT:    ALU clause starting at 11:
118; EG-NEXT:     FFBL_INT T0.X, T0.X,
119; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
120; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
121;
122; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32:
123; GFX9-GISEL:       ; %bb.0:
124; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
125; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
126; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
127; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
128; GFX9-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
129; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
130; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
131; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
132; GFX9-GISEL-NEXT:    s_endpgm
133  %tid = call i32 @llvm.amdgcn.workitem.id.x()
134  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
135  %val = load i32, ptr addrspace(1) %in.gep, align 4
136  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
137  store i32 %cttz, ptr addrspace(1) %out, align 4
138  ret void
139}
140
141define amdgpu_kernel void @v_cttz_zero_undef_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
142; SI-LABEL: v_cttz_zero_undef_v2i32:
143; SI:       ; %bb.0:
144; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
145; SI-NEXT:    s_mov_b32 s7, 0xf000
146; SI-NEXT:    s_mov_b32 s10, 0
147; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
148; SI-NEXT:    v_mov_b32_e32 v1, 0
149; SI-NEXT:    s_mov_b32 s11, s7
150; SI-NEXT:    s_waitcnt lgkmcnt(0)
151; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
152; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
153; SI-NEXT:    s_mov_b32 s6, -1
154; SI-NEXT:    s_mov_b32 s4, s0
155; SI-NEXT:    s_mov_b32 s5, s1
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    v_ffbl_b32_e32 v1, v1
158; SI-NEXT:    v_ffbl_b32_e32 v0, v0
159; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
160; SI-NEXT:    s_endpgm
161;
162; VI-LABEL: v_cttz_zero_undef_v2i32:
163; VI:       ; %bb.0:
164; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
165; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
166; VI-NEXT:    s_waitcnt lgkmcnt(0)
167; VI-NEXT:    v_mov_b32_e32 v1, s3
168; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
169; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
170; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
171; VI-NEXT:    v_mov_b32_e32 v3, s1
172; VI-NEXT:    v_mov_b32_e32 v2, s0
173; VI-NEXT:    s_waitcnt vmcnt(0)
174; VI-NEXT:    v_ffbl_b32_e32 v1, v1
175; VI-NEXT:    v_ffbl_b32_e32 v0, v0
176; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
177; VI-NEXT:    s_endpgm
178;
179; EG-LABEL: v_cttz_zero_undef_v2i32:
180; EG:       ; %bb.0:
181; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
182; EG-NEXT:    TEX 0 @6
183; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
184; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
185; EG-NEXT:    CF_END
186; EG-NEXT:    PAD
187; EG-NEXT:    Fetch clause starting at 6:
188; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
189; EG-NEXT:    ALU clause starting at 8:
190; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
191; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
192; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
193; EG-NEXT:    ALU clause starting at 11:
194; EG-NEXT:     FFBL_INT * T0.Y, T0.Y,
195; EG-NEXT:     FFBL_INT T0.X, T0.X,
196; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
197; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
198;
199; GFX9-GISEL-LABEL: v_cttz_zero_undef_v2i32:
200; GFX9-GISEL:       ; %bb.0:
201; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
202; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
203; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
204; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
205; GFX9-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
206; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
207; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
208; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
209; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
210; GFX9-GISEL-NEXT:    s_endpgm
211  %tid = call i32 @llvm.amdgcn.workitem.id.x()
212  %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
213  %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
214  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
215  store <2 x i32> %cttz, ptr addrspace(1) %out, align 8
216  ret void
217}
218
219define amdgpu_kernel void @v_cttz_zero_undef_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
220; SI-LABEL: v_cttz_zero_undef_v4i32:
221; SI:       ; %bb.0:
222; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
223; SI-NEXT:    s_mov_b32 s7, 0xf000
224; SI-NEXT:    s_mov_b32 s10, 0
225; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
226; SI-NEXT:    v_mov_b32_e32 v1, 0
227; SI-NEXT:    s_mov_b32 s11, s7
228; SI-NEXT:    s_waitcnt lgkmcnt(0)
229; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
230; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
231; SI-NEXT:    s_mov_b32 s6, -1
232; SI-NEXT:    s_mov_b32 s4, s0
233; SI-NEXT:    s_mov_b32 s5, s1
234; SI-NEXT:    s_waitcnt vmcnt(0)
235; SI-NEXT:    v_ffbl_b32_e32 v3, v3
236; SI-NEXT:    v_ffbl_b32_e32 v2, v2
237; SI-NEXT:    v_ffbl_b32_e32 v1, v1
238; SI-NEXT:    v_ffbl_b32_e32 v0, v0
239; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
240; SI-NEXT:    s_endpgm
241;
242; VI-LABEL: v_cttz_zero_undef_v4i32:
243; VI:       ; %bb.0:
244; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
245; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
246; VI-NEXT:    s_waitcnt lgkmcnt(0)
247; VI-NEXT:    v_mov_b32_e32 v1, s3
248; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
249; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
250; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
251; VI-NEXT:    v_mov_b32_e32 v5, s1
252; VI-NEXT:    v_mov_b32_e32 v4, s0
253; VI-NEXT:    s_waitcnt vmcnt(0)
254; VI-NEXT:    v_ffbl_b32_e32 v3, v3
255; VI-NEXT:    v_ffbl_b32_e32 v2, v2
256; VI-NEXT:    v_ffbl_b32_e32 v1, v1
257; VI-NEXT:    v_ffbl_b32_e32 v0, v0
258; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
259; VI-NEXT:    s_endpgm
260;
261; EG-LABEL: v_cttz_zero_undef_v4i32:
262; EG:       ; %bb.0:
263; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
264; EG-NEXT:    TEX 0 @6
265; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
266; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
267; EG-NEXT:    CF_END
268; EG-NEXT:    PAD
269; EG-NEXT:    Fetch clause starting at 6:
270; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
271; EG-NEXT:    ALU clause starting at 8:
272; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
273; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
274; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
275; EG-NEXT:    ALU clause starting at 11:
276; EG-NEXT:     FFBL_INT * T0.W, T0.W,
277; EG-NEXT:     FFBL_INT * T0.Z, T0.Z,
278; EG-NEXT:     FFBL_INT * T0.Y, T0.Y,
279; EG-NEXT:     FFBL_INT T0.X, T0.X,
280; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
281; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
282;
283; GFX9-GISEL-LABEL: v_cttz_zero_undef_v4i32:
284; GFX9-GISEL:       ; %bb.0:
285; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
286; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
287; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0
288; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
289; GFX9-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
290; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
291; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
292; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
293; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
294; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
295; GFX9-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
296; GFX9-GISEL-NEXT:    s_endpgm
297  %tid = call i32 @llvm.amdgcn.workitem.id.x()
298  %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
299  %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
300  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
301  store <4 x i32> %cttz, ptr addrspace(1) %out, align 16
302  ret void
303}
304
305define amdgpu_kernel void @s_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, i8 %val) nounwind {
306; SI-LABEL: s_cttz_zero_undef_i8_with_select:
307; SI:       ; %bb.0:
308; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
309; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
310; SI-NEXT:    s_mov_b32 s3, 0xf000
311; SI-NEXT:    s_waitcnt lgkmcnt(0)
312; SI-NEXT:    s_ff1_i32_b32 s4, s2
313; SI-NEXT:    s_mov_b32 s2, -1
314; SI-NEXT:    v_mov_b32_e32 v0, s4
315; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
316; SI-NEXT:    s_endpgm
317;
318; VI-LABEL: s_cttz_zero_undef_i8_with_select:
319; VI:       ; %bb.0:
320; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
321; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
322; VI-NEXT:    s_waitcnt lgkmcnt(0)
323; VI-NEXT:    s_ff1_i32_b32 s2, s2
324; VI-NEXT:    v_mov_b32_e32 v0, s0
325; VI-NEXT:    v_mov_b32_e32 v1, s1
326; VI-NEXT:    v_mov_b32_e32 v2, s2
327; VI-NEXT:    flat_store_byte v[0:1], v2
328; VI-NEXT:    s_endpgm
329;
330; EG-LABEL: s_cttz_zero_undef_i8_with_select:
331; EG:       ; %bb.0:
332; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
333; EG-NEXT:    TEX 0 @6
334; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
335; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
336; EG-NEXT:    CF_END
337; EG-NEXT:    PAD
338; EG-NEXT:    Fetch clause starting at 6:
339; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 40, #3
340; EG-NEXT:    ALU clause starting at 8:
341; EG-NEXT:     MOV * T0.X, 0.0,
342; EG-NEXT:    ALU clause starting at 9:
343; EG-NEXT:     FFBL_INT T0.W, T0.X,
344; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
345; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
346; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
347; EG-NEXT:     LSHL * T1.W, PS, literal.y,
348; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
349; EG-NEXT:     LSHL T0.X, PV.W, PS,
350; EG-NEXT:     LSHL * T0.W, literal.x, PS,
351; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
352; EG-NEXT:     MOV T0.Y, 0.0,
353; EG-NEXT:     MOV * T0.Z, 0.0,
354; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
355; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
356;
357; GFX9-GISEL-LABEL: s_cttz_zero_undef_i8_with_select:
358; GFX9-GISEL:       ; %bb.0:
359; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
360; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
361; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
362; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
363; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
364; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
365; GFX9-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
366; GFX9-GISEL-NEXT:    s_endpgm
367  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
368  %cttz_ret = icmp ne i8 %val, 0
369  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
370  store i8 %cttz, ptr addrspace(1) %out, align 4
371  ret void
372}
373
374define amdgpu_kernel void @s_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, i16 %val) nounwind {
375; SI-LABEL: s_cttz_zero_undef_i16_with_select:
376; SI:       ; %bb.0:
377; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
378; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
379; SI-NEXT:    s_mov_b32 s3, 0xf000
380; SI-NEXT:    s_waitcnt lgkmcnt(0)
381; SI-NEXT:    s_ff1_i32_b32 s4, s2
382; SI-NEXT:    s_mov_b32 s2, -1
383; SI-NEXT:    v_mov_b32_e32 v0, s4
384; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
385; SI-NEXT:    s_endpgm
386;
387; VI-LABEL: s_cttz_zero_undef_i16_with_select:
388; VI:       ; %bb.0:
389; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
390; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
391; VI-NEXT:    s_waitcnt lgkmcnt(0)
392; VI-NEXT:    s_ff1_i32_b32 s2, s2
393; VI-NEXT:    v_mov_b32_e32 v0, s0
394; VI-NEXT:    v_mov_b32_e32 v1, s1
395; VI-NEXT:    v_mov_b32_e32 v2, s2
396; VI-NEXT:    flat_store_short v[0:1], v2
397; VI-NEXT:    s_endpgm
398;
399; EG-LABEL: s_cttz_zero_undef_i16_with_select:
400; EG:       ; %bb.0:
401; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
402; EG-NEXT:    TEX 0 @6
403; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
404; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
405; EG-NEXT:    CF_END
406; EG-NEXT:    PAD
407; EG-NEXT:    Fetch clause starting at 6:
408; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
409; EG-NEXT:    ALU clause starting at 8:
410; EG-NEXT:     MOV * T0.X, 0.0,
411; EG-NEXT:    ALU clause starting at 9:
412; EG-NEXT:     FFBL_INT T0.W, T0.X,
413; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
414; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
415; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
416; EG-NEXT:     LSHL * T1.W, PS, literal.y,
417; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
418; EG-NEXT:     LSHL T0.X, PV.W, PS,
419; EG-NEXT:     LSHL * T0.W, literal.x, PS,
420; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
421; EG-NEXT:     MOV T0.Y, 0.0,
422; EG-NEXT:     MOV * T0.Z, 0.0,
423; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
424; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
425;
426; GFX9-GISEL-LABEL: s_cttz_zero_undef_i16_with_select:
427; GFX9-GISEL:       ; %bb.0:
428; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
429; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
430; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
431; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
432; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
433; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
434; GFX9-GISEL-NEXT:    global_store_short v1, v0, s[0:1]
435; GFX9-GISEL-NEXT:    s_endpgm
436  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
437  %cttz_ret = icmp ne i16 %val, 0
438  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
439  store i16 %cttz, ptr addrspace(1) %out, align 4
440  ret void
441}
442
443define amdgpu_kernel void @s_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, i32 %val) nounwind {
444; SI-LABEL: s_cttz_zero_undef_i32_with_select:
445; SI:       ; %bb.0:
446; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
447; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
448; SI-NEXT:    s_mov_b32 s3, 0xf000
449; SI-NEXT:    s_waitcnt lgkmcnt(0)
450; SI-NEXT:    s_ff1_i32_b32 s4, s2
451; SI-NEXT:    s_mov_b32 s2, -1
452; SI-NEXT:    v_mov_b32_e32 v0, s4
453; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
454; SI-NEXT:    s_endpgm
455;
456; VI-LABEL: s_cttz_zero_undef_i32_with_select:
457; VI:       ; %bb.0:
458; VI-NEXT:    s_load_dword s2, s[4:5], 0x2c
459; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
460; VI-NEXT:    s_waitcnt lgkmcnt(0)
461; VI-NEXT:    s_ff1_i32_b32 s2, s2
462; VI-NEXT:    v_mov_b32_e32 v0, s0
463; VI-NEXT:    v_mov_b32_e32 v1, s1
464; VI-NEXT:    v_mov_b32_e32 v2, s2
465; VI-NEXT:    flat_store_dword v[0:1], v2
466; VI-NEXT:    s_endpgm
467;
468; EG-LABEL: s_cttz_zero_undef_i32_with_select:
469; EG:       ; %bb.0:
470; EG-NEXT:    ALU 2, @4, KC0[CB0:0-32], KC1[]
471; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1
472; EG-NEXT:    CF_END
473; EG-NEXT:    PAD
474; EG-NEXT:    ALU clause starting at 4:
475; EG-NEXT:     LSHR * T0.X, KC0[2].Y, literal.x,
476; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
477; EG-NEXT:     FFBL_INT * T1.X, KC0[2].Z,
478;
479; GFX9-GISEL-LABEL: s_cttz_zero_undef_i32_with_select:
480; GFX9-GISEL:       ; %bb.0:
481; GFX9-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
482; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
483; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
484; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
485; GFX9-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
486; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
487; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
488; GFX9-GISEL-NEXT:    s_endpgm
489  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
490  %cttz_ret = icmp ne i32 %val, 0
491  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
492  store i32 %cttz, ptr addrspace(1) %out, align 4
493  ret void
494}
495
496define amdgpu_kernel void @s_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, i64 %val) nounwind {
497; SI-LABEL: s_cttz_zero_undef_i64_with_select:
498; SI:       ; %bb.0:
499; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
500; SI-NEXT:    s_mov_b32 s7, 0xf000
501; SI-NEXT:    s_mov_b32 s6, -1
502; SI-NEXT:    s_waitcnt lgkmcnt(0)
503; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
504; SI-NEXT:    v_mov_b32_e32 v1, 0
505; SI-NEXT:    s_mov_b32 s4, s0
506; SI-NEXT:    s_mov_b32 s5, s1
507; SI-NEXT:    v_mov_b32_e32 v0, s2
508; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
509; SI-NEXT:    s_endpgm
510;
511; VI-LABEL: s_cttz_zero_undef_i64_with_select:
512; VI:       ; %bb.0:
513; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
514; VI-NEXT:    v_mov_b32_e32 v1, 0
515; VI-NEXT:    s_waitcnt lgkmcnt(0)
516; VI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
517; VI-NEXT:    v_mov_b32_e32 v3, s1
518; VI-NEXT:    v_mov_b32_e32 v0, s2
519; VI-NEXT:    v_mov_b32_e32 v2, s0
520; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
521; VI-NEXT:    s_endpgm
522;
523; EG-LABEL: s_cttz_zero_undef_i64_with_select:
524; EG:       ; %bb.0:
525; EG-NEXT:    ALU 7, @4, KC0[CB0:0-32], KC1[]
526; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
527; EG-NEXT:    CF_END
528; EG-NEXT:    PAD
529; EG-NEXT:    ALU clause starting at 4:
530; EG-NEXT:     FFBL_INT * T0.W, KC0[3].X,
531; EG-NEXT:     FFBL_INT T1.W, KC0[2].W,
532; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
533; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
534; EG-NEXT:     CNDE_INT T0.X, KC0[2].W, PS, PV.W,
535; EG-NEXT:     MOV T0.Y, 0.0,
536; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
537; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
538;
539; GFX9-GISEL-LABEL: s_cttz_zero_undef_i64_with_select:
540; GFX9-GISEL:       ; %bb.0:
541; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
542; GFX9-GISEL-NEXT:    s_mov_b32 s5, 0
543; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0
544; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
545; GFX9-GISEL-NEXT:    s_ff1_i32_b64 s4, s[2:3]
546; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
547; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
548; GFX9-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
549; GFX9-GISEL-NEXT:    s_endpgm
550  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
551  %cttz_ret = icmp ne i64 %val, 0
552  %ret = select i1 %cttz_ret, i64 %cttz, i64 32
553  store i64 %cttz, ptr addrspace(1) %out, align 4
554  ret void
555}
556
557define amdgpu_kernel void @v_cttz_zero_undef_i8_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
558; SI-LABEL: v_cttz_zero_undef_i8_with_select:
559; SI:       ; %bb.0:
560; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
561; SI-NEXT:    s_mov_b32 s7, 0xf000
562; SI-NEXT:    s_mov_b32 s6, -1
563; SI-NEXT:    s_mov_b32 s10, s6
564; SI-NEXT:    s_mov_b32 s11, s7
565; SI-NEXT:    s_waitcnt lgkmcnt(0)
566; SI-NEXT:    s_mov_b32 s8, s2
567; SI-NEXT:    s_mov_b32 s9, s3
568; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
569; SI-NEXT:    s_mov_b32 s4, s0
570; SI-NEXT:    s_mov_b32 s5, s1
571; SI-NEXT:    s_waitcnt vmcnt(0)
572; SI-NEXT:    v_ffbl_b32_e32 v1, v0
573; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
574; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
575; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
576; SI-NEXT:    s_endpgm
577;
578; VI-LABEL: v_cttz_zero_undef_i8_with_select:
579; VI:       ; %bb.0:
580; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
581; VI-NEXT:    s_waitcnt lgkmcnt(0)
582; VI-NEXT:    v_mov_b32_e32 v0, s2
583; VI-NEXT:    v_mov_b32_e32 v1, s3
584; VI-NEXT:    flat_load_ubyte v0, v[0:1]
585; VI-NEXT:    s_waitcnt vmcnt(0)
586; VI-NEXT:    v_ffbl_b32_e32 v1, v0
587; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
588; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
589; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
590; VI-NEXT:    v_mov_b32_e32 v0, s0
591; VI-NEXT:    v_mov_b32_e32 v1, s1
592; VI-NEXT:    flat_store_byte v[0:1], v2
593; VI-NEXT:    s_endpgm
594;
595; EG-LABEL: v_cttz_zero_undef_i8_with_select:
596; EG:       ; %bb.0:
597; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
598; EG-NEXT:    TEX 0 @6
599; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
600; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
601; EG-NEXT:    CF_END
602; EG-NEXT:    PAD
603; EG-NEXT:    Fetch clause starting at 6:
604; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
605; EG-NEXT:    ALU clause starting at 8:
606; EG-NEXT:     MOV * T0.X, KC0[2].Z,
607; EG-NEXT:    ALU clause starting at 9:
608; EG-NEXT:     FFBL_INT T0.W, T0.X,
609; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
610; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
611; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
612; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
613; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
614; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
615; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
616; EG-NEXT:     LSHL T0.X, PV.W, PS,
617; EG-NEXT:     LSHL * T0.W, literal.x, PS,
618; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
619; EG-NEXT:     MOV T0.Y, 0.0,
620; EG-NEXT:     MOV * T0.Z, 0.0,
621; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
622; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
623;
624; GFX9-GISEL-LABEL: v_cttz_zero_undef_i8_with_select:
625; GFX9-GISEL:       ; %bb.0:
626; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
627; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
628; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
629; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
630; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
631; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
632; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xff, v2
633; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
634; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
635; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
636; GFX9-GISEL-NEXT:    s_endpgm
637  %val = load i8, ptr addrspace(1) %arrayidx, align 1
638  %cttz = tail call i8 @llvm.cttz.i8(i8 %val, i1 true) nounwind readnone
639  %cttz_ret = icmp ne i8 %val, 0
640  %ret = select i1 %cttz_ret, i8 %cttz, i8 32
641  store i8 %ret, ptr addrspace(1) %out, align 4
642  ret void
643}
644
645define amdgpu_kernel void @v_cttz_zero_undef_i16_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
646; SI-LABEL: v_cttz_zero_undef_i16_with_select:
647; SI:       ; %bb.0:
648; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
649; SI-NEXT:    s_mov_b32 s7, 0xf000
650; SI-NEXT:    s_mov_b32 s6, -1
651; SI-NEXT:    s_mov_b32 s10, s6
652; SI-NEXT:    s_mov_b32 s11, s7
653; SI-NEXT:    s_waitcnt lgkmcnt(0)
654; SI-NEXT:    s_mov_b32 s8, s2
655; SI-NEXT:    s_mov_b32 s9, s3
656; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
657; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
658; SI-NEXT:    s_mov_b32 s4, s0
659; SI-NEXT:    s_mov_b32 s5, s1
660; SI-NEXT:    s_waitcnt vmcnt(1)
661; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
662; SI-NEXT:    s_waitcnt vmcnt(0)
663; SI-NEXT:    v_or_b32_e32 v0, v0, v1
664; SI-NEXT:    v_ffbl_b32_e32 v1, v0
665; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
666; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
667; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
668; SI-NEXT:    s_endpgm
669;
670; VI-LABEL: v_cttz_zero_undef_i16_with_select:
671; VI:       ; %bb.0:
672; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
673; VI-NEXT:    s_waitcnt lgkmcnt(0)
674; VI-NEXT:    s_add_u32 s4, s2, 1
675; VI-NEXT:    s_addc_u32 s5, s3, 0
676; VI-NEXT:    v_mov_b32_e32 v2, s4
677; VI-NEXT:    v_mov_b32_e32 v0, s2
678; VI-NEXT:    v_mov_b32_e32 v3, s5
679; VI-NEXT:    v_mov_b32_e32 v1, s3
680; VI-NEXT:    flat_load_ubyte v2, v[2:3]
681; VI-NEXT:    flat_load_ubyte v0, v[0:1]
682; VI-NEXT:    s_waitcnt vmcnt(1)
683; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
684; VI-NEXT:    s_waitcnt vmcnt(0)
685; VI-NEXT:    v_or_b32_e32 v0, v1, v0
686; VI-NEXT:    v_ffbl_b32_e32 v1, v0
687; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
688; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
689; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
690; VI-NEXT:    v_mov_b32_e32 v0, s0
691; VI-NEXT:    v_mov_b32_e32 v1, s1
692; VI-NEXT:    flat_store_short v[0:1], v2
693; VI-NEXT:    s_endpgm
694;
695; EG-LABEL: v_cttz_zero_undef_i16_with_select:
696; EG:       ; %bb.0:
697; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
698; EG-NEXT:    TEX 0 @6
699; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
700; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
701; EG-NEXT:    CF_END
702; EG-NEXT:    PAD
703; EG-NEXT:    Fetch clause starting at 6:
704; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
705; EG-NEXT:    ALU clause starting at 8:
706; EG-NEXT:     MOV * T0.X, KC0[2].Z,
707; EG-NEXT:    ALU clause starting at 9:
708; EG-NEXT:     FFBL_INT T0.W, T0.X,
709; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
710; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
711; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
712; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
713; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
714; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
715; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
716; EG-NEXT:     LSHL T0.X, PV.W, PS,
717; EG-NEXT:     LSHL * T0.W, literal.x, PS,
718; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
719; EG-NEXT:     MOV T0.Y, 0.0,
720; EG-NEXT:     MOV * T0.Z, 0.0,
721; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
722; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
723;
724; GFX9-GISEL-LABEL: v_cttz_zero_undef_i16_with_select:
725; GFX9-GISEL:       ; %bb.0:
726; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
727; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
728; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
729; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
730; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
731; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
732; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
733; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
734; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
735; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
736; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
737; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
738; GFX9-GISEL-NEXT:    s_endpgm
739  %val = load i16, ptr addrspace(1) %arrayidx, align 1
740  %cttz = tail call i16 @llvm.cttz.i16(i16 %val, i1 true) nounwind readnone
741  %cttz_ret = icmp ne i16 %val, 0
742  %ret = select i1 %cttz_ret, i16 %cttz, i16 32
743  store i16 %ret, ptr addrspace(1) %out, align 4
744  ret void
745}
746
747define amdgpu_kernel void @v_cttz_zero_undef_i32_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
748; SI-LABEL: v_cttz_zero_undef_i32_with_select:
749; SI:       ; %bb.0:
750; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
751; SI-NEXT:    s_mov_b32 s7, 0xf000
752; SI-NEXT:    s_mov_b32 s6, -1
753; SI-NEXT:    s_mov_b32 s10, s6
754; SI-NEXT:    s_mov_b32 s11, s7
755; SI-NEXT:    s_waitcnt lgkmcnt(0)
756; SI-NEXT:    s_mov_b32 s8, s2
757; SI-NEXT:    s_mov_b32 s9, s3
758; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
759; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
760; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
761; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
762; SI-NEXT:    s_mov_b32 s4, s0
763; SI-NEXT:    s_mov_b32 s5, s1
764; SI-NEXT:    s_waitcnt vmcnt(3)
765; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
766; SI-NEXT:    s_waitcnt vmcnt(2)
767; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
768; SI-NEXT:    s_waitcnt vmcnt(1)
769; SI-NEXT:    v_or_b32_e32 v0, v0, v2
770; SI-NEXT:    s_waitcnt vmcnt(0)
771; SI-NEXT:    v_or_b32_e32 v1, v1, v3
772; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
773; SI-NEXT:    v_or_b32_e32 v0, v1, v0
774; SI-NEXT:    v_ffbl_b32_e32 v0, v0
775; SI-NEXT:    v_min_u32_e32 v0, 32, v0
776; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
777; SI-NEXT:    s_endpgm
778;
779; VI-LABEL: v_cttz_zero_undef_i32_with_select:
780; VI:       ; %bb.0:
781; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
782; VI-NEXT:    s_waitcnt lgkmcnt(0)
783; VI-NEXT:    s_add_u32 s4, s2, 3
784; VI-NEXT:    s_addc_u32 s5, s3, 0
785; VI-NEXT:    v_mov_b32_e32 v2, s4
786; VI-NEXT:    v_mov_b32_e32 v3, s5
787; VI-NEXT:    s_add_u32 s4, s2, 2
788; VI-NEXT:    v_mov_b32_e32 v0, s2
789; VI-NEXT:    s_addc_u32 s5, s3, 0
790; VI-NEXT:    v_mov_b32_e32 v1, s3
791; VI-NEXT:    s_add_u32 s2, s2, 1
792; VI-NEXT:    s_addc_u32 s3, s3, 0
793; VI-NEXT:    v_mov_b32_e32 v4, s4
794; VI-NEXT:    v_mov_b32_e32 v7, s3
795; VI-NEXT:    v_mov_b32_e32 v5, s5
796; VI-NEXT:    v_mov_b32_e32 v6, s2
797; VI-NEXT:    flat_load_ubyte v2, v[2:3]
798; VI-NEXT:    flat_load_ubyte v3, v[4:5]
799; VI-NEXT:    flat_load_ubyte v4, v[6:7]
800; VI-NEXT:    flat_load_ubyte v0, v[0:1]
801; VI-NEXT:    s_waitcnt vmcnt(3)
802; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
803; VI-NEXT:    s_waitcnt vmcnt(2)
804; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
805; VI-NEXT:    s_waitcnt vmcnt(1)
806; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
807; VI-NEXT:    s_waitcnt vmcnt(0)
808; VI-NEXT:    v_or_b32_e32 v0, v2, v0
809; VI-NEXT:    v_or_b32_e32 v0, v1, v0
810; VI-NEXT:    v_ffbl_b32_e32 v0, v0
811; VI-NEXT:    v_min_u32_e32 v2, 32, v0
812; VI-NEXT:    v_mov_b32_e32 v0, s0
813; VI-NEXT:    v_mov_b32_e32 v1, s1
814; VI-NEXT:    flat_store_dword v[0:1], v2
815; VI-NEXT:    s_endpgm
816;
817; EG-LABEL: v_cttz_zero_undef_i32_with_select:
818; EG:       ; %bb.0:
819; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
820; EG-NEXT:    TEX 1 @6
821; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
822; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
823; EG-NEXT:    CF_END
824; EG-NEXT:    PAD
825; EG-NEXT:    Fetch clause starting at 6:
826; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
827; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
828; EG-NEXT:    ALU clause starting at 10:
829; EG-NEXT:     MOV * T0.X, KC0[2].Z,
830; EG-NEXT:    ALU clause starting at 11:
831; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
832; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
833; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
834; EG-NEXT:     FFBL_INT * T1.W, PV.W,
835; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
836; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
837; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
838;
839; GFX9-GISEL-LABEL: v_cttz_zero_undef_i32_with_select:
840; GFX9-GISEL:       ; %bb.0:
841; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
842; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
843; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
845; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
846; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
847; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
848; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
849; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
850; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
851; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
852; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
853; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
854; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
855; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
856; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
857; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, 32, v2, vcc
858; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
859; GFX9-GISEL-NEXT:    s_endpgm
860  %val = load i32, ptr addrspace(1) %arrayidx, align 1
861  %cttz = tail call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
862  %cttz_ret = icmp ne i32 %val, 0
863  %ret = select i1 %cttz_ret, i32 %cttz, i32 32
864  store i32 %ret, ptr addrspace(1) %out, align 4
865  ret void
866}
867
868define amdgpu_kernel void @v_cttz_zero_undef_i64_with_select(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
869; SI-LABEL: v_cttz_zero_undef_i64_with_select:
870; SI:       ; %bb.0:
871; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
872; SI-NEXT:    s_mov_b32 s3, 0xf000
873; SI-NEXT:    s_mov_b32 s2, -1
874; SI-NEXT:    s_mov_b32 s10, s2
875; SI-NEXT:    s_mov_b32 s11, s3
876; SI-NEXT:    s_waitcnt lgkmcnt(0)
877; SI-NEXT:    s_mov_b32 s8, s6
878; SI-NEXT:    s_mov_b32 s9, s7
879; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:5
880; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:7
881; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
882; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:1
883; SI-NEXT:    buffer_load_ubyte v4, off, s[8:11], 0 offset:2
884; SI-NEXT:    buffer_load_ubyte v5, off, s[8:11], 0 offset:3
885; SI-NEXT:    buffer_load_ubyte v6, off, s[8:11], 0 offset:4
886; SI-NEXT:    buffer_load_ubyte v7, off, s[8:11], 0 offset:6
887; SI-NEXT:    s_mov_b32 s0, s4
888; SI-NEXT:    s_mov_b32 s1, s5
889; SI-NEXT:    s_waitcnt vmcnt(7)
890; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
891; SI-NEXT:    s_waitcnt vmcnt(6)
892; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
893; SI-NEXT:    s_waitcnt vmcnt(4)
894; SI-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
895; SI-NEXT:    s_waitcnt vmcnt(2)
896; SI-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
897; SI-NEXT:    s_waitcnt vmcnt(1)
898; SI-NEXT:    v_or_b32_e32 v0, v0, v6
899; SI-NEXT:    s_waitcnt vmcnt(0)
900; SI-NEXT:    v_or_b32_e32 v1, v1, v7
901; SI-NEXT:    v_or_b32_e32 v2, v3, v2
902; SI-NEXT:    v_or_b32_e32 v3, v5, v4
903; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
904; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
905; SI-NEXT:    v_or_b32_e32 v0, v1, v0
906; SI-NEXT:    v_or_b32_e32 v1, v3, v2
907; SI-NEXT:    v_ffbl_b32_e32 v1, v1
908; SI-NEXT:    v_ffbl_b32_e32 v0, v0
909; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
910; SI-NEXT:    v_min_u32_e32 v0, v0, v1
911; SI-NEXT:    v_min_u32_e32 v0, 64, v0
912; SI-NEXT:    v_mov_b32_e32 v1, 0
913; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
914; SI-NEXT:    s_endpgm
915;
916; VI-LABEL: v_cttz_zero_undef_i64_with_select:
917; VI:       ; %bb.0:
918; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
919; VI-NEXT:    s_waitcnt lgkmcnt(0)
920; VI-NEXT:    s_add_u32 s4, s2, 5
921; VI-NEXT:    s_addc_u32 s5, s3, 0
922; VI-NEXT:    v_mov_b32_e32 v0, s4
923; VI-NEXT:    v_mov_b32_e32 v1, s5
924; VI-NEXT:    s_add_u32 s4, s2, 4
925; VI-NEXT:    s_addc_u32 s5, s3, 0
926; VI-NEXT:    v_mov_b32_e32 v2, s4
927; VI-NEXT:    v_mov_b32_e32 v3, s5
928; VI-NEXT:    s_add_u32 s4, s2, 7
929; VI-NEXT:    s_addc_u32 s5, s3, 0
930; VI-NEXT:    v_mov_b32_e32 v4, s4
931; VI-NEXT:    v_mov_b32_e32 v5, s5
932; VI-NEXT:    s_add_u32 s4, s2, 6
933; VI-NEXT:    s_addc_u32 s5, s3, 0
934; VI-NEXT:    v_mov_b32_e32 v7, s5
935; VI-NEXT:    v_mov_b32_e32 v6, s4
936; VI-NEXT:    s_add_u32 s4, s2, 3
937; VI-NEXT:    s_addc_u32 s5, s3, 0
938; VI-NEXT:    v_mov_b32_e32 v9, s5
939; VI-NEXT:    v_mov_b32_e32 v8, s4
940; VI-NEXT:    s_add_u32 s4, s2, 2
941; VI-NEXT:    s_addc_u32 s5, s3, 0
942; VI-NEXT:    v_mov_b32_e32 v11, s5
943; VI-NEXT:    v_mov_b32_e32 v10, s4
944; VI-NEXT:    flat_load_ubyte v12, v[0:1]
945; VI-NEXT:    flat_load_ubyte v13, v[2:3]
946; VI-NEXT:    flat_load_ubyte v4, v[4:5]
947; VI-NEXT:    flat_load_ubyte v5, v[6:7]
948; VI-NEXT:    s_add_u32 s4, s2, 1
949; VI-NEXT:    flat_load_ubyte v6, v[8:9]
950; VI-NEXT:    s_addc_u32 s5, s3, 0
951; VI-NEXT:    v_mov_b32_e32 v0, s4
952; VI-NEXT:    v_mov_b32_e32 v2, s2
953; VI-NEXT:    v_mov_b32_e32 v1, s5
954; VI-NEXT:    v_mov_b32_e32 v3, s3
955; VI-NEXT:    flat_load_ubyte v7, v[10:11]
956; VI-NEXT:    flat_load_ubyte v0, v[0:1]
957; VI-NEXT:    flat_load_ubyte v2, v[2:3]
958; VI-NEXT:    v_mov_b32_e32 v1, 0
959; VI-NEXT:    s_waitcnt vmcnt(7)
960; VI-NEXT:    v_lshlrev_b32_e32 v3, 8, v12
961; VI-NEXT:    s_waitcnt vmcnt(6)
962; VI-NEXT:    v_or_b32_e32 v3, v3, v13
963; VI-NEXT:    s_waitcnt vmcnt(5)
964; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v4
965; VI-NEXT:    s_waitcnt vmcnt(4)
966; VI-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
967; VI-NEXT:    v_or_b32_e32 v3, v4, v3
968; VI-NEXT:    s_waitcnt vmcnt(3)
969; VI-NEXT:    v_lshlrev_b32_e32 v4, 8, v6
970; VI-NEXT:    v_ffbl_b32_e32 v3, v3
971; VI-NEXT:    v_add_u32_e32 v3, vcc, 32, v3
972; VI-NEXT:    s_waitcnt vmcnt(2)
973; VI-NEXT:    v_or_b32_sdwa v4, v4, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
974; VI-NEXT:    s_waitcnt vmcnt(1)
975; VI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
976; VI-NEXT:    s_waitcnt vmcnt(0)
977; VI-NEXT:    v_or_b32_e32 v0, v0, v2
978; VI-NEXT:    v_or_b32_e32 v0, v4, v0
979; VI-NEXT:    v_ffbl_b32_e32 v0, v0
980; VI-NEXT:    v_min_u32_e32 v0, v3, v0
981; VI-NEXT:    v_mov_b32_e32 v3, s1
982; VI-NEXT:    v_min_u32_e32 v0, 64, v0
983; VI-NEXT:    v_mov_b32_e32 v2, s0
984; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
985; VI-NEXT:    s_endpgm
986;
987; EG-LABEL: v_cttz_zero_undef_i64_with_select:
988; EG:       ; %bb.0:
989; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
990; EG-NEXT:    TEX 3 @6
991; EG-NEXT:    ALU 15, @15, KC0[CB0:0-32], KC1[]
992; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
993; EG-NEXT:    CF_END
994; EG-NEXT:    PAD
995; EG-NEXT:    Fetch clause starting at 6:
996; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 6, #1
997; EG-NEXT:     VTX_READ_16 T2.X, T0.X, 0, #1
998; EG-NEXT:     VTX_READ_16 T3.X, T0.X, 2, #1
999; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 4, #1
1000; EG-NEXT:    ALU clause starting at 14:
1001; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1002; EG-NEXT:    ALU clause starting at 15:
1003; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1004; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1005; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1006; EG-NEXT:     FFBL_INT T1.W, PV.W,
1007; EG-NEXT:     LSHL * T2.W, T3.X, literal.x,
1008; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1009; EG-NEXT:     CNDE_INT T0.W, T0.W, literal.x, PV.W,
1010; EG-NEXT:     OR_INT * T1.W, PS, T2.X,
1011; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1012; EG-NEXT:     FFBL_INT T2.W, PS,
1013; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
1014; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1015; EG-NEXT:     CNDE_INT T0.X, T1.W, PS, PV.W,
1016; EG-NEXT:     MOV T0.Y, 0.0,
1017; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1018; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1019;
1020; GFX9-GISEL-LABEL: v_cttz_zero_undef_i64_with_select:
1021; GFX9-GISEL:       ; %bb.0:
1022; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1023; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1024; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1025; GFX9-GISEL-NEXT:    global_load_ubyte v0, v1, s[2:3]
1026; GFX9-GISEL-NEXT:    global_load_ubyte v2, v1, s[2:3] offset:1
1027; GFX9-GISEL-NEXT:    global_load_ubyte v3, v1, s[2:3] offset:2
1028; GFX9-GISEL-NEXT:    global_load_ubyte v4, v1, s[2:3] offset:3
1029; GFX9-GISEL-NEXT:    global_load_ubyte v5, v1, s[2:3] offset:4
1030; GFX9-GISEL-NEXT:    global_load_ubyte v6, v1, s[2:3] offset:5
1031; GFX9-GISEL-NEXT:    global_load_ubyte v7, v1, s[2:3] offset:6
1032; GFX9-GISEL-NEXT:    global_load_ubyte v8, v1, s[2:3] offset:7
1033; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(6)
1034; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 8, v0
1035; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(5)
1036; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
1037; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(4)
1038; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
1039; GFX9-GISEL-NEXT:    v_or3_b32 v2, v2, v3, v0
1040; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1041; GFX9-GISEL-NEXT:    v_lshl_or_b32 v4, v6, 8, v5
1042; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1043; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
1044; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1045; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v8, 24, v5
1046; GFX9-GISEL-NEXT:    v_or3_b32 v3, v0, v4, 0
1047; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v4, v3
1048; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v0, v2
1049; GFX9-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
1050; GFX9-GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
1051; GFX9-GISEL-NEXT:    v_min_u32_e32 v0, v0, v4
1052; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 64, v0, vcc
1053; GFX9-GISEL-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
1054; GFX9-GISEL-NEXT:    s_endpgm
1055  %val = load i64, ptr addrspace(1) %arrayidx, align 1
1056  %cttz = tail call i64 @llvm.cttz.i64(i64 %val, i1 true) nounwind readnone
1057  %cttz_ret = icmp ne i64 %val, 0
1058  %ret = select i1 %cttz_ret, i64 %cttz, i64 64
1059  store i64 %ret, ptr addrspace(1) %out, align 4
1060  ret void
1061}
1062
1063define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1064; SI-LABEL: v_cttz_i32_sel_eq_neg1:
1065; SI:       ; %bb.0:
1066; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1067; SI-NEXT:    s_mov_b32 s7, 0xf000
1068; SI-NEXT:    s_mov_b32 s6, -1
1069; SI-NEXT:    s_mov_b32 s10, s6
1070; SI-NEXT:    s_mov_b32 s11, s7
1071; SI-NEXT:    s_waitcnt lgkmcnt(0)
1072; SI-NEXT:    s_mov_b32 s8, s2
1073; SI-NEXT:    s_mov_b32 s9, s3
1074; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1075; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1076; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
1077; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1078; SI-NEXT:    s_mov_b32 s4, s0
1079; SI-NEXT:    s_mov_b32 s5, s1
1080; SI-NEXT:    s_waitcnt vmcnt(3)
1081; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1082; SI-NEXT:    s_waitcnt vmcnt(2)
1083; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1084; SI-NEXT:    s_waitcnt vmcnt(1)
1085; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1086; SI-NEXT:    s_waitcnt vmcnt(0)
1087; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1088; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1089; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1090; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1091; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1092; SI-NEXT:    s_endpgm
1093;
1094; VI-LABEL: v_cttz_i32_sel_eq_neg1:
1095; VI:       ; %bb.0:
1096; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1097; VI-NEXT:    s_waitcnt lgkmcnt(0)
1098; VI-NEXT:    s_add_u32 s4, s2, 3
1099; VI-NEXT:    s_addc_u32 s5, s3, 0
1100; VI-NEXT:    v_mov_b32_e32 v2, s4
1101; VI-NEXT:    v_mov_b32_e32 v3, s5
1102; VI-NEXT:    s_add_u32 s4, s2, 2
1103; VI-NEXT:    v_mov_b32_e32 v0, s2
1104; VI-NEXT:    s_addc_u32 s5, s3, 0
1105; VI-NEXT:    v_mov_b32_e32 v1, s3
1106; VI-NEXT:    s_add_u32 s2, s2, 1
1107; VI-NEXT:    s_addc_u32 s3, s3, 0
1108; VI-NEXT:    v_mov_b32_e32 v4, s4
1109; VI-NEXT:    v_mov_b32_e32 v7, s3
1110; VI-NEXT:    v_mov_b32_e32 v5, s5
1111; VI-NEXT:    v_mov_b32_e32 v6, s2
1112; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1113; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1114; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1115; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1116; VI-NEXT:    s_waitcnt vmcnt(3)
1117; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1118; VI-NEXT:    s_waitcnt vmcnt(2)
1119; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1120; VI-NEXT:    s_waitcnt vmcnt(1)
1121; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1122; VI-NEXT:    s_waitcnt vmcnt(0)
1123; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1124; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1125; VI-NEXT:    v_ffbl_b32_e32 v2, v0
1126; VI-NEXT:    v_mov_b32_e32 v0, s0
1127; VI-NEXT:    v_mov_b32_e32 v1, s1
1128; VI-NEXT:    flat_store_dword v[0:1], v2
1129; VI-NEXT:    s_endpgm
1130;
1131; EG-LABEL: v_cttz_i32_sel_eq_neg1:
1132; EG:       ; %bb.0:
1133; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1134; EG-NEXT:    TEX 1 @6
1135; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1136; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1137; EG-NEXT:    CF_END
1138; EG-NEXT:    PAD
1139; EG-NEXT:    Fetch clause starting at 6:
1140; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1141; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1142; EG-NEXT:    ALU clause starting at 10:
1143; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1144; EG-NEXT:    ALU clause starting at 11:
1145; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1146; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1147; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1148; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1149; EG-NEXT:     CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1150; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1151; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
1152; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1153; EG-NEXT:    -1(nan), 2(2.802597e-45)
1154;
1155; GFX9-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
1156; GFX9-GISEL:       ; %bb.0:
1157; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1158; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1159; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1160; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1161; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1162; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1163; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1164; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1165; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1166; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1167; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1168; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1169; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1170; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1171; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
1172; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1173; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1174; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, -1, vcc
1175; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1176; GFX9-GISEL-NEXT:    s_endpgm
1177  %val = load i32, ptr addrspace(1) %arrayidx, align 1
1178  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1179  %cmp = icmp eq i32 %val, 0
1180  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1181  store i32 %sel, ptr addrspace(1) %out
1182  ret void
1183}
1184
1185define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1186; SI-LABEL: v_cttz_i32_sel_ne_neg1:
1187; SI:       ; %bb.0:
1188; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1189; SI-NEXT:    s_mov_b32 s7, 0xf000
1190; SI-NEXT:    s_mov_b32 s6, -1
1191; SI-NEXT:    s_mov_b32 s10, s6
1192; SI-NEXT:    s_mov_b32 s11, s7
1193; SI-NEXT:    s_waitcnt lgkmcnt(0)
1194; SI-NEXT:    s_mov_b32 s8, s2
1195; SI-NEXT:    s_mov_b32 s9, s3
1196; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1197; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1198; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
1199; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1200; SI-NEXT:    s_mov_b32 s4, s0
1201; SI-NEXT:    s_mov_b32 s5, s1
1202; SI-NEXT:    s_waitcnt vmcnt(3)
1203; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1204; SI-NEXT:    s_waitcnt vmcnt(2)
1205; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1206; SI-NEXT:    s_waitcnt vmcnt(1)
1207; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1208; SI-NEXT:    s_waitcnt vmcnt(0)
1209; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1210; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1211; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1212; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1213; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1214; SI-NEXT:    s_endpgm
1215;
1216; VI-LABEL: v_cttz_i32_sel_ne_neg1:
1217; VI:       ; %bb.0:
1218; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1219; VI-NEXT:    s_waitcnt lgkmcnt(0)
1220; VI-NEXT:    s_add_u32 s4, s2, 3
1221; VI-NEXT:    s_addc_u32 s5, s3, 0
1222; VI-NEXT:    v_mov_b32_e32 v2, s4
1223; VI-NEXT:    v_mov_b32_e32 v3, s5
1224; VI-NEXT:    s_add_u32 s4, s2, 2
1225; VI-NEXT:    v_mov_b32_e32 v0, s2
1226; VI-NEXT:    s_addc_u32 s5, s3, 0
1227; VI-NEXT:    v_mov_b32_e32 v1, s3
1228; VI-NEXT:    s_add_u32 s2, s2, 1
1229; VI-NEXT:    s_addc_u32 s3, s3, 0
1230; VI-NEXT:    v_mov_b32_e32 v4, s4
1231; VI-NEXT:    v_mov_b32_e32 v7, s3
1232; VI-NEXT:    v_mov_b32_e32 v5, s5
1233; VI-NEXT:    v_mov_b32_e32 v6, s2
1234; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1235; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1236; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1237; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1238; VI-NEXT:    s_waitcnt vmcnt(3)
1239; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1240; VI-NEXT:    s_waitcnt vmcnt(2)
1241; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1242; VI-NEXT:    s_waitcnt vmcnt(1)
1243; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1244; VI-NEXT:    s_waitcnt vmcnt(0)
1245; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1246; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1247; VI-NEXT:    v_ffbl_b32_e32 v2, v0
1248; VI-NEXT:    v_mov_b32_e32 v0, s0
1249; VI-NEXT:    v_mov_b32_e32 v1, s1
1250; VI-NEXT:    flat_store_dword v[0:1], v2
1251; VI-NEXT:    s_endpgm
1252;
1253; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1254; EG:       ; %bb.0:
1255; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1256; EG-NEXT:    TEX 1 @6
1257; EG-NEXT:    ALU 8, @11, KC0[CB0:0-32], KC1[]
1258; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1259; EG-NEXT:    CF_END
1260; EG-NEXT:    PAD
1261; EG-NEXT:    Fetch clause starting at 6:
1262; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1263; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1264; EG-NEXT:    ALU clause starting at 10:
1265; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1266; EG-NEXT:    ALU clause starting at 11:
1267; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1268; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1269; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1270; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1271; EG-NEXT:     CNDE_INT * T1.W, T0.W, literal.x, PV.W,
1272; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1273; EG-NEXT:     CNDE_INT T0.X, T0.W, literal.x, PV.W,
1274; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1275; EG-NEXT:    -1(nan), 2(2.802597e-45)
1276;
1277; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1278; GFX9-GISEL:       ; %bb.0:
1279; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1280; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1281; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1282; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1283; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1284; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1285; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1286; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1287; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1288; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1289; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1290; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1291; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1292; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1293; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v1
1294; GFX9-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1295; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
1296; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v2, vcc
1297; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1298; GFX9-GISEL-NEXT:    s_endpgm
1299  %val = load i32, ptr addrspace(1) %arrayidx, align 1
1300  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1301  %cmp = icmp ne i32 %val, 0
1302  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1303  store i32 %sel, ptr addrspace(1) %out
1304  ret void
1305}
1306
1307define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1308; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1309; SI:       ; %bb.0:
1310; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1311; SI-NEXT:    s_mov_b32 s7, 0xf000
1312; SI-NEXT:    s_mov_b32 s6, -1
1313; SI-NEXT:    s_mov_b32 s10, s6
1314; SI-NEXT:    s_mov_b32 s11, s7
1315; SI-NEXT:    s_waitcnt lgkmcnt(0)
1316; SI-NEXT:    s_mov_b32 s8, s2
1317; SI-NEXT:    s_mov_b32 s9, s3
1318; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1319; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0 offset:3
1320; SI-NEXT:    buffer_load_ubyte v2, off, s[8:11], 0
1321; SI-NEXT:    buffer_load_ubyte v3, off, s[8:11], 0 offset:2
1322; SI-NEXT:    s_mov_b32 s4, s0
1323; SI-NEXT:    s_mov_b32 s5, s1
1324; SI-NEXT:    s_waitcnt vmcnt(3)
1325; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1326; SI-NEXT:    s_waitcnt vmcnt(2)
1327; SI-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
1328; SI-NEXT:    s_waitcnt vmcnt(1)
1329; SI-NEXT:    v_or_b32_e32 v0, v0, v2
1330; SI-NEXT:    s_waitcnt vmcnt(0)
1331; SI-NEXT:    v_or_b32_e32 v1, v1, v3
1332; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
1333; SI-NEXT:    v_or_b32_e32 v0, v1, v0
1334; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1335; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1336; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1337; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1338; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1339; SI-NEXT:    s_endpgm
1340;
1341; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1342; VI:       ; %bb.0:
1343; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1344; VI-NEXT:    s_waitcnt lgkmcnt(0)
1345; VI-NEXT:    s_add_u32 s4, s2, 3
1346; VI-NEXT:    s_addc_u32 s5, s3, 0
1347; VI-NEXT:    v_mov_b32_e32 v2, s4
1348; VI-NEXT:    v_mov_b32_e32 v3, s5
1349; VI-NEXT:    s_add_u32 s4, s2, 2
1350; VI-NEXT:    v_mov_b32_e32 v0, s2
1351; VI-NEXT:    s_addc_u32 s5, s3, 0
1352; VI-NEXT:    v_mov_b32_e32 v1, s3
1353; VI-NEXT:    s_add_u32 s2, s2, 1
1354; VI-NEXT:    s_addc_u32 s3, s3, 0
1355; VI-NEXT:    v_mov_b32_e32 v4, s4
1356; VI-NEXT:    v_mov_b32_e32 v7, s3
1357; VI-NEXT:    v_mov_b32_e32 v5, s5
1358; VI-NEXT:    v_mov_b32_e32 v6, s2
1359; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1360; VI-NEXT:    flat_load_ubyte v3, v[4:5]
1361; VI-NEXT:    flat_load_ubyte v4, v[6:7]
1362; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1363; VI-NEXT:    s_waitcnt vmcnt(3)
1364; VI-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
1365; VI-NEXT:    s_waitcnt vmcnt(2)
1366; VI-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
1367; VI-NEXT:    s_waitcnt vmcnt(1)
1368; VI-NEXT:    v_lshlrev_b32_e32 v2, 8, v4
1369; VI-NEXT:    s_waitcnt vmcnt(0)
1370; VI-NEXT:    v_or_b32_e32 v0, v2, v0
1371; VI-NEXT:    v_or_b32_e32 v0, v1, v0
1372; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1373; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1374; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1375; VI-NEXT:    v_cndmask_b32_e32 v2, -1, v0, vcc
1376; VI-NEXT:    v_mov_b32_e32 v0, s0
1377; VI-NEXT:    v_mov_b32_e32 v1, s1
1378; VI-NEXT:    flat_store_dword v[0:1], v2
1379; VI-NEXT:    s_endpgm
1380;
1381; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1382; EG:       ; %bb.0:
1383; EG-NEXT:    ALU 0, @10, KC0[CB0:0-32], KC1[]
1384; EG-NEXT:    TEX 1 @6
1385; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
1386; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1387; EG-NEXT:    CF_END
1388; EG-NEXT:    PAD
1389; EG-NEXT:    Fetch clause starting at 6:
1390; EG-NEXT:     VTX_READ_16 T1.X, T0.X, 2, #1
1391; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1392; EG-NEXT:    ALU clause starting at 10:
1393; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1394; EG-NEXT:    ALU clause starting at 11:
1395; EG-NEXT:     LSHL * T0.W, T1.X, literal.x,
1396; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
1397; EG-NEXT:     OR_INT * T0.W, PV.W, T0.X,
1398; EG-NEXT:     FFBL_INT * T1.W, PV.W,
1399; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W,
1400; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1401; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1402; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1403; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1404; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1405; EG-NEXT:    -1(nan), 2(2.802597e-45)
1406;
1407; GFX9-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1408; GFX9-GISEL:       ; %bb.0:
1409; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1410; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1411; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1412; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1413; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1414; GFX9-GISEL-NEXT:    global_load_ubyte v3, v0, s[2:3] offset:3
1415; GFX9-GISEL-NEXT:    global_load_ubyte v4, v0, s[2:3] offset:2
1416; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(2)
1417; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1418; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(1)
1419; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
1420; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1421; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v4
1422; GFX9-GISEL-NEXT:    v_or3_b32 v1, v2, v3, v1
1423; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1424; GFX9-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1425; GFX9-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v1
1426; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, -1, v1, vcc
1427; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
1428; GFX9-GISEL-NEXT:    s_endpgm
1429  %val = load i32, ptr addrspace(1) %arrayidx, align 1
1430  %ctlz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1431  %cmp = icmp ne i32 %ctlz, 32
1432  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1433  store i32 %sel, ptr addrspace(1) %out
1434  ret void
1435}
1436
1437 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1438; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1439; SI:       ; %bb.0:
1440; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1441; SI-NEXT:    s_mov_b32 s7, 0xf000
1442; SI-NEXT:    s_mov_b32 s6, -1
1443; SI-NEXT:    s_mov_b32 s10, s6
1444; SI-NEXT:    s_mov_b32 s11, s7
1445; SI-NEXT:    s_waitcnt lgkmcnt(0)
1446; SI-NEXT:    s_mov_b32 s8, s2
1447; SI-NEXT:    s_mov_b32 s9, s3
1448; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
1449; SI-NEXT:    s_mov_b32 s4, s0
1450; SI-NEXT:    s_mov_b32 s5, s1
1451; SI-NEXT:    s_waitcnt vmcnt(0)
1452; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1453; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1454; SI-NEXT:    s_endpgm
1455;
1456; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1457; VI:       ; %bb.0:
1458; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1459; VI-NEXT:    s_waitcnt lgkmcnt(0)
1460; VI-NEXT:    v_mov_b32_e32 v0, s2
1461; VI-NEXT:    v_mov_b32_e32 v1, s3
1462; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1463; VI-NEXT:    v_mov_b32_e32 v1, 0xff
1464; VI-NEXT:    s_waitcnt vmcnt(0)
1465; VI-NEXT:    v_or_b32_e32 v2, 0x100, v0
1466; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1467; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1468; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1469; VI-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
1470; VI-NEXT:    v_mov_b32_e32 v0, s0
1471; VI-NEXT:    v_mov_b32_e32 v1, s1
1472; VI-NEXT:    flat_store_byte v[0:1], v2
1473; VI-NEXT:    s_endpgm
1474;
1475; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1476; EG:       ; %bb.0:
1477; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1478; EG-NEXT:    TEX 0 @6
1479; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1480; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1481; EG-NEXT:    CF_END
1482; EG-NEXT:    PAD
1483; EG-NEXT:    Fetch clause starting at 6:
1484; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1485; EG-NEXT:    ALU clause starting at 8:
1486; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1487; EG-NEXT:    ALU clause starting at 9:
1488; EG-NEXT:     FFBL_INT T0.W, T0.X,
1489; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1490; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1491; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1492; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1493; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1494; EG-NEXT:     LSHL T0.X, PV.W, PS,
1495; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1496; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1497; EG-NEXT:     MOV T0.Y, 0.0,
1498; EG-NEXT:     MOV * T0.Z, 0.0,
1499; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1500; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1501;
1502; GFX9-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1503; GFX9-GISEL:       ; %bb.0:
1504; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1505; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1506; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xff
1507; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1508; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1509; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1510; GFX9-GISEL-NEXT:    v_or_b32_e32 v3, 0x100, v1
1511; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
1512; GFX9-GISEL-NEXT:    v_and_b32_e32 v3, 0xff, v3
1513; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1514; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v2, vcc
1515; GFX9-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
1516; GFX9-GISEL-NEXT:    s_endpgm
1517  %val = load i8, ptr addrspace(1) %arrayidx, align 1
1518  %ctlz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1519  %cmp = icmp eq i8 %val, 0
1520  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1521  store i8 %sel, ptr addrspace(1) %out
1522  ret void
1523}
1524
1525 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) nocapture readonly %arrayidx) nounwind {
1526; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1527; SI:       ; %bb.0:
1528; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1529; SI-NEXT:    s_mov_b32 s7, 0xf000
1530; SI-NEXT:    s_mov_b32 s6, -1
1531; SI-NEXT:    s_mov_b32 s10, s6
1532; SI-NEXT:    s_mov_b32 s11, s7
1533; SI-NEXT:    s_waitcnt lgkmcnt(0)
1534; SI-NEXT:    s_mov_b32 s8, s2
1535; SI-NEXT:    s_mov_b32 s9, s3
1536; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0 offset:1
1537; SI-NEXT:    buffer_load_ubyte v1, off, s[8:11], 0
1538; SI-NEXT:    s_mov_b32 s4, s0
1539; SI-NEXT:    s_mov_b32 s5, s1
1540; SI-NEXT:    s_waitcnt vmcnt(1)
1541; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
1542; SI-NEXT:    s_waitcnt vmcnt(0)
1543; SI-NEXT:    v_or_b32_e32 v0, v0, v1
1544; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1545; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1546; SI-NEXT:    s_endpgm
1547;
1548; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1549; VI:       ; %bb.0:
1550; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1551; VI-NEXT:    s_waitcnt lgkmcnt(0)
1552; VI-NEXT:    s_add_u32 s4, s2, 1
1553; VI-NEXT:    s_addc_u32 s5, s3, 0
1554; VI-NEXT:    v_mov_b32_e32 v2, s4
1555; VI-NEXT:    v_mov_b32_e32 v0, s2
1556; VI-NEXT:    v_mov_b32_e32 v3, s5
1557; VI-NEXT:    v_mov_b32_e32 v1, s3
1558; VI-NEXT:    flat_load_ubyte v2, v[2:3]
1559; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1560; VI-NEXT:    s_waitcnt vmcnt(1)
1561; VI-NEXT:    v_readfirstlane_b32 s2, v2
1562; VI-NEXT:    s_waitcnt vmcnt(0)
1563; VI-NEXT:    v_readfirstlane_b32 s3, v0
1564; VI-NEXT:    s_lshl_b32 s2, s2, 8
1565; VI-NEXT:    s_or_b32 s2, s2, s3
1566; VI-NEXT:    s_or_b32 s3, s2, 0x10000
1567; VI-NEXT:    s_and_b32 s2, s2, 0xffff
1568; VI-NEXT:    s_ff1_i32_b32 s3, s3
1569; VI-NEXT:    s_cmp_lg_u32 s2, 0
1570; VI-NEXT:    s_cselect_b32 s2, s3, 0xffff
1571; VI-NEXT:    v_mov_b32_e32 v0, s0
1572; VI-NEXT:    v_mov_b32_e32 v1, s1
1573; VI-NEXT:    v_mov_b32_e32 v2, s2
1574; VI-NEXT:    flat_store_short v[0:1], v2
1575; VI-NEXT:    s_endpgm
1576;
1577; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1578; EG:       ; %bb.0:
1579; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1580; EG-NEXT:    TEX 0 @6
1581; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1582; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1583; EG-NEXT:    CF_END
1584; EG-NEXT:    PAD
1585; EG-NEXT:    Fetch clause starting at 6:
1586; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1587; EG-NEXT:    ALU clause starting at 8:
1588; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1589; EG-NEXT:    ALU clause starting at 9:
1590; EG-NEXT:     FFBL_INT T0.W, T0.X,
1591; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1592; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1593; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1594; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1595; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1596; EG-NEXT:     LSHL T0.X, PV.W, PS,
1597; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1598; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1599; EG-NEXT:     MOV T0.Y, 0.0,
1600; EG-NEXT:     MOV * T0.Z, 0.0,
1601; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1602; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1603;
1604; GFX9-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1605; GFX9-GISEL:       ; %bb.0:
1606; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1607; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1608; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff
1609; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1610; GFX9-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
1611; GFX9-GISEL-NEXT:    global_load_ubyte v2, v0, s[2:3] offset:1
1612; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
1613; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
1614; GFX9-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1615; GFX9-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
1616; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1617; GFX9-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
1618; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
1619; GFX9-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1620; GFX9-GISEL-NEXT:    s_endpgm
1621  %val = load i16, ptr addrspace(1) %arrayidx, align 1
1622  %ctlz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1623  %cmp = icmp eq i16 %val, 0
1624  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1625  store i16 %sel, ptr addrspace(1) %out
1626  ret void
1627}
1628
1629
1630