xref: /llvm-project/llvm/test/CodeGen/AMDGPU/cttz.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7
8declare i7 @llvm.cttz.i7(i7, i1) nounwind readnone
9declare i8 @llvm.cttz.i8(i8, i1) nounwind readnone
10declare i16 @llvm.cttz.i16(i16, i1) nounwind readnone
11
12declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
13declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>, i1) nounwind readnone
14declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
15
16declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
17declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>, i1) nounwind readnone
18declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>, i1) nounwind readnone
19
20declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
21
22define amdgpu_kernel void @s_cttz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
23; SI-LABEL: s_cttz_i32:
24; SI:       ; %bb.0:
25; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
26; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
27; SI-NEXT:    s_mov_b32 s3, 0xf000
28; SI-NEXT:    s_waitcnt lgkmcnt(0)
29; SI-NEXT:    s_ff1_i32_b32 s2, s2
30; SI-NEXT:    s_min_u32 s4, s2, 32
31; SI-NEXT:    s_mov_b32 s2, -1
32; SI-NEXT:    v_mov_b32_e32 v0, s4
33; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
34; SI-NEXT:    s_endpgm
35;
36; VI-LABEL: s_cttz_i32:
37; VI:       ; %bb.0:
38; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
39; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
40; VI-NEXT:    s_mov_b32 s3, 0xf000
41; VI-NEXT:    s_mov_b32 s2, -1
42; VI-NEXT:    s_waitcnt lgkmcnt(0)
43; VI-NEXT:    s_ff1_i32_b32 s4, s6
44; VI-NEXT:    s_min_u32 s4, s4, 32
45; VI-NEXT:    v_mov_b32_e32 v0, s4
46; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
47; VI-NEXT:    s_endpgm
48;
49; EG-LABEL: s_cttz_i32:
50; EG:       ; %bb.0:
51; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
52; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
53; EG-NEXT:    CF_END
54; EG-NEXT:    PAD
55; EG-NEXT:    ALU clause starting at 4:
56; EG-NEXT:     FFBL_INT * T0.W, KC0[2].Z,
57; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
58; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
59; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
60;
61; GFX10-LABEL: s_cttz_i32:
62; GFX10:       ; %bb.0:
63; GFX10-NEXT:    s_clause 0x1
64; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
65; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
66; GFX10-NEXT:    v_mov_b32_e32 v0, 0
67; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
68; GFX10-NEXT:    s_ff1_i32_b32 s2, s2
69; GFX10-NEXT:    s_min_u32 s2, s2, 32
70; GFX10-NEXT:    v_mov_b32_e32 v1, s2
71; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
72; GFX10-NEXT:    s_endpgm
73;
74; GFX10-GISEL-LABEL: s_cttz_i32:
75; GFX10-GISEL:       ; %bb.0:
76; GFX10-GISEL-NEXT:    s_clause 0x1
77; GFX10-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
78; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
79; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
80; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
81; GFX10-GISEL-NEXT:    s_ff1_i32_b32 s2, s2
82; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 32
83; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
84; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
85; GFX10-GISEL-NEXT:    s_endpgm
86  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
87  store i32 %cttz, ptr addrspace(1) %out, align 4
88  ret void
89}
90
91define amdgpu_kernel void @v_cttz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
92; SI-LABEL: v_cttz_i32:
93; SI:       ; %bb.0:
94; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
95; SI-NEXT:    s_mov_b32 s7, 0xf000
96; SI-NEXT:    s_mov_b32 s10, 0
97; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
98; SI-NEXT:    v_mov_b32_e32 v1, 0
99; SI-NEXT:    s_mov_b32 s11, s7
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
102; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
103; SI-NEXT:    s_mov_b32 s6, -1
104; SI-NEXT:    s_mov_b32 s4, s0
105; SI-NEXT:    s_mov_b32 s5, s1
106; SI-NEXT:    s_waitcnt vmcnt(0)
107; SI-NEXT:    v_ffbl_b32_e32 v0, v0
108; SI-NEXT:    v_min_u32_e32 v0, 32, v0
109; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
110; SI-NEXT:    s_endpgm
111;
112; VI-LABEL: v_cttz_i32:
113; VI:       ; %bb.0:
114; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
115; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
116; VI-NEXT:    s_waitcnt lgkmcnt(0)
117; VI-NEXT:    v_mov_b32_e32 v1, s3
118; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
119; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
120; VI-NEXT:    flat_load_dword v0, v[0:1]
121; VI-NEXT:    s_mov_b32 s3, 0xf000
122; VI-NEXT:    s_mov_b32 s2, -1
123; VI-NEXT:    s_waitcnt vmcnt(0)
124; VI-NEXT:    v_ffbl_b32_e32 v0, v0
125; VI-NEXT:    v_min_u32_e32 v0, 32, v0
126; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
127; VI-NEXT:    s_endpgm
128;
129; EG-LABEL: v_cttz_i32:
130; EG:       ; %bb.0:
131; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
132; EG-NEXT:    TEX 0 @6
133; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
134; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
135; EG-NEXT:    CF_END
136; EG-NEXT:    PAD
137; EG-NEXT:    Fetch clause starting at 6:
138; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
139; EG-NEXT:    ALU clause starting at 8:
140; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
141; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
142; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
143; EG-NEXT:    ALU clause starting at 11:
144; EG-NEXT:     FFBL_INT * T0.W, T0.X,
145; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
146; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
147; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
148;
149; GFX10-LABEL: v_cttz_i32:
150; GFX10:       ; %bb.0:
151; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
152; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
153; GFX10-NEXT:    v_mov_b32_e32 v1, 0
154; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
155; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
156; GFX10-NEXT:    s_waitcnt vmcnt(0)
157; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
158; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
159; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
160; GFX10-NEXT:    s_endpgm
161;
162; GFX10-GISEL-LABEL: v_cttz_i32:
163; GFX10-GISEL:       ; %bb.0:
164; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
165; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
166; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
167; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
168; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
169; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
170; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
171; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
172; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
173; GFX10-GISEL-NEXT:    s_endpgm
174  %tid = call i32 @llvm.amdgcn.workitem.id.x()
175  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
176  %val = load i32, ptr addrspace(1) %in.gep, align 4
177  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
178  store i32 %cttz, ptr addrspace(1) %out, align 4
179  ret void
180}
181
182define amdgpu_kernel void @v_cttz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
183; SI-LABEL: v_cttz_v2i32:
184; SI:       ; %bb.0:
185; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
186; SI-NEXT:    s_mov_b32 s7, 0xf000
187; SI-NEXT:    s_mov_b32 s10, 0
188; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
189; SI-NEXT:    v_mov_b32_e32 v1, 0
190; SI-NEXT:    s_mov_b32 s11, s7
191; SI-NEXT:    s_waitcnt lgkmcnt(0)
192; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
193; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
194; SI-NEXT:    s_mov_b32 s6, -1
195; SI-NEXT:    s_mov_b32 s4, s0
196; SI-NEXT:    s_mov_b32 s5, s1
197; SI-NEXT:    s_waitcnt vmcnt(0)
198; SI-NEXT:    v_ffbl_b32_e32 v1, v1
199; SI-NEXT:    v_ffbl_b32_e32 v0, v0
200; SI-NEXT:    v_min_u32_e32 v1, 32, v1
201; SI-NEXT:    v_min_u32_e32 v0, 32, v0
202; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
203; SI-NEXT:    s_endpgm
204;
205; VI-LABEL: v_cttz_v2i32:
206; VI:       ; %bb.0:
207; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
208; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
209; VI-NEXT:    s_waitcnt lgkmcnt(0)
210; VI-NEXT:    v_mov_b32_e32 v1, s3
211; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
212; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
213; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
214; VI-NEXT:    s_mov_b32 s3, 0xf000
215; VI-NEXT:    s_mov_b32 s2, -1
216; VI-NEXT:    s_waitcnt vmcnt(0)
217; VI-NEXT:    v_ffbl_b32_e32 v1, v1
218; VI-NEXT:    v_ffbl_b32_e32 v0, v0
219; VI-NEXT:    v_min_u32_e32 v1, 32, v1
220; VI-NEXT:    v_min_u32_e32 v0, 32, v0
221; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
222; VI-NEXT:    s_endpgm
223;
224; EG-LABEL: v_cttz_v2i32:
225; EG:       ; %bb.0:
226; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
227; EG-NEXT:    TEX 0 @6
228; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
229; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
230; EG-NEXT:    CF_END
231; EG-NEXT:    PAD
232; EG-NEXT:    Fetch clause starting at 6:
233; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
234; EG-NEXT:    ALU clause starting at 8:
235; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
236; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
237; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
238; EG-NEXT:    ALU clause starting at 11:
239; EG-NEXT:     FFBL_INT * T0.W, T0.Y,
240; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
241; EG-NEXT:     FFBL_INT * T0.W, T0.X,
242; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
243; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
244; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
245; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
246;
247; GFX10-LABEL: v_cttz_v2i32:
248; GFX10:       ; %bb.0:
249; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
250; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
251; GFX10-NEXT:    v_mov_b32_e32 v2, 0
252; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
253; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
254; GFX10-NEXT:    s_waitcnt vmcnt(0)
255; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
256; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
257; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
258; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
259; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
260; GFX10-NEXT:    s_endpgm
261;
262; GFX10-GISEL-LABEL: v_cttz_v2i32:
263; GFX10-GISEL:       ; %bb.0:
264; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
265; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
266; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
267; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
268; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
269; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
270; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
271; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
272; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
273; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
274; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
275; GFX10-GISEL-NEXT:    s_endpgm
276  %tid = call i32 @llvm.amdgcn.workitem.id.x()
277  %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
278  %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
279  %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
280  store <2 x i32> %cttz, ptr addrspace(1) %out, align 8
281  ret void
282}
283
284define amdgpu_kernel void @v_cttz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
285; SI-LABEL: v_cttz_v4i32:
286; SI:       ; %bb.0:
287; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
288; SI-NEXT:    s_mov_b32 s7, 0xf000
289; SI-NEXT:    s_mov_b32 s10, 0
290; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
291; SI-NEXT:    v_mov_b32_e32 v1, 0
292; SI-NEXT:    s_mov_b32 s11, s7
293; SI-NEXT:    s_waitcnt lgkmcnt(0)
294; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
295; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
296; SI-NEXT:    s_mov_b32 s6, -1
297; SI-NEXT:    s_mov_b32 s4, s0
298; SI-NEXT:    s_mov_b32 s5, s1
299; SI-NEXT:    s_waitcnt vmcnt(0)
300; SI-NEXT:    v_ffbl_b32_e32 v3, v3
301; SI-NEXT:    v_ffbl_b32_e32 v2, v2
302; SI-NEXT:    v_ffbl_b32_e32 v1, v1
303; SI-NEXT:    v_ffbl_b32_e32 v0, v0
304; SI-NEXT:    v_min_u32_e32 v3, 32, v3
305; SI-NEXT:    v_min_u32_e32 v2, 32, v2
306; SI-NEXT:    v_min_u32_e32 v1, 32, v1
307; SI-NEXT:    v_min_u32_e32 v0, 32, v0
308; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
309; SI-NEXT:    s_endpgm
310;
311; VI-LABEL: v_cttz_v4i32:
312; VI:       ; %bb.0:
313; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
314; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
315; VI-NEXT:    s_waitcnt lgkmcnt(0)
316; VI-NEXT:    v_mov_b32_e32 v1, s3
317; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
318; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
319; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
320; VI-NEXT:    s_mov_b32 s3, 0xf000
321; VI-NEXT:    s_mov_b32 s2, -1
322; VI-NEXT:    s_waitcnt vmcnt(0)
323; VI-NEXT:    v_ffbl_b32_e32 v3, v3
324; VI-NEXT:    v_ffbl_b32_e32 v2, v2
325; VI-NEXT:    v_ffbl_b32_e32 v1, v1
326; VI-NEXT:    v_ffbl_b32_e32 v0, v0
327; VI-NEXT:    v_min_u32_e32 v3, 32, v3
328; VI-NEXT:    v_min_u32_e32 v2, 32, v2
329; VI-NEXT:    v_min_u32_e32 v1, 32, v1
330; VI-NEXT:    v_min_u32_e32 v0, 32, v0
331; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
332; VI-NEXT:    s_endpgm
333;
334; EG-LABEL: v_cttz_v4i32:
335; EG:       ; %bb.0:
336; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
337; EG-NEXT:    TEX 0 @6
338; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
339; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
340; EG-NEXT:    CF_END
341; EG-NEXT:    PAD
342; EG-NEXT:    Fetch clause starting at 6:
343; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
344; EG-NEXT:    ALU clause starting at 8:
345; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
346; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
347; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
348; EG-NEXT:    ALU clause starting at 11:
349; EG-NEXT:     FFBL_INT * T1.W, T0.W,
350; EG-NEXT:     FFBL_INT T2.W, T0.Z,
351; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
352; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
353; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
354; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
355; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
356; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
357; EG-NEXT:     FFBL_INT * T1.W, T0.X,
358; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
359; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
360; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
361; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
362;
363; GFX10-LABEL: v_cttz_v4i32:
364; GFX10:       ; %bb.0:
365; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
366; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
367; GFX10-NEXT:    v_mov_b32_e32 v4, 0
368; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
369; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
370; GFX10-NEXT:    s_waitcnt vmcnt(0)
371; GFX10-NEXT:    v_ffbl_b32_e32 v3, v3
372; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
373; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
374; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
375; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
376; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
377; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
378; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
379; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
380; GFX10-NEXT:    s_endpgm
381;
382; GFX10-GISEL-LABEL: v_cttz_v4i32:
383; GFX10-GISEL:       ; %bb.0:
384; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
385; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
386; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
387; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
388; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
389; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
390; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
391; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
392; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
393; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v3, v3
394; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
395; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
396; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
397; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
398; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
399; GFX10-GISEL-NEXT:    s_endpgm
400  %tid = call i32 @llvm.amdgcn.workitem.id.x()
401  %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
402  %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
403  %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
404  store <4 x i32> %cttz, ptr addrspace(1) %out, align 16
405  ret void
406}
407
408define amdgpu_kernel void @v_cttz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
409; SI-LABEL: v_cttz_i8:
410; SI:       ; %bb.0:
411; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
412; SI-NEXT:    s_mov_b32 s7, 0xf000
413; SI-NEXT:    s_mov_b32 s6, -1
414; SI-NEXT:    s_mov_b32 s10, s6
415; SI-NEXT:    s_mov_b32 s11, s7
416; SI-NEXT:    s_waitcnt lgkmcnt(0)
417; SI-NEXT:    s_mov_b32 s8, s2
418; SI-NEXT:    s_mov_b32 s9, s3
419; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
420; SI-NEXT:    s_mov_b32 s4, s0
421; SI-NEXT:    s_mov_b32 s5, s1
422; SI-NEXT:    s_waitcnt vmcnt(0)
423; SI-NEXT:    v_or_b32_e32 v0, 0x100, v0
424; SI-NEXT:    v_ffbl_b32_e32 v0, v0
425; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
426; SI-NEXT:    s_endpgm
427;
428; VI-LABEL: v_cttz_i8:
429; VI:       ; %bb.0:
430; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
431; VI-NEXT:    s_mov_b32 s7, 0xf000
432; VI-NEXT:    s_mov_b32 s6, -1
433; VI-NEXT:    s_mov_b32 s10, s6
434; VI-NEXT:    s_mov_b32 s11, s7
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    s_mov_b32 s8, s2
437; VI-NEXT:    s_mov_b32 s9, s3
438; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
439; VI-NEXT:    s_mov_b32 s4, s0
440; VI-NEXT:    s_mov_b32 s5, s1
441; VI-NEXT:    s_waitcnt vmcnt(0)
442; VI-NEXT:    v_or_b32_e32 v0, 0x100, v0
443; VI-NEXT:    v_ffbl_b32_e32 v0, v0
444; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
445; VI-NEXT:    s_endpgm
446;
447; EG-LABEL: v_cttz_i8:
448; EG:       ; %bb.0:
449; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
450; EG-NEXT:    TEX 0 @6
451; EG-NEXT:    ALU 14, @9, KC0[CB0:0-32], KC1[]
452; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
453; EG-NEXT:    CF_END
454; EG-NEXT:    PAD
455; EG-NEXT:    Fetch clause starting at 6:
456; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
457; EG-NEXT:    ALU clause starting at 8:
458; EG-NEXT:     MOV * T0.X, KC0[2].Z,
459; EG-NEXT:    ALU clause starting at 9:
460; EG-NEXT:     OR_INT * T0.W, T0.X, literal.x,
461; EG-NEXT:    256(3.587324e-43), 0(0.000000e+00)
462; EG-NEXT:     FFBL_INT T0.W, PV.W,
463; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
464; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
465; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
466; EG-NEXT:     LSHL * T1.W, PS, literal.y,
467; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
468; EG-NEXT:     LSHL T0.X, PV.W, PS,
469; EG-NEXT:     LSHL * T0.W, literal.x, PS,
470; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
471; EG-NEXT:     MOV T0.Y, 0.0,
472; EG-NEXT:     MOV * T0.Z, 0.0,
473; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
474; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
475;
476; GFX10-LABEL: v_cttz_i8:
477; GFX10:       ; %bb.0:
478; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
479; GFX10-NEXT:    v_mov_b32_e32 v0, 0
480; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
481; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
482; GFX10-NEXT:    s_waitcnt vmcnt(0)
483; GFX10-NEXT:    v_or_b32_e32 v1, 0x100, v1
484; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
485; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
486; GFX10-NEXT:    s_endpgm
487;
488; GFX10-GISEL-LABEL: v_cttz_i8:
489; GFX10-GISEL:       ; %bb.0:
490; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
491; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
492; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
493; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
494; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
495; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v1
496; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
497; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
498; GFX10-GISEL-NEXT:    s_endpgm
499  %val = load i8, ptr addrspace(1) %valptr
500  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
501  store i8 %cttz, ptr addrspace(1) %out
502  ret void
503}
504
505define amdgpu_kernel void @s_cttz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
506; SI-LABEL: s_cttz_i64:
507; SI:       ; %bb.0:
508; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
509; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
510; SI-NEXT:    s_mov_b32 s3, 0xf000
511; SI-NEXT:    s_mov_b32 s2, -1
512; SI-NEXT:    s_waitcnt lgkmcnt(0)
513; SI-NEXT:    s_ff1_i32_b64 s4, s[6:7]
514; SI-NEXT:    s_min_u32 s4, s4, 64
515; SI-NEXT:    v_mov_b32_e32 v1, 0
516; SI-NEXT:    v_mov_b32_e32 v0, s4
517; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
518; SI-NEXT:    s_endpgm
519;
520; VI-LABEL: s_cttz_i64:
521; VI:       ; %bb.0:
522; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
523; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
524; VI-NEXT:    s_mov_b32 s3, 0xf000
525; VI-NEXT:    s_mov_b32 s2, -1
526; VI-NEXT:    v_mov_b32_e32 v1, 0
527; VI-NEXT:    s_waitcnt lgkmcnt(0)
528; VI-NEXT:    s_ff1_i32_b64 s4, s[6:7]
529; VI-NEXT:    s_min_u32 s4, s4, 64
530; VI-NEXT:    v_mov_b32_e32 v0, s4
531; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
532; VI-NEXT:    s_endpgm
533;
534; EG-LABEL: s_cttz_i64:
535; EG:       ; %bb.0:
536; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
537; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
538; EG-NEXT:    CF_END
539; EG-NEXT:    PAD
540; EG-NEXT:    ALU clause starting at 4:
541; EG-NEXT:     FFBL_INT * T0.W, KC0[5].X,
542; EG-NEXT:     CNDE_INT * T0.W, KC0[5].X, literal.x, PV.W,
543; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
544; EG-NEXT:     FFBL_INT T1.W, KC0[4].W,
545; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
546; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
547; EG-NEXT:     CNDE_INT T0.X, KC0[4].W, PS, PV.W,
548; EG-NEXT:     MOV T0.Y, 0.0,
549; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
550; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
551;
552; GFX10-LABEL: s_cttz_i64:
553; GFX10:       ; %bb.0:
554; GFX10-NEXT:    s_clause 0x1
555; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
556; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
557; GFX10-NEXT:    v_mov_b32_e32 v1, 0
558; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
559; GFX10-NEXT:    s_ff1_i32_b64 s0, s[0:1]
560; GFX10-NEXT:    s_min_u32 s0, s0, 64
561; GFX10-NEXT:    v_mov_b32_e32 v0, s0
562; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[2:3]
563; GFX10-NEXT:    s_endpgm
564;
565; GFX10-GISEL-LABEL: s_cttz_i64:
566; GFX10-GISEL:       ; %bb.0:
567; GFX10-GISEL-NEXT:    s_clause 0x1
568; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
569; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
570; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
571; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
572; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s0, s[0:1]
573; GFX10-GISEL-NEXT:    s_mov_b32 s1, 0
574; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
575; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
576; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
577; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
578; GFX10-GISEL-NEXT:    s_endpgm
579  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
580  store i64 %cttz, ptr addrspace(1) %out
581  ret void
582}
583
584define amdgpu_kernel void @s_cttz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
585; SI-LABEL: s_cttz_i64_trunc:
586; SI:       ; %bb.0:
587; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
588; SI-NEXT:    s_mov_b32 s7, 0xf000
589; SI-NEXT:    s_waitcnt lgkmcnt(0)
590; SI-NEXT:    s_ff1_i32_b64 s2, s[2:3]
591; SI-NEXT:    s_min_u32 s2, s2, 64
592; SI-NEXT:    s_mov_b32 s6, -1
593; SI-NEXT:    s_mov_b32 s4, s0
594; SI-NEXT:    s_mov_b32 s5, s1
595; SI-NEXT:    v_mov_b32_e32 v0, s2
596; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
597; SI-NEXT:    s_endpgm
598;
599; VI-LABEL: s_cttz_i64_trunc:
600; VI:       ; %bb.0:
601; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
602; VI-NEXT:    s_mov_b32 s7, 0xf000
603; VI-NEXT:    s_mov_b32 s6, -1
604; VI-NEXT:    s_waitcnt lgkmcnt(0)
605; VI-NEXT:    s_mov_b32 s4, s0
606; VI-NEXT:    s_ff1_i32_b64 s0, s[2:3]
607; VI-NEXT:    s_min_u32 s0, s0, 64
608; VI-NEXT:    s_mov_b32 s5, s1
609; VI-NEXT:    v_mov_b32_e32 v0, s0
610; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
611; VI-NEXT:    s_endpgm
612;
613; EG-LABEL: s_cttz_i64_trunc:
614; EG:       ; %bb.0:
615; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
616; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
617; EG-NEXT:    CF_END
618; EG-NEXT:    PAD
619; EG-NEXT:    ALU clause starting at 4:
620; EG-NEXT:     FFBL_INT * T0.W, KC0[3].X,
621; EG-NEXT:     CNDE_INT * T0.W, KC0[3].X, literal.x, PV.W,
622; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
623; EG-NEXT:     FFBL_INT T1.W, KC0[2].W,
624; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
625; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
626; EG-NEXT:     CNDE_INT T0.X, KC0[2].W, PS, PV.W,
627; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
628; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
629;
630; GFX10-LABEL: s_cttz_i64_trunc:
631; GFX10:       ; %bb.0:
632; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
633; GFX10-NEXT:    v_mov_b32_e32 v0, 0
634; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
635; GFX10-NEXT:    s_ff1_i32_b64 s2, s[2:3]
636; GFX10-NEXT:    s_min_u32 s2, s2, 64
637; GFX10-NEXT:    v_mov_b32_e32 v1, s2
638; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
639; GFX10-NEXT:    s_endpgm
640;
641; GFX10-GISEL-LABEL: s_cttz_i64_trunc:
642; GFX10-GISEL:       ; %bb.0:
643; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
644; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
645; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
646; GFX10-GISEL-NEXT:    s_ff1_i32_b64 s2, s[2:3]
647; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
648; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
649; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
650; GFX10-GISEL-NEXT:    s_endpgm
651  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
652  %trunc = trunc i64 %cttz to i32
653  store i32 %trunc, ptr addrspace(1) %out
654  ret void
655}
656
657define amdgpu_kernel void @v_cttz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
658; SI-LABEL: v_cttz_i64:
659; SI:       ; %bb.0:
660; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
661; SI-NEXT:    s_mov_b32 s7, 0xf000
662; SI-NEXT:    s_mov_b32 s6, 0
663; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
664; SI-NEXT:    v_mov_b32_e32 v1, 0
665; SI-NEXT:    s_waitcnt lgkmcnt(0)
666; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
667; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
668; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
669; SI-NEXT:    s_waitcnt vmcnt(0)
670; SI-NEXT:    v_ffbl_b32_e32 v3, v3
671; SI-NEXT:    v_min_u32_e32 v3, 0xffffffdf, v3
672; SI-NEXT:    v_add_i32_e32 v3, vcc, 32, v3
673; SI-NEXT:    v_ffbl_b32_e32 v2, v2
674; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
675; SI-NEXT:    v_mov_b32_e32 v3, v1
676; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
677; SI-NEXT:    s_endpgm
678;
679; VI-LABEL: v_cttz_i64:
680; VI:       ; %bb.0:
681; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
682; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
683; VI-NEXT:    v_mov_b32_e32 v2, 0
684; VI-NEXT:    s_waitcnt lgkmcnt(0)
685; VI-NEXT:    v_mov_b32_e32 v1, s3
686; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
687; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
688; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
689; VI-NEXT:    v_mov_b32_e32 v4, s1
690; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
691; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
692; VI-NEXT:    s_waitcnt vmcnt(0)
693; VI-NEXT:    v_ffbl_b32_e32 v1, v1
694; VI-NEXT:    v_add_u32_e64 v1, s[0:1], v1, 32 clamp
695; VI-NEXT:    v_ffbl_b32_e32 v0, v0
696; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
697; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
698; VI-NEXT:    s_endpgm
699;
700; EG-LABEL: v_cttz_i64:
701; EG:       ; %bb.0:
702; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
703; EG-NEXT:    TEX 0 @6
704; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
705; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
706; EG-NEXT:    CF_END
707; EG-NEXT:    PAD
708; EG-NEXT:    Fetch clause starting at 6:
709; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
710; EG-NEXT:    ALU clause starting at 8:
711; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
712; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
713; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
714; EG-NEXT:    ALU clause starting at 11:
715; EG-NEXT:     FFBL_INT * T1.W, T0.Y,
716; EG-NEXT:     CNDE_INT * T1.W, T0.Y, literal.x, PV.W,
717; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
718; EG-NEXT:     FFBL_INT T2.W, T0.X,
719; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
720; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
721; EG-NEXT:     CNDE_INT T0.X, T0.X, PS, PV.W,
722; EG-NEXT:     MOV T0.Y, 0.0,
723; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
724; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
725; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
726;
727; GFX10-LABEL: v_cttz_i64:
728; GFX10:       ; %bb.0:
729; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
730; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
731; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
732; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
733; GFX10-NEXT:    s_waitcnt vmcnt(0)
734; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
735; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
736; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
737; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
738; GFX10-NEXT:    v_mov_b32_e32 v1, 0
739; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
740; GFX10-NEXT:    s_endpgm
741;
742; GFX10-GISEL-LABEL: v_cttz_i64:
743; GFX10-GISEL:       ; %bb.0:
744; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
745; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
746; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
748; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
749; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
750; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
751; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
752; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
753; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
754; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
755; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
756; GFX10-GISEL-NEXT:    s_endpgm
757  %tid = call i32 @llvm.amdgcn.workitem.id.x()
758  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
759  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
760  %val = load i64, ptr addrspace(1) %in.gep
761  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
762  store i64 %cttz, ptr addrspace(1) %out.gep
763  ret void
764}
765
766define amdgpu_kernel void @v_cttz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
767; SI-LABEL: v_cttz_i64_trunc:
768; SI:       ; %bb.0:
769; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
770; SI-NEXT:    s_mov_b32 s7, 0xf000
771; SI-NEXT:    s_mov_b32 s6, 0
772; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
773; SI-NEXT:    v_mov_b32_e32 v2, 0
774; SI-NEXT:    s_waitcnt lgkmcnt(0)
775; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
776; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
777; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
778; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
779; SI-NEXT:    s_waitcnt vmcnt(0)
780; SI-NEXT:    v_ffbl_b32_e32 v0, v4
781; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
782; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
783; SI-NEXT:    v_ffbl_b32_e32 v3, v3
784; SI-NEXT:    v_min3_u32 v0, v3, v0, 64
785; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
786; SI-NEXT:    s_endpgm
787;
788; VI-LABEL: v_cttz_i64_trunc:
789; VI:       ; %bb.0:
790; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
791; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
792; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
793; VI-NEXT:    s_waitcnt lgkmcnt(0)
794; VI-NEXT:    v_mov_b32_e32 v2, s3
795; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
796; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
797; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
798; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
799; VI-NEXT:    v_mov_b32_e32 v4, s1
800; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
801; VI-NEXT:    s_waitcnt vmcnt(0)
802; VI-NEXT:    v_ffbl_b32_e32 v0, v2
803; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
804; VI-NEXT:    v_ffbl_b32_e32 v1, v1
805; VI-NEXT:    v_min3_u32 v0, v1, v0, 64
806; VI-NEXT:    flat_store_dword v[3:4], v0
807; VI-NEXT:    s_endpgm
808;
809; EG-LABEL: v_cttz_i64_trunc:
810; EG:       ; %bb.0:
811; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
812; EG-NEXT:    TEX 0 @6
813; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
814; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
815; EG-NEXT:    CF_END
816; EG-NEXT:    PAD
817; EG-NEXT:    Fetch clause starting at 6:
818; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
819; EG-NEXT:    ALU clause starting at 8:
820; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
821; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
822; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
823; EG-NEXT:    ALU clause starting at 11:
824; EG-NEXT:     FFBL_INT * T0.W, T1.Y,
825; EG-NEXT:     CNDE_INT * T0.W, T1.Y, literal.x, PV.W,
826; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
827; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
828; EG-NEXT:     FFBL_INT T1.W, T1.X, BS:VEC_120/SCL_212
829; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
830; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
831; EG-NEXT:     CNDE_INT T0.X, T1.X, PS, PV.W,
832; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
833; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
834; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
835;
836; GFX10-LABEL: v_cttz_i64_trunc:
837; GFX10:       ; %bb.0:
838; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
839; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
840; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
841; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
842; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
843; GFX10-NEXT:    s_waitcnt vmcnt(0)
844; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
845; GFX10-NEXT:    v_ffbl_b32_e32 v1, v1
846; GFX10-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
847; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
848; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
849; GFX10-NEXT:    s_endpgm
850;
851; GFX10-GISEL-LABEL: v_cttz_i64_trunc:
852; GFX10-GISEL:       ; %bb.0:
853; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
854; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
855; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
856; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
857; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
858; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
859; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
860; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
861; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v2, v2, 32 clamp
862; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
863; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
864; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
865; GFX10-GISEL-NEXT:    s_endpgm
866  %tid = call i32 @llvm.amdgcn.workitem.id.x()
867  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
868  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
869  %val = load i64, ptr addrspace(1) %in.gep
870  %cttz = call i64 @llvm.cttz.i64(i64 %val, i1 false)
871  %trunc = trunc i64 %cttz to i32
872  store i32 %trunc, ptr addrspace(1) %out.gep
873  ret void
874}
875
876define amdgpu_kernel void @v_cttz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
877; SI-LABEL: v_cttz_i32_sel_eq_neg1:
878; SI:       ; %bb.0:
879; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
880; SI-NEXT:    s_mov_b32 s7, 0xf000
881; SI-NEXT:    s_mov_b32 s10, 0
882; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
883; SI-NEXT:    v_mov_b32_e32 v1, 0
884; SI-NEXT:    s_mov_b32 s11, s7
885; SI-NEXT:    s_waitcnt lgkmcnt(0)
886; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
887; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
888; SI-NEXT:    s_mov_b32 s6, -1
889; SI-NEXT:    s_mov_b32 s4, s0
890; SI-NEXT:    s_mov_b32 s5, s1
891; SI-NEXT:    s_waitcnt vmcnt(0)
892; SI-NEXT:    v_ffbl_b32_e32 v0, v0
893; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
894; SI-NEXT:    s_endpgm
895;
896; VI-LABEL: v_cttz_i32_sel_eq_neg1:
897; VI:       ; %bb.0:
898; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
899; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
900; VI-NEXT:    s_waitcnt lgkmcnt(0)
901; VI-NEXT:    v_mov_b32_e32 v1, s3
902; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
903; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
904; VI-NEXT:    flat_load_dword v0, v[0:1]
905; VI-NEXT:    s_mov_b32 s3, 0xf000
906; VI-NEXT:    s_mov_b32 s2, -1
907; VI-NEXT:    s_waitcnt vmcnt(0)
908; VI-NEXT:    v_ffbl_b32_e32 v0, v0
909; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
910; VI-NEXT:    s_endpgm
911;
912; EG-LABEL: v_cttz_i32_sel_eq_neg1:
913; EG:       ; %bb.0:
914; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
915; EG-NEXT:    TEX 0 @6
916; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
917; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
918; EG-NEXT:    CF_END
919; EG-NEXT:    PAD
920; EG-NEXT:    Fetch clause starting at 6:
921; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
922; EG-NEXT:    ALU clause starting at 8:
923; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
924; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
925; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
926; EG-NEXT:    ALU clause starting at 11:
927; EG-NEXT:     FFBL_INT * T0.W, T0.X,
928; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
929; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
930; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
931; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
932; EG-NEXT:    -1(nan), 2(2.802597e-45)
933;
934; GFX10-LABEL: v_cttz_i32_sel_eq_neg1:
935; GFX10:       ; %bb.0:
936; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
937; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
938; GFX10-NEXT:    v_mov_b32_e32 v1, 0
939; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
940; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
941; GFX10-NEXT:    s_waitcnt vmcnt(0)
942; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
943; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
944; GFX10-NEXT:    s_endpgm
945;
946; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_neg1:
947; GFX10-GISEL:       ; %bb.0:
948; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
949; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
950; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
951; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
952; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
953; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
954; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
955; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
956; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
957; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
958; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
959; GFX10-GISEL-NEXT:    s_endpgm
960  %tid = call i32 @llvm.amdgcn.workitem.id.x()
961  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
962  %val = load i32, ptr addrspace(1) %in.gep
963  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
964  %cmp = icmp eq i32 %val, 0
965  %sel = select i1 %cmp, i32 -1, i32 %cttz
966  store i32 %sel, ptr addrspace(1) %out
967  ret void
968}
969
970define amdgpu_kernel void @v_cttz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
971; SI-LABEL: v_cttz_i32_sel_ne_neg1:
972; SI:       ; %bb.0:
973; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
974; SI-NEXT:    s_mov_b32 s7, 0xf000
975; SI-NEXT:    s_mov_b32 s10, 0
976; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
977; SI-NEXT:    v_mov_b32_e32 v1, 0
978; SI-NEXT:    s_mov_b32 s11, s7
979; SI-NEXT:    s_waitcnt lgkmcnt(0)
980; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
981; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
982; SI-NEXT:    s_mov_b32 s6, -1
983; SI-NEXT:    s_mov_b32 s4, s0
984; SI-NEXT:    s_mov_b32 s5, s1
985; SI-NEXT:    s_waitcnt vmcnt(0)
986; SI-NEXT:    v_ffbl_b32_e32 v0, v0
987; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
988; SI-NEXT:    s_endpgm
989;
990; VI-LABEL: v_cttz_i32_sel_ne_neg1:
991; VI:       ; %bb.0:
992; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
993; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
994; VI-NEXT:    s_waitcnt lgkmcnt(0)
995; VI-NEXT:    v_mov_b32_e32 v1, s3
996; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
997; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
998; VI-NEXT:    flat_load_dword v0, v[0:1]
999; VI-NEXT:    s_mov_b32 s3, 0xf000
1000; VI-NEXT:    s_mov_b32 s2, -1
1001; VI-NEXT:    s_waitcnt vmcnt(0)
1002; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1003; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1004; VI-NEXT:    s_endpgm
1005;
1006; EG-LABEL: v_cttz_i32_sel_ne_neg1:
1007; EG:       ; %bb.0:
1008; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1009; EG-NEXT:    TEX 0 @6
1010; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1011; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1012; EG-NEXT:    CF_END
1013; EG-NEXT:    PAD
1014; EG-NEXT:    Fetch clause starting at 6:
1015; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1016; EG-NEXT:    ALU clause starting at 8:
1017; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1018; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1019; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1020; EG-NEXT:    ALU clause starting at 11:
1021; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1022; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1023; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1024; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1025; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1026; EG-NEXT:    -1(nan), 2(2.802597e-45)
1027;
1028; GFX10-LABEL: v_cttz_i32_sel_ne_neg1:
1029; GFX10:       ; %bb.0:
1030; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1031; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1032; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1033; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1034; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1035; GFX10-NEXT:    s_waitcnt vmcnt(0)
1036; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1037; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1038; GFX10-NEXT:    s_endpgm
1039;
1040; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_neg1:
1041; GFX10-GISEL:       ; %bb.0:
1042; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1043; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1044; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1045; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1046; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1047; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v0
1048; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1049; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1050; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1051; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1052; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1053; GFX10-GISEL-NEXT:    s_endpgm
1054  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1055  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1056  %val = load i32, ptr addrspace(1) %in.gep
1057  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1058  %cmp = icmp ne i32 %val, 0
1059  %sel = select i1 %cmp, i32 %cttz, i32 -1
1060  store i32 %sel, ptr addrspace(1) %out
1061  ret void
1062}
1063
1064; TODO: Should be able to eliminate select here as well.
1065define amdgpu_kernel void @v_cttz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1066; SI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1067; SI:       ; %bb.0:
1068; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1069; SI-NEXT:    s_mov_b32 s7, 0xf000
1070; SI-NEXT:    s_mov_b32 s10, 0
1071; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1072; SI-NEXT:    v_mov_b32_e32 v1, 0
1073; SI-NEXT:    s_mov_b32 s11, s7
1074; SI-NEXT:    s_waitcnt lgkmcnt(0)
1075; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1076; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1077; SI-NEXT:    s_mov_b32 s6, -1
1078; SI-NEXT:    s_mov_b32 s4, s0
1079; SI-NEXT:    s_mov_b32 s5, s1
1080; SI-NEXT:    s_waitcnt vmcnt(0)
1081; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1082; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1083; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1084; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1085; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1086; SI-NEXT:    s_endpgm
1087;
1088; VI-LABEL: v_cttz_i32_sel_eq_bitwidth:
1089; VI:       ; %bb.0:
1090; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1091; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1092; VI-NEXT:    s_waitcnt lgkmcnt(0)
1093; VI-NEXT:    v_mov_b32_e32 v1, s3
1094; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1095; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1096; VI-NEXT:    flat_load_dword v0, v[0:1]
1097; VI-NEXT:    s_mov_b32 s3, 0xf000
1098; VI-NEXT:    s_mov_b32 s2, -1
1099; VI-NEXT:    s_waitcnt vmcnt(0)
1100; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1101; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1102; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1103; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1104; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1105; VI-NEXT:    s_endpgm
1106;
1107; EG-LABEL: v_cttz_i32_sel_eq_bitwidth:
1108; EG:       ; %bb.0:
1109; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1110; EG-NEXT:    TEX 0 @6
1111; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1112; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1113; EG-NEXT:    CF_END
1114; EG-NEXT:    PAD
1115; EG-NEXT:    Fetch clause starting at 6:
1116; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1117; EG-NEXT:    ALU clause starting at 8:
1118; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1119; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1120; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1121; EG-NEXT:    ALU clause starting at 11:
1122; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1123; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1124; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1125; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1126; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1127; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1128; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1129; EG-NEXT:    -1(nan), 2(2.802597e-45)
1130;
1131; GFX10-LABEL: v_cttz_i32_sel_eq_bitwidth:
1132; GFX10:       ; %bb.0:
1133; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1134; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1135; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1136; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1137; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1138; GFX10-NEXT:    s_waitcnt vmcnt(0)
1139; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1140; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1141; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1142; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1143; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1144; GFX10-NEXT:    s_endpgm
1145;
1146; GFX10-GISEL-LABEL: v_cttz_i32_sel_eq_bitwidth:
1147; GFX10-GISEL:       ; %bb.0:
1148; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1149; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1150; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1151; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1152; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1153; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1154; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1155; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1156; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1157; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1158; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1159; GFX10-GISEL-NEXT:    s_endpgm
1160  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1161  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1162  %val = load i32, ptr addrspace(1) %in.gep
1163  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1164  %cmp = icmp eq i32 %cttz, 32
1165  %sel = select i1 %cmp, i32 -1, i32 %cttz
1166  store i32 %sel, ptr addrspace(1) %out
1167  ret void
1168}
1169
1170define amdgpu_kernel void @v_cttz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1171; SI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1172; SI:       ; %bb.0:
1173; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1174; SI-NEXT:    s_mov_b32 s7, 0xf000
1175; SI-NEXT:    s_mov_b32 s10, 0
1176; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1177; SI-NEXT:    v_mov_b32_e32 v1, 0
1178; SI-NEXT:    s_mov_b32 s11, s7
1179; SI-NEXT:    s_waitcnt lgkmcnt(0)
1180; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1181; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1182; SI-NEXT:    s_mov_b32 s6, -1
1183; SI-NEXT:    s_mov_b32 s4, s0
1184; SI-NEXT:    s_mov_b32 s5, s1
1185; SI-NEXT:    s_waitcnt vmcnt(0)
1186; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1187; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1188; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1189; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1190; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1191; SI-NEXT:    s_endpgm
1192;
1193; VI-LABEL: v_cttz_i32_sel_ne_bitwidth:
1194; VI:       ; %bb.0:
1195; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1196; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1197; VI-NEXT:    s_waitcnt lgkmcnt(0)
1198; VI-NEXT:    v_mov_b32_e32 v1, s3
1199; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1200; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1201; VI-NEXT:    flat_load_dword v0, v[0:1]
1202; VI-NEXT:    s_mov_b32 s3, 0xf000
1203; VI-NEXT:    s_mov_b32 s2, -1
1204; VI-NEXT:    s_waitcnt vmcnt(0)
1205; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1206; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1207; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1208; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1209; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1210; VI-NEXT:    s_endpgm
1211;
1212; EG-LABEL: v_cttz_i32_sel_ne_bitwidth:
1213; EG:       ; %bb.0:
1214; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1215; EG-NEXT:    TEX 0 @6
1216; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1217; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1218; EG-NEXT:    CF_END
1219; EG-NEXT:    PAD
1220; EG-NEXT:    Fetch clause starting at 6:
1221; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1222; EG-NEXT:    ALU clause starting at 8:
1223; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1224; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1225; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1226; EG-NEXT:    ALU clause starting at 11:
1227; EG-NEXT:     FFBL_INT * T0.W, T0.X,
1228; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1229; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1230; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1231; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1232; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1233; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1234; EG-NEXT:    -1(nan), 2(2.802597e-45)
1235;
1236; GFX10-LABEL: v_cttz_i32_sel_ne_bitwidth:
1237; GFX10:       ; %bb.0:
1238; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1239; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1240; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1241; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1242; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1243; GFX10-NEXT:    s_waitcnt vmcnt(0)
1244; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1245; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1246; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1247; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1248; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1249; GFX10-NEXT:    s_endpgm
1250;
1251; GFX10-GISEL-LABEL: v_cttz_i32_sel_ne_bitwidth:
1252; GFX10-GISEL:       ; %bb.0:
1253; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1254; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1255; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1256; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1257; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1258; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1259; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v0, v0
1260; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1261; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1262; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1263; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1264; GFX10-GISEL-NEXT:    s_endpgm
1265  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1266  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1267  %val = load i32, ptr addrspace(1) %in.gep
1268  %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 false) nounwind readnone
1269  %cmp = icmp ne i32 %cttz, 32
1270  %sel = select i1 %cmp, i32 %cttz, i32 -1
1271  store i32 %sel, ptr addrspace(1) %out
1272  ret void
1273}
1274
1275 define amdgpu_kernel void @v_cttz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1276; SI-LABEL: v_cttz_i8_sel_eq_neg1:
1277; SI:       ; %bb.0:
1278; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1279; SI-NEXT:    s_mov_b32 s7, 0xf000
1280; SI-NEXT:    v_mov_b32_e32 v1, 0
1281; SI-NEXT:    s_mov_b32 s10, 0
1282; SI-NEXT:    s_mov_b32 s11, s7
1283; SI-NEXT:    s_waitcnt lgkmcnt(0)
1284; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1285; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1286; SI-NEXT:    s_mov_b32 s6, -1
1287; SI-NEXT:    s_mov_b32 s4, s0
1288; SI-NEXT:    s_mov_b32 s5, s1
1289; SI-NEXT:    s_waitcnt vmcnt(0)
1290; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1291; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1292; SI-NEXT:    s_endpgm
1293;
1294; VI-LABEL: v_cttz_i8_sel_eq_neg1:
1295; VI:       ; %bb.0:
1296; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1297; VI-NEXT:    s_waitcnt lgkmcnt(0)
1298; VI-NEXT:    v_mov_b32_e32 v1, s3
1299; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1300; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1301; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1302; VI-NEXT:    s_mov_b32 s3, 0xf000
1303; VI-NEXT:    s_mov_b32 s2, -1
1304; VI-NEXT:    s_waitcnt vmcnt(0)
1305; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1306; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1307; VI-NEXT:    s_endpgm
1308;
1309; EG-LABEL: v_cttz_i8_sel_eq_neg1:
1310; EG:       ; %bb.0:
1311; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1312; EG-NEXT:    TEX 0 @6
1313; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1314; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1315; EG-NEXT:    CF_END
1316; EG-NEXT:    PAD
1317; EG-NEXT:    Fetch clause starting at 6:
1318; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1319; EG-NEXT:    ALU clause starting at 8:
1320; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1321; EG-NEXT:    ALU clause starting at 9:
1322; EG-NEXT:     FFBL_INT T0.W, T0.X,
1323; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1324; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1325; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1326; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1327; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1328; EG-NEXT:     LSHL T0.X, PV.W, PS,
1329; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1330; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1331; EG-NEXT:     MOV T0.Y, 0.0,
1332; EG-NEXT:     MOV * T0.Z, 0.0,
1333; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1334; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1335;
1336; GFX10-LABEL: v_cttz_i8_sel_eq_neg1:
1337; GFX10:       ; %bb.0:
1338; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1339; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1340; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1341; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1342; GFX10-NEXT:    s_waitcnt vmcnt(0)
1343; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1344; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1345; GFX10-NEXT:    s_endpgm
1346;
1347; GFX10-GISEL-LABEL: v_cttz_i8_sel_eq_neg1:
1348; GFX10-GISEL:       ; %bb.0:
1349; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1350; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1351; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1352; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1353; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1354; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1355; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1356; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
1357; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1358; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1359; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x100, v0
1360; GFX10-GISEL-NEXT:    v_cmp_eq_u32_sdwa s2, v0, v2 src0_sel:BYTE_0 src1_sel:DWORD
1361; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1362; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, s2
1363; GFX10-GISEL-NEXT:    global_store_byte v2, v0, s[0:1]
1364; GFX10-GISEL-NEXT:    s_endpgm
1365  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1366  %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1367  %val = load i8, ptr addrspace(1) %valptr.gep
1368  %cttz = call i8 @llvm.cttz.i8(i8 %val, i1 false) nounwind readnone
1369  %cmp = icmp eq i8 %val, 0
1370  %sel = select i1 %cmp, i8 -1, i8 %cttz
1371  store i8 %sel, ptr addrspace(1) %out
1372  ret void
1373}
1374
1375 define amdgpu_kernel void @v_cttz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1376; SI-LABEL: v_cttz_i16_sel_eq_neg1:
1377; SI:       ; %bb.0:
1378; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1379; SI-NEXT:    s_mov_b32 s7, 0xf000
1380; SI-NEXT:    s_mov_b32 s6, -1
1381; SI-NEXT:    s_mov_b32 s10, s6
1382; SI-NEXT:    s_mov_b32 s11, s7
1383; SI-NEXT:    s_waitcnt lgkmcnt(0)
1384; SI-NEXT:    s_mov_b32 s8, s2
1385; SI-NEXT:    s_mov_b32 s9, s3
1386; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1387; SI-NEXT:    s_mov_b32 s4, s0
1388; SI-NEXT:    s_mov_b32 s5, s1
1389; SI-NEXT:    s_waitcnt vmcnt(0)
1390; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1391; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1392; SI-NEXT:    s_endpgm
1393;
1394; VI-LABEL: v_cttz_i16_sel_eq_neg1:
1395; VI:       ; %bb.0:
1396; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1397; VI-NEXT:    s_mov_b32 s7, 0xf000
1398; VI-NEXT:    s_mov_b32 s6, -1
1399; VI-NEXT:    s_mov_b32 s10, s6
1400; VI-NEXT:    s_mov_b32 s11, s7
1401; VI-NEXT:    s_waitcnt lgkmcnt(0)
1402; VI-NEXT:    s_mov_b32 s8, s2
1403; VI-NEXT:    s_mov_b32 s9, s3
1404; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1405; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1406; VI-NEXT:    s_mov_b32 s4, s0
1407; VI-NEXT:    s_mov_b32 s5, s1
1408; VI-NEXT:    s_waitcnt vmcnt(0)
1409; VI-NEXT:    v_or_b32_e32 v2, 0x10000, v0
1410; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
1411; VI-NEXT:    v_ffbl_b32_e32 v2, v2
1412; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1413; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1414; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1415; VI-NEXT:    s_endpgm
1416;
1417; EG-LABEL: v_cttz_i16_sel_eq_neg1:
1418; EG:       ; %bb.0:
1419; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1420; EG-NEXT:    TEX 0 @6
1421; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1422; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1423; EG-NEXT:    CF_END
1424; EG-NEXT:    PAD
1425; EG-NEXT:    Fetch clause starting at 6:
1426; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1427; EG-NEXT:    ALU clause starting at 8:
1428; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1429; EG-NEXT:    ALU clause starting at 9:
1430; EG-NEXT:     FFBL_INT T0.W, T0.X,
1431; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1432; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1433; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1434; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1435; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1436; EG-NEXT:     LSHL T0.X, PV.W, PS,
1437; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1438; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1439; EG-NEXT:     MOV T0.Y, 0.0,
1440; EG-NEXT:     MOV * T0.Z, 0.0,
1441; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1442; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1443;
1444; GFX10-LABEL: v_cttz_i16_sel_eq_neg1:
1445; GFX10:       ; %bb.0:
1446; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1447; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1448; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1449; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1450; GFX10-NEXT:    s_waitcnt vmcnt(0)
1451; GFX10-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1452; GFX10-NEXT:    v_cmp_ne_u32_sdwa vcc_lo, v1, v0 src0_sel:WORD_0 src1_sel:DWORD
1453; GFX10-NEXT:    v_ffbl_b32_e32 v2, v2
1454; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1455; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1456; GFX10-NEXT:    s_endpgm
1457;
1458; GFX10-GISEL-LABEL: v_cttz_i16_sel_eq_neg1:
1459; GFX10-GISEL:       ; %bb.0:
1460; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1461; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1462; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1463; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1464; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1465; GFX10-GISEL-NEXT:    v_or_b32_e32 v2, 0x10000, v1
1466; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1467; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v2, v2
1468; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1469; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1470; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1471; GFX10-GISEL-NEXT:    s_endpgm
1472  %val = load i16, ptr addrspace(1) %valptr
1473  %cttz = call i16 @llvm.cttz.i16(i16 %val, i1 false) nounwind readnone
1474  %cmp = icmp eq i16 %val, 0
1475  %sel = select i1 %cmp, i16 -1, i16 %cttz
1476  store i16 %sel, ptr addrspace(1) %out
1477  ret void
1478}
1479
1480; FIXME: Need to handle non-uniform case for function below (load without gep).
1481define amdgpu_kernel void @v_cttz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1482; SI-LABEL: v_cttz_i7_sel_eq_neg1:
1483; SI:       ; %bb.0:
1484; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1485; SI-NEXT:    s_mov_b32 s7, 0xf000
1486; SI-NEXT:    v_mov_b32_e32 v1, 0
1487; SI-NEXT:    s_mov_b32 s10, 0
1488; SI-NEXT:    s_mov_b32 s11, s7
1489; SI-NEXT:    s_waitcnt lgkmcnt(0)
1490; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1491; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1492; SI-NEXT:    s_mov_b32 s6, -1
1493; SI-NEXT:    s_mov_b32 s4, s0
1494; SI-NEXT:    s_mov_b32 s5, s1
1495; SI-NEXT:    s_waitcnt vmcnt(0)
1496; SI-NEXT:    v_ffbl_b32_e32 v0, v0
1497; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1498; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1499; SI-NEXT:    s_endpgm
1500;
1501; VI-LABEL: v_cttz_i7_sel_eq_neg1:
1502; VI:       ; %bb.0:
1503; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1504; VI-NEXT:    s_waitcnt lgkmcnt(0)
1505; VI-NEXT:    v_mov_b32_e32 v1, s3
1506; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1507; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1508; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1509; VI-NEXT:    s_mov_b32 s3, 0xf000
1510; VI-NEXT:    s_mov_b32 s2, -1
1511; VI-NEXT:    s_waitcnt vmcnt(0)
1512; VI-NEXT:    v_ffbl_b32_e32 v0, v0
1513; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1514; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1515; VI-NEXT:    s_endpgm
1516;
1517; EG-LABEL: v_cttz_i7_sel_eq_neg1:
1518; EG:       ; %bb.0:
1519; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1520; EG-NEXT:    TEX 0 @6
1521; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1522; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1523; EG-NEXT:    CF_END
1524; EG-NEXT:    PAD
1525; EG-NEXT:    Fetch clause starting at 6:
1526; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1527; EG-NEXT:    ALU clause starting at 8:
1528; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1529; EG-NEXT:    ALU clause starting at 9:
1530; EG-NEXT:     FFBL_INT T0.W, T0.X,
1531; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1532; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1533; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1534; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1535; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1536; EG-NEXT:     LSHL T0.X, PV.W, PS,
1537; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1538; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1539; EG-NEXT:     MOV T0.Y, 0.0,
1540; EG-NEXT:     MOV * T0.Z, 0.0,
1541; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1542; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1543;
1544; GFX10-LABEL: v_cttz_i7_sel_eq_neg1:
1545; GFX10:       ; %bb.0:
1546; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1547; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1548; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1550; GFX10-NEXT:    s_waitcnt vmcnt(0)
1551; GFX10-NEXT:    v_ffbl_b32_e32 v0, v0
1552; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1553; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1554; GFX10-NEXT:    s_endpgm
1555;
1556; GFX10-GISEL-LABEL: v_cttz_i7_sel_eq_neg1:
1557; GFX10-GISEL:       ; %bb.0:
1558; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1559; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1560; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1561; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1562; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1563; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1564; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1565; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1566; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1567; GFX10-GISEL-NEXT:    v_or_b32_e32 v1, 0x80, v0
1568; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1569; GFX10-GISEL-NEXT:    v_ffbl_b32_e32 v1, v1
1570; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1571; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1572; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1573; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1574; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1575; GFX10-GISEL-NEXT:    s_endpgm
1576  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1577  %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
1578  %val = load i7, ptr addrspace(1) %valptr.gep
1579  %cttz = call i7 @llvm.cttz.i7(i7 %val, i1 false) nounwind readnone
1580  %cmp = icmp eq i7 %val, 0
1581  %sel = select i1 %cmp, i7 -1, i7 %cttz
1582  store i7 %sel, ptr addrspace(1) %out
1583  ret void
1584}
1585