xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ctlz.ll (revision 7c58d6363a40fc6d1cdf6a147da8f3bb0d4f96ec)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=amdgcn -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=SI
3; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=VI
4; RUN: llc < %s -mtriple=r600 -mcpu=cypress -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=EG
5; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10
6; RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX10-GISEL
7; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s -enable-var-scope --check-prefix=GFX11
8
9declare i7 @llvm.ctlz.i7(i7, i1) nounwind readnone
10declare i8 @llvm.ctlz.i8(i8, i1) nounwind readnone
11declare i16 @llvm.ctlz.i16(i16, i1) nounwind readnone
12
13declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
14declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>, i1) nounwind readnone
15declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) nounwind readnone
16
17declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
18declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readnone
19declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) nounwind readnone
20
21declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
22
23define amdgpu_kernel void @s_ctlz_i32(ptr addrspace(1) noalias %out, i32 %val) nounwind {
24; SI-LABEL: s_ctlz_i32:
25; SI:       ; %bb.0:
26; SI-NEXT:    s_load_dword s2, s[4:5], 0xb
27; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
28; SI-NEXT:    s_mov_b32 s3, 0xf000
29; SI-NEXT:    s_waitcnt lgkmcnt(0)
30; SI-NEXT:    s_flbit_i32_b32 s2, s2
31; SI-NEXT:    s_min_u32 s4, s2, 32
32; SI-NEXT:    s_mov_b32 s2, -1
33; SI-NEXT:    v_mov_b32_e32 v0, s4
34; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
35; SI-NEXT:    s_endpgm
36;
37; VI-LABEL: s_ctlz_i32:
38; VI:       ; %bb.0:
39; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
40; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
41; VI-NEXT:    s_mov_b32 s3, 0xf000
42; VI-NEXT:    s_mov_b32 s2, -1
43; VI-NEXT:    s_waitcnt lgkmcnt(0)
44; VI-NEXT:    s_flbit_i32_b32 s4, s6
45; VI-NEXT:    s_min_u32 s4, s4, 32
46; VI-NEXT:    v_mov_b32_e32 v0, s4
47; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
48; VI-NEXT:    s_endpgm
49;
50; EG-LABEL: s_ctlz_i32:
51; EG:       ; %bb.0:
52; EG-NEXT:    ALU 3, @4, KC0[CB0:0-32], KC1[]
53; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
54; EG-NEXT:    CF_END
55; EG-NEXT:    PAD
56; EG-NEXT:    ALU clause starting at 4:
57; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].Z,
58; EG-NEXT:     CNDE_INT T0.X, KC0[2].Z, literal.x, PV.W,
59; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
60; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
61;
62; GFX10-LABEL: s_ctlz_i32:
63; GFX10:       ; %bb.0:
64; GFX10-NEXT:    s_clause 0x1
65; GFX10-NEXT:    s_load_dword s2, s[4:5], 0x2c
66; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
67; GFX10-NEXT:    v_mov_b32_e32 v0, 0
68; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
69; GFX10-NEXT:    s_flbit_i32_b32 s2, s2
70; GFX10-NEXT:    s_min_u32 s2, s2, 32
71; GFX10-NEXT:    v_mov_b32_e32 v1, s2
72; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
73; GFX10-NEXT:    s_endpgm
74;
75; GFX10-GISEL-LABEL: s_ctlz_i32:
76; GFX10-GISEL:       ; %bb.0:
77; GFX10-GISEL-NEXT:    s_clause 0x1
78; GFX10-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x2c
79; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
80; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
81; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
82; GFX10-GISEL-NEXT:    s_flbit_i32_b32 s2, s2
83; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 32
84; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
85; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
86; GFX10-GISEL-NEXT:    s_endpgm
87;
88; GFX11-LABEL: s_ctlz_i32:
89; GFX11:       ; %bb.0:
90; GFX11-NEXT:    s_clause 0x1
91; GFX11-NEXT:    s_load_b32 s2, s[4:5], 0x2c
92; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
93; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
94; GFX11-NEXT:    s_clz_i32_u32 s2, s2
95; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
96; GFX11-NEXT:    s_min_u32 s2, s2, 32
97; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
98; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
99; GFX11-NEXT:    s_endpgm
100  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
101  store i32 %ctlz, ptr addrspace(1) %out, align 4
102  ret void
103}
104
105define amdgpu_kernel void @v_ctlz_i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
106; SI-LABEL: v_ctlz_i32:
107; SI:       ; %bb.0:
108; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
109; SI-NEXT:    s_mov_b32 s7, 0xf000
110; SI-NEXT:    s_mov_b32 s10, 0
111; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
112; SI-NEXT:    v_mov_b32_e32 v1, 0
113; SI-NEXT:    s_mov_b32 s11, s7
114; SI-NEXT:    s_waitcnt lgkmcnt(0)
115; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
116; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
117; SI-NEXT:    s_mov_b32 s6, -1
118; SI-NEXT:    s_mov_b32 s4, s0
119; SI-NEXT:    s_mov_b32 s5, s1
120; SI-NEXT:    s_waitcnt vmcnt(0)
121; SI-NEXT:    v_ffbh_u32_e32 v0, v0
122; SI-NEXT:    v_min_u32_e32 v0, 32, v0
123; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
124; SI-NEXT:    s_endpgm
125;
126; VI-LABEL: v_ctlz_i32:
127; VI:       ; %bb.0:
128; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
129; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
130; VI-NEXT:    s_waitcnt lgkmcnt(0)
131; VI-NEXT:    v_mov_b32_e32 v1, s3
132; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
133; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
134; VI-NEXT:    flat_load_dword v0, v[0:1]
135; VI-NEXT:    s_mov_b32 s3, 0xf000
136; VI-NEXT:    s_mov_b32 s2, -1
137; VI-NEXT:    s_waitcnt vmcnt(0)
138; VI-NEXT:    v_ffbh_u32_e32 v0, v0
139; VI-NEXT:    v_min_u32_e32 v0, 32, v0
140; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
141; VI-NEXT:    s_endpgm
142;
143; EG-LABEL: v_ctlz_i32:
144; EG:       ; %bb.0:
145; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
146; EG-NEXT:    TEX 0 @6
147; EG-NEXT:    ALU 3, @11, KC0[CB0:0-32], KC1[]
148; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
149; EG-NEXT:    CF_END
150; EG-NEXT:    PAD
151; EG-NEXT:    Fetch clause starting at 6:
152; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
153; EG-NEXT:    ALU clause starting at 8:
154; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
155; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
156; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
157; EG-NEXT:    ALU clause starting at 11:
158; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
159; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
160; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
161; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
162;
163; GFX10-LABEL: v_ctlz_i32:
164; GFX10:       ; %bb.0:
165; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
166; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
167; GFX10-NEXT:    v_mov_b32_e32 v1, 0
168; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
169; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
170; GFX10-NEXT:    s_waitcnt vmcnt(0)
171; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
172; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
173; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
174; GFX10-NEXT:    s_endpgm
175;
176; GFX10-GISEL-LABEL: v_ctlz_i32:
177; GFX10-GISEL:       ; %bb.0:
178; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
179; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
180; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
181; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
182; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
183; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
184; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
185; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
186; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
187; GFX10-GISEL-NEXT:    s_endpgm
188;
189; GFX11-LABEL: v_ctlz_i32:
190; GFX11:       ; %bb.0:
191; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
192; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
193; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
194; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
195; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
196; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
197; GFX11-NEXT:    s_waitcnt vmcnt(0)
198; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
199; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
200; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
201; GFX11-NEXT:    s_endpgm
202  %tid = call i32 @llvm.amdgcn.workitem.id.x()
203  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
204  %val = load i32, ptr addrspace(1) %in.gep, align 4
205  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
206  store i32 %ctlz, ptr addrspace(1) %out, align 4
207  ret void
208}
209
210define amdgpu_kernel void @v_ctlz_v2i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
211; SI-LABEL: v_ctlz_v2i32:
212; SI:       ; %bb.0:
213; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
214; SI-NEXT:    s_mov_b32 s7, 0xf000
215; SI-NEXT:    s_mov_b32 s10, 0
216; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
217; SI-NEXT:    v_mov_b32_e32 v1, 0
218; SI-NEXT:    s_mov_b32 s11, s7
219; SI-NEXT:    s_waitcnt lgkmcnt(0)
220; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
221; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
222; SI-NEXT:    s_mov_b32 s6, -1
223; SI-NEXT:    s_mov_b32 s4, s0
224; SI-NEXT:    s_mov_b32 s5, s1
225; SI-NEXT:    s_waitcnt vmcnt(0)
226; SI-NEXT:    v_ffbh_u32_e32 v1, v1
227; SI-NEXT:    v_ffbh_u32_e32 v0, v0
228; SI-NEXT:    v_min_u32_e32 v1, 32, v1
229; SI-NEXT:    v_min_u32_e32 v0, 32, v0
230; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
231; SI-NEXT:    s_endpgm
232;
233; VI-LABEL: v_ctlz_v2i32:
234; VI:       ; %bb.0:
235; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
236; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
237; VI-NEXT:    s_waitcnt lgkmcnt(0)
238; VI-NEXT:    v_mov_b32_e32 v1, s3
239; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
240; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
241; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
242; VI-NEXT:    s_mov_b32 s3, 0xf000
243; VI-NEXT:    s_mov_b32 s2, -1
244; VI-NEXT:    s_waitcnt vmcnt(0)
245; VI-NEXT:    v_ffbh_u32_e32 v1, v1
246; VI-NEXT:    v_ffbh_u32_e32 v0, v0
247; VI-NEXT:    v_min_u32_e32 v1, 32, v1
248; VI-NEXT:    v_min_u32_e32 v0, 32, v0
249; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
250; VI-NEXT:    s_endpgm
251;
252; EG-LABEL: v_ctlz_v2i32:
253; EG:       ; %bb.0:
254; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
255; EG-NEXT:    TEX 0 @6
256; EG-NEXT:    ALU 6, @11, KC0[CB0:0-32], KC1[]
257; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
258; EG-NEXT:    CF_END
259; EG-NEXT:    PAD
260; EG-NEXT:    Fetch clause starting at 6:
261; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
262; EG-NEXT:    ALU clause starting at 8:
263; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
264; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
265; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
266; EG-NEXT:    ALU clause starting at 11:
267; EG-NEXT:     FFBH_UINT * T0.W, T0.Y,
268; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
269; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
270; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
271; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
272; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
273; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
274;
275; GFX10-LABEL: v_ctlz_v2i32:
276; GFX10:       ; %bb.0:
277; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
278; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
279; GFX10-NEXT:    v_mov_b32_e32 v2, 0
280; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
281; GFX10-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
282; GFX10-NEXT:    s_waitcnt vmcnt(0)
283; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
284; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
285; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
286; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
287; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
288; GFX10-NEXT:    s_endpgm
289;
290; GFX10-GISEL-LABEL: v_ctlz_v2i32:
291; GFX10-GISEL:       ; %bb.0:
292; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
293; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
294; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
295; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
296; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
297; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
298; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
299; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
300; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
301; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
302; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
303; GFX10-GISEL-NEXT:    s_endpgm
304;
305; GFX11-LABEL: v_ctlz_v2i32:
306; GFX11:       ; %bb.0:
307; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
308; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
309; GFX11-NEXT:    v_mov_b32_e32 v2, 0
310; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
311; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
312; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
313; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
314; GFX11-NEXT:    s_waitcnt vmcnt(0)
315; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
316; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
317; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
318; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
319; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
320; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
321; GFX11-NEXT:    s_endpgm
322  %tid = call i32 @llvm.amdgcn.workitem.id.x()
323  %in.gep = getelementptr <2 x i32>, ptr addrspace(1) %valptr, i32 %tid
324  %val = load <2 x i32>, ptr addrspace(1) %in.gep, align 8
325  %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
326  store <2 x i32> %ctlz, ptr addrspace(1) %out, align 8
327  ret void
328}
329
330define amdgpu_kernel void @v_ctlz_v4i32(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
331; SI-LABEL: v_ctlz_v4i32:
332; SI:       ; %bb.0:
333; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
334; SI-NEXT:    s_mov_b32 s7, 0xf000
335; SI-NEXT:    s_mov_b32 s10, 0
336; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
337; SI-NEXT:    v_mov_b32_e32 v1, 0
338; SI-NEXT:    s_mov_b32 s11, s7
339; SI-NEXT:    s_waitcnt lgkmcnt(0)
340; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
341; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
342; SI-NEXT:    s_mov_b32 s6, -1
343; SI-NEXT:    s_mov_b32 s4, s0
344; SI-NEXT:    s_mov_b32 s5, s1
345; SI-NEXT:    s_waitcnt vmcnt(0)
346; SI-NEXT:    v_ffbh_u32_e32 v3, v3
347; SI-NEXT:    v_ffbh_u32_e32 v2, v2
348; SI-NEXT:    v_ffbh_u32_e32 v1, v1
349; SI-NEXT:    v_ffbh_u32_e32 v0, v0
350; SI-NEXT:    v_min_u32_e32 v3, 32, v3
351; SI-NEXT:    v_min_u32_e32 v2, 32, v2
352; SI-NEXT:    v_min_u32_e32 v1, 32, v1
353; SI-NEXT:    v_min_u32_e32 v0, 32, v0
354; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
355; SI-NEXT:    s_endpgm
356;
357; VI-LABEL: v_ctlz_v4i32:
358; VI:       ; %bb.0:
359; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
360; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
361; VI-NEXT:    s_waitcnt lgkmcnt(0)
362; VI-NEXT:    v_mov_b32_e32 v1, s3
363; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
364; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
365; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
366; VI-NEXT:    s_mov_b32 s3, 0xf000
367; VI-NEXT:    s_mov_b32 s2, -1
368; VI-NEXT:    s_waitcnt vmcnt(0)
369; VI-NEXT:    v_ffbh_u32_e32 v3, v3
370; VI-NEXT:    v_ffbh_u32_e32 v2, v2
371; VI-NEXT:    v_ffbh_u32_e32 v1, v1
372; VI-NEXT:    v_ffbh_u32_e32 v0, v0
373; VI-NEXT:    v_min_u32_e32 v3, 32, v3
374; VI-NEXT:    v_min_u32_e32 v2, 32, v2
375; VI-NEXT:    v_min_u32_e32 v1, 32, v1
376; VI-NEXT:    v_min_u32_e32 v0, 32, v0
377; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
378; VI-NEXT:    s_endpgm
379;
380; EG-LABEL: v_ctlz_v4i32:
381; EG:       ; %bb.0:
382; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
383; EG-NEXT:    TEX 0 @6
384; EG-NEXT:    ALU 12, @11, KC0[CB0:0-32], KC1[]
385; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1
386; EG-NEXT:    CF_END
387; EG-NEXT:    PAD
388; EG-NEXT:    Fetch clause starting at 6:
389; EG-NEXT:     VTX_READ_128 T0.XYZW, T0.X, 0, #1
390; EG-NEXT:    ALU clause starting at 8:
391; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
392; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
393; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
394; EG-NEXT:    ALU clause starting at 11:
395; EG-NEXT:     FFBH_UINT * T1.W, T0.W,
396; EG-NEXT:     FFBH_UINT T2.W, T0.Z,
397; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W, BS:VEC_021/SCL_122
398; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
399; EG-NEXT:     CNDE_INT T0.Z, T0.Z, literal.x, PV.W,
400; EG-NEXT:     FFBH_UINT * T1.W, T0.Y,
401; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
402; EG-NEXT:     CNDE_INT T0.Y, T0.Y, literal.x, PV.W,
403; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
404; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
405; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
406; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
407; EG-NEXT:    32(4.484155e-44), 2(2.802597e-45)
408;
409; GFX10-LABEL: v_ctlz_v4i32:
410; GFX10:       ; %bb.0:
411; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
412; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
413; GFX10-NEXT:    v_mov_b32_e32 v4, 0
414; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
415; GFX10-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
416; GFX10-NEXT:    s_waitcnt vmcnt(0)
417; GFX10-NEXT:    v_ffbh_u32_e32 v3, v3
418; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
419; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
420; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
421; GFX10-NEXT:    v_min_u32_e32 v3, 32, v3
422; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
423; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
424; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
425; GFX10-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
426; GFX10-NEXT:    s_endpgm
427;
428; GFX10-GISEL-LABEL: v_ctlz_v4i32:
429; GFX10-GISEL:       ; %bb.0:
430; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
431; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
432; GFX10-GISEL-NEXT:    v_mov_b32_e32 v4, 0
433; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
434; GFX10-GISEL-NEXT:    global_load_dwordx4 v[0:3], v0, s[2:3]
435; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
436; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
437; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
438; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
439; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v3, v3
440; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
441; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
442; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
443; GFX10-GISEL-NEXT:    v_min_u32_e32 v3, 32, v3
444; GFX10-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
445; GFX10-GISEL-NEXT:    s_endpgm
446;
447; GFX11-LABEL: v_ctlz_v4i32:
448; GFX11:       ; %bb.0:
449; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
450; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
451; GFX11-NEXT:    v_mov_b32_e32 v4, 0
452; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
453; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
454; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
455; GFX11-NEXT:    global_load_b128 v[0:3], v0, s[2:3]
456; GFX11-NEXT:    s_waitcnt vmcnt(0)
457; GFX11-NEXT:    v_clz_i32_u32_e32 v3, v3
458; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v2
459; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
460; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
461; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
462; GFX11-NEXT:    v_min_u32_e32 v3, 32, v3
463; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
464; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
465; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
466; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
467; GFX11-NEXT:    global_store_b128 v4, v[0:3], s[0:1]
468; GFX11-NEXT:    s_endpgm
469  %tid = call i32 @llvm.amdgcn.workitem.id.x()
470  %in.gep = getelementptr <4 x i32>, ptr addrspace(1) %valptr, i32 %tid
471  %val = load <4 x i32>, ptr addrspace(1) %in.gep, align 16
472  %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
473  store <4 x i32> %ctlz, ptr addrspace(1) %out, align 16
474  ret void
475}
476
477define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
478; SI-LABEL: v_ctlz_i8:
479; SI:       ; %bb.0:
480; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
481; SI-NEXT:    s_mov_b32 s7, 0xf000
482; SI-NEXT:    s_mov_b32 s6, -1
483; SI-NEXT:    s_mov_b32 s10, s6
484; SI-NEXT:    s_mov_b32 s11, s7
485; SI-NEXT:    s_waitcnt lgkmcnt(0)
486; SI-NEXT:    s_mov_b32 s8, s2
487; SI-NEXT:    s_mov_b32 s9, s3
488; SI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
489; SI-NEXT:    s_mov_b32 s4, s0
490; SI-NEXT:    s_mov_b32 s5, s1
491; SI-NEXT:    s_waitcnt vmcnt(0)
492; SI-NEXT:    v_ffbh_u32_e32 v0, v0
493; SI-NEXT:    v_min_u32_e32 v0, 32, v0
494; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
495; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
496; SI-NEXT:    s_endpgm
497;
498; VI-LABEL: v_ctlz_i8:
499; VI:       ; %bb.0:
500; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
501; VI-NEXT:    s_mov_b32 s7, 0xf000
502; VI-NEXT:    s_mov_b32 s6, -1
503; VI-NEXT:    s_mov_b32 s10, s6
504; VI-NEXT:    s_mov_b32 s11, s7
505; VI-NEXT:    s_waitcnt lgkmcnt(0)
506; VI-NEXT:    s_mov_b32 s8, s2
507; VI-NEXT:    s_mov_b32 s9, s3
508; VI-NEXT:    buffer_load_ubyte v0, off, s[8:11], 0
509; VI-NEXT:    s_mov_b32 s4, s0
510; VI-NEXT:    s_mov_b32 s5, s1
511; VI-NEXT:    s_waitcnt vmcnt(0)
512; VI-NEXT:    v_ffbh_u32_e32 v0, v0
513; VI-NEXT:    v_min_u32_e32 v0, 32, v0
514; VI-NEXT:    v_subrev_u32_e32 v0, vcc, 24, v0
515; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
516; VI-NEXT:    s_endpgm
517;
518; EG-LABEL: v_ctlz_i8:
519; EG:       ; %bb.0:
520; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
521; EG-NEXT:    TEX 0 @6
522; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
523; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
524; EG-NEXT:    CF_END
525; EG-NEXT:    PAD
526; EG-NEXT:    Fetch clause starting at 6:
527; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
528; EG-NEXT:    ALU clause starting at 8:
529; EG-NEXT:     MOV * T0.X, KC0[2].Z,
530; EG-NEXT:    ALU clause starting at 9:
531; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
532; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
533; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
534; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
535; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
536; EG-NEXT:    -24(nan), 0(0.000000e+00)
537; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
538; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
539; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
540; EG-NEXT:     LSHL T0.X, PV.W, PS,
541; EG-NEXT:     LSHL * T0.W, literal.x, PS,
542; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
543; EG-NEXT:     MOV T0.Y, 0.0,
544; EG-NEXT:     MOV * T0.Z, 0.0,
545; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
546; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
547;
548; GFX10-LABEL: v_ctlz_i8:
549; GFX10:       ; %bb.0:
550; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
551; GFX10-NEXT:    v_mov_b32_e32 v0, 0
552; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
553; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
554; GFX10-NEXT:    s_waitcnt vmcnt(0)
555; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
556; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
557; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
558; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
559; GFX10-NEXT:    s_endpgm
560;
561; GFX10-GISEL-LABEL: v_ctlz_i8:
562; GFX10-GISEL:       ; %bb.0:
563; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
564; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
565; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
566; GFX10-GISEL-NEXT:    global_load_ubyte v1, v0, s[2:3]
567; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
568; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
569; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
570; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v1, 0xffffffe8, v1
571; GFX10-GISEL-NEXT:    global_store_byte v0, v1, s[0:1]
572; GFX10-GISEL-NEXT:    s_endpgm
573;
574; GFX11-LABEL: v_ctlz_i8:
575; GFX11:       ; %bb.0:
576; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
577; GFX11-NEXT:    v_mov_b32_e32 v0, 0
578; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
579; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
580; GFX11-NEXT:    s_waitcnt vmcnt(0)
581; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
582; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
583; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
584; GFX11-NEXT:    v_subrev_nc_u32_e32 v1, 24, v1
585; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
586; GFX11-NEXT:    s_endpgm
587  %val = load i8, ptr addrspace(1) %valptr
588  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
589  store i8 %ctlz, ptr addrspace(1) %out
590  ret void
591}
592
593define amdgpu_kernel void @s_ctlz_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
594; SI-LABEL: s_ctlz_i64:
595; SI:       ; %bb.0:
596; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
597; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
598; SI-NEXT:    s_mov_b32 s3, 0xf000
599; SI-NEXT:    s_mov_b32 s2, -1
600; SI-NEXT:    s_waitcnt lgkmcnt(0)
601; SI-NEXT:    s_flbit_i32_b64 s4, s[6:7]
602; SI-NEXT:    s_min_u32 s4, s4, 64
603; SI-NEXT:    v_mov_b32_e32 v1, 0
604; SI-NEXT:    v_mov_b32_e32 v0, s4
605; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
606; SI-NEXT:    s_endpgm
607;
608; VI-LABEL: s_ctlz_i64:
609; VI:       ; %bb.0:
610; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
611; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
612; VI-NEXT:    s_mov_b32 s3, 0xf000
613; VI-NEXT:    s_mov_b32 s2, -1
614; VI-NEXT:    v_mov_b32_e32 v1, 0
615; VI-NEXT:    s_waitcnt lgkmcnt(0)
616; VI-NEXT:    s_flbit_i32_b64 s4, s[6:7]
617; VI-NEXT:    s_min_u32 s4, s4, 64
618; VI-NEXT:    v_mov_b32_e32 v0, s4
619; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
620; VI-NEXT:    s_endpgm
621;
622; EG-LABEL: s_ctlz_i64:
623; EG:       ; %bb.0:
624; EG-NEXT:    ALU 9, @4, KC0[CB0:0-32], KC1[]
625; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
626; EG-NEXT:    CF_END
627; EG-NEXT:    PAD
628; EG-NEXT:    ALU clause starting at 4:
629; EG-NEXT:     FFBH_UINT * T0.W, KC0[4].W,
630; EG-NEXT:     CNDE_INT * T0.W, KC0[4].W, literal.x, PV.W,
631; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
632; EG-NEXT:     FFBH_UINT T1.W, KC0[5].X,
633; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
634; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
635; EG-NEXT:     CNDE_INT T0.X, KC0[5].X, PS, PV.W,
636; EG-NEXT:     MOV T0.Y, 0.0,
637; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
638; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
639;
640; GFX10-LABEL: s_ctlz_i64:
641; GFX10:       ; %bb.0:
642; GFX10-NEXT:    s_clause 0x1
643; GFX10-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
644; GFX10-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
645; GFX10-NEXT:    v_mov_b32_e32 v1, 0
646; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
647; GFX10-NEXT:    s_flbit_i32_b64 s0, s[0:1]
648; GFX10-NEXT:    s_min_u32 s0, s0, 64
649; GFX10-NEXT:    v_mov_b32_e32 v0, s0
650; GFX10-NEXT:    global_store_dwordx2 v1, v[0:1], s[2:3]
651; GFX10-NEXT:    s_endpgm
652;
653; GFX10-GISEL-LABEL: s_ctlz_i64:
654; GFX10-GISEL:       ; %bb.0:
655; GFX10-GISEL-NEXT:    s_clause 0x1
656; GFX10-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x4c
657; GFX10-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
658; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, 0
659; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
660; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s0, s[0:1]
661; GFX10-GISEL-NEXT:    s_mov_b32 s1, 0
662; GFX10-GISEL-NEXT:    s_min_u32 s0, s0, 64
663; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
664; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s1
665; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
666; GFX10-GISEL-NEXT:    s_endpgm
667;
668; GFX11-LABEL: s_ctlz_i64:
669; GFX11:       ; %bb.0:
670; GFX11-NEXT:    s_clause 0x1
671; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x4c
672; GFX11-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
673; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
674; GFX11-NEXT:    s_clz_i32_u64 s0, s[0:1]
675; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
676; GFX11-NEXT:    s_min_u32 s0, s0, 64
677; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s0
678; GFX11-NEXT:    global_store_b64 v1, v[0:1], s[2:3]
679; GFX11-NEXT:    s_endpgm
680  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
681  store i64 %ctlz, ptr addrspace(1) %out
682  ret void
683}
684
685define amdgpu_kernel void @s_ctlz_i64_trunc(ptr addrspace(1) noalias %out, i64 %val) nounwind {
686; SI-LABEL: s_ctlz_i64_trunc:
687; SI:       ; %bb.0:
688; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
689; SI-NEXT:    s_mov_b32 s7, 0xf000
690; SI-NEXT:    s_waitcnt lgkmcnt(0)
691; SI-NEXT:    s_flbit_i32_b64 s2, s[2:3]
692; SI-NEXT:    s_min_u32 s2, s2, 64
693; SI-NEXT:    s_mov_b32 s6, -1
694; SI-NEXT:    s_mov_b32 s4, s0
695; SI-NEXT:    s_mov_b32 s5, s1
696; SI-NEXT:    v_mov_b32_e32 v0, s2
697; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
698; SI-NEXT:    s_endpgm
699;
700; VI-LABEL: s_ctlz_i64_trunc:
701; VI:       ; %bb.0:
702; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
703; VI-NEXT:    s_mov_b32 s7, 0xf000
704; VI-NEXT:    s_mov_b32 s6, -1
705; VI-NEXT:    s_waitcnt lgkmcnt(0)
706; VI-NEXT:    s_mov_b32 s4, s0
707; VI-NEXT:    s_flbit_i32_b64 s0, s[2:3]
708; VI-NEXT:    s_min_u32 s0, s0, 64
709; VI-NEXT:    s_mov_b32 s5, s1
710; VI-NEXT:    v_mov_b32_e32 v0, s0
711; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
712; VI-NEXT:    s_endpgm
713;
714; EG-LABEL: s_ctlz_i64_trunc:
715; EG:       ; %bb.0:
716; EG-NEXT:    ALU 8, @4, KC0[CB0:0-32], KC1[]
717; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
718; EG-NEXT:    CF_END
719; EG-NEXT:    PAD
720; EG-NEXT:    ALU clause starting at 4:
721; EG-NEXT:     FFBH_UINT * T0.W, KC0[2].W,
722; EG-NEXT:     CNDE_INT * T0.W, KC0[2].W, literal.x, PV.W,
723; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
724; EG-NEXT:     FFBH_UINT T1.W, KC0[3].X,
725; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
726; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
727; EG-NEXT:     CNDE_INT T0.X, KC0[3].X, PS, PV.W,
728; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
729; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
730;
731; GFX10-LABEL: s_ctlz_i64_trunc:
732; GFX10:       ; %bb.0:
733; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
734; GFX10-NEXT:    v_mov_b32_e32 v0, 0
735; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
736; GFX10-NEXT:    s_flbit_i32_b64 s2, s[2:3]
737; GFX10-NEXT:    s_min_u32 s2, s2, 64
738; GFX10-NEXT:    v_mov_b32_e32 v1, s2
739; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
740; GFX10-NEXT:    s_endpgm
741;
742; GFX10-GISEL-LABEL: s_ctlz_i64_trunc:
743; GFX10-GISEL:       ; %bb.0:
744; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
745; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
746; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
747; GFX10-GISEL-NEXT:    s_flbit_i32_b64 s2, s[2:3]
748; GFX10-GISEL-NEXT:    s_min_u32 s2, s2, 64
749; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s2
750; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
751; GFX10-GISEL-NEXT:    s_endpgm
752;
753; GFX11-LABEL: s_ctlz_i64_trunc:
754; GFX11:       ; %bb.0:
755; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
756; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
757; GFX11-NEXT:    s_clz_i32_u64 s2, s[2:3]
758; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
759; GFX11-NEXT:    s_min_u32 s2, s2, 64
760; GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
761; GFX11-NEXT:    global_store_b32 v0, v1, s[0:1]
762; GFX11-NEXT:    s_endpgm
763  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
764  %trunc = trunc i64 %ctlz to i32
765  store i32 %trunc, ptr addrspace(1) %out
766  ret void
767}
768
769define amdgpu_kernel void @v_ctlz_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
770; SI-LABEL: v_ctlz_i64:
771; SI:       ; %bb.0:
772; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
773; SI-NEXT:    s_mov_b32 s7, 0xf000
774; SI-NEXT:    s_mov_b32 s6, 0
775; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
776; SI-NEXT:    v_mov_b32_e32 v1, 0
777; SI-NEXT:    s_waitcnt lgkmcnt(0)
778; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
779; SI-NEXT:    buffer_load_dwordx2 v[2:3], v[0:1], s[4:7], 0 addr64
780; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
781; SI-NEXT:    s_waitcnt vmcnt(0)
782; SI-NEXT:    v_ffbh_u32_e32 v2, v2
783; SI-NEXT:    v_min_u32_e32 v2, 0xffffffdf, v2
784; SI-NEXT:    v_add_i32_e32 v2, vcc, 32, v2
785; SI-NEXT:    v_ffbh_u32_e32 v3, v3
786; SI-NEXT:    v_min3_u32 v2, v2, v3, 64
787; SI-NEXT:    v_mov_b32_e32 v3, v1
788; SI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
789; SI-NEXT:    s_endpgm
790;
791; VI-LABEL: v_ctlz_i64:
792; VI:       ; %bb.0:
793; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
794; VI-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
795; VI-NEXT:    v_mov_b32_e32 v2, 0
796; VI-NEXT:    s_waitcnt lgkmcnt(0)
797; VI-NEXT:    v_mov_b32_e32 v1, s3
798; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v3
799; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
800; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
801; VI-NEXT:    v_mov_b32_e32 v4, s1
802; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v3
803; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
804; VI-NEXT:    s_waitcnt vmcnt(0)
805; VI-NEXT:    v_ffbh_u32_e32 v0, v0
806; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
807; VI-NEXT:    v_ffbh_u32_e32 v1, v1
808; VI-NEXT:    v_min3_u32 v1, v0, v1, 64
809; VI-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
810; VI-NEXT:    s_endpgm
811;
812; EG-LABEL: v_ctlz_i64:
813; EG:       ; %bb.0:
814; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
815; EG-NEXT:    TEX 0 @6
816; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
817; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1
818; EG-NEXT:    CF_END
819; EG-NEXT:    PAD
820; EG-NEXT:    Fetch clause starting at 6:
821; EG-NEXT:     VTX_READ_64 T0.XY, T0.X, 0, #1
822; EG-NEXT:    ALU clause starting at 8:
823; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
824; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
825; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
826; EG-NEXT:    ALU clause starting at 11:
827; EG-NEXT:     FFBH_UINT * T1.W, T0.X,
828; EG-NEXT:     CNDE_INT * T1.W, T0.X, literal.x, PV.W,
829; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
830; EG-NEXT:     FFBH_UINT T2.W, T0.Y,
831; EG-NEXT:     ADD_INT * T1.W, PV.W, literal.x,
832; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
833; EG-NEXT:     CNDE_INT T0.X, T0.Y, PS, PV.W,
834; EG-NEXT:     MOV T0.Y, 0.0,
835; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, T0.W,
836; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
837; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
838;
839; GFX10-LABEL: v_ctlz_i64:
840; GFX10:       ; %bb.0:
841; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
842; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
843; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
844; GFX10-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
845; GFX10-NEXT:    s_waitcnt vmcnt(0)
846; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
847; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
848; GFX10-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
849; GFX10-NEXT:    v_min3_u32 v0, v0, v1, 64
850; GFX10-NEXT:    v_mov_b32_e32 v1, 0
851; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
852; GFX10-NEXT:    s_endpgm
853;
854; GFX10-GISEL-LABEL: v_ctlz_i64:
855; GFX10-GISEL:       ; %bb.0:
856; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
857; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
858; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
859; GFX10-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
860; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
861; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
862; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
863; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
864; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, v1, v0
865; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
866; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 64, v0
867; GFX10-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
868; GFX10-GISEL-NEXT:    s_endpgm
869;
870; GFX11-LABEL: v_ctlz_i64:
871; GFX11:       ; %bb.0:
872; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
873; GFX11-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
874; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
875; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
876; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
877; GFX11-NEXT:    global_load_b64 v[0:1], v2, s[2:3]
878; GFX11-NEXT:    s_waitcnt vmcnt(0)
879; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
880; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
881; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
882; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
883; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
884; GFX11-NEXT:    v_mov_b32_e32 v1, 0
885; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
886; GFX11-NEXT:    s_endpgm
887  %tid = call i32 @llvm.amdgcn.workitem.id.x()
888  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
889  %out.gep = getelementptr i64, ptr addrspace(1) %out, i32 %tid
890  %val = load i64, ptr addrspace(1) %in.gep
891  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
892  store i64 %ctlz, ptr addrspace(1) %out.gep
893  ret void
894}
895
896define amdgpu_kernel void @v_ctlz_i64_trunc(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
897; SI-LABEL: v_ctlz_i64_trunc:
898; SI:       ; %bb.0:
899; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
900; SI-NEXT:    s_mov_b32 s7, 0xf000
901; SI-NEXT:    s_mov_b32 s6, 0
902; SI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
903; SI-NEXT:    v_mov_b32_e32 v2, 0
904; SI-NEXT:    s_waitcnt lgkmcnt(0)
905; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
906; SI-NEXT:    buffer_load_dwordx2 v[3:4], v[1:2], s[4:7], 0 addr64
907; SI-NEXT:    v_lshlrev_b32_e32 v1, 2, v0
908; SI-NEXT:    s_mov_b64 s[2:3], s[6:7]
909; SI-NEXT:    s_waitcnt vmcnt(0)
910; SI-NEXT:    v_ffbh_u32_e32 v0, v3
911; SI-NEXT:    v_min_u32_e32 v0, 0xffffffdf, v0
912; SI-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
913; SI-NEXT:    v_ffbh_u32_e32 v3, v4
914; SI-NEXT:    v_min3_u32 v0, v0, v3, 64
915; SI-NEXT:    buffer_store_dword v0, v[1:2], s[0:3], 0 addr64
916; SI-NEXT:    s_endpgm
917;
918; VI-LABEL: v_ctlz_i64_trunc:
919; VI:       ; %bb.0:
920; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
921; VI-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
922; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
923; VI-NEXT:    s_waitcnt lgkmcnt(0)
924; VI-NEXT:    v_mov_b32_e32 v2, s3
925; VI-NEXT:    v_add_u32_e32 v1, vcc, s2, v1
926; VI-NEXT:    v_addc_u32_e32 v2, vcc, 0, v2, vcc
927; VI-NEXT:    flat_load_dwordx2 v[1:2], v[1:2]
928; VI-NEXT:    v_add_u32_e32 v3, vcc, s0, v0
929; VI-NEXT:    v_mov_b32_e32 v4, s1
930; VI-NEXT:    v_addc_u32_e32 v4, vcc, 0, v4, vcc
931; VI-NEXT:    s_waitcnt vmcnt(0)
932; VI-NEXT:    v_ffbh_u32_e32 v0, v1
933; VI-NEXT:    v_add_u32_e64 v0, s[0:1], v0, 32 clamp
934; VI-NEXT:    v_ffbh_u32_e32 v1, v2
935; VI-NEXT:    v_min3_u32 v0, v0, v1, 64
936; VI-NEXT:    flat_store_dword v[3:4], v0
937; VI-NEXT:    s_endpgm
938;
939; EG-LABEL: v_ctlz_i64_trunc:
940; EG:       ; %bb.0:
941; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
942; EG-NEXT:    TEX 0 @6
943; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
944; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
945; EG-NEXT:    CF_END
946; EG-NEXT:    PAD
947; EG-NEXT:    Fetch clause starting at 6:
948; EG-NEXT:     VTX_READ_64 T1.XY, T1.X, 0, #1
949; EG-NEXT:    ALU clause starting at 8:
950; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
951; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
952; EG-NEXT:     ADD_INT * T1.X, KC0[2].Z, PV.W,
953; EG-NEXT:    ALU clause starting at 11:
954; EG-NEXT:     FFBH_UINT * T0.W, T1.X,
955; EG-NEXT:     CNDE_INT * T0.W, T1.X, literal.x, PV.W,
956; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
957; EG-NEXT:     LSHL T0.Z, T0.X, literal.x,
958; EG-NEXT:     FFBH_UINT T1.W, T1.Y,
959; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.y,
960; EG-NEXT:    2(2.802597e-45), 32(4.484155e-44)
961; EG-NEXT:     CNDE_INT T0.X, T1.Y, PS, PV.W,
962; EG-NEXT:     ADD_INT * T0.W, KC0[2].Y, PV.Z,
963; EG-NEXT:     LSHR * T1.X, PV.W, literal.x,
964; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
965;
966; GFX10-LABEL: v_ctlz_i64_trunc:
967; GFX10:       ; %bb.0:
968; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
969; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
970; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
971; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
972; GFX10-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
973; GFX10-NEXT:    s_waitcnt vmcnt(0)
974; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
975; GFX10-NEXT:    v_ffbh_u32_e32 v2, v2
976; GFX10-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
977; GFX10-NEXT:    v_min3_u32 v1, v1, v2, 64
978; GFX10-NEXT:    global_store_dword v0, v1, s[0:1]
979; GFX10-NEXT:    s_endpgm
980;
981; GFX10-GISEL-LABEL: v_ctlz_i64_trunc:
982; GFX10-GISEL:       ; %bb.0:
983; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
984; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 3, v0
985; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
986; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
987; GFX10-GISEL-NEXT:    global_load_dwordx2 v[1:2], v1, s[2:3]
988; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
989; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v1
990; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v2
991; GFX10-GISEL-NEXT:    v_add_nc_u32_e64 v1, v1, 32 clamp
992; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, v2, v1
993; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 64, v1
994; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
995; GFX10-GISEL-NEXT:    s_endpgm
996;
997; GFX11-LABEL: v_ctlz_i64_trunc:
998; GFX11:       ; %bb.0:
999; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1000; GFX11-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
1001; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1002; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 3, v2
1003; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 2, v2
1004; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1005; GFX11-NEXT:    global_load_b64 v[0:1], v0, s[2:3]
1006; GFX11-NEXT:    s_waitcnt vmcnt(0)
1007; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1008; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
1009; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1010; GFX11-NEXT:    v_add_nc_u32_e64 v0, v0, 32 clamp
1011; GFX11-NEXT:    v_min3_u32 v0, v0, v1, 64
1012; GFX11-NEXT:    global_store_b32 v2, v0, s[0:1]
1013; GFX11-NEXT:    s_endpgm
1014  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1015  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
1016  %out.gep = getelementptr i32, ptr addrspace(1) %out, i32 %tid
1017  %val = load i64, ptr addrspace(1) %in.gep
1018  %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
1019  %trunc = trunc i64 %ctlz to i32
1020  store i32 %trunc, ptr addrspace(1) %out.gep
1021  ret void
1022}
1023
1024define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1025; SI-LABEL: v_ctlz_i32_sel_eq_neg1:
1026; SI:       ; %bb.0:
1027; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1028; SI-NEXT:    s_mov_b32 s7, 0xf000
1029; SI-NEXT:    s_mov_b32 s10, 0
1030; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1031; SI-NEXT:    v_mov_b32_e32 v1, 0
1032; SI-NEXT:    s_mov_b32 s11, s7
1033; SI-NEXT:    s_waitcnt lgkmcnt(0)
1034; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1035; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1036; SI-NEXT:    s_mov_b32 s6, -1
1037; SI-NEXT:    s_mov_b32 s4, s0
1038; SI-NEXT:    s_mov_b32 s5, s1
1039; SI-NEXT:    s_waitcnt vmcnt(0)
1040; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1041; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1042; SI-NEXT:    s_endpgm
1043;
1044; VI-LABEL: v_ctlz_i32_sel_eq_neg1:
1045; VI:       ; %bb.0:
1046; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1047; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1048; VI-NEXT:    s_waitcnt lgkmcnt(0)
1049; VI-NEXT:    v_mov_b32_e32 v1, s3
1050; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1051; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1052; VI-NEXT:    flat_load_dword v0, v[0:1]
1053; VI-NEXT:    s_mov_b32 s3, 0xf000
1054; VI-NEXT:    s_mov_b32 s2, -1
1055; VI-NEXT:    s_waitcnt vmcnt(0)
1056; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1057; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1058; VI-NEXT:    s_endpgm
1059;
1060; EG-LABEL: v_ctlz_i32_sel_eq_neg1:
1061; EG:       ; %bb.0:
1062; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1063; EG-NEXT:    TEX 0 @6
1064; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1065; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1066; EG-NEXT:    CF_END
1067; EG-NEXT:    PAD
1068; EG-NEXT:    Fetch clause starting at 6:
1069; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1070; EG-NEXT:    ALU clause starting at 8:
1071; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1072; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1073; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1074; EG-NEXT:    ALU clause starting at 11:
1075; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1076; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1077; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1078; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1079; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1080; EG-NEXT:    -1(nan), 2(2.802597e-45)
1081;
1082; GFX10-LABEL: v_ctlz_i32_sel_eq_neg1:
1083; GFX10:       ; %bb.0:
1084; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1085; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1086; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1087; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1088; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1089; GFX10-NEXT:    s_waitcnt vmcnt(0)
1090; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1091; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1092; GFX10-NEXT:    s_endpgm
1093;
1094; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_neg1:
1095; GFX10-GISEL:       ; %bb.0:
1096; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1097; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1098; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1099; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1100; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1101; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1102; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1103; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1104; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, -1, vcc_lo
1105; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1106; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1107; GFX10-GISEL-NEXT:    s_endpgm
1108;
1109; GFX11-LABEL: v_ctlz_i32_sel_eq_neg1:
1110; GFX11:       ; %bb.0:
1111; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1112; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1113; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1114; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1115; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1116; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1117; GFX11-NEXT:    s_waitcnt vmcnt(0)
1118; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1119; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1120; GFX11-NEXT:    s_endpgm
1121  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1122  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1123  %val = load i32, ptr addrspace(1) %in.gep
1124  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1125  %cmp = icmp eq i32 %val, 0
1126  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1127  store i32 %sel, ptr addrspace(1) %out
1128  ret void
1129}
1130
1131define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1132; SI-LABEL: v_ctlz_i32_sel_ne_neg1:
1133; SI:       ; %bb.0:
1134; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1135; SI-NEXT:    s_mov_b32 s7, 0xf000
1136; SI-NEXT:    s_mov_b32 s10, 0
1137; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1138; SI-NEXT:    v_mov_b32_e32 v1, 0
1139; SI-NEXT:    s_mov_b32 s11, s7
1140; SI-NEXT:    s_waitcnt lgkmcnt(0)
1141; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1142; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1143; SI-NEXT:    s_mov_b32 s6, -1
1144; SI-NEXT:    s_mov_b32 s4, s0
1145; SI-NEXT:    s_mov_b32 s5, s1
1146; SI-NEXT:    s_waitcnt vmcnt(0)
1147; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1148; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1149; SI-NEXT:    s_endpgm
1150;
1151; VI-LABEL: v_ctlz_i32_sel_ne_neg1:
1152; VI:       ; %bb.0:
1153; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1154; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1155; VI-NEXT:    s_waitcnt lgkmcnt(0)
1156; VI-NEXT:    v_mov_b32_e32 v1, s3
1157; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1158; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1159; VI-NEXT:    flat_load_dword v0, v[0:1]
1160; VI-NEXT:    s_mov_b32 s3, 0xf000
1161; VI-NEXT:    s_mov_b32 s2, -1
1162; VI-NEXT:    s_waitcnt vmcnt(0)
1163; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1164; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1165; VI-NEXT:    s_endpgm
1166;
1167; EG-LABEL: v_ctlz_i32_sel_ne_neg1:
1168; EG:       ; %bb.0:
1169; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1170; EG-NEXT:    TEX 0 @6
1171; EG-NEXT:    ALU 5, @11, KC0[CB0:0-32], KC1[]
1172; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1173; EG-NEXT:    CF_END
1174; EG-NEXT:    PAD
1175; EG-NEXT:    Fetch clause starting at 6:
1176; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1177; EG-NEXT:    ALU clause starting at 8:
1178; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1179; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1180; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1181; EG-NEXT:    ALU clause starting at 11:
1182; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1183; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1184; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1185; EG-NEXT:     CNDE_INT T0.X, T0.X, literal.x, PV.W,
1186; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1187; EG-NEXT:    -1(nan), 2(2.802597e-45)
1188;
1189; GFX10-LABEL: v_ctlz_i32_sel_ne_neg1:
1190; GFX10:       ; %bb.0:
1191; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1192; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1193; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1194; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1195; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1196; GFX10-NEXT:    s_waitcnt vmcnt(0)
1197; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1198; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1199; GFX10-NEXT:    s_endpgm
1200;
1201; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_neg1:
1202; GFX10-GISEL:       ; %bb.0:
1203; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1204; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1205; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1206; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1207; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1208; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1209; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
1210; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1211; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v1, vcc_lo
1212; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1213; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1214; GFX10-GISEL-NEXT:    s_endpgm
1215;
1216; GFX11-LABEL: v_ctlz_i32_sel_ne_neg1:
1217; GFX11:       ; %bb.0:
1218; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1219; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1220; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1221; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1222; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1223; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1224; GFX11-NEXT:    s_waitcnt vmcnt(0)
1225; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1226; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1227; GFX11-NEXT:    s_endpgm
1228  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1229  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1230  %val = load i32, ptr addrspace(1) %in.gep
1231  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1232  %cmp = icmp ne i32 %val, 0
1233  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1234  store i32 %sel, ptr addrspace(1) %out
1235  ret void
1236}
1237
1238; TODO: Should be able to eliminate select here as well.
1239define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1240; SI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1241; SI:       ; %bb.0:
1242; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1243; SI-NEXT:    s_mov_b32 s7, 0xf000
1244; SI-NEXT:    s_mov_b32 s10, 0
1245; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1246; SI-NEXT:    v_mov_b32_e32 v1, 0
1247; SI-NEXT:    s_mov_b32 s11, s7
1248; SI-NEXT:    s_waitcnt lgkmcnt(0)
1249; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1250; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1251; SI-NEXT:    s_mov_b32 s6, -1
1252; SI-NEXT:    s_mov_b32 s4, s0
1253; SI-NEXT:    s_mov_b32 s5, s1
1254; SI-NEXT:    s_waitcnt vmcnt(0)
1255; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1256; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1257; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1258; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1259; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1260; SI-NEXT:    s_endpgm
1261;
1262; VI-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1263; VI:       ; %bb.0:
1264; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1265; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1266; VI-NEXT:    s_waitcnt lgkmcnt(0)
1267; VI-NEXT:    v_mov_b32_e32 v1, s3
1268; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1269; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1270; VI-NEXT:    flat_load_dword v0, v[0:1]
1271; VI-NEXT:    s_mov_b32 s3, 0xf000
1272; VI-NEXT:    s_mov_b32 s2, -1
1273; VI-NEXT:    s_waitcnt vmcnt(0)
1274; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1275; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1276; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1277; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1278; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1279; VI-NEXT:    s_endpgm
1280;
1281; EG-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1282; EG:       ; %bb.0:
1283; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1284; EG-NEXT:    TEX 0 @6
1285; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1286; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1287; EG-NEXT:    CF_END
1288; EG-NEXT:    PAD
1289; EG-NEXT:    Fetch clause starting at 6:
1290; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1291; EG-NEXT:    ALU clause starting at 8:
1292; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1293; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1294; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1295; EG-NEXT:    ALU clause starting at 11:
1296; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1297; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1298; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1299; EG-NEXT:     SETE_INT * T1.W, PV.W, literal.x,
1300; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1301; EG-NEXT:     CNDE_INT T0.X, PV.W, T0.W, literal.x,
1302; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1303; EG-NEXT:    -1(nan), 2(2.802597e-45)
1304;
1305; GFX10-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1306; GFX10:       ; %bb.0:
1307; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1308; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1309; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1310; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1311; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1312; GFX10-NEXT:    s_waitcnt vmcnt(0)
1313; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1314; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1315; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1316; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1317; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1318; GFX10-NEXT:    s_endpgm
1319;
1320; GFX10-GISEL-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1321; GFX10-GISEL:       ; %bb.0:
1322; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1323; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1324; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1325; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1326; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1327; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1328; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1329; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1330; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 32, v0
1331; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, -1, vcc_lo
1332; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1333; GFX10-GISEL-NEXT:    s_endpgm
1334;
1335; GFX11-LABEL: v_ctlz_i32_sel_eq_bitwidth:
1336; GFX11:       ; %bb.0:
1337; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1338; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1339; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1340; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1341; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1342; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1343; GFX11-NEXT:    s_waitcnt vmcnt(0)
1344; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1345; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
1346; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1347; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1348; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1349; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1350; GFX11-NEXT:    s_endpgm
1351  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1352  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1353  %val = load i32, ptr addrspace(1) %in.gep
1354  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1355  %cmp = icmp eq i32 %ctlz, 32
1356  %sel = select i1 %cmp, i32 -1, i32 %ctlz
1357  store i32 %sel, ptr addrspace(1) %out
1358  ret void
1359}
1360
1361define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1362; SI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1363; SI:       ; %bb.0:
1364; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1365; SI-NEXT:    s_mov_b32 s7, 0xf000
1366; SI-NEXT:    s_mov_b32 s10, 0
1367; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1368; SI-NEXT:    v_mov_b32_e32 v1, 0
1369; SI-NEXT:    s_mov_b32 s11, s7
1370; SI-NEXT:    s_waitcnt lgkmcnt(0)
1371; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1372; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
1373; SI-NEXT:    s_mov_b32 s6, -1
1374; SI-NEXT:    s_mov_b32 s4, s0
1375; SI-NEXT:    s_mov_b32 s5, s1
1376; SI-NEXT:    s_waitcnt vmcnt(0)
1377; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1378; SI-NEXT:    v_min_u32_e32 v0, 32, v0
1379; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1380; SI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1381; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
1382; SI-NEXT:    s_endpgm
1383;
1384; VI-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1385; VI:       ; %bb.0:
1386; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1387; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1388; VI-NEXT:    s_waitcnt lgkmcnt(0)
1389; VI-NEXT:    v_mov_b32_e32 v1, s3
1390; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1391; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1392; VI-NEXT:    flat_load_dword v0, v[0:1]
1393; VI-NEXT:    s_mov_b32 s3, 0xf000
1394; VI-NEXT:    s_mov_b32 s2, -1
1395; VI-NEXT:    s_waitcnt vmcnt(0)
1396; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1397; VI-NEXT:    v_min_u32_e32 v0, 32, v0
1398; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 32, v0
1399; VI-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
1400; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
1401; VI-NEXT:    s_endpgm
1402;
1403; EG-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1404; EG:       ; %bb.0:
1405; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
1406; EG-NEXT:    TEX 0 @6
1407; EG-NEXT:    ALU 7, @11, KC0[CB0:0-32], KC1[]
1408; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
1409; EG-NEXT:    CF_END
1410; EG-NEXT:    PAD
1411; EG-NEXT:    Fetch clause starting at 6:
1412; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
1413; EG-NEXT:    ALU clause starting at 8:
1414; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
1415; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1416; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1417; EG-NEXT:    ALU clause starting at 11:
1418; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
1419; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
1420; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1421; EG-NEXT:     SETNE_INT * T1.W, PV.W, literal.x,
1422; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
1423; EG-NEXT:     CNDE_INT T0.X, PV.W, literal.x, T0.W,
1424; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.y,
1425; EG-NEXT:    -1(nan), 2(2.802597e-45)
1426;
1427; GFX10-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1428; GFX10:       ; %bb.0:
1429; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1430; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1431; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1432; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1433; GFX10-NEXT:    global_load_dword v0, v0, s[2:3]
1434; GFX10-NEXT:    s_waitcnt vmcnt(0)
1435; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1436; GFX10-NEXT:    v_min_u32_e32 v0, 32, v0
1437; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1438; GFX10-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1439; GFX10-NEXT:    global_store_dword v1, v0, s[0:1]
1440; GFX10-NEXT:    s_endpgm
1441;
1442; GFX10-GISEL-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1443; GFX10-GISEL:       ; %bb.0:
1444; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1445; GFX10-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1446; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1447; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1448; GFX10-GISEL-NEXT:    global_load_dword v0, v0, s[2:3]
1449; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1450; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v0, v0
1451; GFX10-GISEL-NEXT:    v_min_u32_e32 v0, 32, v0
1452; GFX10-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1453; GFX10-GISEL-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1454; GFX10-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
1455; GFX10-GISEL-NEXT:    s_endpgm
1456;
1457; GFX11-LABEL: v_ctlz_i32_sel_ne_bitwidth:
1458; GFX11:       ; %bb.0:
1459; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1460; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1461; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
1462; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
1463; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1464; GFX11-NEXT:    global_load_b32 v0, v0, s[2:3]
1465; GFX11-NEXT:    s_waitcnt vmcnt(0)
1466; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1467; GFX11-NEXT:    v_min_u32_e32 v0, 32, v0
1468; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1469; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 32, v0
1470; GFX11-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc_lo
1471; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
1472; GFX11-NEXT:    s_endpgm
1473  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1474  %in.gep = getelementptr i32, ptr addrspace(1) %valptr, i32 %tid
1475  %val = load i32, ptr addrspace(1) %in.gep
1476  %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
1477  %cmp = icmp ne i32 %ctlz, 32
1478  %sel = select i1 %cmp, i32 %ctlz, i32 -1
1479  store i32 %sel, ptr addrspace(1) %out
1480  ret void
1481}
1482
1483 define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1484; SI-LABEL: v_ctlz_i8_sel_eq_neg1:
1485; SI:       ; %bb.0:
1486; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1487; SI-NEXT:    s_mov_b32 s7, 0xf000
1488; SI-NEXT:    v_mov_b32_e32 v1, 0
1489; SI-NEXT:    s_mov_b32 s10, 0
1490; SI-NEXT:    s_mov_b32 s11, s7
1491; SI-NEXT:    s_waitcnt lgkmcnt(0)
1492; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1493; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1494; SI-NEXT:    s_mov_b32 s6, -1
1495; SI-NEXT:    s_mov_b32 s4, s0
1496; SI-NEXT:    s_mov_b32 s5, s1
1497; SI-NEXT:    s_waitcnt vmcnt(0)
1498; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1499; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1500; SI-NEXT:    s_endpgm
1501;
1502; VI-LABEL: v_ctlz_i8_sel_eq_neg1:
1503; VI:       ; %bb.0:
1504; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1505; VI-NEXT:    s_waitcnt lgkmcnt(0)
1506; VI-NEXT:    v_mov_b32_e32 v1, s3
1507; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1508; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1509; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1510; VI-NEXT:    s_mov_b32 s3, 0xf000
1511; VI-NEXT:    s_mov_b32 s2, -1
1512; VI-NEXT:    s_waitcnt vmcnt(0)
1513; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1514; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1515; VI-NEXT:    s_endpgm
1516;
1517; EG-LABEL: v_ctlz_i8_sel_eq_neg1:
1518; EG:       ; %bb.0:
1519; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1520; EG-NEXT:    TEX 0 @6
1521; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1522; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1523; EG-NEXT:    CF_END
1524; EG-NEXT:    PAD
1525; EG-NEXT:    Fetch clause starting at 6:
1526; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1527; EG-NEXT:    ALU clause starting at 8:
1528; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1529; EG-NEXT:    ALU clause starting at 9:
1530; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1531; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1532; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1533; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1534; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1535; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
1536; EG-NEXT:     LSHL T0.X, PV.W, PS,
1537; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1538; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1539; EG-NEXT:     MOV T0.Y, 0.0,
1540; EG-NEXT:     MOV * T0.Z, 0.0,
1541; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1542; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1543;
1544; GFX10-LABEL: v_ctlz_i8_sel_eq_neg1:
1545; GFX10:       ; %bb.0:
1546; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1547; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1548; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1549; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1550; GFX10-NEXT:    s_waitcnt vmcnt(0)
1551; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1552; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1553; GFX10-NEXT:    s_endpgm
1554;
1555; GFX10-GISEL-LABEL: v_ctlz_i8_sel_eq_neg1:
1556; GFX10-GISEL:       ; %bb.0:
1557; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1558; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1559; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1560; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1561; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1562; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1563; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1564; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1565; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1566; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1567; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1568; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1569; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe8, v1
1570; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0xffff, vcc_lo
1571; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1572; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1573; GFX10-GISEL-NEXT:    s_endpgm
1574;
1575; GFX11-LABEL: v_ctlz_i8_sel_eq_neg1:
1576; GFX11:       ; %bb.0:
1577; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1578; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1579; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1580; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1581; GFX11-NEXT:    s_waitcnt vmcnt(0)
1582; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1583; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
1584; GFX11-NEXT:    s_endpgm
1585  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1586  %valptr.gep = getelementptr i8, ptr addrspace(1) %valptr, i32 %tid
1587  %val = load i8, ptr addrspace(1) %valptr.gep
1588  %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
1589  %cmp = icmp eq i8 %val, 0
1590  %sel = select i1 %cmp, i8 -1, i8 %ctlz
1591  store i8 %sel, ptr addrspace(1) %out
1592  ret void
1593}
1594
1595 define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1596; SI-LABEL: v_ctlz_i16_sel_eq_neg1:
1597; SI:       ; %bb.0:
1598; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1599; SI-NEXT:    s_mov_b32 s7, 0xf000
1600; SI-NEXT:    s_mov_b32 s6, -1
1601; SI-NEXT:    s_mov_b32 s10, s6
1602; SI-NEXT:    s_mov_b32 s11, s7
1603; SI-NEXT:    s_waitcnt lgkmcnt(0)
1604; SI-NEXT:    s_mov_b32 s8, s2
1605; SI-NEXT:    s_mov_b32 s9, s3
1606; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1607; SI-NEXT:    s_mov_b32 s4, s0
1608; SI-NEXT:    s_mov_b32 s5, s1
1609; SI-NEXT:    s_waitcnt vmcnt(0)
1610; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1611; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1612; SI-NEXT:    s_endpgm
1613;
1614; VI-LABEL: v_ctlz_i16_sel_eq_neg1:
1615; VI:       ; %bb.0:
1616; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1617; VI-NEXT:    s_mov_b32 s7, 0xf000
1618; VI-NEXT:    s_mov_b32 s6, -1
1619; VI-NEXT:    s_mov_b32 s10, s6
1620; VI-NEXT:    s_mov_b32 s11, s7
1621; VI-NEXT:    s_waitcnt lgkmcnt(0)
1622; VI-NEXT:    s_mov_b32 s8, s2
1623; VI-NEXT:    s_mov_b32 s9, s3
1624; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0
1625; VI-NEXT:    v_mov_b32_e32 v1, 0xffff
1626; VI-NEXT:    s_mov_b32 s4, s0
1627; VI-NEXT:    s_mov_b32 s5, s1
1628; VI-NEXT:    s_waitcnt vmcnt(0)
1629; VI-NEXT:    v_ffbh_u32_e32 v2, v0
1630; VI-NEXT:    v_min_u32_e32 v2, 32, v2
1631; VI-NEXT:    v_add_u32_e32 v2, vcc, -16, v2
1632; VI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
1633; VI-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
1634; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1635; VI-NEXT:    s_endpgm
1636;
1637; EG-LABEL: v_ctlz_i16_sel_eq_neg1:
1638; EG:       ; %bb.0:
1639; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1640; EG-NEXT:    TEX 0 @6
1641; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1642; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1643; EG-NEXT:    CF_END
1644; EG-NEXT:    PAD
1645; EG-NEXT:    Fetch clause starting at 6:
1646; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1647; EG-NEXT:    ALU clause starting at 8:
1648; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1649; EG-NEXT:    ALU clause starting at 9:
1650; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1651; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1652; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1653; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1654; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1655; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1656; EG-NEXT:     LSHL T0.X, PV.W, PS,
1657; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1658; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1659; EG-NEXT:     MOV T0.Y, 0.0,
1660; EG-NEXT:     MOV * T0.Z, 0.0,
1661; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1662; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1663;
1664; GFX10-LABEL: v_ctlz_i16_sel_eq_neg1:
1665; GFX10:       ; %bb.0:
1666; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1667; GFX10-NEXT:    v_mov_b32_e32 v0, 0
1668; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1669; GFX10-NEXT:    global_load_ushort v1, v0, s[2:3]
1670; GFX10-NEXT:    s_waitcnt vmcnt(0)
1671; GFX10-NEXT:    v_ffbh_u32_e32 v2, v1
1672; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1673; GFX10-NEXT:    v_min_u32_e32 v2, 32, v2
1674; GFX10-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1675; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1676; GFX10-NEXT:    global_store_short v0, v1, s[0:1]
1677; GFX10-NEXT:    s_endpgm
1678;
1679; GFX10-GISEL-LABEL: v_ctlz_i16_sel_eq_neg1:
1680; GFX10-GISEL:       ; %bb.0:
1681; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1682; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, 0
1683; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1684; GFX10-GISEL-NEXT:    global_load_ushort v1, v0, s[2:3]
1685; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1686; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v2, v1
1687; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
1688; GFX10-GISEL-NEXT:    v_min_u32_e32 v2, 32, v2
1689; GFX10-GISEL-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1690; GFX10-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
1691; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, 0xffff, vcc_lo
1692; GFX10-GISEL-NEXT:    global_store_short v0, v1, s[0:1]
1693; GFX10-GISEL-NEXT:    s_endpgm
1694;
1695; GFX11-LABEL: v_ctlz_i16_sel_eq_neg1:
1696; GFX11:       ; %bb.0:
1697; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1698; GFX11-NEXT:    v_mov_b32_e32 v0, 0
1699; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1700; GFX11-NEXT:    global_load_u16 v1, v0, s[2:3]
1701; GFX11-NEXT:    s_waitcnt vmcnt(0)
1702; GFX11-NEXT:    v_clz_i32_u32_e32 v2, v1
1703; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
1704; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
1705; GFX11-NEXT:    v_min_u32_e32 v2, 32, v2
1706; GFX11-NEXT:    v_add_nc_u32_e32 v2, -16, v2
1707; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1708; GFX11-NEXT:    v_cndmask_b32_e32 v1, 0xffff, v2, vcc_lo
1709; GFX11-NEXT:    global_store_b16 v0, v1, s[0:1]
1710; GFX11-NEXT:    s_endpgm
1711  %val = load i16, ptr addrspace(1) %valptr
1712  %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
1713  %cmp = icmp eq i16 %val, 0
1714  %sel = select i1 %cmp, i16 -1, i16 %ctlz
1715  store i16 %sel, ptr addrspace(1) %out
1716  ret void
1717}
1718
1719; FIXME: Need to handle non-uniform case for function below (load without gep).
1720define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %valptr) nounwind {
1721; SI-LABEL: v_ctlz_i7_sel_eq_neg1:
1722; SI:       ; %bb.0:
1723; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1724; SI-NEXT:    s_mov_b32 s7, 0xf000
1725; SI-NEXT:    v_mov_b32_e32 v1, 0
1726; SI-NEXT:    s_mov_b32 s10, 0
1727; SI-NEXT:    s_mov_b32 s11, s7
1728; SI-NEXT:    s_waitcnt lgkmcnt(0)
1729; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1730; SI-NEXT:    buffer_load_ubyte v0, v[0:1], s[8:11], 0 addr64
1731; SI-NEXT:    s_mov_b32 s6, -1
1732; SI-NEXT:    s_mov_b32 s4, s0
1733; SI-NEXT:    s_mov_b32 s5, s1
1734; SI-NEXT:    s_waitcnt vmcnt(0)
1735; SI-NEXT:    v_ffbh_u32_e32 v0, v0
1736; SI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1737; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
1738; SI-NEXT:    s_endpgm
1739;
1740; VI-LABEL: v_ctlz_i7_sel_eq_neg1:
1741; VI:       ; %bb.0:
1742; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1743; VI-NEXT:    s_waitcnt lgkmcnt(0)
1744; VI-NEXT:    v_mov_b32_e32 v1, s3
1745; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1746; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1747; VI-NEXT:    flat_load_ubyte v0, v[0:1]
1748; VI-NEXT:    s_mov_b32 s3, 0xf000
1749; VI-NEXT:    s_mov_b32 s2, -1
1750; VI-NEXT:    s_waitcnt vmcnt(0)
1751; VI-NEXT:    v_ffbh_u32_e32 v0, v0
1752; VI-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1753; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
1754; VI-NEXT:    s_endpgm
1755;
1756; EG-LABEL: v_ctlz_i7_sel_eq_neg1:
1757; EG:       ; %bb.0:
1758; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
1759; EG-NEXT:    TEX 0 @6
1760; EG-NEXT:    ALU 12, @9, KC0[CB0:0-32], KC1[]
1761; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1762; EG-NEXT:    CF_END
1763; EG-NEXT:    PAD
1764; EG-NEXT:    Fetch clause starting at 6:
1765; EG-NEXT:     VTX_READ_8 T0.X, T0.X, 0, #1
1766; EG-NEXT:    ALU clause starting at 8:
1767; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
1768; EG-NEXT:    ALU clause starting at 9:
1769; EG-NEXT:     FFBH_UINT T0.W, T0.X,
1770; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1771; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1772; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1773; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1774; EG-NEXT:    127(1.779649e-43), 3(4.203895e-45)
1775; EG-NEXT:     LSHL T0.X, PV.W, PS,
1776; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1777; EG-NEXT:    255(3.573311e-43), 0(0.000000e+00)
1778; EG-NEXT:     MOV T0.Y, 0.0,
1779; EG-NEXT:     MOV * T0.Z, 0.0,
1780; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1781; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1782;
1783; GFX10-LABEL: v_ctlz_i7_sel_eq_neg1:
1784; GFX10:       ; %bb.0:
1785; GFX10-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1786; GFX10-NEXT:    v_mov_b32_e32 v1, 0
1787; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
1788; GFX10-NEXT:    global_load_ubyte v0, v0, s[2:3]
1789; GFX10-NEXT:    s_waitcnt vmcnt(0)
1790; GFX10-NEXT:    v_ffbh_u32_e32 v0, v0
1791; GFX10-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1792; GFX10-NEXT:    global_store_byte v1, v0, s[0:1]
1793; GFX10-NEXT:    s_endpgm
1794;
1795; GFX10-GISEL-LABEL: v_ctlz_i7_sel_eq_neg1:
1796; GFX10-GISEL:       ; %bb.0:
1797; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1798; GFX10-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
1799; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
1800; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s2
1801; GFX10-GISEL-NEXT:    v_mov_b32_e32 v2, s3
1802; GFX10-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v1, v0
1803; GFX10-GISEL-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, v2, v3, vcc_lo
1804; GFX10-GISEL-NEXT:    global_load_ubyte v0, v[0:1], off
1805; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
1806; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1807; GFX10-GISEL-NEXT:    v_ffbh_u32_e32 v1, v0
1808; GFX10-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
1809; GFX10-GISEL-NEXT:    v_min_u32_e32 v1, 32, v1
1810; GFX10-GISEL-NEXT:    v_add_nc_u16 v1, 0xffe7, v1
1811; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, 0x7f, vcc_lo
1812; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
1813; GFX10-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1814; GFX10-GISEL-NEXT:    global_store_byte v1, v0, s[0:1]
1815; GFX10-GISEL-NEXT:    s_endpgm
1816;
1817; GFX11-LABEL: v_ctlz_i7_sel_eq_neg1:
1818; GFX11:       ; %bb.0:
1819; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
1820; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
1821; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
1822; GFX11-NEXT:    global_load_u8 v0, v0, s[2:3]
1823; GFX11-NEXT:    s_waitcnt vmcnt(0)
1824; GFX11-NEXT:    v_clz_i32_u32_e32 v0, v0
1825; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
1826; GFX11-NEXT:    v_and_b32_e32 v0, 0x7f, v0
1827; GFX11-NEXT:    global_store_b8 v1, v0, s[0:1]
1828; GFX11-NEXT:    s_endpgm
1829  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1830  %valptr.gep = getelementptr i7, ptr addrspace(1) %valptr, i32 %tid
1831  %val = load i7, ptr addrspace(1) %valptr.gep
1832  %ctlz = call i7 @llvm.ctlz.i7(i7 %val, i1 false) nounwind readnone
1833  %cmp = icmp eq i7 %val, 0
1834  %sel = select i1 %cmp, i7 -1, i7 %ctlz
1835  store i7 %sel, ptr addrspace(1) %out
1836  ret void
1837}
1838