xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ctpop16.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=VI %s
4; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=EG %s
5
6declare i16 @llvm.ctpop.i16(i16) nounwind readnone
7declare <2 x i16> @llvm.ctpop.v2i16(<2 x i16>) nounwind readnone
8declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>) nounwind readnone
9declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>) nounwind readnone
10declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>) nounwind readnone
11
12declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
13
14define amdgpu_kernel void @s_ctpop_i16(ptr addrspace(1) noalias %out, i16 %val) nounwind {
15; SI-LABEL: s_ctpop_i16:
16; SI:       ; %bb.0:
17; SI-NEXT:    s_load_dword s6, s[4:5], 0xb
18; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
19; SI-NEXT:    s_mov_b32 s3, 0xf000
20; SI-NEXT:    s_mov_b32 s2, -1
21; SI-NEXT:    s_waitcnt lgkmcnt(0)
22; SI-NEXT:    s_and_b32 s4, s6, 0xffff
23; SI-NEXT:    s_bcnt1_i32_b32 s4, s4
24; SI-NEXT:    v_mov_b32_e32 v0, s4
25; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
26; SI-NEXT:    s_endpgm
27;
28; VI-LABEL: s_ctpop_i16:
29; VI:       ; %bb.0:
30; VI-NEXT:    s_load_dword s6, s[4:5], 0x2c
31; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
32; VI-NEXT:    s_mov_b32 s3, 0xf000
33; VI-NEXT:    s_mov_b32 s2, -1
34; VI-NEXT:    s_waitcnt lgkmcnt(0)
35; VI-NEXT:    s_and_b32 s4, s6, 0xffff
36; VI-NEXT:    s_bcnt1_i32_b32 s4, s4
37; VI-NEXT:    v_mov_b32_e32 v0, s4
38; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
39; VI-NEXT:    s_endpgm
40;
41; EG-LABEL: s_ctpop_i16:
42; EG:       ; %bb.0:
43; EG-NEXT:    ALU 0, @8, KC0[], KC1[]
44; EG-NEXT:    TEX 0 @6
45; EG-NEXT:    ALU 11, @9, KC0[CB0:0-32], KC1[]
46; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
47; EG-NEXT:    CF_END
48; EG-NEXT:    PAD
49; EG-NEXT:    Fetch clause starting at 6:
50; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 40, #3
51; EG-NEXT:    ALU clause starting at 8:
52; EG-NEXT:     MOV * T0.X, 0.0,
53; EG-NEXT:    ALU clause starting at 9:
54; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
55; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
56; EG-NEXT:     BCNT_INT T1.W, T0.X,
57; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
58; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
59; EG-NEXT:     LSHL T0.X, PV.W, PS,
60; EG-NEXT:     LSHL * T0.W, literal.x, PS,
61; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
62; EG-NEXT:     MOV T0.Y, 0.0,
63; EG-NEXT:     MOV * T0.Z, 0.0,
64; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
65; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
66  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
67  store i16 %ctpop, ptr addrspace(1) %out, align 4
68  ret void
69}
70
71; XXX - Why 0 in register?
72define amdgpu_kernel void @v_ctpop_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
73; SI-LABEL: v_ctpop_i16:
74; SI:       ; %bb.0:
75; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
76; SI-NEXT:    s_mov_b32 s7, 0xf000
77; SI-NEXT:    s_mov_b32 s10, 0
78; SI-NEXT:    s_mov_b32 s11, s7
79; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
80; SI-NEXT:    s_waitcnt lgkmcnt(0)
81; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
82; SI-NEXT:    v_mov_b32_e32 v1, 0
83; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
84; SI-NEXT:    s_mov_b32 s6, -1
85; SI-NEXT:    s_mov_b32 s4, s0
86; SI-NEXT:    s_mov_b32 s5, s1
87; SI-NEXT:    s_waitcnt vmcnt(0)
88; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
89; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
90; SI-NEXT:    s_endpgm
91;
92; VI-LABEL: v_ctpop_i16:
93; VI:       ; %bb.0:
94; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
95; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
96; VI-NEXT:    s_waitcnt lgkmcnt(0)
97; VI-NEXT:    v_mov_b32_e32 v1, s3
98; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
99; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
100; VI-NEXT:    flat_load_ushort v0, v[0:1]
101; VI-NEXT:    s_mov_b32 s3, 0xf000
102; VI-NEXT:    s_mov_b32 s2, -1
103; VI-NEXT:    s_waitcnt vmcnt(0)
104; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
105; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
106; VI-NEXT:    s_endpgm
107;
108; EG-LABEL: v_ctpop_i16:
109; EG:       ; %bb.0:
110; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
111; EG-NEXT:    TEX 0 @6
112; EG-NEXT:    ALU 11, @10, KC0[CB0:0-32], KC1[]
113; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
114; EG-NEXT:    CF_END
115; EG-NEXT:    PAD
116; EG-NEXT:    Fetch clause starting at 6:
117; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
118; EG-NEXT:    ALU clause starting at 8:
119; EG-NEXT:     LSHL * T0.W, T0.X, 1,
120; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
121; EG-NEXT:    ALU clause starting at 10:
122; EG-NEXT:     AND_INT * T0.W, KC0[2].Y, literal.x,
123; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
124; EG-NEXT:     BCNT_INT T1.W, T0.X,
125; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
126; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
127; EG-NEXT:     LSHL T0.X, PV.W, PS,
128; EG-NEXT:     LSHL * T0.W, literal.x, PS,
129; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
130; EG-NEXT:     MOV T0.Y, 0.0,
131; EG-NEXT:     MOV * T0.Z, 0.0,
132; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
133; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
134  %tid = call i32 @llvm.amdgcn.workitem.id.x()
135  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
136  %val = load i16, ptr addrspace(1) %in.gep, align 4
137  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
138  store i16 %ctpop, ptr addrspace(1) %out, align 4
139  ret void
140}
141
142define amdgpu_kernel void @v_ctpop_add_chain_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in0, ptr addrspace(1) noalias %in1) nounwind {
143; SI-LABEL: v_ctpop_add_chain_i16:
144; SI:       ; %bb.0:
145; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
146; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
147; SI-NEXT:    s_mov_b32 s11, 0xf000
148; SI-NEXT:    s_mov_b32 s14, 0
149; SI-NEXT:    s_mov_b32 s15, s11
150; SI-NEXT:    s_waitcnt lgkmcnt(0)
151; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
152; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
153; SI-NEXT:    v_mov_b32_e32 v1, 0
154; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
155; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64 glc
156; SI-NEXT:    s_waitcnt vmcnt(0)
157; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 glc
158; SI-NEXT:    s_waitcnt vmcnt(0)
159; SI-NEXT:    s_mov_b32 s10, -1
160; SI-NEXT:    s_mov_b32 s8, s0
161; SI-NEXT:    s_mov_b32 s9, s1
162; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
163; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
164; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
165; SI-NEXT:    s_endpgm
166;
167; VI-LABEL: v_ctpop_add_chain_i16:
168; VI:       ; %bb.0:
169; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
170; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
171; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
172; VI-NEXT:    s_waitcnt lgkmcnt(0)
173; VI-NEXT:    v_mov_b32_e32 v1, s3
174; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
175; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
176; VI-NEXT:    v_mov_b32_e32 v3, s5
177; VI-NEXT:    v_add_u32_e32 v2, vcc, s4, v2
178; VI-NEXT:    v_addc_u32_e32 v3, vcc, 0, v3, vcc
179; VI-NEXT:    flat_load_ushort v0, v[0:1] glc
180; VI-NEXT:    s_waitcnt vmcnt(0)
181; VI-NEXT:    flat_load_ushort v1, v[2:3] glc
182; VI-NEXT:    s_waitcnt vmcnt(0)
183; VI-NEXT:    s_mov_b32 s3, 0xf000
184; VI-NEXT:    s_mov_b32 s2, -1
185; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
186; VI-NEXT:    v_bcnt_u32_b32 v0, v0, v1
187; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
188; VI-NEXT:    s_endpgm
189;
190; EG-LABEL: v_ctpop_add_chain_i16:
191; EG:       ; %bb.0:
192; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
193; EG-NEXT:    TEX 0 @8
194; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
195; EG-NEXT:    TEX 0 @10
196; EG-NEXT:    ALU 16, @15, KC0[CB0:0-32], KC1[]
197; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
198; EG-NEXT:    CF_END
199; EG-NEXT:    PAD
200; EG-NEXT:    Fetch clause starting at 8:
201; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
202; EG-NEXT:    Fetch clause starting at 10:
203; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
204; EG-NEXT:    ALU clause starting at 12:
205; EG-NEXT:     LSHL * T0.W, T0.X, 1,
206; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
207; EG-NEXT:    ALU clause starting at 14:
208; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
209; EG-NEXT:    ALU clause starting at 15:
210; EG-NEXT:     AND_INT T0.W, T0.X, literal.x,
211; EG-NEXT:     AND_INT * T1.W, T1.X, literal.x,
212; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
213; EG-NEXT:     BCNT_INT T0.Z, PS,
214; EG-NEXT:     BCNT_INT T0.W, PV.W,
215; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
216; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
217; EG-NEXT:     ADD_INT T0.W, PV.W, PV.Z,
218; EG-NEXT:     LSHL * T1.W, PS, literal.x,
219; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
220; EG-NEXT:     LSHL T0.X, PV.W, PS,
221; EG-NEXT:     LSHL * T0.W, literal.x, PS,
222; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
223; EG-NEXT:     MOV T0.Y, 0.0,
224; EG-NEXT:     MOV * T0.Z, 0.0,
225; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
226; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
227  %tid = call i32 @llvm.amdgcn.workitem.id.x()
228  %in0.gep = getelementptr i16, ptr addrspace(1) %in0, i32 %tid
229  %in1.gep = getelementptr i16, ptr addrspace(1) %in1, i32 %tid
230  %val0 = load volatile i16, ptr addrspace(1) %in0.gep, align 4
231  %val1 = load volatile i16, ptr addrspace(1) %in1.gep, align 4
232  %ctpop0 = call i16 @llvm.ctpop.i16(i16 %val0) nounwind readnone
233  %ctpop1 = call i16 @llvm.ctpop.i16(i16 %val1) nounwind readnone
234  %add = add i16 %ctpop0, %ctpop1
235  store i16 %add, ptr addrspace(1) %out, align 4
236  ret void
237}
238
239define amdgpu_kernel void @v_ctpop_add_sgpr_i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %sval) nounwind {
240; SI-LABEL: v_ctpop_add_sgpr_i16:
241; SI:       ; %bb.0:
242; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
243; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
244; SI-NEXT:    s_mov_b32 s7, 0xf000
245; SI-NEXT:    s_mov_b32 s10, 0
246; SI-NEXT:    s_mov_b32 s11, s7
247; SI-NEXT:    s_waitcnt lgkmcnt(0)
248; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
249; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
250; SI-NEXT:    v_mov_b32_e32 v1, 0
251; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
252; SI-NEXT:    s_mov_b32 s6, -1
253; SI-NEXT:    s_mov_b32 s4, s0
254; SI-NEXT:    s_mov_b32 s5, s1
255; SI-NEXT:    s_waitcnt vmcnt(0)
256; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s12
257; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
258; SI-NEXT:    s_endpgm
259;
260; VI-LABEL: v_ctpop_add_sgpr_i16:
261; VI:       ; %bb.0:
262; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
263; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
264; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
265; VI-NEXT:    s_waitcnt lgkmcnt(0)
266; VI-NEXT:    v_mov_b32_e32 v1, s3
267; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
268; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
269; VI-NEXT:    flat_load_ushort v0, v[0:1]
270; VI-NEXT:    s_mov_b32 s3, 0xf000
271; VI-NEXT:    s_mov_b32 s2, -1
272; VI-NEXT:    s_waitcnt vmcnt(0)
273; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
274; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
275; VI-NEXT:    s_endpgm
276;
277; EG-LABEL: v_ctpop_add_sgpr_i16:
278; EG:       ; %bb.0:
279; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
280; EG-NEXT:    TEX 0 @8
281; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
282; EG-NEXT:    TEX 0 @10
283; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
284; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
285; EG-NEXT:    CF_END
286; EG-NEXT:    PAD
287; EG-NEXT:    Fetch clause starting at 8:
288; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
289; EG-NEXT:    Fetch clause starting at 10:
290; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
291; EG-NEXT:    ALU clause starting at 12:
292; EG-NEXT:     LSHL * T0.W, T0.X, 1,
293; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
294; EG-NEXT:    ALU clause starting at 14:
295; EG-NEXT:     MOV * T1.X, 0.0,
296; EG-NEXT:    ALU clause starting at 15:
297; EG-NEXT:     BCNT_INT T0.W, T0.X,
298; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
299; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
300; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
301; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
302; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
303; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
304; EG-NEXT:     LSHL T0.X, PV.W, PS,
305; EG-NEXT:     LSHL * T0.W, literal.x, PS,
306; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
307; EG-NEXT:     MOV T0.Y, 0.0,
308; EG-NEXT:     MOV * T0.Z, 0.0,
309; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
310; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
311  %tid = call i32 @llvm.amdgcn.workitem.id.x()
312  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
313  %val = load i16, ptr addrspace(1) %in.gep, align 4
314  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
315  %add = add i16 %ctpop, %sval
316  store i16 %add, ptr addrspace(1) %out, align 4
317  ret void
318}
319
320define amdgpu_kernel void @v_ctpop_v2i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
321; SI-LABEL: v_ctpop_v2i16:
322; SI:       ; %bb.0:
323; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
324; SI-NEXT:    s_mov_b32 s7, 0xf000
325; SI-NEXT:    s_mov_b32 s10, 0
326; SI-NEXT:    s_mov_b32 s11, s7
327; SI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
328; SI-NEXT:    s_waitcnt lgkmcnt(0)
329; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
330; SI-NEXT:    v_mov_b32_e32 v1, 0
331; SI-NEXT:    buffer_load_dword v0, v[0:1], s[8:11], 0 addr64
332; SI-NEXT:    s_mov_b32 s6, -1
333; SI-NEXT:    s_mov_b32 s4, s0
334; SI-NEXT:    s_mov_b32 s5, s1
335; SI-NEXT:    s_waitcnt vmcnt(0)
336; SI-NEXT:    v_and_b32_e32 v1, 0xffff, v0
337; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
338; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
339; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
340; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
341; SI-NEXT:    v_or_b32_e32 v0, v1, v0
342; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
343; SI-NEXT:    s_endpgm
344;
345; VI-LABEL: v_ctpop_v2i16:
346; VI:       ; %bb.0:
347; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
348; VI-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
349; VI-NEXT:    s_waitcnt lgkmcnt(0)
350; VI-NEXT:    v_mov_b32_e32 v1, s3
351; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
352; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
353; VI-NEXT:    flat_load_dword v0, v[0:1]
354; VI-NEXT:    s_mov_b32 s3, 0xf000
355; VI-NEXT:    s_mov_b32 s2, -1
356; VI-NEXT:    s_waitcnt vmcnt(0)
357; VI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
358; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
359; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
360; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
361; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
362; VI-NEXT:    v_or_b32_e32 v0, v0, v1
363; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
364; VI-NEXT:    s_endpgm
365;
366; EG-LABEL: v_ctpop_v2i16:
367; EG:       ; %bb.0:
368; EG-NEXT:    ALU 2, @8, KC0[CB0:0-32], KC1[]
369; EG-NEXT:    TEX 0 @6
370; EG-NEXT:    ALU 10, @11, KC0[CB0:0-32], KC1[]
371; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.X, T6.X, 1
372; EG-NEXT:    CF_END
373; EG-NEXT:    PAD
374; EG-NEXT:    Fetch clause starting at 6:
375; EG-NEXT:     VTX_READ_32 T0.X, T0.X, 0, #1
376; EG-NEXT:    ALU clause starting at 8:
377; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
378; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
379; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
380; EG-NEXT:    ALU clause starting at 11:
381; EG-NEXT:     LSHR * T0.W, T0.X, literal.x,
382; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
383; EG-NEXT:     BCNT_INT T0.W, PV.W,
384; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
385; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
386; EG-NEXT:     BCNT_INT T1.W, PS,
387; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
388; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
389; EG-NEXT:     OR_INT T0.X, PV.W, PS,
390; EG-NEXT:     LSHR * T6.X, KC0[2].Y, literal.x,
391; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
392  %tid = call i32 @llvm.amdgcn.workitem.id.x()
393  %in.gep = getelementptr <2 x i16>, ptr addrspace(1) %in, i32 %tid
394  %val = load <2 x i16>, ptr addrspace(1) %in.gep, align 8
395  %ctpop = call <2 x i16> @llvm.ctpop.v2i16(<2 x i16> %val) nounwind readnone
396  store <2 x i16> %ctpop, ptr addrspace(1) %out, align 8
397  ret void
398}
399
400define amdgpu_kernel void @v_ctpop_v4i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
401; SI-LABEL: v_ctpop_v4i16:
402; SI:       ; %bb.0:
403; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
404; SI-NEXT:    s_mov_b32 s7, 0xf000
405; SI-NEXT:    s_mov_b32 s10, 0
406; SI-NEXT:    s_mov_b32 s11, s7
407; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
408; SI-NEXT:    s_waitcnt lgkmcnt(0)
409; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
410; SI-NEXT:    v_mov_b32_e32 v1, 0
411; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
412; SI-NEXT:    s_mov_b32 s6, -1
413; SI-NEXT:    s_mov_b32 s4, s0
414; SI-NEXT:    s_mov_b32 s5, s1
415; SI-NEXT:    s_waitcnt vmcnt(0)
416; SI-NEXT:    v_and_b32_e32 v2, 0xffff, v0
417; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
418; SI-NEXT:    v_and_b32_e32 v3, 0xffff, v1
419; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
420; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
421; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
422; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
423; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
424; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
425; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
426; SI-NEXT:    v_or_b32_e32 v1, v3, v1
427; SI-NEXT:    v_or_b32_e32 v0, v2, v0
428; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
429; SI-NEXT:    s_endpgm
430;
431; VI-LABEL: v_ctpop_v4i16:
432; VI:       ; %bb.0:
433; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
434; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
435; VI-NEXT:    s_waitcnt lgkmcnt(0)
436; VI-NEXT:    v_mov_b32_e32 v1, s3
437; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
438; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
439; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
440; VI-NEXT:    s_mov_b32 s3, 0xf000
441; VI-NEXT:    s_mov_b32 s2, -1
442; VI-NEXT:    s_waitcnt vmcnt(0)
443; VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
444; VI-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
445; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
446; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
447; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
448; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
449; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
450; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
451; VI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
452; VI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
453; VI-NEXT:    v_or_b32_e32 v1, v1, v2
454; VI-NEXT:    v_or_b32_e32 v0, v0, v3
455; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
456; VI-NEXT:    s_endpgm
457;
458; EG-LABEL: v_ctpop_v4i16:
459; EG:       ; %bb.0:
460; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
461; EG-NEXT:    TEX 0 @6
462; EG-NEXT:    ALU 37, @12, KC0[CB0:0-32], KC1[]
463; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T8.XY, T0.X, 1
464; EG-NEXT:    CF_END
465; EG-NEXT:    PAD
466; EG-NEXT:    Fetch clause starting at 6:
467; EG-NEXT:     VTX_READ_64 T8.XY, T0.X, 0, #1
468; EG-NEXT:    ALU clause starting at 8:
469; EG-NEXT:     MOV T0.Y, T4.X,
470; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
471; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
472; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
473; EG-NEXT:    ALU clause starting at 12:
474; EG-NEXT:     AND_INT * T0.W, T8.X, literal.x,
475; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
476; EG-NEXT:     BCNT_INT T0.W, PV.W,
477; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.x,
478; EG-NEXT:    -65536(nan), 0(0.000000e+00)
479; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
480; EG-NEXT:     MOV * T4.X, PV.W,
481; EG-NEXT:     MOV T0.X, PV.X,
482; EG-NEXT:     LSHR * T0.W, T8.X, literal.x,
483; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
484; EG-NEXT:     BCNT_INT T0.W, PV.W,
485; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
486; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
487; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
488; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
489; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
490; EG-NEXT:     MOV T4.X, PV.W,
491; EG-NEXT:     MOV * T0.X, T5.X,
492; EG-NEXT:     AND_INT * T0.W, T8.Y, literal.x,
493; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
494; EG-NEXT:     BCNT_INT T0.W, PV.W,
495; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
496; EG-NEXT:    -65536(nan), 0(0.000000e+00)
497; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
498; EG-NEXT:     MOV * T5.X, PV.W,
499; EG-NEXT:     MOV T0.X, PV.X,
500; EG-NEXT:     LSHR * T0.W, T8.Y, literal.x,
501; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
502; EG-NEXT:     BCNT_INT T0.W, PV.W,
503; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
504; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
505; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
506; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
507; EG-NEXT:     LSHR T0.X, KC0[2].Y, literal.x,
508; EG-NEXT:     OR_INT * T8.Y, T1.W, PV.W,
509; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
510; EG-NEXT:     MOV T5.X, PV.Y,
511; EG-NEXT:     MOV * T8.X, T4.X,
512  %tid = call i32 @llvm.amdgcn.workitem.id.x()
513  %in.gep = getelementptr <4 x i16>, ptr addrspace(1) %in, i32 %tid
514  %val = load <4 x i16>, ptr addrspace(1) %in.gep, align 16
515  %ctpop = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %val) nounwind readnone
516  store <4 x i16> %ctpop, ptr addrspace(1) %out, align 16
517  ret void
518}
519
520define amdgpu_kernel void @v_ctpop_v8i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
521; SI-LABEL: v_ctpop_v8i16:
522; SI:       ; %bb.0:
523; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
524; SI-NEXT:    s_mov_b32 s3, 0xf000
525; SI-NEXT:    s_mov_b32 s10, 0
526; SI-NEXT:    s_mov_b32 s11, s3
527; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
528; SI-NEXT:    s_waitcnt lgkmcnt(0)
529; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
530; SI-NEXT:    v_mov_b32_e32 v1, 0
531; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
532; SI-NEXT:    s_mov_b32 s2, -1
533; SI-NEXT:    s_mov_b32 s0, s4
534; SI-NEXT:    s_mov_b32 s1, s5
535; SI-NEXT:    s_waitcnt vmcnt(0)
536; SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
537; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
538; SI-NEXT:    v_and_b32_e32 v5, 0xffff, v1
539; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
540; SI-NEXT:    v_and_b32_e32 v6, 0xffff, v2
541; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
542; SI-NEXT:    v_and_b32_e32 v7, 0xffff, v3
543; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
544; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
545; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
546; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
547; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
548; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
549; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
550; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
551; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
552; SI-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
553; SI-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
554; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
555; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
556; SI-NEXT:    v_or_b32_e32 v3, v7, v3
557; SI-NEXT:    v_or_b32_e32 v2, v6, v2
558; SI-NEXT:    v_or_b32_e32 v1, v5, v1
559; SI-NEXT:    v_or_b32_e32 v0, v4, v0
560; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
561; SI-NEXT:    s_endpgm
562;
563; VI-LABEL: v_ctpop_v8i16:
564; VI:       ; %bb.0:
565; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
566; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
567; VI-NEXT:    s_waitcnt lgkmcnt(0)
568; VI-NEXT:    v_mov_b32_e32 v1, s3
569; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
570; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
571; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
572; VI-NEXT:    s_mov_b32 s3, 0xf000
573; VI-NEXT:    s_mov_b32 s2, -1
574; VI-NEXT:    s_waitcnt vmcnt(0)
575; VI-NEXT:    v_lshrrev_b32_e32 v4, 16, v3
576; VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v2
577; VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
578; VI-NEXT:    v_lshrrev_b32_e32 v7, 16, v0
579; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
580; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
581; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
582; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
583; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
584; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
585; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
586; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
587; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
588; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
589; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
590; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
591; VI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
592; VI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
593; VI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
594; VI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
595; VI-NEXT:    v_or_b32_e32 v3, v3, v4
596; VI-NEXT:    v_or_b32_e32 v2, v2, v5
597; VI-NEXT:    v_or_b32_e32 v1, v1, v6
598; VI-NEXT:    v_or_b32_e32 v0, v0, v7
599; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
600; VI-NEXT:    s_endpgm
601;
602; EG-LABEL: v_ctpop_v8i16:
603; EG:       ; %bb.0:
604; EG-NEXT:    ALU 3, @8, KC0[CB0:0-32], KC1[]
605; EG-NEXT:    TEX 0 @6
606; EG-NEXT:    ALU 73, @12, KC0[CB0:0-32], KC1[]
607; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T12.X, 1
608; EG-NEXT:    CF_END
609; EG-NEXT:    PAD
610; EG-NEXT:    Fetch clause starting at 6:
611; EG-NEXT:     VTX_READ_128 T12.XYZW, T0.X, 0, #1
612; EG-NEXT:    ALU clause starting at 8:
613; EG-NEXT:     MOV T0.Y, T4.X,
614; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
615; EG-NEXT:    4(5.605194e-45), 0(0.000000e+00)
616; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
617; EG-NEXT:    ALU clause starting at 12:
618; EG-NEXT:     LSHR * T0.W, T12.X, literal.x,
619; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
620; EG-NEXT:     BCNT_INT * T0.W, PV.W,
621; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
622; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
623; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
624; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
625; EG-NEXT:     MOV * T4.X, PV.W,
626; EG-NEXT:     MOV T0.X, PV.X,
627; EG-NEXT:     AND_INT * T0.W, T12.X, literal.x,
628; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
629; EG-NEXT:     BCNT_INT T0.W, PV.W,
630; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
631; EG-NEXT:    -65536(nan), 0(0.000000e+00)
632; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
633; EG-NEXT:     MOV T4.X, PV.W,
634; EG-NEXT:     MOV * T0.X, T5.X,
635; EG-NEXT:     LSHR * T0.W, T12.Y, literal.x,
636; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
637; EG-NEXT:     BCNT_INT T0.W, PV.W,
638; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
639; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
640; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
641; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
642; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
643; EG-NEXT:     MOV * T5.X, PV.W,
644; EG-NEXT:     MOV T0.X, PV.X,
645; EG-NEXT:     AND_INT * T0.W, T12.Y, literal.x,
646; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
647; EG-NEXT:     BCNT_INT T0.W, PV.W,
648; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
649; EG-NEXT:    -65536(nan), 0(0.000000e+00)
650; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
651; EG-NEXT:     MOV T5.X, PV.Y,
652; EG-NEXT:     MOV * T0.X, T8.X,
653; EG-NEXT:     LSHR * T0.W, T12.Z, literal.x,
654; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
655; EG-NEXT:     BCNT_INT T0.W, PV.W,
656; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
657; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
658; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
659; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
660; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
661; EG-NEXT:     MOV * T8.X, PV.W,
662; EG-NEXT:     MOV T0.X, PV.X,
663; EG-NEXT:     AND_INT * T0.W, T12.Z, literal.x,
664; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
665; EG-NEXT:     BCNT_INT T0.W, PV.W,
666; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
667; EG-NEXT:    -65536(nan), 0(0.000000e+00)
668; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
669; EG-NEXT:     MOV T8.X, PV.W,
670; EG-NEXT:     MOV * T0.X, T9.X,
671; EG-NEXT:     LSHR * T0.W, T12.W, literal.x,
672; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
673; EG-NEXT:     BCNT_INT T0.W, PV.W,
674; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
675; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
676; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
677; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
678; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
679; EG-NEXT:     MOV * T9.X, PV.W,
680; EG-NEXT:     MOV T0.X, PV.X,
681; EG-NEXT:     AND_INT * T0.W, T12.W, literal.x,
682; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
683; EG-NEXT:     BCNT_INT T0.W, PV.W,
684; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
685; EG-NEXT:    -65536(nan), 0(0.000000e+00)
686; EG-NEXT:     LSHR T12.X, KC0[2].Y, literal.x,
687; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
688; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
689; EG-NEXT:     MOV T9.X, PV.W,
690; EG-NEXT:     MOV * T0.X, T4.X,
691; EG-NEXT:     MOV * T0.Z, T8.X,
692  %tid = call i32 @llvm.amdgcn.workitem.id.x()
693  %in.gep = getelementptr <8 x i16>, ptr addrspace(1) %in, i32 %tid
694  %val = load <8 x i16>, ptr addrspace(1) %in.gep, align 32
695  %ctpop = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %val) nounwind readnone
696  store <8 x i16> %ctpop, ptr addrspace(1) %out, align 32
697  ret void
698}
699
700define amdgpu_kernel void @v_ctpop_v16i16(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
701; SI-LABEL: v_ctpop_v16i16:
702; SI:       ; %bb.0:
703; SI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
704; SI-NEXT:    s_mov_b32 s3, 0xf000
705; SI-NEXT:    s_mov_b32 s10, 0
706; SI-NEXT:    s_mov_b32 s11, s3
707; SI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
708; SI-NEXT:    s_waitcnt lgkmcnt(0)
709; SI-NEXT:    s_mov_b64 s[8:9], s[6:7]
710; SI-NEXT:    v_mov_b32_e32 v5, 0
711; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64 offset:16
712; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64
713; SI-NEXT:    s_mov_b32 s2, -1
714; SI-NEXT:    s_mov_b32 s0, s4
715; SI-NEXT:    s_mov_b32 s1, s5
716; SI-NEXT:    s_waitcnt vmcnt(1)
717; SI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
718; SI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
719; SI-NEXT:    v_and_b32_e32 v9, 0xffff, v1
720; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
721; SI-NEXT:    v_and_b32_e32 v10, 0xffff, v2
722; SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
723; SI-NEXT:    v_and_b32_e32 v11, 0xffff, v3
724; SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
725; SI-NEXT:    s_waitcnt vmcnt(0)
726; SI-NEXT:    v_and_b32_e32 v12, 0xffff, v4
727; SI-NEXT:    v_lshrrev_b32_e32 v4, 16, v4
728; SI-NEXT:    v_and_b32_e32 v13, 0xffff, v5
729; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
730; SI-NEXT:    v_and_b32_e32 v14, 0xffff, v6
731; SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
732; SI-NEXT:    v_and_b32_e32 v15, 0xffff, v7
733; SI-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
734; SI-NEXT:    v_bcnt_u32_b32_e64 v7, v7, 0
735; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
736; SI-NEXT:    v_bcnt_u32_b32_e64 v5, v5, 0
737; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
738; SI-NEXT:    v_bcnt_u32_b32_e64 v3, v3, 0
739; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
740; SI-NEXT:    v_bcnt_u32_b32_e64 v1, v1, 0
741; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
742; SI-NEXT:    v_bcnt_u32_b32_e64 v15, v15, 0
743; SI-NEXT:    v_bcnt_u32_b32_e64 v14, v14, 0
744; SI-NEXT:    v_bcnt_u32_b32_e64 v13, v13, 0
745; SI-NEXT:    v_bcnt_u32_b32_e64 v12, v12, 0
746; SI-NEXT:    v_bcnt_u32_b32_e64 v11, v11, 0
747; SI-NEXT:    v_bcnt_u32_b32_e64 v10, v10, 0
748; SI-NEXT:    v_bcnt_u32_b32_e64 v9, v9, 0
749; SI-NEXT:    v_bcnt_u32_b32_e64 v8, v8, 0
750; SI-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
751; SI-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
752; SI-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
753; SI-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
754; SI-NEXT:    v_lshlrev_b32_e32 v16, 16, v3
755; SI-NEXT:    v_lshlrev_b32_e32 v17, 16, v2
756; SI-NEXT:    v_lshlrev_b32_e32 v18, 16, v1
757; SI-NEXT:    v_lshlrev_b32_e32 v19, 16, v0
758; SI-NEXT:    v_or_b32_e32 v3, v15, v7
759; SI-NEXT:    v_or_b32_e32 v2, v14, v6
760; SI-NEXT:    v_or_b32_e32 v1, v13, v5
761; SI-NEXT:    v_or_b32_e32 v0, v12, v4
762; SI-NEXT:    v_or_b32_e32 v7, v11, v16
763; SI-NEXT:    v_or_b32_e32 v6, v10, v17
764; SI-NEXT:    v_or_b32_e32 v5, v9, v18
765; SI-NEXT:    v_or_b32_e32 v4, v8, v19
766; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
767; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
768; SI-NEXT:    s_endpgm
769;
770; VI-LABEL: v_ctpop_v16i16:
771; VI:       ; %bb.0:
772; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
773; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
774; VI-NEXT:    s_waitcnt lgkmcnt(0)
775; VI-NEXT:    v_mov_b32_e32 v1, s3
776; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
777; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
778; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
779; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
780; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
781; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
782; VI-NEXT:    s_mov_b32 s3, 0xf000
783; VI-NEXT:    s_mov_b32 s2, -1
784; VI-NEXT:    s_waitcnt vmcnt(1)
785; VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v3
786; VI-NEXT:    v_lshrrev_b32_e32 v9, 16, v2
787; VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
788; VI-NEXT:    v_lshrrev_b32_e32 v11, 16, v0
789; VI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
790; VI-NEXT:    v_and_b32_e32 v2, 0xffff, v2
791; VI-NEXT:    v_and_b32_e32 v1, 0xffff, v1
792; VI-NEXT:    v_and_b32_e32 v0, 0xffff, v0
793; VI-NEXT:    s_waitcnt vmcnt(0)
794; VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v7
795; VI-NEXT:    v_lshrrev_b32_e32 v13, 16, v6
796; VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v5
797; VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v4
798; VI-NEXT:    v_bcnt_u32_b32 v8, v8, 0
799; VI-NEXT:    v_bcnt_u32_b32 v9, v9, 0
800; VI-NEXT:    v_bcnt_u32_b32 v10, v10, 0
801; VI-NEXT:    v_bcnt_u32_b32 v11, v11, 0
802; VI-NEXT:    v_and_b32_e32 v7, 0xffff, v7
803; VI-NEXT:    v_and_b32_e32 v6, 0xffff, v6
804; VI-NEXT:    v_and_b32_e32 v5, 0xffff, v5
805; VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
806; VI-NEXT:    v_bcnt_u32_b32 v3, v3, 0
807; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
808; VI-NEXT:    v_bcnt_u32_b32 v1, v1, 0
809; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
810; VI-NEXT:    v_bcnt_u32_b32 v12, v12, 0
811; VI-NEXT:    v_bcnt_u32_b32 v13, v13, 0
812; VI-NEXT:    v_bcnt_u32_b32 v14, v14, 0
813; VI-NEXT:    v_bcnt_u32_b32 v15, v15, 0
814; VI-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
815; VI-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
816; VI-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
817; VI-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
818; VI-NEXT:    v_bcnt_u32_b32 v7, v7, 0
819; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
820; VI-NEXT:    v_bcnt_u32_b32 v5, v5, 0
821; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
822; VI-NEXT:    v_lshlrev_b32_e32 v12, 16, v12
823; VI-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
824; VI-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
825; VI-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
826; VI-NEXT:    v_or_b32_e32 v3, v3, v8
827; VI-NEXT:    v_or_b32_e32 v2, v2, v9
828; VI-NEXT:    v_or_b32_e32 v1, v1, v10
829; VI-NEXT:    v_or_b32_e32 v0, v0, v11
830; VI-NEXT:    v_or_b32_e32 v7, v7, v12
831; VI-NEXT:    v_or_b32_e32 v6, v6, v13
832; VI-NEXT:    v_or_b32_e32 v5, v5, v14
833; VI-NEXT:    v_or_b32_e32 v4, v4, v15
834; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
835; VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
836; VI-NEXT:    s_endpgm
837;
838; EG-LABEL: v_ctpop_v16i16:
839; EG:       ; %bb.0:
840; EG-NEXT:    ALU 3, @12, KC0[CB0:0-32], KC1[]
841; EG-NEXT:    TEX 1 @8
842; EG-NEXT:    ALU 114, @16, KC0[], KC1[]
843; EG-NEXT:    ALU 34, @131, KC0[CB0:0-32], KC1[]
844; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T22.X, 0
845; EG-NEXT:    MEM_RAT_CACHELESS STORE_RAW T20.XYZW, T21.X, 1
846; EG-NEXT:    CF_END
847; EG-NEXT:    PAD
848; EG-NEXT:    Fetch clause starting at 8:
849; EG-NEXT:     VTX_READ_128 T20.XYZW, T0.X, 16, #1
850; EG-NEXT:     VTX_READ_128 T21.XYZW, T0.X, 0, #1
851; EG-NEXT:    ALU clause starting at 12:
852; EG-NEXT:     MOV T0.Y, T4.X,
853; EG-NEXT:     LSHL * T0.W, T0.X, literal.x, BS:VEC_120/SCL_212
854; EG-NEXT:    5(7.006492e-45), 0(0.000000e+00)
855; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
856; EG-NEXT:    ALU clause starting at 16:
857; EG-NEXT:     LSHR * T0.W, T20.X, literal.x,
858; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
859; EG-NEXT:     BCNT_INT * T0.W, PV.W,
860; EG-NEXT:     LSHL T0.W, PV.W, literal.x,
861; EG-NEXT:     AND_INT * T1.W, T0.Y, literal.y,
862; EG-NEXT:    16(2.242078e-44), 65535(9.183409e-41)
863; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
864; EG-NEXT:     MOV * T4.X, PV.W,
865; EG-NEXT:     MOV T0.X, PV.X,
866; EG-NEXT:     AND_INT * T0.W, T20.X, literal.x,
867; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
868; EG-NEXT:     BCNT_INT T0.W, PV.W,
869; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
870; EG-NEXT:    -65536(nan), 0(0.000000e+00)
871; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
872; EG-NEXT:     MOV T4.X, PV.W,
873; EG-NEXT:     MOV * T0.X, T5.X,
874; EG-NEXT:     LSHR * T0.W, T20.Y, literal.x,
875; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
876; EG-NEXT:     BCNT_INT T0.W, PV.W,
877; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
878; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
879; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
880; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
881; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
882; EG-NEXT:     MOV * T5.X, PV.W,
883; EG-NEXT:     MOV T0.X, PV.X,
884; EG-NEXT:     AND_INT * T0.W, T20.Y, literal.x,
885; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
886; EG-NEXT:     BCNT_INT T0.W, PV.W,
887; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
888; EG-NEXT:    -65536(nan), 0(0.000000e+00)
889; EG-NEXT:     OR_INT * T0.Y, PS, PV.W,
890; EG-NEXT:     MOV T5.X, PV.Y,
891; EG-NEXT:     MOV * T0.X, T8.X,
892; EG-NEXT:     LSHR * T0.W, T20.Z, literal.x,
893; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
894; EG-NEXT:     BCNT_INT T0.W, PV.W,
895; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
896; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
897; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
898; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
899; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
900; EG-NEXT:     MOV * T8.X, PV.W,
901; EG-NEXT:     MOV T0.X, PV.X,
902; EG-NEXT:     AND_INT * T0.W, T20.Z, literal.x,
903; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
904; EG-NEXT:     BCNT_INT T0.W, PV.W,
905; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
906; EG-NEXT:    -65536(nan), 0(0.000000e+00)
907; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
908; EG-NEXT:     MOV T8.X, PV.W,
909; EG-NEXT:     MOV * T0.X, T9.X,
910; EG-NEXT:     LSHR * T0.W, T20.W, literal.x,
911; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
912; EG-NEXT:     BCNT_INT T0.W, PV.W,
913; EG-NEXT:     AND_INT * T1.W, T0.X, literal.x,
914; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
915; EG-NEXT:     LSHL * T0.W, PV.W, literal.x,
916; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
917; EG-NEXT:     OR_INT * T0.W, T1.W, PV.W,
918; EG-NEXT:     MOV * T9.X, PV.W,
919; EG-NEXT:     MOV T0.X, PV.X,
920; EG-NEXT:     AND_INT * T0.W, T20.W, literal.x,
921; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
922; EG-NEXT:     BCNT_INT T0.W, PV.W,
923; EG-NEXT:     AND_INT * T1.W, PV.X, literal.x,
924; EG-NEXT:    -65536(nan), 0(0.000000e+00)
925; EG-NEXT:     OR_INT * T0.W, PS, PV.W,
926; EG-NEXT:     MOV T9.X, PV.W,
927; EG-NEXT:     MOV * T0.X, T12.X,
928; EG-NEXT:     LSHR * T1.W, T21.X, literal.x,
929; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
930; EG-NEXT:     BCNT_INT T1.W, PV.W,
931; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
932; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
933; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
934; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
935; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
936; EG-NEXT:     MOV * T12.X, PV.W,
937; EG-NEXT:     MOV T0.X, PV.X,
938; EG-NEXT:     AND_INT * T1.W, T21.X, literal.x,
939; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
940; EG-NEXT:     BCNT_INT T1.W, PV.W,
941; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
942; EG-NEXT:    -65536(nan), 0(0.000000e+00)
943; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
944; EG-NEXT:     MOV T12.X, PV.W,
945; EG-NEXT:     MOV * T0.X, T13.X,
946; EG-NEXT:     LSHR * T1.W, T21.Y, literal.x,
947; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
948; EG-NEXT:     BCNT_INT T1.W, PV.W,
949; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
950; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
951; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
952; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
953; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
954; EG-NEXT:     MOV * T13.X, PV.W,
955; EG-NEXT:     MOV T0.X, PV.X,
956; EG-NEXT:     AND_INT * T1.W, T21.Y, literal.x,
957; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
958; EG-NEXT:     BCNT_INT T1.W, PV.W,
959; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
960; EG-NEXT:    -65536(nan), 0(0.000000e+00)
961; EG-NEXT:     OR_INT * T20.Y, PS, PV.W,
962; EG-NEXT:     MOV T13.X, PV.Y,
963; EG-NEXT:     MOV * T0.X, T16.X,
964; EG-NEXT:     LSHR * T1.W, T21.Z, literal.x,
965; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
966; EG-NEXT:     BCNT_INT T1.W, PV.W,
967; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
968; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
969; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
970; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
971; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
972; EG-NEXT:    ALU clause starting at 131:
973; EG-NEXT:     MOV * T16.X, T1.W,
974; EG-NEXT:     MOV T0.X, PV.X,
975; EG-NEXT:     AND_INT * T1.W, T21.Z, literal.x,
976; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
977; EG-NEXT:     BCNT_INT T1.W, PV.W,
978; EG-NEXT:     AND_INT * T2.W, PV.X, literal.x,
979; EG-NEXT:    -65536(nan), 0(0.000000e+00)
980; EG-NEXT:     OR_INT * T1.W, PS, PV.W,
981; EG-NEXT:     MOV T16.X, PV.W,
982; EG-NEXT:     MOV * T0.X, T17.X,
983; EG-NEXT:     LSHR * T1.W, T21.W, literal.x,
984; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
985; EG-NEXT:     BCNT_INT T1.W, PV.W,
986; EG-NEXT:     AND_INT * T2.W, T0.X, literal.x,
987; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
988; EG-NEXT:     LSHL * T1.W, PV.W, literal.x,
989; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
990; EG-NEXT:     OR_INT * T1.W, T2.W, PV.W,
991; EG-NEXT:     MOV * T17.X, PV.W,
992; EG-NEXT:     MOV T0.X, PV.X,
993; EG-NEXT:     AND_INT T1.W, T21.W, literal.x,
994; EG-NEXT:     LSHR * T21.X, KC0[2].Y, literal.y,
995; EG-NEXT:    65535(9.183409e-41), 2(2.802597e-45)
996; EG-NEXT:     AND_INT T0.Z, PV.X, literal.x,
997; EG-NEXT:     BCNT_INT T1.W, PV.W,
998; EG-NEXT:     ADD_INT * T2.W, KC0[2].Y, literal.y,
999; EG-NEXT:    -65536(nan), 16(2.242078e-44)
1000; EG-NEXT:     LSHR T22.X, PS, literal.x,
1001; EG-NEXT:     OR_INT * T20.W, PV.Z, PV.W,
1002; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1003; EG-NEXT:     MOV T17.X, PV.W,
1004; EG-NEXT:     MOV * T0.X, T4.X,
1005; EG-NEXT:     MOV * T0.Z, T8.X,
1006; EG-NEXT:     MOV T20.X, T12.X,
1007; EG-NEXT:     MOV * T20.Z, T16.X, BS:VEC_120/SCL_212
1008  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1009  %in.gep = getelementptr <16 x i16>, ptr addrspace(1) %in, i32 %tid
1010  %val = load <16 x i16>, ptr addrspace(1) %in.gep, align 32
1011  %ctpop = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %val) nounwind readnone
1012  store <16 x i16> %ctpop, ptr addrspace(1) %out, align 32
1013  ret void
1014}
1015
1016define amdgpu_kernel void @v_ctpop_i16_add_inline_constant(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1017; SI-LABEL: v_ctpop_i16_add_inline_constant:
1018; SI:       ; %bb.0:
1019; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1020; SI-NEXT:    s_mov_b32 s7, 0xf000
1021; SI-NEXT:    s_mov_b32 s10, 0
1022; SI-NEXT:    s_mov_b32 s11, s7
1023; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1024; SI-NEXT:    s_waitcnt lgkmcnt(0)
1025; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1026; SI-NEXT:    v_mov_b32_e32 v1, 0
1027; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1028; SI-NEXT:    s_mov_b32 s6, -1
1029; SI-NEXT:    s_mov_b32 s4, s0
1030; SI-NEXT:    s_mov_b32 s5, s1
1031; SI-NEXT:    s_waitcnt vmcnt(0)
1032; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1033; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1034; SI-NEXT:    s_endpgm
1035;
1036; VI-LABEL: v_ctpop_i16_add_inline_constant:
1037; VI:       ; %bb.0:
1038; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1039; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1040; VI-NEXT:    s_waitcnt lgkmcnt(0)
1041; VI-NEXT:    v_mov_b32_e32 v1, s3
1042; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1043; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1044; VI-NEXT:    flat_load_ushort v0, v[0:1]
1045; VI-NEXT:    s_mov_b32 s3, 0xf000
1046; VI-NEXT:    s_mov_b32 s2, -1
1047; VI-NEXT:    s_waitcnt vmcnt(0)
1048; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1049; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1050; VI-NEXT:    s_endpgm
1051;
1052; EG-LABEL: v_ctpop_i16_add_inline_constant:
1053; EG:       ; %bb.0:
1054; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1055; EG-NEXT:    TEX 0 @6
1056; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1057; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1058; EG-NEXT:    CF_END
1059; EG-NEXT:    PAD
1060; EG-NEXT:    Fetch clause starting at 6:
1061; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1062; EG-NEXT:    ALU clause starting at 8:
1063; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1064; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1065; EG-NEXT:    ALU clause starting at 10:
1066; EG-NEXT:     BCNT_INT T0.W, T0.X,
1067; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1068; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1069; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1070; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1071; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1072; EG-NEXT:     LSHL T0.X, PV.W, PS,
1073; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1074; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1075; EG-NEXT:     MOV T0.Y, 0.0,
1076; EG-NEXT:     MOV * T0.Z, 0.0,
1077; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1078; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1079  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1080  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1081  %val = load i16, ptr addrspace(1) %in.gep, align 4
1082  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1083  %add = add i16 %ctpop, 4
1084  store i16 %add, ptr addrspace(1) %out, align 4
1085  ret void
1086}
1087
1088define amdgpu_kernel void @v_ctpop_i16_add_inline_constant_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1089; SI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1090; SI:       ; %bb.0:
1091; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1092; SI-NEXT:    s_mov_b32 s7, 0xf000
1093; SI-NEXT:    s_mov_b32 s10, 0
1094; SI-NEXT:    s_mov_b32 s11, s7
1095; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1096; SI-NEXT:    s_waitcnt lgkmcnt(0)
1097; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1098; SI-NEXT:    v_mov_b32_e32 v1, 0
1099; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1100; SI-NEXT:    s_mov_b32 s6, -1
1101; SI-NEXT:    s_mov_b32 s4, s0
1102; SI-NEXT:    s_mov_b32 s5, s1
1103; SI-NEXT:    s_waitcnt vmcnt(0)
1104; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 4
1105; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1106; SI-NEXT:    s_endpgm
1107;
1108; VI-LABEL: v_ctpop_i16_add_inline_constant_inv:
1109; VI:       ; %bb.0:
1110; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1111; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1112; VI-NEXT:    s_waitcnt lgkmcnt(0)
1113; VI-NEXT:    v_mov_b32_e32 v1, s3
1114; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1115; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1116; VI-NEXT:    flat_load_ushort v0, v[0:1]
1117; VI-NEXT:    s_mov_b32 s3, 0xf000
1118; VI-NEXT:    s_mov_b32 s2, -1
1119; VI-NEXT:    s_waitcnt vmcnt(0)
1120; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 4
1121; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1122; VI-NEXT:    s_endpgm
1123;
1124; EG-LABEL: v_ctpop_i16_add_inline_constant_inv:
1125; EG:       ; %bb.0:
1126; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1127; EG-NEXT:    TEX 0 @6
1128; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1129; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1130; EG-NEXT:    CF_END
1131; EG-NEXT:    PAD
1132; EG-NEXT:    Fetch clause starting at 6:
1133; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1134; EG-NEXT:    ALU clause starting at 8:
1135; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1136; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1137; EG-NEXT:    ALU clause starting at 10:
1138; EG-NEXT:     BCNT_INT T0.W, T0.X,
1139; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1140; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1141; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1142; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1143; EG-NEXT:    4(5.605194e-45), 3(4.203895e-45)
1144; EG-NEXT:     LSHL T0.X, PV.W, PS,
1145; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1146; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1147; EG-NEXT:     MOV T0.Y, 0.0,
1148; EG-NEXT:     MOV * T0.Z, 0.0,
1149; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1150; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1151  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1152  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1153  %val = load i16, ptr addrspace(1) %in.gep, align 4
1154  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1155  %add = add i16 4, %ctpop
1156  store i16 %add, ptr addrspace(1) %out, align 4
1157  ret void
1158}
1159
1160define amdgpu_kernel void @v_ctpop_i16_add_literal(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
1161; SI-LABEL: v_ctpop_i16_add_literal:
1162; SI:       ; %bb.0:
1163; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1164; SI-NEXT:    s_mov_b32 s7, 0xf000
1165; SI-NEXT:    s_mov_b32 s10, 0
1166; SI-NEXT:    s_mov_b32 s11, s7
1167; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1168; SI-NEXT:    s_waitcnt lgkmcnt(0)
1169; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1170; SI-NEXT:    v_mov_b32_e32 v1, 0
1171; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1172; SI-NEXT:    s_mov_b32 s4, s0
1173; SI-NEXT:    s_movk_i32 s0, 0x3e7
1174; SI-NEXT:    s_mov_b32 s6, -1
1175; SI-NEXT:    s_mov_b32 s5, s1
1176; SI-NEXT:    s_waitcnt vmcnt(0)
1177; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s0
1178; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1179; SI-NEXT:    s_endpgm
1180;
1181; VI-LABEL: v_ctpop_i16_add_literal:
1182; VI:       ; %bb.0:
1183; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1184; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1185; VI-NEXT:    s_movk_i32 s4, 0x3e7
1186; VI-NEXT:    s_waitcnt lgkmcnt(0)
1187; VI-NEXT:    v_mov_b32_e32 v1, s3
1188; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1189; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1190; VI-NEXT:    flat_load_ushort v0, v[0:1]
1191; VI-NEXT:    s_mov_b32 s3, 0xf000
1192; VI-NEXT:    s_mov_b32 s2, -1
1193; VI-NEXT:    s_waitcnt vmcnt(0)
1194; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1195; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1196; VI-NEXT:    s_endpgm
1197;
1198; EG-LABEL: v_ctpop_i16_add_literal:
1199; EG:       ; %bb.0:
1200; EG-NEXT:    ALU 1, @8, KC0[CB0:0-32], KC1[]
1201; EG-NEXT:    TEX 0 @6
1202; EG-NEXT:    ALU 12, @10, KC0[CB0:0-32], KC1[]
1203; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1204; EG-NEXT:    CF_END
1205; EG-NEXT:    PAD
1206; EG-NEXT:    Fetch clause starting at 6:
1207; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1208; EG-NEXT:    ALU clause starting at 8:
1209; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1210; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1211; EG-NEXT:    ALU clause starting at 10:
1212; EG-NEXT:     BCNT_INT T0.W, T0.X,
1213; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1214; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1215; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
1216; EG-NEXT:     LSHL * T1.W, PS, literal.y,
1217; EG-NEXT:    999(1.399897e-42), 3(4.203895e-45)
1218; EG-NEXT:     LSHL T0.X, PV.W, PS,
1219; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1220; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1221; EG-NEXT:     MOV T0.Y, 0.0,
1222; EG-NEXT:     MOV * T0.Z, 0.0,
1223; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1224; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1225  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1226  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1227  %val = load i16, ptr addrspace(1) %in.gep, align 4
1228  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1229  %add = add i16 %ctpop, 999
1230  store i16 %add, ptr addrspace(1) %out, align 4
1231  ret void
1232}
1233
1234define amdgpu_kernel void @v_ctpop_i16_add_var(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
1235; SI-LABEL: v_ctpop_i16_add_var:
1236; SI:       ; %bb.0:
1237; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1238; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
1239; SI-NEXT:    s_mov_b32 s7, 0xf000
1240; SI-NEXT:    s_mov_b32 s10, 0
1241; SI-NEXT:    s_mov_b32 s11, s7
1242; SI-NEXT:    s_waitcnt lgkmcnt(0)
1243; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1244; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1245; SI-NEXT:    v_mov_b32_e32 v1, 0
1246; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1247; SI-NEXT:    s_mov_b32 s6, -1
1248; SI-NEXT:    s_mov_b32 s4, s0
1249; SI-NEXT:    s_mov_b32 s5, s1
1250; SI-NEXT:    s_waitcnt vmcnt(0)
1251; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s12
1252; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1253; SI-NEXT:    s_endpgm
1254;
1255; VI-LABEL: v_ctpop_i16_add_var:
1256; VI:       ; %bb.0:
1257; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1258; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
1259; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1260; VI-NEXT:    s_waitcnt lgkmcnt(0)
1261; VI-NEXT:    v_mov_b32_e32 v1, s3
1262; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1263; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1264; VI-NEXT:    flat_load_ushort v0, v[0:1]
1265; VI-NEXT:    s_mov_b32 s3, 0xf000
1266; VI-NEXT:    s_mov_b32 s2, -1
1267; VI-NEXT:    s_waitcnt vmcnt(0)
1268; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1269; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1270; VI-NEXT:    s_endpgm
1271;
1272; EG-LABEL: v_ctpop_i16_add_var:
1273; EG:       ; %bb.0:
1274; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1275; EG-NEXT:    TEX 0 @8
1276; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1277; EG-NEXT:    TEX 0 @10
1278; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1279; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1280; EG-NEXT:    CF_END
1281; EG-NEXT:    PAD
1282; EG-NEXT:    Fetch clause starting at 8:
1283; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1284; EG-NEXT:    Fetch clause starting at 10:
1285; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1286; EG-NEXT:    ALU clause starting at 12:
1287; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1288; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1289; EG-NEXT:    ALU clause starting at 14:
1290; EG-NEXT:     MOV * T1.X, 0.0,
1291; EG-NEXT:    ALU clause starting at 15:
1292; EG-NEXT:     BCNT_INT T0.W, T0.X,
1293; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1294; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1295; EG-NEXT:     ADD_INT * T0.W, PV.W, T1.X,
1296; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1297; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1298; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1299; EG-NEXT:     LSHL T0.X, PV.W, PS,
1300; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1301; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1302; EG-NEXT:     MOV T0.Y, 0.0,
1303; EG-NEXT:     MOV * T0.Z, 0.0,
1304; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1305; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1306  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1307  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1308  %val = load i16, ptr addrspace(1) %in.gep, align 4
1309  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1310  %add = add i16 %ctpop, %const
1311  store i16 %add, ptr addrspace(1) %out, align 4
1312  ret void
1313}
1314
1315define amdgpu_kernel void @v_ctpop_i16_add_var_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i16 %const) nounwind {
1316; SI-LABEL: v_ctpop_i16_add_var_inv:
1317; SI:       ; %bb.0:
1318; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1319; SI-NEXT:    s_load_dword s12, s[4:5], 0xd
1320; SI-NEXT:    s_mov_b32 s7, 0xf000
1321; SI-NEXT:    s_mov_b32 s10, 0
1322; SI-NEXT:    s_mov_b32 s11, s7
1323; SI-NEXT:    s_waitcnt lgkmcnt(0)
1324; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
1325; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1326; SI-NEXT:    v_mov_b32_e32 v1, 0
1327; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[8:11], 0 addr64
1328; SI-NEXT:    s_mov_b32 s6, -1
1329; SI-NEXT:    s_mov_b32 s4, s0
1330; SI-NEXT:    s_mov_b32 s5, s1
1331; SI-NEXT:    s_waitcnt vmcnt(0)
1332; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, s12
1333; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
1334; SI-NEXT:    s_endpgm
1335;
1336; VI-LABEL: v_ctpop_i16_add_var_inv:
1337; VI:       ; %bb.0:
1338; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1339; VI-NEXT:    s_load_dword s4, s[4:5], 0x34
1340; VI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1341; VI-NEXT:    s_waitcnt lgkmcnt(0)
1342; VI-NEXT:    v_mov_b32_e32 v1, s3
1343; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
1344; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1345; VI-NEXT:    flat_load_ushort v0, v[0:1]
1346; VI-NEXT:    s_mov_b32 s3, 0xf000
1347; VI-NEXT:    s_mov_b32 s2, -1
1348; VI-NEXT:    s_waitcnt vmcnt(0)
1349; VI-NEXT:    v_bcnt_u32_b32 v0, v0, s4
1350; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1351; VI-NEXT:    s_endpgm
1352;
1353; EG-LABEL: v_ctpop_i16_add_var_inv:
1354; EG:       ; %bb.0:
1355; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1356; EG-NEXT:    TEX 0 @8
1357; EG-NEXT:    ALU 0, @14, KC0[], KC1[]
1358; EG-NEXT:    TEX 0 @10
1359; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1360; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1361; EG-NEXT:    CF_END
1362; EG-NEXT:    PAD
1363; EG-NEXT:    Fetch clause starting at 8:
1364; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1365; EG-NEXT:    Fetch clause starting at 10:
1366; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 44, #3
1367; EG-NEXT:    ALU clause starting at 12:
1368; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1369; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1370; EG-NEXT:    ALU clause starting at 14:
1371; EG-NEXT:     MOV * T1.X, 0.0,
1372; EG-NEXT:    ALU clause starting at 15:
1373; EG-NEXT:     BCNT_INT T0.W, T0.X,
1374; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1375; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1376; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1377; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1378; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1379; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1380; EG-NEXT:     LSHL T0.X, PV.W, PS,
1381; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1382; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1383; EG-NEXT:     MOV T0.Y, 0.0,
1384; EG-NEXT:     MOV * T0.Z, 0.0,
1385; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1386; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1387  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1388  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1389  %val = load i16, ptr addrspace(1) %in.gep, align 4
1390  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1391  %add = add i16 %const, %ctpop
1392  store i16 %add, ptr addrspace(1) %out, align 4
1393  ret void
1394}
1395
1396define amdgpu_kernel void @v_ctpop_i16_add_vvar_inv(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, ptr addrspace(1) noalias %constptr) nounwind {
1397; SI-LABEL: v_ctpop_i16_add_vvar_inv:
1398; SI:       ; %bb.0:
1399; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1400; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0xd
1401; SI-NEXT:    s_mov_b32 s11, 0xf000
1402; SI-NEXT:    s_mov_b32 s14, 0
1403; SI-NEXT:    s_mov_b32 s15, s11
1404; SI-NEXT:    s_waitcnt lgkmcnt(0)
1405; SI-NEXT:    s_mov_b64 s[12:13], s[2:3]
1406; SI-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
1407; SI-NEXT:    v_mov_b32_e32 v1, 0
1408; SI-NEXT:    s_mov_b64 s[6:7], s[14:15]
1409; SI-NEXT:    buffer_load_ushort v2, v[0:1], s[12:15], 0 addr64
1410; SI-NEXT:    buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64
1411; SI-NEXT:    s_mov_b32 s10, -1
1412; SI-NEXT:    s_mov_b32 s8, s0
1413; SI-NEXT:    s_mov_b32 s9, s1
1414; SI-NEXT:    s_waitcnt vmcnt(0)
1415; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v2, v0
1416; SI-NEXT:    buffer_store_short v0, off, s[8:11], 0
1417; SI-NEXT:    s_endpgm
1418;
1419; VI-LABEL: v_ctpop_i16_add_vvar_inv:
1420; VI:       ; %bb.0:
1421; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1422; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
1423; VI-NEXT:    v_lshlrev_b32_e32 v2, 1, v0
1424; VI-NEXT:    s_waitcnt lgkmcnt(0)
1425; VI-NEXT:    v_mov_b32_e32 v1, s3
1426; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v2
1427; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1428; VI-NEXT:    flat_load_ushort v3, v[0:1]
1429; VI-NEXT:    v_mov_b32_e32 v1, s5
1430; VI-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
1431; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
1432; VI-NEXT:    flat_load_ushort v0, v[0:1]
1433; VI-NEXT:    s_mov_b32 s3, 0xf000
1434; VI-NEXT:    s_mov_b32 s2, -1
1435; VI-NEXT:    s_waitcnt vmcnt(0)
1436; VI-NEXT:    v_bcnt_u32_b32 v0, v3, v0
1437; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1438; VI-NEXT:    s_endpgm
1439;
1440; EG-LABEL: v_ctpop_i16_add_vvar_inv:
1441; EG:       ; %bb.0:
1442; EG-NEXT:    ALU 1, @12, KC0[CB0:0-32], KC1[]
1443; EG-NEXT:    TEX 0 @8
1444; EG-NEXT:    ALU 0, @14, KC0[CB0:0-32], KC1[]
1445; EG-NEXT:    TEX 0 @10
1446; EG-NEXT:    ALU 13, @15, KC0[CB0:0-32], KC1[]
1447; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
1448; EG-NEXT:    CF_END
1449; EG-NEXT:    PAD
1450; EG-NEXT:    Fetch clause starting at 8:
1451; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 0, #1
1452; EG-NEXT:    Fetch clause starting at 10:
1453; EG-NEXT:     VTX_READ_16 T1.X, T1.X, 0, #1
1454; EG-NEXT:    ALU clause starting at 12:
1455; EG-NEXT:     LSHL * T0.W, T0.X, 1,
1456; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, PV.W,
1457; EG-NEXT:    ALU clause starting at 14:
1458; EG-NEXT:     ADD_INT * T1.X, KC0[2].W, T0.W,
1459; EG-NEXT:    ALU clause starting at 15:
1460; EG-NEXT:     BCNT_INT T0.W, T0.X,
1461; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
1462; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1463; EG-NEXT:     ADD_INT * T0.W, T1.X, PV.W,
1464; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
1465; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
1466; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
1467; EG-NEXT:     LSHL T0.X, PV.W, PS,
1468; EG-NEXT:     LSHL * T0.W, literal.x, PS,
1469; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1470; EG-NEXT:     MOV T0.Y, 0.0,
1471; EG-NEXT:     MOV * T0.Z, 0.0,
1472; EG-NEXT:     LSHR * T1.X, KC0[2].Y, literal.x,
1473; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1474  %tid = call i32 @llvm.amdgcn.workitem.id.x()
1475  %in.gep = getelementptr i16, ptr addrspace(1) %in, i32 %tid
1476  %val = load i16, ptr addrspace(1) %in.gep, align 4
1477  %ctpop = call i16 @llvm.ctpop.i16(i16 %val) nounwind readnone
1478  %gep = getelementptr i16, ptr addrspace(1) %constptr, i32 %tid
1479  %const = load i16, ptr addrspace(1) %gep, align 4
1480  %add = add i16 %const, %ctpop
1481  store i16 %add, ptr addrspace(1) %out, align 4
1482  ret void
1483}
1484
1485; FIXME: We currently disallow SALU instructions in all branches,
1486; but there are some cases when the should be allowed.
1487define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i16 %ctpop_arg, i16 %cond) {
1488; SI-LABEL: ctpop_i16_in_br:
1489; SI:       ; %bb.0: ; %entry
1490; SI-NEXT:    s_load_dword s6, s[4:5], 0xd
1491; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
1492; SI-NEXT:    s_waitcnt lgkmcnt(0)
1493; SI-NEXT:    s_lshr_b32 s4, s6, 16
1494; SI-NEXT:    s_cmp_lg_u32 s4, 0
1495; SI-NEXT:    s_cbranch_scc0 .LBB14_4
1496; SI-NEXT:  ; %bb.1: ; %else
1497; SI-NEXT:    s_mov_b32 s11, 0xf000
1498; SI-NEXT:    s_mov_b32 s10, -1
1499; SI-NEXT:    s_mov_b32 s8, s2
1500; SI-NEXT:    s_mov_b32 s9, s3
1501; SI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1502; SI-NEXT:    s_cbranch_execnz .LBB14_3
1503; SI-NEXT:  .LBB14_2: ; %if
1504; SI-NEXT:    s_and_b32 s2, s6, 0xffff
1505; SI-NEXT:    s_bcnt1_i32_b32 s2, s2
1506; SI-NEXT:    s_waitcnt vmcnt(0)
1507; SI-NEXT:    v_mov_b32_e32 v0, s2
1508; SI-NEXT:  .LBB14_3: ; %endif
1509; SI-NEXT:    s_mov_b32 s3, 0xf000
1510; SI-NEXT:    s_mov_b32 s2, -1
1511; SI-NEXT:    s_waitcnt vmcnt(0)
1512; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1513; SI-NEXT:    s_endpgm
1514; SI-NEXT:  .LBB14_4:
1515; SI-NEXT:    v_mov_b32_e32 v0, 0
1516; SI-NEXT:    s_branch .LBB14_2
1517;
1518; VI-LABEL: ctpop_i16_in_br:
1519; VI:       ; %bb.0: ; %entry
1520; VI-NEXT:    s_load_dword s6, s[4:5], 0x34
1521; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
1522; VI-NEXT:    s_waitcnt lgkmcnt(0)
1523; VI-NEXT:    s_lshr_b32 s4, s6, 16
1524; VI-NEXT:    s_cmp_lg_u32 s4, 0
1525; VI-NEXT:    s_cbranch_scc0 .LBB14_4
1526; VI-NEXT:  ; %bb.1: ; %else
1527; VI-NEXT:    s_mov_b32 s11, 0xf000
1528; VI-NEXT:    s_mov_b32 s10, -1
1529; VI-NEXT:    s_mov_b32 s8, s2
1530; VI-NEXT:    s_mov_b32 s9, s3
1531; VI-NEXT:    buffer_load_ushort v0, off, s[8:11], 0 offset:2
1532; VI-NEXT:    s_cbranch_execnz .LBB14_3
1533; VI-NEXT:  .LBB14_2: ; %if
1534; VI-NEXT:    s_and_b32 s2, s6, 0xffff
1535; VI-NEXT:    s_bcnt1_i32_b32 s2, s2
1536; VI-NEXT:    s_waitcnt vmcnt(0)
1537; VI-NEXT:    v_mov_b32_e32 v0, s2
1538; VI-NEXT:  .LBB14_3: ; %endif
1539; VI-NEXT:    s_mov_b32 s3, 0xf000
1540; VI-NEXT:    s_mov_b32 s2, -1
1541; VI-NEXT:    s_waitcnt vmcnt(0)
1542; VI-NEXT:    buffer_store_short v0, off, s[0:3], 0
1543; VI-NEXT:    s_endpgm
1544; VI-NEXT:  .LBB14_4:
1545; VI-NEXT:    ; implicit-def: $vgpr0
1546; VI-NEXT:    s_branch .LBB14_2
1547;
1548; EG-LABEL: ctpop_i16_in_br:
1549; EG:       ; %bb.0: ; %entry
1550; EG-NEXT:    ALU 0, @20, KC0[], KC1[]
1551; EG-NEXT:    TEX 0 @14
1552; EG-NEXT:    ALU_PUSH_BEFORE 4, @21, KC0[], KC1[]
1553; EG-NEXT:    JUMP @7 POP:1
1554; EG-NEXT:    ALU 0, @26, KC0[CB0:0-32], KC1[]
1555; EG-NEXT:    TEX 0 @16
1556; EG-NEXT:    ALU_POP_AFTER 1, @27, KC0[], KC1[]
1557; EG-NEXT:    ALU_PUSH_BEFORE 2, @29, KC0[CB0:0-32], KC1[]
1558; EG-NEXT:    JUMP @11 POP:1
1559; EG-NEXT:    TEX 0 @18
1560; EG-NEXT:    ALU_POP_AFTER 0, @32, KC0[], KC1[]
1561; EG-NEXT:    ALU 11, @33, KC0[], KC1[]
1562; EG-NEXT:    MEM_RAT MSKOR T1.XW, T0.X
1563; EG-NEXT:    CF_END
1564; EG-NEXT:    Fetch clause starting at 14:
1565; EG-NEXT:     VTX_READ_16 T2.X, T1.X, 46, #3
1566; EG-NEXT:    Fetch clause starting at 16:
1567; EG-NEXT:     VTX_READ_16 T0.X, T0.X, 2, #1
1568; EG-NEXT:    Fetch clause starting at 18:
1569; EG-NEXT:     VTX_READ_16 T0.X, T1.X, 44, #3
1570; EG-NEXT:    ALU clause starting at 20:
1571; EG-NEXT:     MOV * T1.X, 0.0,
1572; EG-NEXT:    ALU clause starting at 21:
1573; EG-NEXT:     MOV T0.X, literal.x,
1574; EG-NEXT:     MOV T1.W, literal.y,
1575; EG-NEXT:     SETNE_INT * T0.W, T2.X, 0.0,
1576; EG-NEXT:    0(0.000000e+00), 1(1.401298e-45)
1577; EG-NEXT:     PRED_SETNE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1578; EG-NEXT:    ALU clause starting at 26:
1579; EG-NEXT:     MOV * T0.X, KC0[2].Z,
1580; EG-NEXT:    ALU clause starting at 27:
1581; EG-NEXT:     MOV * T1.W, literal.x,
1582; EG-NEXT:    0(0.000000e+00), 0(0.000000e+00)
1583; EG-NEXT:    ALU clause starting at 29:
1584; EG-NEXT:     MOV T0.W, KC0[2].Y,
1585; EG-NEXT:     SETE_INT * T1.W, T1.W, 0.0,
1586; EG-NEXT:     PRED_SETE_INT * ExecMask,PredicateBit (MASKED), PS, 0.0,
1587; EG-NEXT:    ALU clause starting at 32:
1588; EG-NEXT:     BCNT_INT * T0.X, T0.X,
1589; EG-NEXT:    ALU clause starting at 33:
1590; EG-NEXT:     LSHL * T1.W, T0.W, literal.x,
1591; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
1592; EG-NEXT:     AND_INT T1.W, PV.W, literal.x,
1593; EG-NEXT:     AND_INT * T2.W, T0.X, literal.y,
1594; EG-NEXT:    24(3.363116e-44), 65535(9.183409e-41)
1595; EG-NEXT:     LSHL T1.X, PS, PV.W,
1596; EG-NEXT:     LSHL * T1.W, literal.x, PV.W,
1597; EG-NEXT:    65535(9.183409e-41), 0(0.000000e+00)
1598; EG-NEXT:     MOV T1.Y, 0.0,
1599; EG-NEXT:     MOV * T1.Z, 0.0,
1600; EG-NEXT:     LSHR * T0.X, T0.W, literal.x,
1601; EG-NEXT:    2(2.802597e-45), 0(0.000000e+00)
1602entry:
1603  %tmp0 = icmp eq i16 %cond, 0
1604  br i1 %tmp0, label %if, label %else
1605
1606if:
1607  %tmp2 = call i16 @llvm.ctpop.i16(i16 %ctpop_arg)
1608  br label %endif
1609
1610else:
1611  %tmp3 = getelementptr i16, ptr addrspace(1) %in, i16 1
1612  %tmp4 = load i16, ptr addrspace(1) %tmp3
1613  br label %endif
1614
1615endif:
1616  %tmp5 = phi i16 [%tmp2, %if], [%tmp4, %else]
1617  store i16 %tmp5, ptr addrspace(1) %out
1618  ret void
1619}
1620