xref: /llvm-project/llvm/test/CodeGen/AMDGPU/ctpop64.ll (revision 6548b6354d1d990e1c98736f5e7c3de876bedc8e)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
3; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
4
5declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
6
7declare i64 @llvm.ctpop.i64(i64) nounwind readnone
8declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>) nounwind readnone
9declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>) nounwind readnone
10declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>) nounwind readnone
11declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>) nounwind readnone
12
13declare i65 @llvm.ctpop.i65(i65) nounwind readnone
14declare i128 @llvm.ctpop.i128(i128) nounwind readnone
15
16define amdgpu_kernel void @s_ctpop_i64(ptr addrspace(1) noalias %out, [8 x i32], i64 %val) nounwind {
17; SI-LABEL: s_ctpop_i64:
18; SI:       ; %bb.0:
19; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x13
20; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
21; SI-NEXT:    s_mov_b32 s3, 0xf000
22; SI-NEXT:    s_mov_b32 s2, -1
23; SI-NEXT:    s_waitcnt lgkmcnt(0)
24; SI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
25; SI-NEXT:    v_mov_b32_e32 v0, s4
26; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
27; SI-NEXT:    s_endpgm
28;
29; VI-LABEL: s_ctpop_i64:
30; VI:       ; %bb.0:
31; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
32; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
33; VI-NEXT:    s_mov_b32 s3, 0xf000
34; VI-NEXT:    s_mov_b32 s2, -1
35; VI-NEXT:    s_waitcnt lgkmcnt(0)
36; VI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
37; VI-NEXT:    v_mov_b32_e32 v0, s4
38; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
39; VI-NEXT:    s_endpgm
40  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
41  %truncctpop = trunc i64 %ctpop to i32
42  store i32 %truncctpop, ptr addrspace(1) %out, align 4
43  ret void
44}
45
46define amdgpu_kernel void @v_ctpop_i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
47; SI-LABEL: v_ctpop_i64:
48; SI:       ; %bb.0:
49; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
50; SI-NEXT:    s_mov_b32 s7, 0xf000
51; SI-NEXT:    s_mov_b32 s10, 0
52; SI-NEXT:    s_mov_b32 s11, s7
53; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
54; SI-NEXT:    s_waitcnt lgkmcnt(0)
55; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
56; SI-NEXT:    v_mov_b32_e32 v1, 0
57; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
58; SI-NEXT:    s_mov_b32 s6, -1
59; SI-NEXT:    s_mov_b32 s4, s0
60; SI-NEXT:    s_mov_b32 s5, s1
61; SI-NEXT:    s_waitcnt vmcnt(0)
62; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
63; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v1, v0
64; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
65; SI-NEXT:    s_endpgm
66;
67; VI-LABEL: v_ctpop_i64:
68; VI:       ; %bb.0:
69; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
70; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
71; VI-NEXT:    s_waitcnt lgkmcnt(0)
72; VI-NEXT:    v_mov_b32_e32 v1, s3
73; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
74; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
75; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
76; VI-NEXT:    s_mov_b32 s3, 0xf000
77; VI-NEXT:    s_mov_b32 s2, -1
78; VI-NEXT:    s_waitcnt vmcnt(0)
79; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
80; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
81; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
82; VI-NEXT:    s_endpgm
83  %tid = call i32 @llvm.amdgcn.workitem.id.x()
84  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
85  %val = load i64, ptr addrspace(1) %in.gep, align 8
86  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
87  %truncctpop = trunc i64 %ctpop to i32
88  store i32 %truncctpop, ptr addrspace(1) %out, align 4
89  ret void
90}
91
92define amdgpu_kernel void @v_ctpop_i64_user(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in, i64 %s.val) nounwind {
93; SI-LABEL: v_ctpop_i64_user:
94; SI:       ; %bb.0:
95; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
96; SI-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0xd
97; SI-NEXT:    s_mov_b32 s7, 0xf000
98; SI-NEXT:    s_mov_b32 s10, 0
99; SI-NEXT:    s_mov_b32 s11, s7
100; SI-NEXT:    s_waitcnt lgkmcnt(0)
101; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
102; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
103; SI-NEXT:    v_mov_b32_e32 v1, 0
104; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
105; SI-NEXT:    s_mov_b32 s6, -1
106; SI-NEXT:    s_mov_b32 s4, s0
107; SI-NEXT:    s_mov_b32 s5, s1
108; SI-NEXT:    s_waitcnt vmcnt(0)
109; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
110; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v1, v0
111; SI-NEXT:    v_mov_b32_e32 v1, s13
112; SI-NEXT:    v_or_b32_e32 v0, s12, v0
113; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
114; SI-NEXT:    s_endpgm
115;
116; VI-LABEL: v_ctpop_i64_user:
117; VI:       ; %bb.0:
118; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
119; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x34
120; VI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
121; VI-NEXT:    s_waitcnt lgkmcnt(0)
122; VI-NEXT:    v_mov_b32_e32 v1, s3
123; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
124; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
125; VI-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
126; VI-NEXT:    s_mov_b32 s3, 0xf000
127; VI-NEXT:    s_mov_b32 s2, -1
128; VI-NEXT:    s_waitcnt vmcnt(0)
129; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
130; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
131; VI-NEXT:    v_mov_b32_e32 v1, s5
132; VI-NEXT:    v_or_b32_e32 v0, s4, v0
133; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
134; VI-NEXT:    s_endpgm
135  %tid = call i32 @llvm.amdgcn.workitem.id.x()
136  %in.gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
137  %val = load i64, ptr addrspace(1) %in.gep, align 8
138  %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
139  %or = or i64 %ctpop, %s.val
140  store i64 %or, ptr addrspace(1) %out
141  ret void
142}
143
144define amdgpu_kernel void @s_ctpop_v2i64(ptr addrspace(1) noalias %out, <2 x i64> %val) nounwind {
145; SI-LABEL: s_ctpop_v2i64:
146; SI:       ; %bb.0:
147; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xd
148; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
149; SI-NEXT:    s_mov_b32 s7, 0xf000
150; SI-NEXT:    s_mov_b32 s6, -1
151; SI-NEXT:    s_waitcnt lgkmcnt(0)
152; SI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
153; SI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
154; SI-NEXT:    v_mov_b32_e32 v0, s0
155; SI-NEXT:    v_mov_b32_e32 v1, s1
156; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
157; SI-NEXT:    s_endpgm
158;
159; VI-LABEL: s_ctpop_v2i64:
160; VI:       ; %bb.0:
161; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
162; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
163; VI-NEXT:    s_mov_b32 s7, 0xf000
164; VI-NEXT:    s_mov_b32 s6, -1
165; VI-NEXT:    s_waitcnt lgkmcnt(0)
166; VI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
167; VI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
168; VI-NEXT:    v_mov_b32_e32 v0, s0
169; VI-NEXT:    v_mov_b32_e32 v1, s1
170; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
171; VI-NEXT:    s_endpgm
172  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
173  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
174  store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
175  ret void
176}
177
178define amdgpu_kernel void @s_ctpop_v4i64(ptr addrspace(1) noalias %out, <4 x i64> %val) nounwind {
179; SI-LABEL: s_ctpop_v4i64:
180; SI:       ; %bb.0:
181; SI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x11
182; SI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
183; SI-NEXT:    s_mov_b32 s3, 0xf000
184; SI-NEXT:    s_mov_b32 s2, -1
185; SI-NEXT:    s_waitcnt lgkmcnt(0)
186; SI-NEXT:    s_bcnt1_i32_b64 s4, s[8:9]
187; SI-NEXT:    s_bcnt1_i32_b64 s5, s[10:11]
188; SI-NEXT:    s_bcnt1_i32_b64 s6, s[12:13]
189; SI-NEXT:    s_bcnt1_i32_b64 s7, s[14:15]
190; SI-NEXT:    v_mov_b32_e32 v0, s4
191; SI-NEXT:    v_mov_b32_e32 v1, s5
192; SI-NEXT:    v_mov_b32_e32 v2, s6
193; SI-NEXT:    v_mov_b32_e32 v3, s7
194; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
195; SI-NEXT:    s_endpgm
196;
197; VI-LABEL: s_ctpop_v4i64:
198; VI:       ; %bb.0:
199; VI-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
200; VI-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
201; VI-NEXT:    s_mov_b32 s3, 0xf000
202; VI-NEXT:    s_mov_b32 s2, -1
203; VI-NEXT:    s_waitcnt lgkmcnt(0)
204; VI-NEXT:    s_bcnt1_i32_b64 s4, s[8:9]
205; VI-NEXT:    s_bcnt1_i32_b64 s5, s[10:11]
206; VI-NEXT:    s_bcnt1_i32_b64 s6, s[12:13]
207; VI-NEXT:    s_bcnt1_i32_b64 s7, s[14:15]
208; VI-NEXT:    v_mov_b32_e32 v0, s4
209; VI-NEXT:    v_mov_b32_e32 v1, s5
210; VI-NEXT:    v_mov_b32_e32 v2, s6
211; VI-NEXT:    v_mov_b32_e32 v3, s7
212; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
213; VI-NEXT:    s_endpgm
214  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
215  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
216  store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
217  ret void
218}
219
220define amdgpu_kernel void @v_ctpop_v2i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
221; SI-LABEL: v_ctpop_v2i64:
222; SI:       ; %bb.0:
223; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
224; SI-NEXT:    s_mov_b32 s7, 0xf000
225; SI-NEXT:    s_mov_b32 s10, 0
226; SI-NEXT:    s_mov_b32 s11, s7
227; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
228; SI-NEXT:    s_waitcnt lgkmcnt(0)
229; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
230; SI-NEXT:    v_mov_b32_e32 v1, 0
231; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
232; SI-NEXT:    s_mov_b32 s6, -1
233; SI-NEXT:    s_mov_b32 s4, s0
234; SI-NEXT:    s_mov_b32 s5, s1
235; SI-NEXT:    s_waitcnt vmcnt(0)
236; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
237; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
238; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v1, v0
239; SI-NEXT:    v_bcnt_u32_b32_e32 v1, v3, v2
240; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
241; SI-NEXT:    s_endpgm
242;
243; VI-LABEL: v_ctpop_v2i64:
244; VI:       ; %bb.0:
245; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
246; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
247; VI-NEXT:    s_waitcnt lgkmcnt(0)
248; VI-NEXT:    v_mov_b32_e32 v1, s3
249; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
250; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
251; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
252; VI-NEXT:    s_mov_b32 s3, 0xf000
253; VI-NEXT:    s_mov_b32 s2, -1
254; VI-NEXT:    s_waitcnt vmcnt(0)
255; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
256; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
257; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
258; VI-NEXT:    v_bcnt_u32_b32 v1, v3, v2
259; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
260; VI-NEXT:    s_endpgm
261  %tid = call i32 @llvm.amdgcn.workitem.id.x()
262  %in.gep = getelementptr <2 x i64>, ptr addrspace(1) %in, i32 %tid
263  %val = load <2 x i64>, ptr addrspace(1) %in.gep, align 16
264  %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
265  %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
266  store <2 x i32> %truncctpop, ptr addrspace(1) %out, align 8
267  ret void
268}
269
270define amdgpu_kernel void @v_ctpop_v4i64(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
271; SI-LABEL: v_ctpop_v4i64:
272; SI:       ; %bb.0:
273; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
274; SI-NEXT:    s_mov_b32 s7, 0xf000
275; SI-NEXT:    s_mov_b32 s10, 0
276; SI-NEXT:    s_mov_b32 s11, s7
277; SI-NEXT:    v_lshlrev_b32_e32 v4, 5, v0
278; SI-NEXT:    s_waitcnt lgkmcnt(0)
279; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
280; SI-NEXT:    v_mov_b32_e32 v5, 0
281; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[4:5], s[8:11], 0 addr64
282; SI-NEXT:    buffer_load_dwordx4 v[4:7], v[4:5], s[8:11], 0 addr64 offset:16
283; SI-NEXT:    s_mov_b32 s6, -1
284; SI-NEXT:    s_mov_b32 s4, s0
285; SI-NEXT:    s_mov_b32 s5, s1
286; SI-NEXT:    s_waitcnt vmcnt(1)
287; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
288; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
289; SI-NEXT:    s_waitcnt vmcnt(0)
290; SI-NEXT:    v_bcnt_u32_b32_e64 v4, v4, 0
291; SI-NEXT:    v_bcnt_u32_b32_e64 v6, v6, 0
292; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v1, v0
293; SI-NEXT:    v_bcnt_u32_b32_e32 v1, v3, v2
294; SI-NEXT:    v_bcnt_u32_b32_e32 v2, v5, v4
295; SI-NEXT:    v_bcnt_u32_b32_e32 v3, v7, v6
296; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
297; SI-NEXT:    s_endpgm
298;
299; VI-LABEL: v_ctpop_v4i64:
300; VI:       ; %bb.0:
301; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
302; VI-NEXT:    v_lshlrev_b32_e32 v0, 5, v0
303; VI-NEXT:    s_waitcnt lgkmcnt(0)
304; VI-NEXT:    v_mov_b32_e32 v1, s3
305; VI-NEXT:    v_add_u32_e32 v4, vcc, s2, v0
306; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
307; VI-NEXT:    flat_load_dwordx4 v[0:3], v[4:5]
308; VI-NEXT:    v_add_u32_e32 v4, vcc, 16, v4
309; VI-NEXT:    v_addc_u32_e32 v5, vcc, 0, v5, vcc
310; VI-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
311; VI-NEXT:    s_mov_b32 s3, 0xf000
312; VI-NEXT:    s_mov_b32 s2, -1
313; VI-NEXT:    s_waitcnt vmcnt(1)
314; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
315; VI-NEXT:    v_bcnt_u32_b32 v8, v2, 0
316; VI-NEXT:    v_bcnt_u32_b32 v2, v1, v0
317; VI-NEXT:    v_bcnt_u32_b32 v3, v3, v8
318; VI-NEXT:    s_waitcnt vmcnt(0)
319; VI-NEXT:    v_bcnt_u32_b32 v4, v4, 0
320; VI-NEXT:    v_bcnt_u32_b32 v6, v6, 0
321; VI-NEXT:    v_bcnt_u32_b32 v4, v5, v4
322; VI-NEXT:    v_bcnt_u32_b32 v5, v7, v6
323; VI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[0:3], 0
324; VI-NEXT:    s_endpgm
325  %tid = call i32 @llvm.amdgcn.workitem.id.x()
326  %in.gep = getelementptr <4 x i64>, ptr addrspace(1) %in, i32 %tid
327  %val = load <4 x i64>, ptr addrspace(1) %in.gep, align 32
328  %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
329  %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
330  store <4 x i32> %truncctpop, ptr addrspace(1) %out, align 16
331  ret void
332}
333
334define amdgpu_kernel void @ctpop_i64_in_br(ptr addrspace(1) %out, ptr addrspace(1) %in, i64 %ctpop_arg, i32 %cond) {
335; SI-LABEL: ctpop_i64_in_br:
336; SI:       ; %bb.0: ; %entry
337; SI-NEXT:    s_load_dword s8, s[4:5], 0xf
338; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
339; SI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0xd
340; SI-NEXT:    s_waitcnt lgkmcnt(0)
341; SI-NEXT:    s_cmp_lg_u32 s8, 0
342; SI-NEXT:    s_cbranch_scc0 .LBB7_4
343; SI-NEXT:  ; %bb.1: ; %else
344; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x2
345; SI-NEXT:    s_mov_b64 s[2:3], 0
346; SI-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
347; SI-NEXT:    s_waitcnt lgkmcnt(0)
348; SI-NEXT:    s_mov_b64 vcc, vcc
349; SI-NEXT:    s_cbranch_vccnz .LBB7_3
350; SI-NEXT:  .LBB7_2: ; %if
351; SI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
352; SI-NEXT:    s_mov_b32 s5, 0
353; SI-NEXT:  .LBB7_3: ; %endif
354; SI-NEXT:    v_mov_b32_e32 v0, s4
355; SI-NEXT:    s_mov_b32 s3, 0xf000
356; SI-NEXT:    s_mov_b32 s2, -1
357; SI-NEXT:    v_mov_b32_e32 v1, s5
358; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
359; SI-NEXT:    s_endpgm
360; SI-NEXT:  .LBB7_4:
361; SI-NEXT:    ; implicit-def: $sgpr4_sgpr5
362; SI-NEXT:    s_branch .LBB7_2
363;
364; VI-LABEL: ctpop_i64_in_br:
365; VI:       ; %bb.0: ; %entry
366; VI-NEXT:    s_load_dword s8, s[4:5], 0x3c
367; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
368; VI-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
369; VI-NEXT:    s_waitcnt lgkmcnt(0)
370; VI-NEXT:    s_cmp_lg_u32 s8, 0
371; VI-NEXT:    s_cbranch_scc0 .LBB7_4
372; VI-NEXT:  ; %bb.1: ; %else
373; VI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x8
374; VI-NEXT:    s_cbranch_execnz .LBB7_3
375; VI-NEXT:  .LBB7_2: ; %if
376; VI-NEXT:    s_waitcnt lgkmcnt(0)
377; VI-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
378; VI-NEXT:    s_mov_b32 s5, 0
379; VI-NEXT:  .LBB7_3: ; %endif
380; VI-NEXT:    s_waitcnt lgkmcnt(0)
381; VI-NEXT:    v_mov_b32_e32 v0, s4
382; VI-NEXT:    s_mov_b32 s3, 0xf000
383; VI-NEXT:    s_mov_b32 s2, -1
384; VI-NEXT:    v_mov_b32_e32 v1, s5
385; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
386; VI-NEXT:    s_endpgm
387; VI-NEXT:  .LBB7_4:
388; VI-NEXT:    ; implicit-def: $sgpr4_sgpr5
389; VI-NEXT:    s_branch .LBB7_2
390entry:
391  %tmp0 = icmp eq i32 %cond, 0
392  br i1 %tmp0, label %if, label %else
393
394if:
395  %tmp2 = call i64 @llvm.ctpop.i64(i64 %ctpop_arg)
396  br label %endif
397
398else:
399  %tmp3 = getelementptr i64, ptr addrspace(1) %in, i32 1
400  %tmp4 = load i64, ptr addrspace(1) %tmp3
401  br label %endif
402
403endif:
404  %tmp5 = phi i64 [%tmp2, %if], [%tmp4, %else]
405  store i64 %tmp5, ptr addrspace(1) %out
406  ret void
407}
408
409define amdgpu_kernel void @s_ctpop_i128(ptr addrspace(1) noalias %out, i128 %val) nounwind {
410; SI-LABEL: s_ctpop_i128:
411; SI:       ; %bb.0:
412; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0xb
413; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x9
414; SI-NEXT:    s_mov_b32 s7, 0xf000
415; SI-NEXT:    s_mov_b32 s6, -1
416; SI-NEXT:    s_waitcnt lgkmcnt(0)
417; SI-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
418; SI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
419; SI-NEXT:    s_add_i32 s0, s0, s2
420; SI-NEXT:    v_mov_b32_e32 v0, s0
421; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
422; SI-NEXT:    s_endpgm
423;
424; VI-LABEL: s_ctpop_i128:
425; VI:       ; %bb.0:
426; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
427; VI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x24
428; VI-NEXT:    s_mov_b32 s7, 0xf000
429; VI-NEXT:    s_mov_b32 s6, -1
430; VI-NEXT:    s_waitcnt lgkmcnt(0)
431; VI-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
432; VI-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
433; VI-NEXT:    s_add_i32 s0, s0, s2
434; VI-NEXT:    v_mov_b32_e32 v0, s0
435; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
436; VI-NEXT:    s_endpgm
437  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
438  %truncctpop = trunc i128 %ctpop to i32
439  store i32 %truncctpop, ptr addrspace(1) %out, align 4
440  ret void
441}
442
443define amdgpu_kernel void @s_ctpop_i65(ptr addrspace(1) noalias %out, i65 %val) nounwind {
444; SI-LABEL: s_ctpop_i65:
445; SI:       ; %bb.0:
446; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
447; SI-NEXT:    s_load_dword s8, s[4:5], 0xd
448; SI-NEXT:    s_mov_b32 s7, 0xf000
449; SI-NEXT:    s_mov_b32 s6, -1
450; SI-NEXT:    s_waitcnt lgkmcnt(0)
451; SI-NEXT:    s_mov_b32 s4, s0
452; SI-NEXT:    s_and_b32 s0, s8, 0xff
453; SI-NEXT:    s_mov_b32 s5, s1
454; SI-NEXT:    s_bcnt1_i32_b32 s0, s0
455; SI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
456; SI-NEXT:    s_add_i32 s0, s1, s0
457; SI-NEXT:    v_mov_b32_e32 v0, s0
458; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
459; SI-NEXT:    s_endpgm
460;
461; VI-LABEL: s_ctpop_i65:
462; VI:       ; %bb.0:
463; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
464; VI-NEXT:    s_load_dword s8, s[4:5], 0x34
465; VI-NEXT:    s_mov_b32 s7, 0xf000
466; VI-NEXT:    s_mov_b32 s6, -1
467; VI-NEXT:    s_waitcnt lgkmcnt(0)
468; VI-NEXT:    s_mov_b32 s4, s0
469; VI-NEXT:    s_and_b32 s0, s8, 0xff
470; VI-NEXT:    s_mov_b32 s5, s1
471; VI-NEXT:    s_bcnt1_i32_b32 s0, s0
472; VI-NEXT:    s_bcnt1_i32_b64 s1, s[2:3]
473; VI-NEXT:    s_add_i32 s0, s1, s0
474; VI-NEXT:    v_mov_b32_e32 v0, s0
475; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
476; VI-NEXT:    s_endpgm
477  %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
478  %truncctpop = trunc i65 %ctpop to i32
479  store i32 %truncctpop, ptr addrspace(1) %out, align 4
480  ret void
481}
482
483; FIXME: Should not have extra add
484define amdgpu_kernel void @v_ctpop_i128(ptr addrspace(1) noalias %out, ptr addrspace(1) noalias %in) nounwind {
485; SI-LABEL: v_ctpop_i128:
486; SI:       ; %bb.0:
487; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
488; SI-NEXT:    s_mov_b32 s7, 0xf000
489; SI-NEXT:    s_mov_b32 s10, 0
490; SI-NEXT:    s_mov_b32 s11, s7
491; SI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
492; SI-NEXT:    s_waitcnt lgkmcnt(0)
493; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
494; SI-NEXT:    v_mov_b32_e32 v1, 0
495; SI-NEXT:    buffer_load_dwordx4 v[0:3], v[0:1], s[8:11], 0 addr64
496; SI-NEXT:    s_mov_b32 s6, -1
497; SI-NEXT:    s_mov_b32 s4, s0
498; SI-NEXT:    s_mov_b32 s5, s1
499; SI-NEXT:    s_waitcnt vmcnt(0)
500; SI-NEXT:    v_bcnt_u32_b32_e64 v2, v2, 0
501; SI-NEXT:    v_bcnt_u32_b32_e64 v0, v0, 0
502; SI-NEXT:    v_bcnt_u32_b32_e32 v2, v3, v2
503; SI-NEXT:    v_bcnt_u32_b32_e32 v0, v1, v0
504; SI-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
505; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
506; SI-NEXT:    s_endpgm
507;
508; VI-LABEL: v_ctpop_i128:
509; VI:       ; %bb.0:
510; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
511; VI-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
512; VI-NEXT:    s_waitcnt lgkmcnt(0)
513; VI-NEXT:    v_mov_b32_e32 v1, s3
514; VI-NEXT:    v_add_u32_e32 v0, vcc, s2, v0
515; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
516; VI-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
517; VI-NEXT:    s_mov_b32 s3, 0xf000
518; VI-NEXT:    s_mov_b32 s2, -1
519; VI-NEXT:    s_waitcnt vmcnt(0)
520; VI-NEXT:    v_bcnt_u32_b32 v2, v2, 0
521; VI-NEXT:    v_bcnt_u32_b32 v0, v0, 0
522; VI-NEXT:    v_bcnt_u32_b32 v2, v3, v2
523; VI-NEXT:    v_bcnt_u32_b32 v0, v1, v0
524; VI-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
525; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
526; VI-NEXT:    s_endpgm
527  %tid = call i32 @llvm.amdgcn.workitem.id.x()
528  %in.gep = getelementptr i128, ptr addrspace(1) %in, i32 %tid
529  %val = load i128, ptr addrspace(1) %in.gep, align 8
530  %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
531  %truncctpop = trunc i128 %ctpop to i32
532  store i32 %truncctpop, ptr addrspace(1) %out, align 4
533  ret void
534}
535