xref: /llvm-project/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-ctpop.ll (revision 9122c5235ec85ce0c0ad337e862b006e7b349d84)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32
3; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64
4; RUN: llc -mtriple=riscv32 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
5; RUN: llc -mtriple=riscv64 -mattr=+v,+zvbb -verify-machineinstrs < %s | FileCheck %s --check-prefixes=ZVBB
6
7define void @ctpop_v16i8(ptr %x, ptr %y) {
8; CHECK-LABEL: ctpop_v16i8:
9; CHECK:       # %bb.0:
10; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
11; CHECK-NEXT:    vle8.v v8, (a0)
12; CHECK-NEXT:    li a1, 85
13; CHECK-NEXT:    vsrl.vi v9, v8, 1
14; CHECK-NEXT:    vand.vx v9, v9, a1
15; CHECK-NEXT:    li a1, 51
16; CHECK-NEXT:    vsub.vv v8, v8, v9
17; CHECK-NEXT:    vand.vx v9, v8, a1
18; CHECK-NEXT:    vsrl.vi v8, v8, 2
19; CHECK-NEXT:    vand.vx v8, v8, a1
20; CHECK-NEXT:    vadd.vv v8, v9, v8
21; CHECK-NEXT:    vsrl.vi v9, v8, 4
22; CHECK-NEXT:    vadd.vv v8, v8, v9
23; CHECK-NEXT:    vand.vi v8, v8, 15
24; CHECK-NEXT:    vse8.v v8, (a0)
25; CHECK-NEXT:    ret
26;
27; ZVBB-LABEL: ctpop_v16i8:
28; ZVBB:       # %bb.0:
29; ZVBB-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
30; ZVBB-NEXT:    vle8.v v8, (a0)
31; ZVBB-NEXT:    vcpop.v v8, v8
32; ZVBB-NEXT:    vse8.v v8, (a0)
33; ZVBB-NEXT:    ret
34  %a = load <16 x i8>, ptr %x
35  %b = load <16 x i8>, ptr %y
36  %c = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %a)
37  store <16 x i8> %c, ptr %x
38  ret void
39}
40declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
41
42define void @ctpop_v8i16(ptr %x, ptr %y) {
43; CHECK-LABEL: ctpop_v8i16:
44; CHECK:       # %bb.0:
45; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
46; CHECK-NEXT:    vle16.v v8, (a0)
47; CHECK-NEXT:    lui a1, 5
48; CHECK-NEXT:    addi a1, a1, 1365
49; CHECK-NEXT:    vsrl.vi v9, v8, 1
50; CHECK-NEXT:    vand.vx v9, v9, a1
51; CHECK-NEXT:    lui a1, 3
52; CHECK-NEXT:    addi a1, a1, 819
53; CHECK-NEXT:    vsub.vv v8, v8, v9
54; CHECK-NEXT:    vand.vx v9, v8, a1
55; CHECK-NEXT:    vsrl.vi v8, v8, 2
56; CHECK-NEXT:    vand.vx v8, v8, a1
57; CHECK-NEXT:    lui a1, 1
58; CHECK-NEXT:    addi a1, a1, -241
59; CHECK-NEXT:    vadd.vv v8, v9, v8
60; CHECK-NEXT:    vsrl.vi v9, v8, 4
61; CHECK-NEXT:    vadd.vv v8, v8, v9
62; CHECK-NEXT:    vand.vx v8, v8, a1
63; CHECK-NEXT:    li a1, 257
64; CHECK-NEXT:    vmul.vx v8, v8, a1
65; CHECK-NEXT:    vsrl.vi v8, v8, 8
66; CHECK-NEXT:    vse16.v v8, (a0)
67; CHECK-NEXT:    ret
68;
69; ZVBB-LABEL: ctpop_v8i16:
70; ZVBB:       # %bb.0:
71; ZVBB-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
72; ZVBB-NEXT:    vle16.v v8, (a0)
73; ZVBB-NEXT:    vcpop.v v8, v8
74; ZVBB-NEXT:    vse16.v v8, (a0)
75; ZVBB-NEXT:    ret
76  %a = load <8 x i16>, ptr %x
77  %b = load <8 x i16>, ptr %y
78  %c = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %a)
79  store <8 x i16> %c, ptr %x
80  ret void
81}
82declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
83
84define void @ctpop_v4i32(ptr %x, ptr %y) {
85; CHECK-LABEL: ctpop_v4i32:
86; CHECK:       # %bb.0:
87; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
88; CHECK-NEXT:    vle32.v v8, (a0)
89; CHECK-NEXT:    lui a1, 349525
90; CHECK-NEXT:    addi a1, a1, 1365
91; CHECK-NEXT:    vsrl.vi v9, v8, 1
92; CHECK-NEXT:    vand.vx v9, v9, a1
93; CHECK-NEXT:    lui a1, 209715
94; CHECK-NEXT:    addi a1, a1, 819
95; CHECK-NEXT:    vsub.vv v8, v8, v9
96; CHECK-NEXT:    vand.vx v9, v8, a1
97; CHECK-NEXT:    vsrl.vi v8, v8, 2
98; CHECK-NEXT:    vand.vx v8, v8, a1
99; CHECK-NEXT:    lui a1, 61681
100; CHECK-NEXT:    addi a1, a1, -241
101; CHECK-NEXT:    vadd.vv v8, v9, v8
102; CHECK-NEXT:    vsrl.vi v9, v8, 4
103; CHECK-NEXT:    vadd.vv v8, v8, v9
104; CHECK-NEXT:    vand.vx v8, v8, a1
105; CHECK-NEXT:    lui a1, 4112
106; CHECK-NEXT:    addi a1, a1, 257
107; CHECK-NEXT:    vmul.vx v8, v8, a1
108; CHECK-NEXT:    vsrl.vi v8, v8, 24
109; CHECK-NEXT:    vse32.v v8, (a0)
110; CHECK-NEXT:    ret
111;
112; ZVBB-LABEL: ctpop_v4i32:
113; ZVBB:       # %bb.0:
114; ZVBB-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
115; ZVBB-NEXT:    vle32.v v8, (a0)
116; ZVBB-NEXT:    vcpop.v v8, v8
117; ZVBB-NEXT:    vse32.v v8, (a0)
118; ZVBB-NEXT:    ret
119  %a = load <4 x i32>, ptr %x
120  %b = load <4 x i32>, ptr %y
121  %c = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %a)
122  store <4 x i32> %c, ptr %x
123  ret void
124}
125declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
126
127define void @ctpop_v2i64(ptr %x, ptr %y) {
128; RV32-LABEL: ctpop_v2i64:
129; RV32:       # %bb.0:
130; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
131; RV32-NEXT:    vle64.v v8, (a0)
132; RV32-NEXT:    lui a1, 349525
133; RV32-NEXT:    addi a1, a1, 1365
134; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
135; RV32-NEXT:    vmv.v.x v9, a1
136; RV32-NEXT:    lui a1, 209715
137; RV32-NEXT:    addi a1, a1, 819
138; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
139; RV32-NEXT:    vsrl.vi v10, v8, 1
140; RV32-NEXT:    vand.vv v9, v10, v9
141; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
142; RV32-NEXT:    vmv.v.x v10, a1
143; RV32-NEXT:    lui a1, 61681
144; RV32-NEXT:    addi a1, a1, -241
145; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
146; RV32-NEXT:    vsub.vv v8, v8, v9
147; RV32-NEXT:    vand.vv v9, v8, v10
148; RV32-NEXT:    vsrl.vi v8, v8, 2
149; RV32-NEXT:    vand.vv v8, v8, v10
150; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
151; RV32-NEXT:    vmv.v.x v10, a1
152; RV32-NEXT:    lui a1, 4112
153; RV32-NEXT:    addi a1, a1, 257
154; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
155; RV32-NEXT:    vadd.vv v8, v9, v8
156; RV32-NEXT:    vsrl.vi v9, v8, 4
157; RV32-NEXT:    vadd.vv v8, v8, v9
158; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
159; RV32-NEXT:    vmv.v.x v9, a1
160; RV32-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
161; RV32-NEXT:    vand.vv v8, v8, v10
162; RV32-NEXT:    vmul.vv v8, v8, v9
163; RV32-NEXT:    li a1, 56
164; RV32-NEXT:    vsrl.vx v8, v8, a1
165; RV32-NEXT:    vse64.v v8, (a0)
166; RV32-NEXT:    ret
167;
168; RV64-LABEL: ctpop_v2i64:
169; RV64:       # %bb.0:
170; RV64-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
171; RV64-NEXT:    vle64.v v8, (a0)
172; RV64-NEXT:    lui a1, 349525
173; RV64-NEXT:    lui a2, 209715
174; RV64-NEXT:    lui a3, 61681
175; RV64-NEXT:    lui a4, 4112
176; RV64-NEXT:    addiw a1, a1, 1365
177; RV64-NEXT:    addiw a2, a2, 819
178; RV64-NEXT:    addiw a3, a3, -241
179; RV64-NEXT:    addiw a4, a4, 257
180; RV64-NEXT:    slli a5, a1, 32
181; RV64-NEXT:    add a1, a1, a5
182; RV64-NEXT:    slli a5, a2, 32
183; RV64-NEXT:    add a2, a2, a5
184; RV64-NEXT:    slli a5, a3, 32
185; RV64-NEXT:    add a3, a3, a5
186; RV64-NEXT:    slli a5, a4, 32
187; RV64-NEXT:    add a4, a4, a5
188; RV64-NEXT:    vsrl.vi v9, v8, 1
189; RV64-NEXT:    vand.vx v9, v9, a1
190; RV64-NEXT:    vsub.vv v8, v8, v9
191; RV64-NEXT:    vand.vx v9, v8, a2
192; RV64-NEXT:    vsrl.vi v8, v8, 2
193; RV64-NEXT:    vand.vx v8, v8, a2
194; RV64-NEXT:    vadd.vv v8, v9, v8
195; RV64-NEXT:    vsrl.vi v9, v8, 4
196; RV64-NEXT:    vadd.vv v8, v8, v9
197; RV64-NEXT:    vand.vx v8, v8, a3
198; RV64-NEXT:    vmul.vx v8, v8, a4
199; RV64-NEXT:    li a1, 56
200; RV64-NEXT:    vsrl.vx v8, v8, a1
201; RV64-NEXT:    vse64.v v8, (a0)
202; RV64-NEXT:    ret
203;
204; ZVBB-LABEL: ctpop_v2i64:
205; ZVBB:       # %bb.0:
206; ZVBB-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
207; ZVBB-NEXT:    vle64.v v8, (a0)
208; ZVBB-NEXT:    vcpop.v v8, v8
209; ZVBB-NEXT:    vse64.v v8, (a0)
210; ZVBB-NEXT:    ret
211  %a = load <2 x i64>, ptr %x
212  %b = load <2 x i64>, ptr %y
213  %c = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %a)
214  store <2 x i64> %c, ptr %x
215  ret void
216}
217declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
218
219define void @ctpop_v32i8(ptr %x, ptr %y) {
220; CHECK-LABEL: ctpop_v32i8:
221; CHECK:       # %bb.0:
222; CHECK-NEXT:    li a1, 32
223; CHECK-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
224; CHECK-NEXT:    vle8.v v8, (a0)
225; CHECK-NEXT:    li a1, 85
226; CHECK-NEXT:    vsrl.vi v10, v8, 1
227; CHECK-NEXT:    vand.vx v10, v10, a1
228; CHECK-NEXT:    li a1, 51
229; CHECK-NEXT:    vsub.vv v8, v8, v10
230; CHECK-NEXT:    vand.vx v10, v8, a1
231; CHECK-NEXT:    vsrl.vi v8, v8, 2
232; CHECK-NEXT:    vand.vx v8, v8, a1
233; CHECK-NEXT:    vadd.vv v8, v10, v8
234; CHECK-NEXT:    vsrl.vi v10, v8, 4
235; CHECK-NEXT:    vadd.vv v8, v8, v10
236; CHECK-NEXT:    vand.vi v8, v8, 15
237; CHECK-NEXT:    vse8.v v8, (a0)
238; CHECK-NEXT:    ret
239;
240; ZVBB-LABEL: ctpop_v32i8:
241; ZVBB:       # %bb.0:
242; ZVBB-NEXT:    li a1, 32
243; ZVBB-NEXT:    vsetvli zero, a1, e8, m2, ta, ma
244; ZVBB-NEXT:    vle8.v v8, (a0)
245; ZVBB-NEXT:    vcpop.v v8, v8
246; ZVBB-NEXT:    vse8.v v8, (a0)
247; ZVBB-NEXT:    ret
248  %a = load <32 x i8>, ptr %x
249  %b = load <32 x i8>, ptr %y
250  %c = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %a)
251  store <32 x i8> %c, ptr %x
252  ret void
253}
254declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
255
256define void @ctpop_v16i16(ptr %x, ptr %y) {
257; CHECK-LABEL: ctpop_v16i16:
258; CHECK:       # %bb.0:
259; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
260; CHECK-NEXT:    vle16.v v8, (a0)
261; CHECK-NEXT:    lui a1, 5
262; CHECK-NEXT:    addi a1, a1, 1365
263; CHECK-NEXT:    vsrl.vi v10, v8, 1
264; CHECK-NEXT:    vand.vx v10, v10, a1
265; CHECK-NEXT:    lui a1, 3
266; CHECK-NEXT:    addi a1, a1, 819
267; CHECK-NEXT:    vsub.vv v8, v8, v10
268; CHECK-NEXT:    vand.vx v10, v8, a1
269; CHECK-NEXT:    vsrl.vi v8, v8, 2
270; CHECK-NEXT:    vand.vx v8, v8, a1
271; CHECK-NEXT:    lui a1, 1
272; CHECK-NEXT:    addi a1, a1, -241
273; CHECK-NEXT:    vadd.vv v8, v10, v8
274; CHECK-NEXT:    vsrl.vi v10, v8, 4
275; CHECK-NEXT:    vadd.vv v8, v8, v10
276; CHECK-NEXT:    vand.vx v8, v8, a1
277; CHECK-NEXT:    li a1, 257
278; CHECK-NEXT:    vmul.vx v8, v8, a1
279; CHECK-NEXT:    vsrl.vi v8, v8, 8
280; CHECK-NEXT:    vse16.v v8, (a0)
281; CHECK-NEXT:    ret
282;
283; ZVBB-LABEL: ctpop_v16i16:
284; ZVBB:       # %bb.0:
285; ZVBB-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
286; ZVBB-NEXT:    vle16.v v8, (a0)
287; ZVBB-NEXT:    vcpop.v v8, v8
288; ZVBB-NEXT:    vse16.v v8, (a0)
289; ZVBB-NEXT:    ret
290  %a = load <16 x i16>, ptr %x
291  %b = load <16 x i16>, ptr %y
292  %c = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %a)
293  store <16 x i16> %c, ptr %x
294  ret void
295}
296declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
297
298define void @ctpop_v8i32(ptr %x, ptr %y) {
299; CHECK-LABEL: ctpop_v8i32:
300; CHECK:       # %bb.0:
301; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
302; CHECK-NEXT:    vle32.v v8, (a0)
303; CHECK-NEXT:    lui a1, 349525
304; CHECK-NEXT:    addi a1, a1, 1365
305; CHECK-NEXT:    vsrl.vi v10, v8, 1
306; CHECK-NEXT:    vand.vx v10, v10, a1
307; CHECK-NEXT:    lui a1, 209715
308; CHECK-NEXT:    addi a1, a1, 819
309; CHECK-NEXT:    vsub.vv v8, v8, v10
310; CHECK-NEXT:    vand.vx v10, v8, a1
311; CHECK-NEXT:    vsrl.vi v8, v8, 2
312; CHECK-NEXT:    vand.vx v8, v8, a1
313; CHECK-NEXT:    lui a1, 61681
314; CHECK-NEXT:    addi a1, a1, -241
315; CHECK-NEXT:    vadd.vv v8, v10, v8
316; CHECK-NEXT:    vsrl.vi v10, v8, 4
317; CHECK-NEXT:    vadd.vv v8, v8, v10
318; CHECK-NEXT:    vand.vx v8, v8, a1
319; CHECK-NEXT:    lui a1, 4112
320; CHECK-NEXT:    addi a1, a1, 257
321; CHECK-NEXT:    vmul.vx v8, v8, a1
322; CHECK-NEXT:    vsrl.vi v8, v8, 24
323; CHECK-NEXT:    vse32.v v8, (a0)
324; CHECK-NEXT:    ret
325;
326; ZVBB-LABEL: ctpop_v8i32:
327; ZVBB:       # %bb.0:
328; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
329; ZVBB-NEXT:    vle32.v v8, (a0)
330; ZVBB-NEXT:    vcpop.v v8, v8
331; ZVBB-NEXT:    vse32.v v8, (a0)
332; ZVBB-NEXT:    ret
333  %a = load <8 x i32>, ptr %x
334  %b = load <8 x i32>, ptr %y
335  %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
336  store <8 x i32> %c, ptr %x
337  ret void
338}
339define <8 x i1> @ctpop_v8i32_ult_two(ptr %x, ptr %y) {
340; CHECK-LABEL: ctpop_v8i32_ult_two:
341; CHECK:       # %bb.0:
342; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
343; CHECK-NEXT:    vle32.v v8, (a0)
344; CHECK-NEXT:    vadd.vi v10, v8, -1
345; CHECK-NEXT:    vand.vv v8, v8, v10
346; CHECK-NEXT:    vmseq.vi v0, v8, 0
347; CHECK-NEXT:    ret
348;
349; ZVBB-LABEL: ctpop_v8i32_ult_two:
350; ZVBB:       # %bb.0:
351; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
352; ZVBB-NEXT:    vle32.v v8, (a0)
353; ZVBB-NEXT:    vcpop.v v8, v8
354; ZVBB-NEXT:    vmsleu.vi v0, v8, 1
355; ZVBB-NEXT:    ret
356  %a = load <8 x i32>, ptr %x
357  %b = load <8 x i32>, ptr %y
358  %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
359  %cmp = icmp ult <8 x i32> %c, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
360  ret <8 x i1> %cmp
361}
362define <8 x i1> @ctpop_v8i32_ugt_one(ptr %x, ptr %y) {
363; CHECK-LABEL: ctpop_v8i32_ugt_one:
364; CHECK:       # %bb.0:
365; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
366; CHECK-NEXT:    vle32.v v8, (a0)
367; CHECK-NEXT:    vadd.vi v10, v8, -1
368; CHECK-NEXT:    vand.vv v8, v8, v10
369; CHECK-NEXT:    vmsne.vi v0, v8, 0
370; CHECK-NEXT:    ret
371;
372; ZVBB-LABEL: ctpop_v8i32_ugt_one:
373; ZVBB:       # %bb.0:
374; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
375; ZVBB-NEXT:    vle32.v v8, (a0)
376; ZVBB-NEXT:    vcpop.v v8, v8
377; ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
378; ZVBB-NEXT:    ret
379  %a = load <8 x i32>, ptr %x
380  %b = load <8 x i32>, ptr %y
381  %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
382  %cmp = icmp ugt <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
383  ret <8 x i1> %cmp
384}
385define <8 x i1> @ctpop_v8i32_eq_one(ptr %x, ptr %y) {
386; CHECK-LABEL: ctpop_v8i32_eq_one:
387; CHECK:       # %bb.0:
388; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
389; CHECK-NEXT:    vle32.v v8, (a0)
390; CHECK-NEXT:    vadd.vi v10, v8, -1
391; CHECK-NEXT:    vxor.vv v8, v8, v10
392; CHECK-NEXT:    vmsltu.vv v0, v10, v8
393; CHECK-NEXT:    ret
394;
395; ZVBB-LABEL: ctpop_v8i32_eq_one:
396; ZVBB:       # %bb.0:
397; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
398; ZVBB-NEXT:    vle32.v v8, (a0)
399; ZVBB-NEXT:    vcpop.v v8, v8
400; ZVBB-NEXT:    vmseq.vi v0, v8, 1
401; ZVBB-NEXT:    ret
402  %a = load <8 x i32>, ptr %x
403  %b = load <8 x i32>, ptr %y
404  %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
405  %cmp = icmp eq <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
406  ret <8 x i1> %cmp
407}
408define <8 x i1> @ctpop_v8i32_ne_one(ptr %x, ptr %y) {
409; CHECK-LABEL: ctpop_v8i32_ne_one:
410; CHECK:       # %bb.0:
411; CHECK-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
412; CHECK-NEXT:    vle32.v v8, (a0)
413; CHECK-NEXT:    vadd.vi v10, v8, -1
414; CHECK-NEXT:    vxor.vv v8, v8, v10
415; CHECK-NEXT:    vmsleu.vv v0, v8, v10
416; CHECK-NEXT:    ret
417;
418; ZVBB-LABEL: ctpop_v8i32_ne_one:
419; ZVBB:       # %bb.0:
420; ZVBB-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
421; ZVBB-NEXT:    vle32.v v8, (a0)
422; ZVBB-NEXT:    vcpop.v v8, v8
423; ZVBB-NEXT:    vmsne.vi v0, v8, 1
424; ZVBB-NEXT:    ret
425  %a = load <8 x i32>, ptr %x
426  %b = load <8 x i32>, ptr %y
427  %c = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %a)
428  %cmp = icmp ne <8 x i32> %c, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
429  ret <8 x i1> %cmp
430}
431declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
432
433define void @ctpop_v4i64(ptr %x, ptr %y) {
434; RV32-LABEL: ctpop_v4i64:
435; RV32:       # %bb.0:
436; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
437; RV32-NEXT:    vle64.v v8, (a0)
438; RV32-NEXT:    lui a1, 349525
439; RV32-NEXT:    addi a1, a1, 1365
440; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
441; RV32-NEXT:    vmv.v.x v10, a1
442; RV32-NEXT:    lui a1, 209715
443; RV32-NEXT:    addi a1, a1, 819
444; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
445; RV32-NEXT:    vsrl.vi v12, v8, 1
446; RV32-NEXT:    vand.vv v10, v12, v10
447; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
448; RV32-NEXT:    vmv.v.x v12, a1
449; RV32-NEXT:    lui a1, 61681
450; RV32-NEXT:    addi a1, a1, -241
451; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
452; RV32-NEXT:    vsub.vv v8, v8, v10
453; RV32-NEXT:    vand.vv v10, v8, v12
454; RV32-NEXT:    vsrl.vi v8, v8, 2
455; RV32-NEXT:    vand.vv v8, v8, v12
456; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
457; RV32-NEXT:    vmv.v.x v12, a1
458; RV32-NEXT:    lui a1, 4112
459; RV32-NEXT:    addi a1, a1, 257
460; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
461; RV32-NEXT:    vadd.vv v8, v10, v8
462; RV32-NEXT:    vsrl.vi v10, v8, 4
463; RV32-NEXT:    vadd.vv v8, v8, v10
464; RV32-NEXT:    vsetivli zero, 8, e32, m2, ta, ma
465; RV32-NEXT:    vmv.v.x v10, a1
466; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
467; RV32-NEXT:    vand.vv v8, v8, v12
468; RV32-NEXT:    vmul.vv v8, v8, v10
469; RV32-NEXT:    li a1, 56
470; RV32-NEXT:    vsrl.vx v8, v8, a1
471; RV32-NEXT:    vse64.v v8, (a0)
472; RV32-NEXT:    ret
473;
474; RV64-LABEL: ctpop_v4i64:
475; RV64:       # %bb.0:
476; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
477; RV64-NEXT:    vle64.v v8, (a0)
478; RV64-NEXT:    lui a1, 349525
479; RV64-NEXT:    lui a2, 209715
480; RV64-NEXT:    lui a3, 61681
481; RV64-NEXT:    lui a4, 4112
482; RV64-NEXT:    addiw a1, a1, 1365
483; RV64-NEXT:    addiw a2, a2, 819
484; RV64-NEXT:    addiw a3, a3, -241
485; RV64-NEXT:    addiw a4, a4, 257
486; RV64-NEXT:    slli a5, a1, 32
487; RV64-NEXT:    add a1, a1, a5
488; RV64-NEXT:    slli a5, a2, 32
489; RV64-NEXT:    add a2, a2, a5
490; RV64-NEXT:    slli a5, a3, 32
491; RV64-NEXT:    add a3, a3, a5
492; RV64-NEXT:    slli a5, a4, 32
493; RV64-NEXT:    add a4, a4, a5
494; RV64-NEXT:    vsrl.vi v10, v8, 1
495; RV64-NEXT:    vand.vx v10, v10, a1
496; RV64-NEXT:    vsub.vv v8, v8, v10
497; RV64-NEXT:    vand.vx v10, v8, a2
498; RV64-NEXT:    vsrl.vi v8, v8, 2
499; RV64-NEXT:    vand.vx v8, v8, a2
500; RV64-NEXT:    vadd.vv v8, v10, v8
501; RV64-NEXT:    vsrl.vi v10, v8, 4
502; RV64-NEXT:    vadd.vv v8, v8, v10
503; RV64-NEXT:    vand.vx v8, v8, a3
504; RV64-NEXT:    vmul.vx v8, v8, a4
505; RV64-NEXT:    li a1, 56
506; RV64-NEXT:    vsrl.vx v8, v8, a1
507; RV64-NEXT:    vse64.v v8, (a0)
508; RV64-NEXT:    ret
509;
510; ZVBB-LABEL: ctpop_v4i64:
511; ZVBB:       # %bb.0:
512; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
513; ZVBB-NEXT:    vle64.v v8, (a0)
514; ZVBB-NEXT:    vcpop.v v8, v8
515; ZVBB-NEXT:    vse64.v v8, (a0)
516; ZVBB-NEXT:    ret
517  %a = load <4 x i64>, ptr %x
518  %b = load <4 x i64>, ptr %y
519  %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
520  store <4 x i64> %c, ptr %x
521  ret void
522}
523define <4 x i1> @ctpop_v4i64_ult_two(ptr %x, ptr %y) {
524; CHECK-LABEL: ctpop_v4i64_ult_two:
525; CHECK:       # %bb.0:
526; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
527; CHECK-NEXT:    vle64.v v8, (a0)
528; CHECK-NEXT:    vadd.vi v10, v8, -1
529; CHECK-NEXT:    vand.vv v8, v8, v10
530; CHECK-NEXT:    vmseq.vi v0, v8, 0
531; CHECK-NEXT:    ret
532;
533; ZVBB-LABEL: ctpop_v4i64_ult_two:
534; ZVBB:       # %bb.0:
535; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
536; ZVBB-NEXT:    vle64.v v8, (a0)
537; ZVBB-NEXT:    vcpop.v v8, v8
538; ZVBB-NEXT:    vmsleu.vi v0, v8, 1
539; ZVBB-NEXT:    ret
540  %a = load <4 x i64>, ptr %x
541  %b = load <4 x i64>, ptr %y
542  %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
543  %cmp = icmp ult <4 x i64> %c, <i64 2, i64 2, i64 2, i64 2>
544  ret <4 x i1> %cmp
545}
546define <4 x i1> @ctpop_v4i64_ugt_one(ptr %x, ptr %y) {
547; CHECK-LABEL: ctpop_v4i64_ugt_one:
548; CHECK:       # %bb.0:
549; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
550; CHECK-NEXT:    vle64.v v8, (a0)
551; CHECK-NEXT:    vadd.vi v10, v8, -1
552; CHECK-NEXT:    vand.vv v8, v8, v10
553; CHECK-NEXT:    vmsne.vi v0, v8, 0
554; CHECK-NEXT:    ret
555;
556; ZVBB-LABEL: ctpop_v4i64_ugt_one:
557; ZVBB:       # %bb.0:
558; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
559; ZVBB-NEXT:    vle64.v v8, (a0)
560; ZVBB-NEXT:    vcpop.v v8, v8
561; ZVBB-NEXT:    vmsgtu.vi v0, v8, 1
562; ZVBB-NEXT:    ret
563  %a = load <4 x i64>, ptr %x
564  %b = load <4 x i64>, ptr %y
565  %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
566  %cmp = icmp ugt <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
567  ret <4 x i1> %cmp
568}
569define <4 x i1> @ctpop_v4i64_eq_one(ptr %x, ptr %y) {
570; CHECK-LABEL: ctpop_v4i64_eq_one:
571; CHECK:       # %bb.0:
572; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
573; CHECK-NEXT:    vle64.v v8, (a0)
574; CHECK-NEXT:    vadd.vi v10, v8, -1
575; CHECK-NEXT:    vxor.vv v8, v8, v10
576; CHECK-NEXT:    vmsltu.vv v0, v10, v8
577; CHECK-NEXT:    ret
578;
579; ZVBB-LABEL: ctpop_v4i64_eq_one:
580; ZVBB:       # %bb.0:
581; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
582; ZVBB-NEXT:    vle64.v v8, (a0)
583; ZVBB-NEXT:    vcpop.v v8, v8
584; ZVBB-NEXT:    vmseq.vi v0, v8, 1
585; ZVBB-NEXT:    ret
586  %a = load <4 x i64>, ptr %x
587  %b = load <4 x i64>, ptr %y
588  %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
589  %cmp = icmp eq <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
590  ret <4 x i1> %cmp
591}
592define <4 x i1> @ctpop_v4i64_ne_one(ptr %x, ptr %y) {
593; CHECK-LABEL: ctpop_v4i64_ne_one:
594; CHECK:       # %bb.0:
595; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
596; CHECK-NEXT:    vle64.v v8, (a0)
597; CHECK-NEXT:    vadd.vi v10, v8, -1
598; CHECK-NEXT:    vxor.vv v8, v8, v10
599; CHECK-NEXT:    vmsleu.vv v0, v8, v10
600; CHECK-NEXT:    ret
601;
602; ZVBB-LABEL: ctpop_v4i64_ne_one:
603; ZVBB:       # %bb.0:
604; ZVBB-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
605; ZVBB-NEXT:    vle64.v v8, (a0)
606; ZVBB-NEXT:    vcpop.v v8, v8
607; ZVBB-NEXT:    vmsne.vi v0, v8, 1
608; ZVBB-NEXT:    ret
609  %a = load <4 x i64>, ptr %x
610  %b = load <4 x i64>, ptr %y
611  %c = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %a)
612  %cmp = icmp ne <4 x i64> %c, <i64 1, i64 1, i64 1, i64 1>
613  ret <4 x i1> %cmp
614}
615declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
616