xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-fixed-length-bit-counting.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -aarch64-sve-vector-bits-min=256  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
3; RUN: llc -aarch64-sve-vector-bits-min=512  < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
5
6target triple = "aarch64-unknown-linux-gnu"
7
8;
9; CLZ
10;
11
12; Don't use SVE for 64-bit vectors.
13define <8 x i8> @ctlz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
14; CHECK-LABEL: ctlz_v8i8:
15; CHECK:       // %bb.0:
16; CHECK-NEXT:    clz v0.8b, v0.8b
17; CHECK-NEXT:    ret
18  %res = call <8 x i8> @llvm.ctlz.v8i8(<8 x i8> %op)
19  ret <8 x i8> %res
20}
21
22; Don't use SVE for 128-bit vectors.
23define <16 x i8> @ctlz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
24; CHECK-LABEL: ctlz_v16i8:
25; CHECK:       // %bb.0:
26; CHECK-NEXT:    clz v0.16b, v0.16b
27; CHECK-NEXT:    ret
28  %res = call <16 x i8> @llvm.ctlz.v16i8(<16 x i8> %op)
29  ret <16 x i8> %res
30}
31
32define void @ctlz_v32i8(ptr %a) vscale_range(2,0) #0 {
33; CHECK-LABEL: ctlz_v32i8:
34; CHECK:       // %bb.0:
35; CHECK-NEXT:    ptrue p0.b, vl32
36; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
37; CHECK-NEXT:    clz z0.b, p0/m, z0.b
38; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
39; CHECK-NEXT:    ret
40  %op = load <32 x i8>, ptr %a
41  %res = call <32 x i8> @llvm.ctlz.v32i8(<32 x i8> %op)
42  store <32 x i8> %res, ptr %a
43  ret void
44}
45
46define void @ctlz_v64i8(ptr %a) #0 {
47; VBITS_GE_256-LABEL: ctlz_v64i8:
48; VBITS_GE_256:       // %bb.0:
49; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
50; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
51; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
52; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
53; VBITS_GE_256-NEXT:    clz z0.b, p0/m, z0.b
54; VBITS_GE_256-NEXT:    clz z1.b, p0/m, z1.b
55; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
56; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
57; VBITS_GE_256-NEXT:    ret
58;
59; VBITS_GE_512-LABEL: ctlz_v64i8:
60; VBITS_GE_512:       // %bb.0:
61; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
62; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
63; VBITS_GE_512-NEXT:    clz z0.b, p0/m, z0.b
64; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
65; VBITS_GE_512-NEXT:    ret
66  %op = load <64 x i8>, ptr %a
67  %res = call <64 x i8> @llvm.ctlz.v64i8(<64 x i8> %op)
68  store <64 x i8> %res, ptr %a
69  ret void
70}
71
72define void @ctlz_v128i8(ptr %a) vscale_range(8,0) #0 {
73; CHECK-LABEL: ctlz_v128i8:
74; CHECK:       // %bb.0:
75; CHECK-NEXT:    ptrue p0.b, vl128
76; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
77; CHECK-NEXT:    clz z0.b, p0/m, z0.b
78; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
79; CHECK-NEXT:    ret
80  %op = load <128 x i8>, ptr %a
81  %res = call <128 x i8> @llvm.ctlz.v128i8(<128 x i8> %op)
82  store <128 x i8> %res, ptr %a
83  ret void
84}
85
86define void @ctlz_v256i8(ptr %a) vscale_range(16,0) #0 {
87; CHECK-LABEL: ctlz_v256i8:
88; CHECK:       // %bb.0:
89; CHECK-NEXT:    ptrue p0.b, vl256
90; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
91; CHECK-NEXT:    clz z0.b, p0/m, z0.b
92; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
93; CHECK-NEXT:    ret
94  %op = load <256 x i8>, ptr %a
95  %res = call <256 x i8> @llvm.ctlz.v256i8(<256 x i8> %op)
96  store <256 x i8> %res, ptr %a
97  ret void
98}
99
100; Don't use SVE for 64-bit vectors.
101define <4 x i16> @ctlz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
102; CHECK-LABEL: ctlz_v4i16:
103; CHECK:       // %bb.0:
104; CHECK-NEXT:    clz v0.4h, v0.4h
105; CHECK-NEXT:    ret
106  %res = call <4 x i16> @llvm.ctlz.v4i16(<4 x i16> %op)
107  ret <4 x i16> %res
108}
109
110; Don't use SVE for 128-bit vectors.
111define <8 x i16> @ctlz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
112; CHECK-LABEL: ctlz_v8i16:
113; CHECK:       // %bb.0:
114; CHECK-NEXT:    clz v0.8h, v0.8h
115; CHECK-NEXT:    ret
116  %res = call <8 x i16> @llvm.ctlz.v8i16(<8 x i16> %op)
117  ret <8 x i16> %res
118}
119
120define void @ctlz_v16i16(ptr %a) vscale_range(2,0) #0 {
121; CHECK-LABEL: ctlz_v16i16:
122; CHECK:       // %bb.0:
123; CHECK-NEXT:    ptrue p0.h, vl16
124; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
125; CHECK-NEXT:    clz z0.h, p0/m, z0.h
126; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
127; CHECK-NEXT:    ret
128  %op = load <16 x i16>, ptr %a
129  %res = call <16 x i16> @llvm.ctlz.v16i16(<16 x i16> %op)
130  store <16 x i16> %res, ptr %a
131  ret void
132}
133
134define void @ctlz_v32i16(ptr %a) #0 {
135; VBITS_GE_256-LABEL: ctlz_v32i16:
136; VBITS_GE_256:       // %bb.0:
137; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
138; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
139; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
140; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
141; VBITS_GE_256-NEXT:    clz z0.h, p0/m, z0.h
142; VBITS_GE_256-NEXT:    clz z1.h, p0/m, z1.h
143; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
144; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
145; VBITS_GE_256-NEXT:    ret
146;
147; VBITS_GE_512-LABEL: ctlz_v32i16:
148; VBITS_GE_512:       // %bb.0:
149; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
150; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
151; VBITS_GE_512-NEXT:    clz z0.h, p0/m, z0.h
152; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
153; VBITS_GE_512-NEXT:    ret
154  %op = load <32 x i16>, ptr %a
155  %res = call <32 x i16> @llvm.ctlz.v32i16(<32 x i16> %op)
156  store <32 x i16> %res, ptr %a
157  ret void
158}
159
160define void @ctlz_v64i16(ptr %a) vscale_range(8,0) #0 {
161; CHECK-LABEL: ctlz_v64i16:
162; CHECK:       // %bb.0:
163; CHECK-NEXT:    ptrue p0.h, vl64
164; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
165; CHECK-NEXT:    clz z0.h, p0/m, z0.h
166; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
167; CHECK-NEXT:    ret
168  %op = load <64 x i16>, ptr %a
169  %res = call <64 x i16> @llvm.ctlz.v64i16(<64 x i16> %op)
170  store <64 x i16> %res, ptr %a
171  ret void
172}
173
174define void @ctlz_v128i16(ptr %a) vscale_range(16,0) #0 {
175; CHECK-LABEL: ctlz_v128i16:
176; CHECK:       // %bb.0:
177; CHECK-NEXT:    ptrue p0.h, vl128
178; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
179; CHECK-NEXT:    clz z0.h, p0/m, z0.h
180; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
181; CHECK-NEXT:    ret
182  %op = load <128 x i16>, ptr %a
183  %res = call <128 x i16> @llvm.ctlz.v128i16(<128 x i16> %op)
184  store <128 x i16> %res, ptr %a
185  ret void
186}
187
188; Don't use SVE for 64-bit vectors.
189define <2 x i32> @ctlz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
190; CHECK-LABEL: ctlz_v2i32:
191; CHECK:       // %bb.0:
192; CHECK-NEXT:    clz v0.2s, v0.2s
193; CHECK-NEXT:    ret
194  %res = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %op)
195  ret <2 x i32> %res
196}
197
198; Don't use SVE for 128-bit vectors.
199define <4 x i32> @ctlz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
200; CHECK-LABEL: ctlz_v4i32:
201; CHECK:       // %bb.0:
202; CHECK-NEXT:    clz v0.4s, v0.4s
203; CHECK-NEXT:    ret
204  %res = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %op)
205  ret <4 x i32> %res
206}
207
208define void @ctlz_v8i32(ptr %a) vscale_range(2,0) #0 {
209; CHECK-LABEL: ctlz_v8i32:
210; CHECK:       // %bb.0:
211; CHECK-NEXT:    ptrue p0.s, vl8
212; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
213; CHECK-NEXT:    clz z0.s, p0/m, z0.s
214; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
215; CHECK-NEXT:    ret
216  %op = load <8 x i32>, ptr %a
217  %res = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %op)
218  store <8 x i32> %res, ptr %a
219  ret void
220}
221
222define void @ctlz_v16i32(ptr %a) #0 {
223; VBITS_GE_256-LABEL: ctlz_v16i32:
224; VBITS_GE_256:       // %bb.0:
225; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
226; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
227; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
228; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
229; VBITS_GE_256-NEXT:    clz z0.s, p0/m, z0.s
230; VBITS_GE_256-NEXT:    clz z1.s, p0/m, z1.s
231; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
232; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
233; VBITS_GE_256-NEXT:    ret
234;
235; VBITS_GE_512-LABEL: ctlz_v16i32:
236; VBITS_GE_512:       // %bb.0:
237; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
238; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
239; VBITS_GE_512-NEXT:    clz z0.s, p0/m, z0.s
240; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
241; VBITS_GE_512-NEXT:    ret
242  %op = load <16 x i32>, ptr %a
243  %res = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %op)
244  store <16 x i32> %res, ptr %a
245  ret void
246}
247
248define void @ctlz_v32i32(ptr %a) vscale_range(8,0) #0 {
249; CHECK-LABEL: ctlz_v32i32:
250; CHECK:       // %bb.0:
251; CHECK-NEXT:    ptrue p0.s, vl32
252; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
253; CHECK-NEXT:    clz z0.s, p0/m, z0.s
254; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
255; CHECK-NEXT:    ret
256  %op = load <32 x i32>, ptr %a
257  %res = call <32 x i32> @llvm.ctlz.v32i32(<32 x i32> %op)
258  store <32 x i32> %res, ptr %a
259  ret void
260}
261
262define void @ctlz_v64i32(ptr %a)  vscale_range(16,0) #0 {
263; CHECK-LABEL: ctlz_v64i32:
264; CHECK:       // %bb.0:
265; CHECK-NEXT:    ptrue p0.s, vl64
266; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
267; CHECK-NEXT:    clz z0.s, p0/m, z0.s
268; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
269; CHECK-NEXT:    ret
270  %op = load <64 x i32>, ptr %a
271  %res = call <64 x i32> @llvm.ctlz.v64i32(<64 x i32> %op)
272  store <64 x i32> %res, ptr %a
273  ret void
274}
275
276define <1 x i64> @ctlz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
277; CHECK-LABEL: ctlz_v1i64:
278; CHECK:       // %bb.0:
279; CHECK-NEXT:    ptrue p0.d, vl1
280; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
281; CHECK-NEXT:    clz z0.d, p0/m, z0.d
282; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
283; CHECK-NEXT:    ret
284  %res = call <1 x i64> @llvm.ctlz.v1i64(<1 x i64> %op)
285  ret <1 x i64> %res
286}
287
288define <2 x i64> @ctlz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
289; CHECK-LABEL: ctlz_v2i64:
290; CHECK:       // %bb.0:
291; CHECK-NEXT:    ptrue p0.d, vl2
292; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
293; CHECK-NEXT:    clz z0.d, p0/m, z0.d
294; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
295; CHECK-NEXT:    ret
296  %res = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %op)
297  ret <2 x i64> %res
298}
299
300define void @ctlz_v4i64(ptr %a) vscale_range(2,0) #0 {
301; CHECK-LABEL: ctlz_v4i64:
302; CHECK:       // %bb.0:
303; CHECK-NEXT:    ptrue p0.d, vl4
304; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
305; CHECK-NEXT:    clz z0.d, p0/m, z0.d
306; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
307; CHECK-NEXT:    ret
308  %op = load <4 x i64>, ptr %a
309  %res = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %op)
310  store <4 x i64> %res, ptr %a
311  ret void
312}
313
314define void @ctlz_v8i64(ptr %a) #0 {
315; VBITS_GE_256-LABEL: ctlz_v8i64:
316; VBITS_GE_256:       // %bb.0:
317; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
318; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
319; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
320; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
321; VBITS_GE_256-NEXT:    clz z0.d, p0/m, z0.d
322; VBITS_GE_256-NEXT:    clz z1.d, p0/m, z1.d
323; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
324; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
325; VBITS_GE_256-NEXT:    ret
326;
327; VBITS_GE_512-LABEL: ctlz_v8i64:
328; VBITS_GE_512:       // %bb.0:
329; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
330; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
331; VBITS_GE_512-NEXT:    clz z0.d, p0/m, z0.d
332; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
333; VBITS_GE_512-NEXT:    ret
334  %op = load <8 x i64>, ptr %a
335  %res = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %op)
336  store <8 x i64> %res, ptr %a
337  ret void
338}
339
340define void @ctlz_v16i64(ptr %a) vscale_range(8,0) #0 {
341; CHECK-LABEL: ctlz_v16i64:
342; CHECK:       // %bb.0:
343; CHECK-NEXT:    ptrue p0.d, vl16
344; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
345; CHECK-NEXT:    clz z0.d, p0/m, z0.d
346; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
347; CHECK-NEXT:    ret
348  %op = load <16 x i64>, ptr %a
349  %res = call <16 x i64> @llvm.ctlz.v16i64(<16 x i64> %op)
350  store <16 x i64> %res, ptr %a
351  ret void
352}
353
354define void @ctlz_v32i64(ptr %a) vscale_range(16,0) #0 {
355; CHECK-LABEL: ctlz_v32i64:
356; CHECK:       // %bb.0:
357; CHECK-NEXT:    ptrue p0.d, vl32
358; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
359; CHECK-NEXT:    clz z0.d, p0/m, z0.d
360; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
361; CHECK-NEXT:    ret
362  %op = load <32 x i64>, ptr %a
363  %res = call <32 x i64> @llvm.ctlz.v32i64(<32 x i64> %op)
364  store <32 x i64> %res, ptr %a
365  ret void
366}
367
368;
369; CNT
370;
371
372; Don't use SVE for 64-bit vectors.
373define <8 x i8> @ctpop_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
374; CHECK-LABEL: ctpop_v8i8:
375; CHECK:       // %bb.0:
376; CHECK-NEXT:    cnt v0.8b, v0.8b
377; CHECK-NEXT:    ret
378  %res = call <8 x i8> @llvm.ctpop.v8i8(<8 x i8> %op)
379  ret <8 x i8> %res
380}
381
382; Don't use SVE for 128-bit vectors.
383define <16 x i8> @ctpop_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
384; CHECK-LABEL: ctpop_v16i8:
385; CHECK:       // %bb.0:
386; CHECK-NEXT:    cnt v0.16b, v0.16b
387; CHECK-NEXT:    ret
388  %res = call <16 x i8> @llvm.ctpop.v16i8(<16 x i8> %op)
389  ret <16 x i8> %res
390}
391
392define void @ctpop_v32i8(ptr %a) vscale_range(2,0) #0 {
393; CHECK-LABEL: ctpop_v32i8:
394; CHECK:       // %bb.0:
395; CHECK-NEXT:    ptrue p0.b, vl32
396; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
397; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
398; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
399; CHECK-NEXT:    ret
400  %op = load <32 x i8>, ptr %a
401  %res = call <32 x i8> @llvm.ctpop.v32i8(<32 x i8> %op)
402  store <32 x i8> %res, ptr %a
403  ret void
404}
405
406define void @ctpop_v64i8(ptr %a) #0 {
407; VBITS_GE_256-LABEL: ctpop_v64i8:
408; VBITS_GE_256:       // %bb.0:
409; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
410; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
411; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
412; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
413; VBITS_GE_256-NEXT:    cnt z0.b, p0/m, z0.b
414; VBITS_GE_256-NEXT:    cnt z1.b, p0/m, z1.b
415; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
416; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
417; VBITS_GE_256-NEXT:    ret
418;
419; VBITS_GE_512-LABEL: ctpop_v64i8:
420; VBITS_GE_512:       // %bb.0:
421; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
422; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
423; VBITS_GE_512-NEXT:    cnt z0.b, p0/m, z0.b
424; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
425; VBITS_GE_512-NEXT:    ret
426  %op = load <64 x i8>, ptr %a
427  %res = call <64 x i8> @llvm.ctpop.v64i8(<64 x i8> %op)
428  store <64 x i8> %res, ptr %a
429  ret void
430}
431
432define void @ctpop_v128i8(ptr %a) vscale_range(8,0) #0 {
433; CHECK-LABEL: ctpop_v128i8:
434; CHECK:       // %bb.0:
435; CHECK-NEXT:    ptrue p0.b, vl128
436; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
437; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
438; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
439; CHECK-NEXT:    ret
440  %op = load <128 x i8>, ptr %a
441  %res = call <128 x i8> @llvm.ctpop.v128i8(<128 x i8> %op)
442  store <128 x i8> %res, ptr %a
443  ret void
444}
445
446define void @ctpop_v256i8(ptr %a) vscale_range(16,0) #0 {
447; CHECK-LABEL: ctpop_v256i8:
448; CHECK:       // %bb.0:
449; CHECK-NEXT:    ptrue p0.b, vl256
450; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
451; CHECK-NEXT:    cnt z0.b, p0/m, z0.b
452; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
453; CHECK-NEXT:    ret
454  %op = load <256 x i8>, ptr %a
455  %res = call <256 x i8> @llvm.ctpop.v256i8(<256 x i8> %op)
456  store <256 x i8> %res, ptr %a
457  ret void
458}
459
460; Don't use SVE for 64-bit vectors.
461define <4 x i16> @ctpop_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
462; CHECK-LABEL: ctpop_v4i16:
463; CHECK:       // %bb.0:
464; CHECK-NEXT:    cnt v0.8b, v0.8b
465; CHECK-NEXT:    uaddlp v0.4h, v0.8b
466; CHECK-NEXT:    ret
467  %res = call <4 x i16> @llvm.ctpop.v4i16(<4 x i16> %op)
468  ret <4 x i16> %res
469}
470
471; Don't use SVE for 128-bit vectors.
472define <8 x i16> @ctpop_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
473; CHECK-LABEL: ctpop_v8i16:
474; CHECK:       // %bb.0:
475; CHECK-NEXT:    cnt v0.16b, v0.16b
476; CHECK-NEXT:    uaddlp v0.8h, v0.16b
477; CHECK-NEXT:    ret
478  %res = call <8 x i16> @llvm.ctpop.v8i16(<8 x i16> %op)
479  ret <8 x i16> %res
480}
481
482define void @ctpop_v16i16(ptr %a) vscale_range(2,0) #0 {
483; CHECK-LABEL: ctpop_v16i16:
484; CHECK:       // %bb.0:
485; CHECK-NEXT:    ptrue p0.h, vl16
486; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
487; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
488; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
489; CHECK-NEXT:    ret
490  %op = load <16 x i16>, ptr %a
491  %res = call <16 x i16> @llvm.ctpop.v16i16(<16 x i16> %op)
492  store <16 x i16> %res, ptr %a
493  ret void
494}
495
496define void @ctpop_v32i16(ptr %a) #0 {
497; VBITS_GE_256-LABEL: ctpop_v32i16:
498; VBITS_GE_256:       // %bb.0:
499; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
500; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
501; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
502; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
503; VBITS_GE_256-NEXT:    cnt z0.h, p0/m, z0.h
504; VBITS_GE_256-NEXT:    cnt z1.h, p0/m, z1.h
505; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
506; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
507; VBITS_GE_256-NEXT:    ret
508;
509; VBITS_GE_512-LABEL: ctpop_v32i16:
510; VBITS_GE_512:       // %bb.0:
511; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
512; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
513; VBITS_GE_512-NEXT:    cnt z0.h, p0/m, z0.h
514; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
515; VBITS_GE_512-NEXT:    ret
516  %op = load <32 x i16>, ptr %a
517  %res = call <32 x i16> @llvm.ctpop.v32i16(<32 x i16> %op)
518  store <32 x i16> %res, ptr %a
519  ret void
520}
521
522define void @ctpop_v64i16(ptr %a) vscale_range(8,0) #0 {
523; CHECK-LABEL: ctpop_v64i16:
524; CHECK:       // %bb.0:
525; CHECK-NEXT:    ptrue p0.h, vl64
526; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
527; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
528; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
529; CHECK-NEXT:    ret
530  %op = load <64 x i16>, ptr %a
531  %res = call <64 x i16> @llvm.ctpop.v64i16(<64 x i16> %op)
532  store <64 x i16> %res, ptr %a
533  ret void
534}
535
536define void @ctpop_v128i16(ptr %a) vscale_range(16,0) #0 {
537; CHECK-LABEL: ctpop_v128i16:
538; CHECK:       // %bb.0:
539; CHECK-NEXT:    ptrue p0.h, vl128
540; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
541; CHECK-NEXT:    cnt z0.h, p0/m, z0.h
542; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
543; CHECK-NEXT:    ret
544  %op = load <128 x i16>, ptr %a
545  %res = call <128 x i16> @llvm.ctpop.v128i16(<128 x i16> %op)
546  store <128 x i16> %res, ptr %a
547  ret void
548}
549
550; Don't use SVE for 64-bit vectors.
551define <2 x i32> @ctpop_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
552; CHECK-LABEL: ctpop_v2i32:
553; CHECK:       // %bb.0:
554; CHECK-NEXT:    cnt v0.8b, v0.8b
555; CHECK-NEXT:    uaddlp v0.4h, v0.8b
556; CHECK-NEXT:    uaddlp v0.2s, v0.4h
557; CHECK-NEXT:    ret
558  %res = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %op)
559  ret <2 x i32> %res
560}
561
562; Don't use SVE for 128-bit vectors.
563define <4 x i32> @ctpop_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
564; CHECK-LABEL: ctpop_v4i32:
565; CHECK:       // %bb.0:
566; CHECK-NEXT:    cnt v0.16b, v0.16b
567; CHECK-NEXT:    uaddlp v0.8h, v0.16b
568; CHECK-NEXT:    uaddlp v0.4s, v0.8h
569; CHECK-NEXT:    ret
570  %res = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %op)
571  ret <4 x i32> %res
572}
573
574define void @ctpop_v8i32(ptr %a) vscale_range(2,0) #0 {
575; CHECK-LABEL: ctpop_v8i32:
576; CHECK:       // %bb.0:
577; CHECK-NEXT:    ptrue p0.s, vl8
578; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
579; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
580; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
581; CHECK-NEXT:    ret
582  %op = load <8 x i32>, ptr %a
583  %res = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %op)
584  store <8 x i32> %res, ptr %a
585  ret void
586}
587
588define void @ctpop_v16i32(ptr %a) #0 {
589; VBITS_GE_256-LABEL: ctpop_v16i32:
590; VBITS_GE_256:       // %bb.0:
591; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
592; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
593; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
594; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
595; VBITS_GE_256-NEXT:    cnt z0.s, p0/m, z0.s
596; VBITS_GE_256-NEXT:    cnt z1.s, p0/m, z1.s
597; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
598; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
599; VBITS_GE_256-NEXT:    ret
600;
601; VBITS_GE_512-LABEL: ctpop_v16i32:
602; VBITS_GE_512:       // %bb.0:
603; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
604; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
605; VBITS_GE_512-NEXT:    cnt z0.s, p0/m, z0.s
606; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
607; VBITS_GE_512-NEXT:    ret
608  %op = load <16 x i32>, ptr %a
609  %res = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %op)
610  store <16 x i32> %res, ptr %a
611  ret void
612}
613
614define void @ctpop_v32i32(ptr %a) vscale_range(8,0) #0 {
615; CHECK-LABEL: ctpop_v32i32:
616; CHECK:       // %bb.0:
617; CHECK-NEXT:    ptrue p0.s, vl32
618; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
619; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
620; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
621; CHECK-NEXT:    ret
622  %op = load <32 x i32>, ptr %a
623  %res = call <32 x i32> @llvm.ctpop.v32i32(<32 x i32> %op)
624  store <32 x i32> %res, ptr %a
625  ret void
626}
627
628define void @ctpop_v64i32(ptr %a) vscale_range(16,0) #0 {
629; CHECK-LABEL: ctpop_v64i32:
630; CHECK:       // %bb.0:
631; CHECK-NEXT:    ptrue p0.s, vl64
632; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
633; CHECK-NEXT:    cnt z0.s, p0/m, z0.s
634; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
635; CHECK-NEXT:    ret
636  %op = load <64 x i32>, ptr %a
637  %res = call <64 x i32> @llvm.ctpop.v64i32(<64 x i32> %op)
638  store <64 x i32> %res, ptr %a
639  ret void
640}
641
642; Don't use SVE for 64-bit vectors.
643define <1 x i64> @ctpop_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
644; CHECK-LABEL: ctpop_v1i64:
645; CHECK:       // %bb.0:
646; CHECK-NEXT:    cnt v0.8b, v0.8b
647; CHECK-NEXT:    uaddlp v0.4h, v0.8b
648; CHECK-NEXT:    uaddlp v0.2s, v0.4h
649; CHECK-NEXT:    uaddlp v0.1d, v0.2s
650; CHECK-NEXT:    ret
651  %res = call <1 x i64> @llvm.ctpop.v1i64(<1 x i64> %op)
652  ret <1 x i64> %res
653}
654
655; Don't use SVE for 128-bit vectors.
656define <2 x i64> @ctpop_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
657; CHECK-LABEL: ctpop_v2i64:
658; CHECK:       // %bb.0:
659; CHECK-NEXT:    cnt v0.16b, v0.16b
660; CHECK-NEXT:    uaddlp v0.8h, v0.16b
661; CHECK-NEXT:    uaddlp v0.4s, v0.8h
662; CHECK-NEXT:    uaddlp v0.2d, v0.4s
663; CHECK-NEXT:    ret
664  %res = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %op)
665  ret <2 x i64> %res
666}
667
668define void @ctpop_v4i64(ptr %a) vscale_range(2,0) #0 {
669; CHECK-LABEL: ctpop_v4i64:
670; CHECK:       // %bb.0:
671; CHECK-NEXT:    ptrue p0.d, vl4
672; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
673; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
674; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
675; CHECK-NEXT:    ret
676  %op = load <4 x i64>, ptr %a
677  %res = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %op)
678  store <4 x i64> %res, ptr %a
679  ret void
680}
681
682define void @ctpop_v8i64(ptr %a) #0 {
683; VBITS_GE_256-LABEL: ctpop_v8i64:
684; VBITS_GE_256:       // %bb.0:
685; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
686; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
687; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
688; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
689; VBITS_GE_256-NEXT:    cnt z0.d, p0/m, z0.d
690; VBITS_GE_256-NEXT:    cnt z1.d, p0/m, z1.d
691; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
692; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
693; VBITS_GE_256-NEXT:    ret
694;
695; VBITS_GE_512-LABEL: ctpop_v8i64:
696; VBITS_GE_512:       // %bb.0:
697; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
698; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
699; VBITS_GE_512-NEXT:    cnt z0.d, p0/m, z0.d
700; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
701; VBITS_GE_512-NEXT:    ret
702  %op = load <8 x i64>, ptr %a
703  %res = call <8 x i64> @llvm.ctpop.v8i64(<8 x i64> %op)
704  store <8 x i64> %res, ptr %a
705  ret void
706}
707
708define void @ctpop_v16i64(ptr %a) vscale_range(8,0) #0 {
709; CHECK-LABEL: ctpop_v16i64:
710; CHECK:       // %bb.0:
711; CHECK-NEXT:    ptrue p0.d, vl16
712; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
713; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
714; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
715; CHECK-NEXT:    ret
716  %op = load <16 x i64>, ptr %a
717  %res = call <16 x i64> @llvm.ctpop.v16i64(<16 x i64> %op)
718  store <16 x i64> %res, ptr %a
719  ret void
720}
721
722define void @ctpop_v32i64(ptr %a) vscale_range(16,0) #0 {
723; CHECK-LABEL: ctpop_v32i64:
724; CHECK:       // %bb.0:
725; CHECK-NEXT:    ptrue p0.d, vl32
726; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
727; CHECK-NEXT:    cnt z0.d, p0/m, z0.d
728; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
729; CHECK-NEXT:    ret
730  %op = load <32 x i64>, ptr %a
731  %res = call <32 x i64> @llvm.ctpop.v32i64(<32 x i64> %op)
732  store <32 x i64> %res, ptr %a
733  ret void
734}
735
736;
737; Count trailing zeros
738;
739
740define <8 x i8> @cttz_v8i8(<8 x i8> %op) vscale_range(2,0) #0 {
741; CHECK-LABEL: cttz_v8i8:
742; CHECK:       // %bb.0:
743; CHECK-NEXT:    ptrue p0.b, vl8
744; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
745; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
746; CHECK-NEXT:    clz v0.8b, v0.8b
747; CHECK-NEXT:    ret
748  %res = call <8 x i8> @llvm.cttz.v8i8(<8 x i8> %op)
749  ret <8 x i8> %res
750}
751
752define <16 x i8> @cttz_v16i8(<16 x i8> %op) vscale_range(2,0) #0 {
753; CHECK-LABEL: cttz_v16i8:
754; CHECK:       // %bb.0:
755; CHECK-NEXT:    ptrue p0.b, vl16
756; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
757; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
758; CHECK-NEXT:    clz v0.16b, v0.16b
759; CHECK-NEXT:    ret
760  %res = call <16 x i8> @llvm.cttz.v16i8(<16 x i8> %op)
761  ret <16 x i8> %res
762}
763
764define void @cttz_v32i8(ptr %a) vscale_range(2,0) #0 {
765; CHECK-LABEL: cttz_v32i8:
766; CHECK:       // %bb.0:
767; CHECK-NEXT:    ptrue p0.b, vl32
768; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
769; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
770; CHECK-NEXT:    clz z0.b, p0/m, z0.b
771; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
772; CHECK-NEXT:    ret
773  %op = load <32 x i8>, ptr %a
774  %res = call <32 x i8> @llvm.cttz.v32i8(<32 x i8> %op)
775  store <32 x i8> %res, ptr %a
776  ret void
777}
778
779define void @cttz_v64i8(ptr %a) #0 {
780; VBITS_GE_256-LABEL: cttz_v64i8:
781; VBITS_GE_256:       // %bb.0:
782; VBITS_GE_256-NEXT:    ptrue p0.b, vl32
783; VBITS_GE_256-NEXT:    mov w8, #32 // =0x20
784; VBITS_GE_256-NEXT:    ld1b { z0.b }, p0/z, [x0, x8]
785; VBITS_GE_256-NEXT:    ld1b { z1.b }, p0/z, [x0]
786; VBITS_GE_256-NEXT:    rbit z0.b, p0/m, z0.b
787; VBITS_GE_256-NEXT:    rbit z1.b, p0/m, z1.b
788; VBITS_GE_256-NEXT:    clz z0.b, p0/m, z0.b
789; VBITS_GE_256-NEXT:    clz z1.b, p0/m, z1.b
790; VBITS_GE_256-NEXT:    st1b { z0.b }, p0, [x0, x8]
791; VBITS_GE_256-NEXT:    st1b { z1.b }, p0, [x0]
792; VBITS_GE_256-NEXT:    ret
793;
794; VBITS_GE_512-LABEL: cttz_v64i8:
795; VBITS_GE_512:       // %bb.0:
796; VBITS_GE_512-NEXT:    ptrue p0.b, vl64
797; VBITS_GE_512-NEXT:    ld1b { z0.b }, p0/z, [x0]
798; VBITS_GE_512-NEXT:    rbit z0.b, p0/m, z0.b
799; VBITS_GE_512-NEXT:    clz z0.b, p0/m, z0.b
800; VBITS_GE_512-NEXT:    st1b { z0.b }, p0, [x0]
801; VBITS_GE_512-NEXT:    ret
802  %op = load <64 x i8>, ptr %a
803  %res = call <64 x i8> @llvm.cttz.v64i8(<64 x i8> %op)
804  store <64 x i8> %res, ptr %a
805  ret void
806}
807
808define void @cttz_v128i8(ptr %a) vscale_range(8,0) #0 {
809; CHECK-LABEL: cttz_v128i8:
810; CHECK:       // %bb.0:
811; CHECK-NEXT:    ptrue p0.b, vl128
812; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
813; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
814; CHECK-NEXT:    clz z0.b, p0/m, z0.b
815; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
816; CHECK-NEXT:    ret
817  %op = load <128 x i8>, ptr %a
818  %res = call <128 x i8> @llvm.cttz.v128i8(<128 x i8> %op)
819  store <128 x i8> %res, ptr %a
820  ret void
821}
822
823define void @cttz_v256i8(ptr %a) vscale_range(16,0) #0 {
824; CHECK-LABEL: cttz_v256i8:
825; CHECK:       // %bb.0:
826; CHECK-NEXT:    ptrue p0.b, vl256
827; CHECK-NEXT:    ld1b { z0.b }, p0/z, [x0]
828; CHECK-NEXT:    rbit z0.b, p0/m, z0.b
829; CHECK-NEXT:    clz z0.b, p0/m, z0.b
830; CHECK-NEXT:    st1b { z0.b }, p0, [x0]
831; CHECK-NEXT:    ret
832  %op = load <256 x i8>, ptr %a
833  %res = call <256 x i8> @llvm.cttz.v256i8(<256 x i8> %op)
834  store <256 x i8> %res, ptr %a
835  ret void
836}
837
838define <4 x i16> @cttz_v4i16(<4 x i16> %op) vscale_range(2,0) #0 {
839; CHECK-LABEL: cttz_v4i16:
840; CHECK:       // %bb.0:
841; CHECK-NEXT:    ptrue p0.h, vl4
842; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
843; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
844; CHECK-NEXT:    clz v0.4h, v0.4h
845; CHECK-NEXT:    ret
846  %res = call <4 x i16> @llvm.cttz.v4i16(<4 x i16> %op)
847  ret <4 x i16> %res
848}
849
850define <8 x i16> @cttz_v8i16(<8 x i16> %op) vscale_range(2,0) #0 {
851; CHECK-LABEL: cttz_v8i16:
852; CHECK:       // %bb.0:
853; CHECK-NEXT:    ptrue p0.h, vl8
854; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
855; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
856; CHECK-NEXT:    clz v0.8h, v0.8h
857; CHECK-NEXT:    ret
858  %res = call <8 x i16> @llvm.cttz.v8i16(<8 x i16> %op)
859  ret <8 x i16> %res
860}
861
862define void @cttz_v16i16(ptr %a) vscale_range(2,0) #0 {
863; CHECK-LABEL: cttz_v16i16:
864; CHECK:       // %bb.0:
865; CHECK-NEXT:    ptrue p0.h, vl16
866; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
867; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
868; CHECK-NEXT:    clz z0.h, p0/m, z0.h
869; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
870; CHECK-NEXT:    ret
871  %op = load <16 x i16>, ptr %a
872  %res = call <16 x i16> @llvm.cttz.v16i16(<16 x i16> %op)
873  store <16 x i16> %res, ptr %a
874  ret void
875}
876
877define void @cttz_v32i16(ptr %a) #0 {
878; VBITS_GE_256-LABEL: cttz_v32i16:
879; VBITS_GE_256:       // %bb.0:
880; VBITS_GE_256-NEXT:    ptrue p0.h, vl16
881; VBITS_GE_256-NEXT:    mov x8, #16 // =0x10
882; VBITS_GE_256-NEXT:    ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
883; VBITS_GE_256-NEXT:    ld1h { z1.h }, p0/z, [x0]
884; VBITS_GE_256-NEXT:    rbit z0.h, p0/m, z0.h
885; VBITS_GE_256-NEXT:    rbit z1.h, p0/m, z1.h
886; VBITS_GE_256-NEXT:    clz z0.h, p0/m, z0.h
887; VBITS_GE_256-NEXT:    clz z1.h, p0/m, z1.h
888; VBITS_GE_256-NEXT:    st1h { z0.h }, p0, [x0, x8, lsl #1]
889; VBITS_GE_256-NEXT:    st1h { z1.h }, p0, [x0]
890; VBITS_GE_256-NEXT:    ret
891;
892; VBITS_GE_512-LABEL: cttz_v32i16:
893; VBITS_GE_512:       // %bb.0:
894; VBITS_GE_512-NEXT:    ptrue p0.h, vl32
895; VBITS_GE_512-NEXT:    ld1h { z0.h }, p0/z, [x0]
896; VBITS_GE_512-NEXT:    rbit z0.h, p0/m, z0.h
897; VBITS_GE_512-NEXT:    clz z0.h, p0/m, z0.h
898; VBITS_GE_512-NEXT:    st1h { z0.h }, p0, [x0]
899; VBITS_GE_512-NEXT:    ret
900  %op = load <32 x i16>, ptr %a
901  %res = call <32 x i16> @llvm.cttz.v32i16(<32 x i16> %op)
902  store <32 x i16> %res, ptr %a
903  ret void
904}
905
906define void @cttz_v64i16(ptr %a) vscale_range(8,0) #0 {
907; CHECK-LABEL: cttz_v64i16:
908; CHECK:       // %bb.0:
909; CHECK-NEXT:    ptrue p0.h, vl64
910; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
911; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
912; CHECK-NEXT:    clz z0.h, p0/m, z0.h
913; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
914; CHECK-NEXT:    ret
915  %op = load <64 x i16>, ptr %a
916  %res = call <64 x i16> @llvm.cttz.v64i16(<64 x i16> %op)
917  store <64 x i16> %res, ptr %a
918  ret void
919}
920
921define void @cttz_v128i16(ptr %a) vscale_range(16,0) #0 {
922; CHECK-LABEL: cttz_v128i16:
923; CHECK:       // %bb.0:
924; CHECK-NEXT:    ptrue p0.h, vl128
925; CHECK-NEXT:    ld1h { z0.h }, p0/z, [x0]
926; CHECK-NEXT:    rbit z0.h, p0/m, z0.h
927; CHECK-NEXT:    clz z0.h, p0/m, z0.h
928; CHECK-NEXT:    st1h { z0.h }, p0, [x0]
929; CHECK-NEXT:    ret
930  %op = load <128 x i16>, ptr %a
931  %res = call <128 x i16> @llvm.cttz.v128i16(<128 x i16> %op)
932  store <128 x i16> %res, ptr %a
933  ret void
934}
935
936; Don't use SVE for 64-bit vectors.
937define <2 x i32> @cttz_v2i32(<2 x i32> %op) vscale_range(2,0) #0 {
938; CHECK-LABEL: cttz_v2i32:
939; CHECK:       // %bb.0:
940; CHECK-NEXT:    ptrue p0.s, vl2
941; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
942; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
943; CHECK-NEXT:    clz v0.2s, v0.2s
944; CHECK-NEXT:    ret
945  %res = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %op)
946  ret <2 x i32> %res
947}
948
949; Don't use SVE for 128-bit vectors.
950define <4 x i32> @cttz_v4i32(<4 x i32> %op) vscale_range(2,0) #0 {
951; CHECK-LABEL: cttz_v4i32:
952; CHECK:       // %bb.0:
953; CHECK-NEXT:    ptrue p0.s, vl4
954; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
955; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
956; CHECK-NEXT:    clz v0.4s, v0.4s
957; CHECK-NEXT:    ret
958  %res = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %op)
959  ret <4 x i32> %res
960}
961
962define void @cttz_v8i32(ptr %a) vscale_range(2,0) #0 {
963; CHECK-LABEL: cttz_v8i32:
964; CHECK:       // %bb.0:
965; CHECK-NEXT:    ptrue p0.s, vl8
966; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
967; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
968; CHECK-NEXT:    clz z0.s, p0/m, z0.s
969; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
970; CHECK-NEXT:    ret
971  %op = load <8 x i32>, ptr %a
972  %res = call <8 x i32> @llvm.cttz.v8i32(<8 x i32> %op)
973  store <8 x i32> %res, ptr %a
974  ret void
975}
976
977define void @cttz_v16i32(ptr %a) #0 {
978; VBITS_GE_256-LABEL: cttz_v16i32:
979; VBITS_GE_256:       // %bb.0:
980; VBITS_GE_256-NEXT:    ptrue p0.s, vl8
981; VBITS_GE_256-NEXT:    mov x8, #8 // =0x8
982; VBITS_GE_256-NEXT:    ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
983; VBITS_GE_256-NEXT:    ld1w { z1.s }, p0/z, [x0]
984; VBITS_GE_256-NEXT:    rbit z0.s, p0/m, z0.s
985; VBITS_GE_256-NEXT:    rbit z1.s, p0/m, z1.s
986; VBITS_GE_256-NEXT:    clz z0.s, p0/m, z0.s
987; VBITS_GE_256-NEXT:    clz z1.s, p0/m, z1.s
988; VBITS_GE_256-NEXT:    st1w { z0.s }, p0, [x0, x8, lsl #2]
989; VBITS_GE_256-NEXT:    st1w { z1.s }, p0, [x0]
990; VBITS_GE_256-NEXT:    ret
991;
992; VBITS_GE_512-LABEL: cttz_v16i32:
993; VBITS_GE_512:       // %bb.0:
994; VBITS_GE_512-NEXT:    ptrue p0.s, vl16
995; VBITS_GE_512-NEXT:    ld1w { z0.s }, p0/z, [x0]
996; VBITS_GE_512-NEXT:    rbit z0.s, p0/m, z0.s
997; VBITS_GE_512-NEXT:    clz z0.s, p0/m, z0.s
998; VBITS_GE_512-NEXT:    st1w { z0.s }, p0, [x0]
999; VBITS_GE_512-NEXT:    ret
1000  %op = load <16 x i32>, ptr %a
1001  %res = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %op)
1002  store <16 x i32> %res, ptr %a
1003  ret void
1004}
1005
1006define void @cttz_v32i32(ptr %a) vscale_range(8,0) #0 {
1007; CHECK-LABEL: cttz_v32i32:
1008; CHECK:       // %bb.0:
1009; CHECK-NEXT:    ptrue p0.s, vl32
1010; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1011; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
1012; CHECK-NEXT:    clz z0.s, p0/m, z0.s
1013; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1014; CHECK-NEXT:    ret
1015  %op = load <32 x i32>, ptr %a
1016  %res = call <32 x i32> @llvm.cttz.v32i32(<32 x i32> %op)
1017  store <32 x i32> %res, ptr %a
1018  ret void
1019}
1020
1021define void @cttz_v64i32(ptr %a) vscale_range(16,0) #0 {
1022; CHECK-LABEL: cttz_v64i32:
1023; CHECK:       // %bb.0:
1024; CHECK-NEXT:    ptrue p0.s, vl64
1025; CHECK-NEXT:    ld1w { z0.s }, p0/z, [x0]
1026; CHECK-NEXT:    rbit z0.s, p0/m, z0.s
1027; CHECK-NEXT:    clz z0.s, p0/m, z0.s
1028; CHECK-NEXT:    st1w { z0.s }, p0, [x0]
1029; CHECK-NEXT:    ret
1030  %op = load <64 x i32>, ptr %a
1031  %res = call <64 x i32> @llvm.cttz.v64i32(<64 x i32> %op)
1032  store <64 x i32> %res, ptr %a
1033  ret void
1034}
1035
1036define <1 x i64> @cttz_v1i64(<1 x i64> %op) vscale_range(2,0) #0 {
1037; CHECK-LABEL: cttz_v1i64:
1038; CHECK:       // %bb.0:
1039; CHECK-NEXT:    ptrue p0.d, vl1
1040; CHECK-NEXT:    // kill: def $d0 killed $d0 def $z0
1041; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
1042; CHECK-NEXT:    clz z0.d, p0/m, z0.d
1043; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $z0
1044; CHECK-NEXT:    ret
1045  %res = call <1 x i64> @llvm.cttz.v1i64(<1 x i64> %op)
1046  ret <1 x i64> %res
1047}
1048
1049define <2 x i64> @cttz_v2i64(<2 x i64> %op) vscale_range(2,0) #0 {
1050; CHECK-LABEL: cttz_v2i64:
1051; CHECK:       // %bb.0:
1052; CHECK-NEXT:    ptrue p0.d, vl2
1053; CHECK-NEXT:    // kill: def $q0 killed $q0 def $z0
1054; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
1055; CHECK-NEXT:    clz z0.d, p0/m, z0.d
1056; CHECK-NEXT:    // kill: def $q0 killed $q0 killed $z0
1057; CHECK-NEXT:    ret
1058  %res = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> %op)
1059  ret <2 x i64> %res
1060}
1061
1062define void @cttz_v4i64(ptr %a) vscale_range(2,0) #0 {
1063; CHECK-LABEL: cttz_v4i64:
1064; CHECK:       // %bb.0:
1065; CHECK-NEXT:    ptrue p0.d, vl4
1066; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1067; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
1068; CHECK-NEXT:    clz z0.d, p0/m, z0.d
1069; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1070; CHECK-NEXT:    ret
1071  %op = load <4 x i64>, ptr %a
1072  %res = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> %op)
1073  store <4 x i64> %res, ptr %a
1074  ret void
1075}
1076
1077define void @cttz_v8i64(ptr %a) #0 {
1078; VBITS_GE_256-LABEL: cttz_v8i64:
1079; VBITS_GE_256:       // %bb.0:
1080; VBITS_GE_256-NEXT:    ptrue p0.d, vl4
1081; VBITS_GE_256-NEXT:    mov x8, #4 // =0x4
1082; VBITS_GE_256-NEXT:    ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
1083; VBITS_GE_256-NEXT:    ld1d { z1.d }, p0/z, [x0]
1084; VBITS_GE_256-NEXT:    rbit z0.d, p0/m, z0.d
1085; VBITS_GE_256-NEXT:    rbit z1.d, p0/m, z1.d
1086; VBITS_GE_256-NEXT:    clz z0.d, p0/m, z0.d
1087; VBITS_GE_256-NEXT:    clz z1.d, p0/m, z1.d
1088; VBITS_GE_256-NEXT:    st1d { z0.d }, p0, [x0, x8, lsl #3]
1089; VBITS_GE_256-NEXT:    st1d { z1.d }, p0, [x0]
1090; VBITS_GE_256-NEXT:    ret
1091;
1092; VBITS_GE_512-LABEL: cttz_v8i64:
1093; VBITS_GE_512:       // %bb.0:
1094; VBITS_GE_512-NEXT:    ptrue p0.d, vl8
1095; VBITS_GE_512-NEXT:    ld1d { z0.d }, p0/z, [x0]
1096; VBITS_GE_512-NEXT:    rbit z0.d, p0/m, z0.d
1097; VBITS_GE_512-NEXT:    clz z0.d, p0/m, z0.d
1098; VBITS_GE_512-NEXT:    st1d { z0.d }, p0, [x0]
1099; VBITS_GE_512-NEXT:    ret
1100  %op = load <8 x i64>, ptr %a
1101  %res = call <8 x i64> @llvm.cttz.v8i64(<8 x i64> %op)
1102  store <8 x i64> %res, ptr %a
1103  ret void
1104}
1105
1106define void @cttz_v16i64(ptr %a) vscale_range(8,0) #0 {
1107; CHECK-LABEL: cttz_v16i64:
1108; CHECK:       // %bb.0:
1109; CHECK-NEXT:    ptrue p0.d, vl16
1110; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1111; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
1112; CHECK-NEXT:    clz z0.d, p0/m, z0.d
1113; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1114; CHECK-NEXT:    ret
1115  %op = load <16 x i64>, ptr %a
1116  %res = call <16 x i64> @llvm.cttz.v16i64(<16 x i64> %op)
1117  store <16 x i64> %res, ptr %a
1118  ret void
1119}
1120
1121define void @cttz_v32i64(ptr %a) vscale_range(16,0) #0 {
1122; CHECK-LABEL: cttz_v32i64:
1123; CHECK:       // %bb.0:
1124; CHECK-NEXT:    ptrue p0.d, vl32
1125; CHECK-NEXT:    ld1d { z0.d }, p0/z, [x0]
1126; CHECK-NEXT:    rbit z0.d, p0/m, z0.d
1127; CHECK-NEXT:    clz z0.d, p0/m, z0.d
1128; CHECK-NEXT:    st1d { z0.d }, p0, [x0]
1129; CHECK-NEXT:    ret
1130  %op = load <32 x i64>, ptr %a
1131  %res = call <32 x i64> @llvm.cttz.v32i64(<32 x i64> %op)
1132  store <32 x i64> %res, ptr %a
1133  ret void
1134}
1135
1136attributes #0 = { "target-features"="+sve" }
1137
1138declare <8 x i8> @llvm.ctlz.v8i8(<8 x i8>)
1139declare <16 x i8> @llvm.ctlz.v16i8(<16 x i8>)
1140declare <32 x i8> @llvm.ctlz.v32i8(<32 x i8>)
1141declare <64 x i8> @llvm.ctlz.v64i8(<64 x i8>)
1142declare <128 x i8> @llvm.ctlz.v128i8(<128 x i8>)
1143declare <256 x i8> @llvm.ctlz.v256i8(<256 x i8>)
1144declare <4 x i16> @llvm.ctlz.v4i16(<4 x i16>)
1145declare <8 x i16> @llvm.ctlz.v8i16(<8 x i16>)
1146declare <16 x i16> @llvm.ctlz.v16i16(<16 x i16>)
1147declare <32 x i16> @llvm.ctlz.v32i16(<32 x i16>)
1148declare <64 x i16> @llvm.ctlz.v64i16(<64 x i16>)
1149declare <128 x i16> @llvm.ctlz.v128i16(<128 x i16>)
1150declare <2 x i32> @llvm.ctlz.v2i32(<2 x i32>)
1151declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>)
1152declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>)
1153declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>)
1154declare <32 x i32> @llvm.ctlz.v32i32(<32 x i32>)
1155declare <64 x i32> @llvm.ctlz.v64i32(<64 x i32>)
1156declare <1 x i64> @llvm.ctlz.v1i64(<1 x i64>)
1157declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>)
1158declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>)
1159declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>)
1160declare <16 x i64> @llvm.ctlz.v16i64(<16 x i64>)
1161declare <32 x i64> @llvm.ctlz.v32i64(<32 x i64>)
1162
1163declare <8 x i8> @llvm.ctpop.v8i8(<8 x i8>)
1164declare <16 x i8> @llvm.ctpop.v16i8(<16 x i8>)
1165declare <32 x i8> @llvm.ctpop.v32i8(<32 x i8>)
1166declare <64 x i8> @llvm.ctpop.v64i8(<64 x i8>)
1167declare <128 x i8> @llvm.ctpop.v128i8(<128 x i8>)
1168declare <256 x i8> @llvm.ctpop.v256i8(<256 x i8>)
1169declare <4 x i16> @llvm.ctpop.v4i16(<4 x i16>)
1170declare <8 x i16> @llvm.ctpop.v8i16(<8 x i16>)
1171declare <16 x i16> @llvm.ctpop.v16i16(<16 x i16>)
1172declare <32 x i16> @llvm.ctpop.v32i16(<32 x i16>)
1173declare <64 x i16> @llvm.ctpop.v64i16(<64 x i16>)
1174declare <128 x i16> @llvm.ctpop.v128i16(<128 x i16>)
1175declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
1176declare <4 x i32> @llvm.ctpop.v4i32(<4 x i32>)
1177declare <8 x i32> @llvm.ctpop.v8i32(<8 x i32>)
1178declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>)
1179declare <32 x i32> @llvm.ctpop.v32i32(<32 x i32>)
1180declare <64 x i32> @llvm.ctpop.v64i32(<64 x i32>)
1181declare <1 x i64> @llvm.ctpop.v1i64(<1 x i64>)
1182declare <2 x i64> @llvm.ctpop.v2i64(<2 x i64>)
1183declare <4 x i64> @llvm.ctpop.v4i64(<4 x i64>)
1184declare <8 x i64> @llvm.ctpop.v8i64(<8 x i64>)
1185declare <16 x i64> @llvm.ctpop.v16i64(<16 x i64>)
1186declare <32 x i64> @llvm.ctpop.v32i64(<32 x i64>)
1187
1188declare <8 x i8> @llvm.cttz.v8i8(<8 x i8>)
1189declare <16 x i8> @llvm.cttz.v16i8(<16 x i8>)
1190declare <32 x i8> @llvm.cttz.v32i8(<32 x i8>)
1191declare <64 x i8> @llvm.cttz.v64i8(<64 x i8>)
1192declare <128 x i8> @llvm.cttz.v128i8(<128 x i8>)
1193declare <256 x i8> @llvm.cttz.v256i8(<256 x i8>)
1194declare <4 x i16> @llvm.cttz.v4i16(<4 x i16>)
1195declare <8 x i16> @llvm.cttz.v8i16(<8 x i16>)
1196declare <16 x i16> @llvm.cttz.v16i16(<16 x i16>)
1197declare <32 x i16> @llvm.cttz.v32i16(<32 x i16>)
1198declare <64 x i16> @llvm.cttz.v64i16(<64 x i16>)
1199declare <128 x i16> @llvm.cttz.v128i16(<128 x i16>)
1200declare <2 x i32> @llvm.cttz.v2i32(<2 x i32>)
1201declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>)
1202declare <8 x i32> @llvm.cttz.v8i32(<8 x i32>)
1203declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>)
1204declare <32 x i32> @llvm.cttz.v32i32(<32 x i32>)
1205declare <64 x i32> @llvm.cttz.v64i32(<64 x i32>)
1206declare <1 x i64> @llvm.cttz.v1i64(<1 x i64>)
1207declare <2 x i64> @llvm.cttz.v2i64(<2 x i64>)
1208declare <4 x i64> @llvm.cttz.v4i64(<4 x i64>)
1209declare <8 x i64> @llvm.cttz.v8i64(<8 x i64>)
1210declare <16 x i64> @llvm.cttz.v16i64(<16 x i64>)
1211declare <32 x i64> @llvm.cttz.v32i64(<32 x i64>)
1212