xref: /llvm-project/llvm/test/CodeGen/AArch64/sve2-histcnt.ll (revision 61510b51c33464a6bc15e4cf5b1ee07e2e0ec1c9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
2; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s -o - | FileCheck %s
3
4define void @histogram_i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0 {
5; CHECK-LABEL: histogram_i64:
6; CHECK:       // %bb.0:
7; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
8; CHECK-NEXT:    mov z3.d, x0
9; CHECK-NEXT:    ld1d { z2.d }, p0/z, [z0.d]
10; CHECK-NEXT:    ptrue p1.d
11; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
12; CHECK-NEXT:    st1d { z1.d }, p0, [z0.d]
13; CHECK-NEXT:    ret
14  call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask)
15  ret void
16}
17
18;; FIXME: We maybe need some dagcombines here? We're multiplying the output of the histcnt
19;;        by 1, so we should be able to remove that and directly add the histcnt to the
20;;        current bucket data.
21define void @histogram_i32_literal(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
22; CHECK-LABEL: histogram_i32_literal:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
25; CHECK-NEXT:    mov z3.s, #1 // =0x1
26; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
27; CHECK-NEXT:    ptrue p1.s
28; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
29; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
30; CHECK-NEXT:    ret
31
32  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %indices
33  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
34  ret void
35}
36
37define void @histogram_i32_literal_noscale(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
38; CHECK-LABEL: histogram_i32_literal_noscale:
39; CHECK:       // %bb.0:
40; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
41; CHECK-NEXT:    mov z3.s, #1 // =0x1
42; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw]
43; CHECK-NEXT:    ptrue p1.s
44; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
45; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw]
46; CHECK-NEXT:    ret
47
48  %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices
49  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
50  ret void
51}
52
53define void @histogram_i32_promote(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i32 %inc) #0 {
54; CHECK-LABEL: histogram_i32_promote:
55; CHECK:       // %bb.0:
56; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
57; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
58; CHECK-NEXT:    mov z3.d, x1
59; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x0, z0.d, lsl #2]
60; CHECK-NEXT:    ptrue p1.d
61; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
62; CHECK-NEXT:    st1w { z1.d }, p0, [x0, z0.d, lsl #2]
63; CHECK-NEXT:    ret
64  %buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %indices
65  call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 %inc, <vscale x 2 x i1> %mask)
66  ret void
67}
68
69define void @histogram_i16(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0 {
70; CHECK-LABEL: histogram_i16:
71; CHECK:       // %bb.0:
72; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
73; CHECK-NEXT:    mov z3.s, w1
74; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
75; CHECK-NEXT:    ptrue p1.s
76; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
77; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
78; CHECK-NEXT:    ret
79  %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices
80  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask)
81  ret void
82}
83
84define void @histogram_i8(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0 {
85; CHECK-LABEL: histogram_i8:
86; CHECK:       // %bb.0:
87; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
88; CHECK-NEXT:    mov z3.s, w1
89; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x0, z0.s, sxtw]
90; CHECK-NEXT:    ptrue p1.s
91; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
92; CHECK-NEXT:    st1b { z1.s }, p0, [x0, z0.s, sxtw]
93; CHECK-NEXT:    ret
94  %buckets = getelementptr i8, ptr %base, <vscale x 4 x i32> %indices
95  call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask)
96  ret void
97}
98
99define void @histogram_i16_2_lane(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i16 %inc) #0 {
100; CHECK-LABEL: histogram_i16_2_lane:
101; CHECK:       // %bb.0:
102; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
103; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
104; CHECK-NEXT:    mov z3.d, x1
105; CHECK-NEXT:    ld1h { z2.d }, p0/z, [x0, z0.d, lsl #1]
106; CHECK-NEXT:    ptrue p1.d
107; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
108; CHECK-NEXT:    st1h { z1.d }, p0, [x0, z0.d, lsl #1]
109; CHECK-NEXT:    ret
110  %buckets = getelementptr i16, ptr %base, <vscale x 2 x i64> %indices
111  call void @llvm.experimental.vector.histogram.add.nxv2p0.i16(<vscale x 2 x ptr> %buckets, i16 %inc, <vscale x 2 x i1> %mask)
112  ret void
113}
114
115define void @histogram_i8_2_lane(ptr %base, <vscale x 2 x i64> %indices, <vscale x 2 x i1> %mask, i8 %inc) #0 {
116; CHECK-LABEL: histogram_i8_2_lane:
117; CHECK:       // %bb.0:
118; CHECK-NEXT:    histcnt z1.d, p0/z, z0.d, z0.d
119; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
120; CHECK-NEXT:    mov z3.d, x1
121; CHECK-NEXT:    ld1b { z2.d }, p0/z, [x0, z0.d]
122; CHECK-NEXT:    ptrue p1.d
123; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
124; CHECK-NEXT:    st1b { z1.d }, p0, [x0, z0.d]
125; CHECK-NEXT:    ret
126  %buckets = getelementptr i8, ptr %base, <vscale x 2 x i64> %indices
127  call void @llvm.experimental.vector.histogram.add.nxv2p0.i8(<vscale x 2 x ptr> %buckets, i8 %inc, <vscale x 2 x i1> %mask)
128  ret void
129}
130
131define void @histogram_i16_literal_1(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
132; CHECK-LABEL: histogram_i16_literal_1:
133; CHECK:       // %bb.0:
134; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
135; CHECK-NEXT:    mov z3.s, #1 // =0x1
136; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
137; CHECK-NEXT:    ptrue p1.s
138; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
139; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
140; CHECK-NEXT:    ret
141  %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices
142  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 1, <vscale x 4 x i1> %mask)
143  ret void
144}
145
146define void @histogram_i16_literal_2(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
147; CHECK-LABEL: histogram_i16_literal_2:
148; CHECK:       // %bb.0:
149; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
150; CHECK-NEXT:    mov z3.s, #2 // =0x2
151; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
152; CHECK-NEXT:    ptrue p1.s
153; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
154; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
155; CHECK-NEXT:    ret
156  %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices
157  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 2, <vscale x 4 x i1> %mask)
158  ret void
159}
160
161define void @histogram_i16_literal_3(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
162; CHECK-LABEL: histogram_i16_literal_3:
163; CHECK:       // %bb.0:
164; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
165; CHECK-NEXT:    mov z3.s, #3 // =0x3
166; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, sxtw #1]
167; CHECK-NEXT:    ptrue p1.s
168; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
169; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, sxtw #1]
170; CHECK-NEXT:    ret
171  %buckets = getelementptr i16, ptr %base, <vscale x 4 x i32> %indices
172  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 3, <vscale x 4 x i1> %mask)
173  ret void
174}
175
176define void @histogram_i64_4_lane(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask) #0 {
177; CHECK-LABEL: histogram_i64_4_lane:
178; CHECK:       // %bb.0:
179; CHECK-NEXT:    punpklo p1.h, p0.b
180; CHECK-NEXT:    mov z4.d, x0
181; CHECK-NEXT:    ptrue p2.d
182; CHECK-NEXT:    histcnt z2.d, p1/z, z0.d, z0.d
183; CHECK-NEXT:    ld1d { z3.d }, p1/z, [z0.d]
184; CHECK-NEXT:    punpkhi p0.h, p0.b
185; CHECK-NEXT:    mad z2.d, p2/m, z4.d, z3.d
186; CHECK-NEXT:    st1d { z2.d }, p1, [z0.d]
187; CHECK-NEXT:    histcnt z0.d, p0/z, z1.d, z1.d
188; CHECK-NEXT:    ld1d { z2.d }, p0/z, [z1.d]
189; CHECK-NEXT:    mad z0.d, p2/m, z4.d, z2.d
190; CHECK-NEXT:    st1d { z0.d }, p0, [z1.d]
191; CHECK-NEXT:    ret
192  call void @llvm.experimental.vector.histogram.add.nxv4p0.i64(<vscale x 4 x ptr> %buckets, i64 %inc, <vscale x 4 x i1> %mask)
193  ret void
194}
195
196define void @histogram_i64_8_lane(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask) #0 {
197; CHECK-LABEL: histogram_i64_8_lane:
198; CHECK:       // %bb.0:
199; CHECK-NEXT:    punpklo p2.h, p0.b
200; CHECK-NEXT:    mov z6.d, x0
201; CHECK-NEXT:    ptrue p1.d
202; CHECK-NEXT:    punpklo p3.h, p2.b
203; CHECK-NEXT:    punpkhi p2.h, p2.b
204; CHECK-NEXT:    histcnt z4.d, p3/z, z0.d, z0.d
205; CHECK-NEXT:    ld1d { z5.d }, p3/z, [z0.d]
206; CHECK-NEXT:    punpkhi p0.h, p0.b
207; CHECK-NEXT:    mad z4.d, p1/m, z6.d, z5.d
208; CHECK-NEXT:    st1d { z4.d }, p3, [z0.d]
209; CHECK-NEXT:    histcnt z0.d, p2/z, z1.d, z1.d
210; CHECK-NEXT:    ld1d { z4.d }, p2/z, [z1.d]
211; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z4.d
212; CHECK-NEXT:    st1d { z0.d }, p2, [z1.d]
213; CHECK-NEXT:    punpklo p2.h, p0.b
214; CHECK-NEXT:    punpkhi p0.h, p0.b
215; CHECK-NEXT:    histcnt z0.d, p2/z, z2.d, z2.d
216; CHECK-NEXT:    ld1d { z1.d }, p2/z, [z2.d]
217; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z1.d
218; CHECK-NEXT:    st1d { z0.d }, p2, [z2.d]
219; CHECK-NEXT:    histcnt z0.d, p0/z, z3.d, z3.d
220; CHECK-NEXT:    ld1d { z1.d }, p0/z, [z3.d]
221; CHECK-NEXT:    mad z0.d, p1/m, z6.d, z1.d
222; CHECK-NEXT:    st1d { z0.d }, p0, [z3.d]
223; CHECK-NEXT:    ret
224  call void @llvm.experimental.vector.histogram.add.nxv8p0.i64(<vscale x 8 x ptr> %buckets, i64 %inc, <vscale x 8 x i1> %mask)
225  ret void
226}
227
228define void @histogram_i32_8_lane(ptr %base, <vscale x 8 x i32> %indices, i32 %inc, <vscale x 8 x i1> %mask) #0 {
229; CHECK-LABEL: histogram_i32_8_lane:
230; CHECK:       // %bb.0:
231; CHECK-NEXT:    punpklo p1.h, p0.b
232; CHECK-NEXT:    mov z4.s, w1
233; CHECK-NEXT:    ptrue p2.s
234; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
235; CHECK-NEXT:    ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
236; CHECK-NEXT:    punpkhi p0.h, p0.b
237; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
238; CHECK-NEXT:    st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
239; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
240; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
241; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
242; CHECK-NEXT:    st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
243; CHECK-NEXT:    ret
244  %buckets = getelementptr i32, ptr %base, <vscale x 8 x i32> %indices
245  call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 %inc, <vscale x 8 x i1> %mask)
246  ret void
247}
248
249define void @histogram_i16_8_lane(ptr %base, <vscale x 8 x i32> %indices, i16 %inc, <vscale x 8 x i1> %mask) #0 {
250; CHECK-LABEL: histogram_i16_8_lane:
251; CHECK:       // %bb.0:
252; CHECK-NEXT:    punpklo p1.h, p0.b
253; CHECK-NEXT:    mov z4.s, w1
254; CHECK-NEXT:    ptrue p2.s
255; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
256; CHECK-NEXT:    ld1h { z3.s }, p1/z, [x0, z0.s, sxtw #1]
257; CHECK-NEXT:    punpkhi p0.h, p0.b
258; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
259; CHECK-NEXT:    st1h { z2.s }, p1, [x0, z0.s, sxtw #1]
260; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
261; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z1.s, sxtw #1]
262; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
263; CHECK-NEXT:    st1h { z0.s }, p0, [x0, z1.s, sxtw #1]
264; CHECK-NEXT:    ret
265  %buckets = getelementptr i16, ptr %base, <vscale x 8 x i32> %indices
266  call void @llvm.experimental.vector.histogram.add.nxv8p0.i16(<vscale x 8 x ptr> %buckets, i16 %inc, <vscale x 8 x i1> %mask)
267  ret void
268}
269
270define void @histogram_i8_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i8 %inc) #0{
271; CHECK-LABEL: histogram_i8_zext:
272; CHECK:       // %bb.0:
273; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
274; CHECK-NEXT:    mov z3.s, w1
275; CHECK-NEXT:    ld1b { z2.s }, p0/z, [x0, z0.s, uxtw]
276; CHECK-NEXT:    ptrue p1.s
277; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
278; CHECK-NEXT:    st1b { z1.s }, p0, [x0, z0.s, uxtw]
279; CHECK-NEXT:    ret
280  %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
281  %buckets = getelementptr i8, ptr %base, <vscale x 4 x i64> %extended
282  call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> %buckets, i8 %inc, <vscale x 4 x i1> %mask)
283  ret void
284}
285
286define void @histogram_i16_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask, i16 %inc) #0{
287; CHECK-LABEL: histogram_i16_zext:
288; CHECK:       // %bb.0:
289; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
290; CHECK-NEXT:    mov z3.s, w1
291; CHECK-NEXT:    ld1h { z2.s }, p0/z, [x0, z0.s, uxtw #1]
292; CHECK-NEXT:    ptrue p1.s
293; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
294; CHECK-NEXT:    st1h { z1.s }, p0, [x0, z0.s, uxtw #1]
295; CHECK-NEXT:    ret
296  %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
297  %buckets = getelementptr i16, ptr %base, <vscale x 4 x i64> %extended
298  call void @llvm.experimental.vector.histogram.add.nxv4p0.i16(<vscale x 4 x ptr> %buckets, i16 %inc, <vscale x 4 x i1> %mask)
299  ret void
300}
301
302define void @histogram_i32_zext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
303; CHECK-LABEL: histogram_i32_zext:
304; CHECK:       // %bb.0:
305; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
306; CHECK-NEXT:    mov z3.s, #1 // =0x1
307; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
308; CHECK-NEXT:    ptrue p1.s
309; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
310; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
311; CHECK-NEXT:    ret
312  %extended = zext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
313  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
314  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
315  ret void
316}
317
318define void @histogram_i32_sext(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0 {
319; CHECK-LABEL: histogram_i32_sext:
320; CHECK:       // %bb.0:
321; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
322; CHECK-NEXT:    mov z3.s, #1 // =0x1
323; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
324; CHECK-NEXT:    ptrue p1.s
325; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
326; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
327; CHECK-NEXT:    ret
328  %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
329  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
330  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
331  ret void
332}
333
334define void @histogram_zext_from_i8_to_i64(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
335; CHECK-LABEL: histogram_zext_from_i8_to_i64:
336; CHECK:       // %bb.0:
337; CHECK-NEXT:    and z0.s, z0.s, #0xff
338; CHECK-NEXT:    mov z3.s, #1 // =0x1
339; CHECK-NEXT:    ptrue p1.s
340; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
341; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
342; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
343; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
344; CHECK-NEXT:    ret
345  %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i64>
346  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
347  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
348  ret void
349}
350
351define void @histogram_zext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
352; CHECK-LABEL: histogram_zext_from_i16_to_i64:
353; CHECK:       // %bb.0:
354; CHECK-NEXT:    and z0.s, z0.s, #0xffff
355; CHECK-NEXT:    mov z3.s, #1 // =0x1
356; CHECK-NEXT:    ptrue p1.s
357; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
358; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
359; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
360; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
361; CHECK-NEXT:    ret
362  %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
363  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
364  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
365  ret void
366}
367
368define void @histogram_sext_from_i16_to_i64(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0{
369; CHECK-LABEL: histogram_sext_from_i16_to_i64:
370; CHECK:       // %bb.0:
371; CHECK-NEXT:    ptrue p1.s
372; CHECK-NEXT:    mov z3.s, #1 // =0x1
373; CHECK-NEXT:    sxth z0.s, p1/m, z0.s
374; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
375; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, sxtw #2]
376; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
377; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, sxtw #2]
378; CHECK-NEXT:    ret
379  %extended = sext <vscale x 4 x i16> %indices to <vscale x 4 x i64>
380  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
381  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
382  ret void
383}
384
385define void @histogram_zext_from_i8_to_i32(ptr %base, <vscale x 4 x i8> %indices, <vscale x 4 x i1> %mask) #0{
386; CHECK-LABEL: histogram_zext_from_i8_to_i32:
387; CHECK:       // %bb.0:
388; CHECK-NEXT:    and z0.s, z0.s, #0xff
389; CHECK-NEXT:    mov z3.s, #1 // =0x1
390; CHECK-NEXT:    ptrue p1.s
391; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
392; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
393; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
394; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
395; CHECK-NEXT:    ret
396  %extended = zext <vscale x 4 x i8> %indices to <vscale x 4 x i32>
397  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
398  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
399  ret void
400}
401
402define void @histogram_zext_from_i16_to_i32(ptr %base, <vscale x 4 x i16> %indices, <vscale x 4 x i1> %mask) #0 {
403; CHECK-LABEL: histogram_zext_from_i16_to_i32:
404; CHECK:       // %bb.0:
405; CHECK-NEXT:    and z0.s, z0.s, #0xffff
406; CHECK-NEXT:    mov z3.s, #1 // =0x1
407; CHECK-NEXT:    ptrue p1.s
408; CHECK-NEXT:    histcnt z1.s, p0/z, z0.s, z0.s
409; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z0.s, uxtw #2]
410; CHECK-NEXT:    mad z1.s, p1/m, z3.s, z2.s
411; CHECK-NEXT:    st1w { z1.s }, p0, [x0, z0.s, uxtw #2]
412; CHECK-NEXT:    ret
413  %extended = zext <vscale x 4 x i16> %indices to <vscale x 4 x i32>
414  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i32> %extended
415  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> %mask)
416  ret void
417}
418
419define void @histogram_2_lane_zext(ptr %base, <vscale x 2 x i32> %indices, <vscale x 2 x i1> %mask) #0 {
420; CHECK-LABEL: histogram_2_lane_zext:
421; CHECK:       // %bb.0:
422; CHECK-NEXT:    mov z1.d, z0.d
423; CHECK-NEXT:    mov z3.d, #1 // =0x1
424; CHECK-NEXT:    ptrue p1.d
425; CHECK-NEXT:    ld1w { z2.d }, p0/z, [x0, z0.d, uxtw #2]
426; CHECK-NEXT:    and z1.d, z1.d, #0xffffffff
427; CHECK-NEXT:    histcnt z1.d, p0/z, z1.d, z1.d
428; CHECK-NEXT:    mad z1.d, p1/m, z3.d, z2.d
429; CHECK-NEXT:    st1w { z1.d }, p0, [x0, z0.d, uxtw #2]
430; CHECK-NEXT:    ret
431  %extended = zext <vscale x 2 x i32> %indices to <vscale x 2 x i64>
432  %buckets = getelementptr i32, ptr %base, <vscale x 2 x i64> %extended
433  call void @llvm.experimental.vector.histogram.add.nxv2p0.i32(<vscale x 2 x ptr> %buckets, i32 1, <vscale x 2 x i1> %mask)
434  ret void
435}
436
437define void @histogram_8_lane_zext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
438; CHECK-LABEL: histogram_8_lane_zext:
439; CHECK:       // %bb.0:
440; CHECK-NEXT:    punpklo p1.h, p0.b
441; CHECK-NEXT:    mov z4.s, #1 // =0x1
442; CHECK-NEXT:    ptrue p2.s
443; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
444; CHECK-NEXT:    ld1w { z3.s }, p1/z, [x0, z0.s, uxtw #2]
445; CHECK-NEXT:    punpkhi p0.h, p0.b
446; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
447; CHECK-NEXT:    st1w { z2.s }, p1, [x0, z0.s, uxtw #2]
448; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
449; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z1.s, uxtw #2]
450; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
451; CHECK-NEXT:    st1w { z0.s }, p0, [x0, z1.s, uxtw #2]
452; CHECK-NEXT:    ret
453  %extended = zext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
454  %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
455  call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
456  ret void
457}
458
459define void @histogram_8_lane_sext(ptr %base, <vscale x 8 x i32> %indices, <vscale x 8 x i1> %mask) #0{
460; CHECK-LABEL: histogram_8_lane_sext:
461; CHECK:       // %bb.0:
462; CHECK-NEXT:    punpklo p1.h, p0.b
463; CHECK-NEXT:    mov z4.s, #1 // =0x1
464; CHECK-NEXT:    ptrue p2.s
465; CHECK-NEXT:    histcnt z2.s, p1/z, z0.s, z0.s
466; CHECK-NEXT:    ld1w { z3.s }, p1/z, [x0, z0.s, sxtw #2]
467; CHECK-NEXT:    punpkhi p0.h, p0.b
468; CHECK-NEXT:    mad z2.s, p2/m, z4.s, z3.s
469; CHECK-NEXT:    st1w { z2.s }, p1, [x0, z0.s, sxtw #2]
470; CHECK-NEXT:    histcnt z0.s, p0/z, z1.s, z1.s
471; CHECK-NEXT:    ld1w { z2.s }, p0/z, [x0, z1.s, sxtw #2]
472; CHECK-NEXT:    mad z0.s, p2/m, z4.s, z2.s
473; CHECK-NEXT:    st1w { z0.s }, p0, [x0, z1.s, sxtw #2]
474; CHECK-NEXT:    ret
475  %extended = sext <vscale x 8 x i32> %indices to <vscale x 8 x i64>
476  %buckets = getelementptr i32, ptr %base, <vscale x 8 x i64> %extended
477  call void @llvm.experimental.vector.histogram.add.nxv8p0.i32(<vscale x 8 x ptr> %buckets, i32 1, <vscale x 8 x i1> %mask)
478  ret void
479}
480
481define void @histogram_zero_mask(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> %mask) #0{
482; CHECK-LABEL: histogram_zero_mask:
483; CHECK:       // %bb.0:
484; CHECK-NEXT:    ret
485  call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> %buckets, i64 %inc, <vscale x 2 x i1> zeroinitializer)
486  ret void
487}
488
489define void @histogram_sext_zero_mask(ptr %base, <vscale x 4 x i32> %indices, <vscale x 4 x i1> %mask) #0{
490; CHECK-LABEL: histogram_sext_zero_mask:
491; CHECK:       // %bb.0:
492; CHECK-NEXT:    ret
493  %extended = sext <vscale x 4 x i32> %indices to <vscale x 4 x i64>
494  %buckets = getelementptr i32, ptr %base, <vscale x 4 x i64> %extended
495  call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> %buckets, i32 1, <vscale x 4 x i1> zeroinitializer)
496  ret void
497}
498
499attributes #0 = { "target-features"="+sve2" vscale_range(1, 16) }
500