xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-vaddv.ll (revision 7da1dda01eff708decbbb0b4cd52b2b95d89ea2a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s
3; RUN: llc < %s -mtriple=arm64-eabi -global-isel | FileCheck %s
4
5define signext i8 @test_vaddv_s8(<8 x i8> %a1) {
6; CHECK-LABEL: test_vaddv_s8:
7; CHECK:       // %bb.0: // %entry
8; CHECK-NEXT:    addv b0, v0.8b
9; CHECK-NEXT:    smov w0, v0.b[0]
10; CHECK-NEXT:    ret
11entry:
12  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a1)
13  %0 = trunc i32 %vaddv.i to i8
14  ret i8 %0
15}
16
17define <8 x i8> @test_vaddv_s8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
18; CHECK-LABEL: test_vaddv_s8_used_by_laneop:
19; CHECK:       // %bb.0: // %entry
20; CHECK-NEXT:    addv b1, v1.8b
21; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
22; CHECK-NEXT:    mov v0.b[3], v1.b[0]
23; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
24; CHECK-NEXT:    ret
25entry:
26  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8> %a2)
27  %1 = trunc i32 %0 to i8
28  %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
29  ret <8 x i8> %2
30}
31
32define signext i16 @test_vaddv_s16(<4 x i16> %a1) {
33; CHECK-LABEL: test_vaddv_s16:
34; CHECK:       // %bb.0: // %entry
35; CHECK-NEXT:    addv h0, v0.4h
36; CHECK-NEXT:    smov w0, v0.h[0]
37; CHECK-NEXT:    ret
38entry:
39  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a1)
40  %0 = trunc i32 %vaddv.i to i16
41  ret i16 %0
42}
43
44define <4 x i16> @test_vaddv_s16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
45; CHECK-LABEL: test_vaddv_s16_used_by_laneop:
46; CHECK:       // %bb.0: // %entry
47; CHECK-NEXT:    addv h1, v1.4h
48; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
49; CHECK-NEXT:    mov v0.h[3], v1.h[0]
50; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
51; CHECK-NEXT:    ret
52entry:
53  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16> %a2)
54  %1 = trunc i32 %0 to i16
55  %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
56  ret <4 x i16> %2
57}
58
59define i32 @test_vaddv_s32(<2 x i32> %a1) {
60; CHECK-LABEL: test_vaddv_s32:
61; CHECK:       // %bb.0: // %entry
62; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
63; CHECK-NEXT:    fmov w0, s0
64; CHECK-NEXT:    ret
65; 2 x i32 is not supported by the ISA, thus, this is a special case
66entry:
67  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a1)
68  ret i32 %vaddv.i
69}
70
71define <2 x i32> @test_vaddv_s32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
72; CHECK-LABEL: test_vaddv_s32_used_by_laneop:
73; CHECK:       // %bb.0: // %entry
74; CHECK-NEXT:    addp v1.2s, v1.2s, v1.2s
75; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
76; CHECK-NEXT:    mov v0.s[1], v1.s[0]
77; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
78; CHECK-NEXT:    ret
79entry:
80  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a2)
81  %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
82  ret <2 x i32> %1
83}
84
85define i64 @test_vaddv_s64(<2 x i64> %a1) {
86; CHECK-LABEL: test_vaddv_s64:
87; CHECK:       // %bb.0: // %entry
88; CHECK-NEXT:    addp d0, v0.2d
89; CHECK-NEXT:    fmov x0, d0
90; CHECK-NEXT:    ret
91entry:
92  %vaddv.i = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a1)
93  ret i64 %vaddv.i
94}
95
96define <2 x i64> @test_vaddv_s64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
97; CHECK-LABEL: test_vaddv_s64_used_by_laneop:
98; CHECK:       // %bb.0: // %entry
99; CHECK-NEXT:    addp d1, v1.2d
100; CHECK-NEXT:    mov v0.d[1], v1.d[0]
101; CHECK-NEXT:    ret
102entry:
103  %0 = tail call i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64> %a2)
104  %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
105  ret <2 x i64> %1
106}
107
108define zeroext i8 @test_vaddv_u8(<8 x i8> %a1) {
109; CHECK-LABEL: test_vaddv_u8:
110; CHECK:       // %bb.0: // %entry
111; CHECK-NEXT:    addv b0, v0.8b
112; CHECK-NEXT:    fmov w0, s0
113; CHECK-NEXT:    ret
114entry:
115  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
116  %0 = trunc i32 %vaddv.i to i8
117  ret i8 %0
118}
119
120define <8 x i8> @test_vaddv_u8_used_by_laneop(<8 x i8> %a1, <8 x i8> %a2) {
121; CHECK-LABEL: test_vaddv_u8_used_by_laneop:
122; CHECK:       // %bb.0: // %entry
123; CHECK-NEXT:    addv b1, v1.8b
124; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
125; CHECK-NEXT:    mov v0.b[3], v1.b[0]
126; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
127; CHECK-NEXT:    ret
128entry:
129  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a2)
130  %1 = trunc i32 %0 to i8
131  %2 = insertelement <8 x i8> %a1, i8 %1, i32 3
132  ret <8 x i8> %2
133}
134
135define i32 @test_vaddv_u8_masked(<8 x i8> %a1) {
136; CHECK-LABEL: test_vaddv_u8_masked:
137; CHECK:       // %bb.0: // %entry
138; CHECK-NEXT:    addv b0, v0.8b
139; CHECK-NEXT:    fmov w0, s0
140; CHECK-NEXT:    ret
141entry:
142  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8> %a1)
143  %0 = and i32 %vaddv.i, 511 ; 0x1ff
144  ret i32 %0
145}
146
147define zeroext i16 @test_vaddv_u16(<4 x i16> %a1) {
148; CHECK-LABEL: test_vaddv_u16:
149; CHECK:       // %bb.0: // %entry
150; CHECK-NEXT:    addv h0, v0.4h
151; CHECK-NEXT:    fmov w0, s0
152; CHECK-NEXT:    ret
153entry:
154  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
155  %0 = trunc i32 %vaddv.i to i16
156  ret i16 %0
157}
158
159define <4 x i16> @test_vaddv_u16_used_by_laneop(<4 x i16> %a1, <4 x i16> %a2) {
160; CHECK-LABEL: test_vaddv_u16_used_by_laneop:
161; CHECK:       // %bb.0: // %entry
162; CHECK-NEXT:    addv h1, v1.4h
163; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
164; CHECK-NEXT:    mov v0.h[3], v1.h[0]
165; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
166; CHECK-NEXT:    ret
167entry:
168  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a2)
169  %1 = trunc i32 %0 to i16
170  %2 = insertelement <4 x i16> %a1, i16 %1, i32 3
171  ret <4 x i16> %2
172}
173
174define i32 @test_vaddv_u16_masked(<4 x i16> %a1) {
175; CHECK-LABEL: test_vaddv_u16_masked:
176; CHECK:       // %bb.0: // %entry
177; CHECK-NEXT:    addv h0, v0.4h
178; CHECK-NEXT:    fmov w0, s0
179; CHECK-NEXT:    ret
180entry:
181  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16> %a1)
182  %0 = and i32 %vaddv.i, 3276799 ; 0x31ffff
183  ret i32 %0
184}
185
186define i32 @test_vaddv_u32(<2 x i32> %a1) {
187; CHECK-LABEL: test_vaddv_u32:
188; CHECK:       // %bb.0: // %entry
189; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
190; CHECK-NEXT:    fmov w0, s0
191; CHECK-NEXT:    ret
192; 2 x i32 is not supported by the ISA, thus, this is a special case
193entry:
194  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a1)
195  ret i32 %vaddv.i
196}
197
198define <2 x i32> @test_vaddv_u32_used_by_laneop(<2 x i32> %a1, <2 x i32> %a2) {
199; CHECK-LABEL: test_vaddv_u32_used_by_laneop:
200; CHECK:       // %bb.0: // %entry
201; CHECK-NEXT:    addp v1.2s, v1.2s, v1.2s
202; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
203; CHECK-NEXT:    mov v0.s[1], v1.s[0]
204; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
205; CHECK-NEXT:    ret
206entry:
207  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32> %a2)
208  %1 = insertelement <2 x i32> %a1, i32 %0, i32 1
209  ret <2 x i32> %1
210}
211
212define float @test_vaddv_f32(<2 x float> %a1) {
213; CHECK-LABEL: test_vaddv_f32:
214; CHECK:       // %bb.0: // %entry
215; CHECK-NEXT:    faddp s0, v0.2s
216; CHECK-NEXT:    ret
217entry:
218  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
219  ret float %vaddv.i
220}
221
222define float @test_vaddv_v4f32(<4 x float> %a1) {
223; CHECK-LABEL: test_vaddv_v4f32:
224; CHECK:       // %bb.0: // %entry
225; CHECK-NEXT:    faddp v0.4s, v0.4s, v0.4s
226; CHECK-NEXT:    faddp s0, v0.2s
227; CHECK-NEXT:    ret
228entry:
229  %vaddv.i = tail call float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
230  ret float %vaddv.i
231}
232
233define double @test_vaddv_f64(<2 x double> %a1) {
234; CHECK-LABEL: test_vaddv_f64:
235; CHECK:       // %bb.0: // %entry
236; CHECK-NEXT:    faddp d0, v0.2d
237; CHECK-NEXT:    ret
238entry:
239  %vaddv.i = tail call double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
240  ret double %vaddv.i
241}
242
243define i64 @test_vaddv_u64(<2 x i64> %a1) {
244; CHECK-LABEL: test_vaddv_u64:
245; CHECK:       // %bb.0: // %entry
246; CHECK-NEXT:    addp d0, v0.2d
247; CHECK-NEXT:    fmov x0, d0
248; CHECK-NEXT:    ret
249entry:
250  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
251  ret i64 %vaddv.i
252}
253
254define <2 x i64> @test_vaddv_u64_used_by_laneop(<2 x i64> %a1, <2 x i64> %a2) {
255; CHECK-LABEL: test_vaddv_u64_used_by_laneop:
256; CHECK:       // %bb.0: // %entry
257; CHECK-NEXT:    addp d1, v1.2d
258; CHECK-NEXT:    mov v0.d[1], v1.d[0]
259; CHECK-NEXT:    ret
260entry:
261  %0 = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a2)
262  %1 = insertelement <2 x i64> %a1, i64 %0, i64 1
263  ret <2 x i64> %1
264}
265
266define <1 x i64> @test_vaddv_u64_to_vec(<2 x i64> %a1) {
267; CHECK-LABEL: test_vaddv_u64_to_vec:
268; CHECK:       // %bb.0: // %entry
269; CHECK-NEXT:    addp d0, v0.2d
270; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
271; CHECK-NEXT:    ret
272entry:
273  %vaddv.i = tail call i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64> %a1)
274  %vec = insertelement <1 x i64> undef, i64 %vaddv.i, i32 0
275  ret <1 x i64> %vec
276}
277
278define signext i8 @test_vaddvq_s8(<16 x i8> %a1) {
279; CHECK-LABEL: test_vaddvq_s8:
280; CHECK:       // %bb.0: // %entry
281; CHECK-NEXT:    addv b0, v0.16b
282; CHECK-NEXT:    smov w0, v0.b[0]
283; CHECK-NEXT:    ret
284entry:
285  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a1)
286  %0 = trunc i32 %vaddv.i to i8
287  ret i8 %0
288}
289
290define <16 x i8> @test_vaddvq_s8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
291; CHECK-LABEL: test_vaddvq_s8_used_by_laneop:
292; CHECK:       // %bb.0: // %entry
293; CHECK-NEXT:    addv b1, v1.16b
294; CHECK-NEXT:    mov v0.b[3], v1.b[0]
295; CHECK-NEXT:    ret
296entry:
297  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8> %a2)
298  %1 = trunc i32 %0 to i8
299  %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
300  ret <16 x i8> %2
301}
302
303define signext i16 @test_vaddvq_s16(<8 x i16> %a1) {
304; CHECK-LABEL: test_vaddvq_s16:
305; CHECK:       // %bb.0: // %entry
306; CHECK-NEXT:    addv h0, v0.8h
307; CHECK-NEXT:    smov w0, v0.h[0]
308; CHECK-NEXT:    ret
309entry:
310  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a1)
311  %0 = trunc i32 %vaddv.i to i16
312  ret i16 %0
313}
314
315define <8 x i16> @test_vaddvq_s16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
316; CHECK-LABEL: test_vaddvq_s16_used_by_laneop:
317; CHECK:       // %bb.0: // %entry
318; CHECK-NEXT:    addv h1, v1.8h
319; CHECK-NEXT:    mov v0.h[3], v1.h[0]
320; CHECK-NEXT:    ret
321entry:
322  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16> %a2)
323  %1 = trunc i32 %0 to i16
324  %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
325  ret <8 x i16> %2
326}
327
328define i32 @test_vaddvq_s32(<4 x i32> %a1) {
329; CHECK-LABEL: test_vaddvq_s32:
330; CHECK:       // %bb.0: // %entry
331; CHECK-NEXT:    addv s0, v0.4s
332; CHECK-NEXT:    fmov w0, s0
333; CHECK-NEXT:    ret
334entry:
335  %vaddv.i = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a1)
336  ret i32 %vaddv.i
337}
338
339define <4 x i32> @test_vaddvq_s32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
340; CHECK-LABEL: test_vaddvq_s32_used_by_laneop:
341; CHECK:       // %bb.0: // %entry
342; CHECK-NEXT:    addv s1, v1.4s
343; CHECK-NEXT:    mov v0.s[3], v1.s[0]
344; CHECK-NEXT:    ret
345entry:
346  %0 = tail call i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32> %a2)
347  %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
348  ret <4 x i32> %1
349}
350
351define zeroext i8 @test_vaddvq_u8(<16 x i8> %a1) {
352; CHECK-LABEL: test_vaddvq_u8:
353; CHECK:       // %bb.0: // %entry
354; CHECK-NEXT:    addv b0, v0.16b
355; CHECK-NEXT:    fmov w0, s0
356; CHECK-NEXT:    ret
357entry:
358  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a1)
359  %0 = trunc i32 %vaddv.i to i8
360  ret i8 %0
361}
362
363define <16 x i8> @test_vaddvq_u8_used_by_laneop(<16 x i8> %a1, <16 x i8> %a2) {
364; CHECK-LABEL: test_vaddvq_u8_used_by_laneop:
365; CHECK:       // %bb.0: // %entry
366; CHECK-NEXT:    addv b1, v1.16b
367; CHECK-NEXT:    mov v0.b[3], v1.b[0]
368; CHECK-NEXT:    ret
369entry:
370  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8> %a2)
371  %1 = trunc i32 %0 to i8
372  %2 = insertelement <16 x i8> %a1, i8 %1, i32 3
373  ret <16 x i8> %2
374}
375
376define zeroext i16 @test_vaddvq_u16(<8 x i16> %a1) {
377; CHECK-LABEL: test_vaddvq_u16:
378; CHECK:       // %bb.0: // %entry
379; CHECK-NEXT:    addv h0, v0.8h
380; CHECK-NEXT:    fmov w0, s0
381; CHECK-NEXT:    ret
382entry:
383  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a1)
384  %0 = trunc i32 %vaddv.i to i16
385  ret i16 %0
386}
387
388define <8 x i16> @test_vaddvq_u16_used_by_laneop(<8 x i16> %a1, <8 x i16> %a2) {
389; CHECK-LABEL: test_vaddvq_u16_used_by_laneop:
390; CHECK:       // %bb.0: // %entry
391; CHECK-NEXT:    addv h1, v1.8h
392; CHECK-NEXT:    mov v0.h[3], v1.h[0]
393; CHECK-NEXT:    ret
394entry:
395  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16> %a2)
396  %1 = trunc i32 %0 to i16
397  %2 = insertelement <8 x i16> %a1, i16 %1, i32 3
398  ret <8 x i16> %2
399}
400
401define i32 @test_vaddvq_u32(<4 x i32> %a1) {
402; CHECK-LABEL: test_vaddvq_u32:
403; CHECK:       // %bb.0: // %entry
404; CHECK-NEXT:    addv s0, v0.4s
405; CHECK-NEXT:    fmov w0, s0
406; CHECK-NEXT:    ret
407entry:
408  %vaddv.i = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a1)
409  ret i32 %vaddv.i
410}
411
412define <4 x i32> @test_vaddvq_u32_used_by_laneop(<4 x i32> %a1, <4 x i32> %a2) {
413; CHECK-LABEL: test_vaddvq_u32_used_by_laneop:
414; CHECK:       // %bb.0: // %entry
415; CHECK-NEXT:    addv s1, v1.4s
416; CHECK-NEXT:    mov v0.s[3], v1.s[0]
417; CHECK-NEXT:    ret
418entry:
419  %0 = tail call i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32> %a2)
420  %1 = insertelement <4 x i32> %a1, i32 %0, i32 3
421  ret <4 x i32> %1
422}
423
424declare i32 @llvm.aarch64.neon.uaddv.i32.v4i32(<4 x i32>)
425
426declare i32 @llvm.aarch64.neon.uaddv.i32.v8i16(<8 x i16>)
427
428declare i32 @llvm.aarch64.neon.uaddv.i32.v16i8(<16 x i8>)
429
430declare i32 @llvm.aarch64.neon.saddv.i32.v4i32(<4 x i32>)
431
432declare i32 @llvm.aarch64.neon.saddv.i32.v8i16(<8 x i16>)
433
434declare i32 @llvm.aarch64.neon.saddv.i32.v16i8(<16 x i8>)
435
436declare i64 @llvm.aarch64.neon.uaddv.i64.v2i64(<2 x i64>)
437
438declare i32 @llvm.aarch64.neon.uaddv.i32.v2i32(<2 x i32>)
439
440declare i32 @llvm.aarch64.neon.uaddv.i32.v4i16(<4 x i16>)
441
442declare i32 @llvm.aarch64.neon.uaddv.i32.v8i8(<8 x i8>)
443
444declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
445
446declare i64 @llvm.aarch64.neon.saddv.i64.v2i64(<2 x i64>)
447
448declare i32 @llvm.aarch64.neon.saddv.i32.v4i16(<4 x i16>)
449
450declare i32 @llvm.aarch64.neon.saddv.i32.v8i8(<8 x i8>)
451
452declare float @llvm.aarch64.neon.faddv.f32.v2f32(<2 x float> %a1)
453declare float @llvm.aarch64.neon.faddv.f32.v4f32(<4 x float> %a1)
454declare double @llvm.aarch64.neon.faddv.f64.v2f64(<2 x double> %a1)
455