xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-neon-add-pairwise.ll (revision 13b7629a58dd5a020d9e5bbe77d676b60bfc512a)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc -mtriple=arm64-none-linux-gnu -mattr=+neon -global-isel < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>)
6
7define <8 x i8> @test_addp_v8i8(<8 x i8> %lhs, <8 x i8> %rhs) {
8; Using registers other than v0, v1 are possible, but would be odd.
9; CHECK-LABEL: test_addp_v8i8:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    addp v0.8b, v0.8b, v1.8b
12; CHECK-NEXT:    ret
13  %tmp1 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %lhs, <8 x i8> %rhs)
14  ret <8 x i8> %tmp1
15}
16
17declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>)
18
19define <16 x i8> @test_addp_v16i8(<16 x i8> %lhs, <16 x i8> %rhs) {
20; CHECK-LABEL: test_addp_v16i8:
21; CHECK:       // %bb.0:
22; CHECK-NEXT:    addp v0.16b, v0.16b, v1.16b
23; CHECK-NEXT:    ret
24  %tmp1 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %lhs, <16 x i8> %rhs)
25  ret <16 x i8> %tmp1
26}
27
28declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>)
29
30define <4 x i16> @test_addp_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
31; CHECK-LABEL: test_addp_v4i16:
32; CHECK:       // %bb.0:
33; CHECK-NEXT:    addp v0.4h, v0.4h, v1.4h
34; CHECK-NEXT:    ret
35  %tmp1 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %lhs, <4 x i16> %rhs)
36  ret <4 x i16> %tmp1
37}
38
39declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>)
40
41define <8 x i16> @test_addp_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
42; CHECK-LABEL: test_addp_v8i16:
43; CHECK:       // %bb.0:
44; CHECK-NEXT:    addp v0.8h, v0.8h, v1.8h
45; CHECK-NEXT:    ret
46  %tmp1 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %lhs, <8 x i16> %rhs)
47  ret <8 x i16> %tmp1
48}
49
50declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>)
51
52define <2 x i32> @test_addp_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
53; CHECK-LABEL: test_addp_v2i32:
54; CHECK:       // %bb.0:
55; CHECK-NEXT:    addp v0.2s, v0.2s, v1.2s
56; CHECK-NEXT:    ret
57  %tmp1 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %lhs, <2 x i32> %rhs)
58  ret <2 x i32> %tmp1
59}
60
61declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>)
62
63define <4 x i32> @test_addp_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
64; CHECK-LABEL: test_addp_v4i32:
65; CHECK:       // %bb.0:
66; CHECK-NEXT:    addp v0.4s, v0.4s, v1.4s
67; CHECK-NEXT:    ret
68  %tmp1 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %lhs, <4 x i32> %rhs)
69  ret <4 x i32> %tmp1
70}
71
72
73declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>)
74
75define <2 x i64> @test_addp_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
76; CHECK-LABEL: test_addp_v2i64:
77; CHECK:       // %bb.0:
78; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
79; CHECK-NEXT:    ret
80        %val = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %lhs, <2 x i64> %rhs)
81        ret <2 x i64> %val
82}
83
84declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>)
85declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>)
86declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>)
87
88define <2 x float> @test_faddp_v2f32(<2 x float> %lhs, <2 x float> %rhs) {
89; CHECK-LABEL: test_faddp_v2f32:
90; CHECK:       // %bb.0:
91; CHECK-NEXT:    faddp v0.2s, v0.2s, v1.2s
92; CHECK-NEXT:    ret
93        %val = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %lhs, <2 x float> %rhs)
94        ret <2 x float> %val
95}
96
97define <4 x float> @test_faddp_v4f32(<4 x float> %lhs, <4 x float> %rhs) {
98; CHECK-LABEL: test_faddp_v4f32:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    faddp v0.4s, v0.4s, v1.4s
101; CHECK-NEXT:    ret
102        %val = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %lhs, <4 x float> %rhs)
103        ret <4 x float> %val
104}
105
106define <2 x double> @test_faddp_v2f64(<2 x double> %lhs, <2 x double> %rhs) {
107; CHECK-LABEL: test_faddp_v2f64:
108; CHECK:       // %bb.0:
109; CHECK-NEXT:    faddp v0.2d, v0.2d, v1.2d
110; CHECK-NEXT:    ret
111        %val = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %lhs, <2 x double> %rhs)
112        ret <2 x double> %val
113}
114
115define i32 @test_vaddv.v2i32(<2 x i32> %a) {
116; CHECK-LABEL: test_vaddv.v2i32:
117; CHECK:       // %bb.0:
118; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
119; CHECK-NEXT:    fmov w0, s0
120; CHECK-NEXT:    ret
121  %1 = tail call i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32> %a)
122  ret i32 %1
123}
124
125declare i32 @llvm.aarch64.neon.saddv.i32.v2i32(<2 x i32>)
126
127define i32 @addp_v4i32(<4 x i32> %a, <4 x i32> %b) {
128; CHECK-SD-LABEL: addp_v4i32:
129; CHECK-SD:       // %bb.0:
130; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
131; CHECK-SD-NEXT:    addp v0.4s, v0.4s, v0.4s
132; CHECK-SD-NEXT:    dup v1.2s, v0.s[1]
133; CHECK-SD-NEXT:    add v0.2s, v0.2s, v1.2s
134; CHECK-SD-NEXT:    fmov w0, s0
135; CHECK-SD-NEXT:    ret
136;
137; CHECK-GI-LABEL: addp_v4i32:
138; CHECK-GI:       // %bb.0:
139; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
140; CHECK-GI-NEXT:    mov d1, v0.d[1]
141; CHECK-GI-NEXT:    addp v0.2s, v0.2s, v1.2s
142; CHECK-GI-NEXT:    rev64 v1.2s, v0.2s
143; CHECK-GI-NEXT:    add v0.2s, v0.2s, v1.2s
144; CHECK-GI-NEXT:    fmov w0, s0
145; CHECK-GI-NEXT:    ret
146  %1 = add <4 x i32> %a, %b
147  %2 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 0, i32 1>
148  %3 = shufflevector <4 x i32> %1, <4 x i32> poison, <2 x i32> <i32 2, i32 3>
149  %4 = tail call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %2, <2 x i32> %3)
150  %5 = shufflevector <2 x i32> %4, <2 x i32> poison, <2 x i32> <i32 1, i32 poison>
151  %6 = add <2 x i32> %4, %5
152  %7 = extractelement <2 x i32> %6, i64 0
153  ret i32 %7
154}
155
156define <4 x i16> @addp_v8i16(<8 x i16> %a, <8 x i16> %b) {
157; CHECK-SD-LABEL: addp_v8i16:
158; CHECK-SD:       // %bb.0:
159; CHECK-SD-NEXT:    add v0.8h, v0.8h, v1.8h
160; CHECK-SD-NEXT:    addp v0.8h, v0.8h, v0.8h
161; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
162; CHECK-SD-NEXT:    ret
163;
164; CHECK-GI-LABEL: addp_v8i16:
165; CHECK-GI:       // %bb.0:
166; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
167; CHECK-GI-NEXT:    mov d1, v0.d[1]
168; CHECK-GI-NEXT:    addp v0.4h, v0.4h, v1.4h
169; CHECK-GI-NEXT:    ret
170  %1 = add <8 x i16> %a, %b
171  %2 = shufflevector <8 x i16> %1, <8 x i16> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
172  %3 = shufflevector <8 x i16> %1, <8 x i16> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
173  %4 = tail call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %2, <4 x i16> %3)
174  ret <4 x i16> %4
175}
176
177define <8 x i8> @addp_v16i8(<16 x i8> %a, <16 x i8> %b) {
178; CHECK-SD-LABEL: addp_v16i8:
179; CHECK-SD:       // %bb.0:
180; CHECK-SD-NEXT:    add v0.16b, v0.16b, v1.16b
181; CHECK-SD-NEXT:    addp v0.16b, v0.16b, v0.16b
182; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 killed $q0
183; CHECK-SD-NEXT:    ret
184;
185; CHECK-GI-LABEL: addp_v16i8:
186; CHECK-GI:       // %bb.0:
187; CHECK-GI-NEXT:    add v0.16b, v0.16b, v1.16b
188; CHECK-GI-NEXT:    mov d1, v0.d[1]
189; CHECK-GI-NEXT:    addp v0.8b, v0.8b, v1.8b
190; CHECK-GI-NEXT:    ret
191  %1 = add <16 x i8> %a, %b
192  %2 = shufflevector <16 x i8> %1, <16 x i8> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
193  %3 = shufflevector <16 x i8> %1, <16 x i8> poison, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
194  %4 = tail call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %2, <8 x i8> %3)
195  ret <8 x i8> %4
196}
197
198