xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-vadd.ll (revision 50df08cd43ec02c58067797df33ec67c128431bb)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
2; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5; CHECK-GI:         warning: Instruction selection used fallback path for saddlp1d
6; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for uaddlp1d
7
8define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
9; CHECK-LABEL: addhn8b:
10; CHECK:       // %bb.0:
11; CHECK-NEXT:    ldr q0, [x0]
12; CHECK-NEXT:    ldr q1, [x1]
13; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
14; CHECK-NEXT:    ret
15        %tmp1 = load <8 x i16>, ptr %A
16        %tmp2 = load <8 x i16>, ptr %B
17        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
18        ret <8 x i8> %tmp3
19}
20
21define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
22; CHECK-LABEL: addhn4h:
23; CHECK:       // %bb.0:
24; CHECK-NEXT:    ldr q0, [x0]
25; CHECK-NEXT:    ldr q1, [x1]
26; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
27; CHECK-NEXT:    ret
28        %tmp1 = load <4 x i32>, ptr %A
29        %tmp2 = load <4 x i32>, ptr %B
30        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
31        ret <4 x i16> %tmp3
32}
33
34define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
35; CHECK-LABEL: addhn2s:
36; CHECK:       // %bb.0:
37; CHECK-NEXT:    ldr q0, [x0]
38; CHECK-NEXT:    ldr q1, [x1]
39; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
40; CHECK-NEXT:    ret
41        %tmp1 = load <2 x i64>, ptr %A
42        %tmp2 = load <2 x i64>, ptr %B
43        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
44        ret <2 x i32> %tmp3
45}
46
47define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
48; CHECK-LABEL: addhn2_16b:
49; CHECK:       // %bb.0:
50; CHECK-NEXT:    addhn v2.8b, v0.8h, v1.8h
51; CHECK-NEXT:    addhn2 v2.16b, v0.8h, v1.8h
52; CHECK-NEXT:    mov v0.16b, v2.16b
53; CHECK-NEXT:    ret
54  %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
55  %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
56  %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
57  ret <16 x i8> %res
58}
59
60define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
61; CHECK-LABEL: addhn2_8h:
62; CHECK:       // %bb.0:
63; CHECK-NEXT:    addhn v2.4h, v0.4s, v1.4s
64; CHECK-NEXT:    addhn2 v2.8h, v0.4s, v1.4s
65; CHECK-NEXT:    mov v0.16b, v2.16b
66; CHECK-NEXT:    ret
67  %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
68  %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
69  %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
70  ret <8 x i16> %res
71}
72
73define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
74; CHECK-LABEL: addhn2_4s:
75; CHECK:       // %bb.0:
76; CHECK-NEXT:    addhn v2.2s, v0.2d, v1.2d
77; CHECK-NEXT:    addhn2 v2.4s, v0.2d, v1.2d
78; CHECK-NEXT:    mov v0.16b, v2.16b
79; CHECK-NEXT:    ret
80  %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
81  %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
82  %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
83  ret <4 x i32> %res
84}
85
86declare <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
87declare <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
88declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
89
90
91define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
92; CHECK-LABEL: raddhn8b:
93; CHECK:       // %bb.0:
94; CHECK-NEXT:    ldr q0, [x0]
95; CHECK-NEXT:    ldr q1, [x1]
96; CHECK-NEXT:    raddhn v0.8b, v0.8h, v1.8h
97; CHECK-NEXT:    ret
98        %tmp1 = load <8 x i16>, ptr %A
99        %tmp2 = load <8 x i16>, ptr %B
100        %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
101        ret <8 x i8> %tmp3
102}
103
104define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
105; CHECK-LABEL: raddhn4h:
106; CHECK:       // %bb.0:
107; CHECK-NEXT:    ldr q0, [x0]
108; CHECK-NEXT:    ldr q1, [x1]
109; CHECK-NEXT:    raddhn v0.4h, v0.4s, v1.4s
110; CHECK-NEXT:    ret
111        %tmp1 = load <4 x i32>, ptr %A
112        %tmp2 = load <4 x i32>, ptr %B
113        %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
114        ret <4 x i16> %tmp3
115}
116
117define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
118; CHECK-LABEL: raddhn2s:
119; CHECK:       // %bb.0:
120; CHECK-NEXT:    ldr q0, [x0]
121; CHECK-NEXT:    ldr q1, [x1]
122; CHECK-NEXT:    raddhn v0.2s, v0.2d, v1.2d
123; CHECK-NEXT:    ret
124        %tmp1 = load <2 x i64>, ptr %A
125        %tmp2 = load <2 x i64>, ptr %B
126        %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
127        ret <2 x i32> %tmp3
128}
129
130define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
131; CHECK-LABEL: raddhn2_16b:
132; CHECK:       // %bb.0:
133; CHECK-NEXT:    raddhn v2.8b, v0.8h, v1.8h
134; CHECK-NEXT:    raddhn2 v2.16b, v0.8h, v1.8h
135; CHECK-NEXT:    mov v0.16b, v2.16b
136; CHECK-NEXT:    ret
137  %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
138  %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
139  %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
140  ret <16 x i8> %res
141}
142
143define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
144; CHECK-LABEL: raddhn2_8h:
145; CHECK:       // %bb.0:
146; CHECK-NEXT:    raddhn v2.4h, v0.4s, v1.4s
147; CHECK-NEXT:    raddhn2 v2.8h, v0.4s, v1.4s
148; CHECK-NEXT:    mov v0.16b, v2.16b
149; CHECK-NEXT:    ret
150  %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
151  %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
152  %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
153  ret <8 x i16> %res
154}
155
156define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
157; CHECK-LABEL: raddhn2_4s:
158; CHECK:       // %bb.0:
159; CHECK-NEXT:    raddhn v2.2s, v0.2d, v1.2d
160; CHECK-NEXT:    raddhn2 v2.4s, v0.2d, v1.2d
161; CHECK-NEXT:    mov v0.16b, v2.16b
162; CHECK-NEXT:    ret
163  %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
164  %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
165  %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
166  ret <4 x i32> %res
167}
168
169declare <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64>, <2 x i64>) nounwind readnone
170declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
171declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
172
173define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
174; CHECK-LABEL: saddl8h:
175; CHECK:       // %bb.0:
176; CHECK-NEXT:    ldr d0, [x0]
177; CHECK-NEXT:    ldr d1, [x1]
178; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
179; CHECK-NEXT:    ret
180        %tmp1 = load <8 x i8>, ptr %A
181        %tmp2 = load <8 x i8>, ptr %B
182  %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
183  %tmp4 = sext <8 x i8> %tmp2 to <8 x i16>
184  %tmp5 = add <8 x i16> %tmp3, %tmp4
185        ret <8 x i16> %tmp5
186}
187
188define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
189; CHECK-LABEL: saddl4s:
190; CHECK:       // %bb.0:
191; CHECK-NEXT:    ldr d0, [x0]
192; CHECK-NEXT:    ldr d1, [x1]
193; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
194; CHECK-NEXT:    ret
195        %tmp1 = load <4 x i16>, ptr %A
196        %tmp2 = load <4 x i16>, ptr %B
197  %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
198  %tmp4 = sext <4 x i16> %tmp2 to <4 x i32>
199  %tmp5 = add <4 x i32> %tmp3, %tmp4
200        ret <4 x i32> %tmp5
201}
202
203define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
204; CHECK-LABEL: saddl2d:
205; CHECK:       // %bb.0:
206; CHECK-NEXT:    ldr d0, [x0]
207; CHECK-NEXT:    ldr d1, [x1]
208; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
209; CHECK-NEXT:    ret
210        %tmp1 = load <2 x i32>, ptr %A
211        %tmp2 = load <2 x i32>, ptr %B
212  %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
213  %tmp4 = sext <2 x i32> %tmp2 to <2 x i64>
214  %tmp5 = add <2 x i64> %tmp3, %tmp4
215        ret <2 x i64> %tmp5
216}
217
218define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
219; CHECK-LABEL: saddl2_8h:
220; CHECK:       // %bb.0:
221; CHECK-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
222; CHECK-NEXT:    ret
223  %tmp = bitcast <16 x i8> %a to <2 x i64>
224  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
225  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
226  %vmovl.i.i.i = sext <8 x i8> %tmp1 to <8 x i16>
227  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
228  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
229  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
230  %vmovl.i.i5.i = sext <8 x i8> %tmp3 to <8 x i16>
231  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
232  ret <8 x i16> %add.i
233}
234
235define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
236; CHECK-LABEL: saddl2_4s:
237; CHECK:       // %bb.0:
238; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
239; CHECK-NEXT:    ret
240  %tmp = bitcast <8 x i16> %a to <2 x i64>
241  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
242  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
243  %vmovl.i.i.i = sext <4 x i16> %tmp1 to <4 x i32>
244  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
245  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
246  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
247  %vmovl.i.i5.i = sext <4 x i16> %tmp3 to <4 x i32>
248  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
249  ret <4 x i32> %add.i
250}
251
252define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
253; CHECK-LABEL: saddl2_2d:
254; CHECK:       // %bb.0:
255; CHECK-NEXT:    saddl2 v0.2d, v0.4s, v1.4s
256; CHECK-NEXT:    ret
257  %tmp = bitcast <4 x i32> %a to <2 x i64>
258  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
259  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
260  %vmovl.i.i.i = sext <2 x i32> %tmp1 to <2 x i64>
261  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
262  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
263  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
264  %vmovl.i.i5.i = sext <2 x i32> %tmp3 to <2 x i64>
265  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
266  ret <2 x i64> %add.i
267}
268
269define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
270; CHECK-LABEL: uaddl8h:
271; CHECK:       // %bb.0:
272; CHECK-NEXT:    ldr d0, [x0]
273; CHECK-NEXT:    ldr d1, [x1]
274; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
275; CHECK-NEXT:    ret
276  %tmp1 = load <8 x i8>, ptr %A
277  %tmp2 = load <8 x i8>, ptr %B
278  %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
279  %tmp4 = zext <8 x i8> %tmp2 to <8 x i16>
280  %tmp5 = add <8 x i16> %tmp3, %tmp4
281  ret <8 x i16> %tmp5
282}
283
284define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
285; CHECK-LABEL: uaddl4s:
286; CHECK:       // %bb.0:
287; CHECK-NEXT:    ldr d0, [x0]
288; CHECK-NEXT:    ldr d1, [x1]
289; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
290; CHECK-NEXT:    ret
291  %tmp1 = load <4 x i16>, ptr %A
292  %tmp2 = load <4 x i16>, ptr %B
293  %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
294  %tmp4 = zext <4 x i16> %tmp2 to <4 x i32>
295  %tmp5 = add <4 x i32> %tmp3, %tmp4
296  ret <4 x i32> %tmp5
297}
298
299define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
300; CHECK-LABEL: uaddl2d:
301; CHECK:       // %bb.0:
302; CHECK-NEXT:    ldr d0, [x0]
303; CHECK-NEXT:    ldr d1, [x1]
304; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
305; CHECK-NEXT:    ret
306  %tmp1 = load <2 x i32>, ptr %A
307  %tmp2 = load <2 x i32>, ptr %B
308  %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
309  %tmp4 = zext <2 x i32> %tmp2 to <2 x i64>
310  %tmp5 = add <2 x i64> %tmp3, %tmp4
311  ret <2 x i64> %tmp5
312}
313
314
315define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
316; CHECK-LABEL: uaddl2_8h:
317; CHECK:       // %bb.0:
318; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
319; CHECK-NEXT:    ret
320  %tmp = bitcast <16 x i8> %a to <2 x i64>
321  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
322  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
323  %vmovl.i.i.i = zext <8 x i8> %tmp1 to <8 x i16>
324  %tmp2 = bitcast <16 x i8> %b to <2 x i64>
325  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
326  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <8 x i8>
327  %vmovl.i.i5.i = zext <8 x i8> %tmp3 to <8 x i16>
328  %add.i = add <8 x i16> %vmovl.i.i.i, %vmovl.i.i5.i
329  ret <8 x i16> %add.i
330}
331
332define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
333; CHECK-LABEL: uaddl2_4s:
334; CHECK:       // %bb.0:
335; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v1.8h
336; CHECK-NEXT:    ret
337  %tmp = bitcast <8 x i16> %a to <2 x i64>
338  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
339  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
340  %vmovl.i.i.i = zext <4 x i16> %tmp1 to <4 x i32>
341  %tmp2 = bitcast <8 x i16> %b to <2 x i64>
342  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
343  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <4 x i16>
344  %vmovl.i.i5.i = zext <4 x i16> %tmp3 to <4 x i32>
345  %add.i = add <4 x i32> %vmovl.i.i.i, %vmovl.i.i5.i
346  ret <4 x i32> %add.i
347}
348
349define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
350; CHECK-LABEL: uaddl2_2d:
351; CHECK:       // %bb.0:
352; CHECK-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
353; CHECK-NEXT:    ret
354  %tmp = bitcast <4 x i32> %a to <2 x i64>
355  %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
356  %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
357  %vmovl.i.i.i = zext <2 x i32> %tmp1 to <2 x i64>
358  %tmp2 = bitcast <4 x i32> %b to <2 x i64>
359  %shuffle.i.i4.i = shufflevector <2 x i64> %tmp2, <2 x i64> undef, <1 x i32> <i32 1>
360  %tmp3 = bitcast <1 x i64> %shuffle.i.i4.i to <2 x i32>
361  %vmovl.i.i5.i = zext <2 x i32> %tmp3 to <2 x i64>
362  %add.i = add <2 x i64> %vmovl.i.i.i, %vmovl.i.i5.i
363  ret <2 x i64> %add.i
364}
365
366define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
367; CHECK-LABEL: uaddw8h:
368; CHECK:       // %bb.0:
369; CHECK-NEXT:    ldr q0, [x0]
370; CHECK-NEXT:    ldr d1, [x1]
371; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
372; CHECK-NEXT:    ret
373        %tmp1 = load <8 x i16>, ptr %A
374        %tmp2 = load <8 x i8>, ptr %B
375  %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
376  %tmp4 = add <8 x i16> %tmp1, %tmp3
377        ret <8 x i16> %tmp4
378}
379
380define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
381; CHECK-LABEL: uaddw4s:
382; CHECK:       // %bb.0:
383; CHECK-NEXT:    ldr q0, [x0]
384; CHECK-NEXT:    ldr d1, [x1]
385; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
386; CHECK-NEXT:    ret
387        %tmp1 = load <4 x i32>, ptr %A
388        %tmp2 = load <4 x i16>, ptr %B
389  %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
390  %tmp4 = add <4 x i32> %tmp1, %tmp3
391        ret <4 x i32> %tmp4
392}
393
394define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
395; CHECK-LABEL: uaddw2d:
396; CHECK:       // %bb.0:
397; CHECK-NEXT:    ldr q0, [x0]
398; CHECK-NEXT:    ldr d1, [x1]
399; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
400; CHECK-NEXT:    ret
401        %tmp1 = load <2 x i64>, ptr %A
402        %tmp2 = load <2 x i32>, ptr %B
403  %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
404  %tmp4 = add <2 x i64> %tmp1, %tmp3
405        ret <2 x i64> %tmp4
406}
407
408define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
409; CHECK-SD-LABEL: uaddw2_8h:
410; CHECK-SD:       // %bb.0:
411; CHECK-SD-NEXT:    ldr q0, [x0]
412; CHECK-SD-NEXT:    ldr d1, [x1, #8]
413; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v1.8b
414; CHECK-SD-NEXT:    ret
415;
416; CHECK-GI-LABEL: uaddw2_8h:
417; CHECK-GI:       // %bb.0:
418; CHECK-GI-NEXT:    ldr q0, [x0]
419; CHECK-GI-NEXT:    ldr q1, [x1]
420; CHECK-GI-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
421; CHECK-GI-NEXT:    ret
422        %tmp1 = load <8 x i16>, ptr %A
423
424        %tmp2 = load <16 x i8>, ptr %B
425        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
426        %ext2 = zext <8 x i8> %high2 to <8 x i16>
427
428        %res = add <8 x i16> %tmp1, %ext2
429        ret <8 x i16> %res
430}
431
432define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
433; CHECK-SD-LABEL: uaddw2_4s:
434; CHECK-SD:       // %bb.0:
435; CHECK-SD-NEXT:    ldr q0, [x0]
436; CHECK-SD-NEXT:    ldr d1, [x1, #8]
437; CHECK-SD-NEXT:    uaddw v0.4s, v0.4s, v1.4h
438; CHECK-SD-NEXT:    ret
439;
440; CHECK-GI-LABEL: uaddw2_4s:
441; CHECK-GI:       // %bb.0:
442; CHECK-GI-NEXT:    ldr q0, [x0]
443; CHECK-GI-NEXT:    ldr q1, [x1]
444; CHECK-GI-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
445; CHECK-GI-NEXT:    ret
446        %tmp1 = load <4 x i32>, ptr %A
447
448        %tmp2 = load <8 x i16>, ptr %B
449        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
450        %ext2 = zext <4 x i16> %high2 to <4 x i32>
451
452        %res = add <4 x i32> %tmp1, %ext2
453        ret <4 x i32> %res
454}
455
456define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
457; CHECK-SD-LABEL: uaddw2_2d:
458; CHECK-SD:       // %bb.0:
459; CHECK-SD-NEXT:    ldr q0, [x0]
460; CHECK-SD-NEXT:    ldr d1, [x1, #8]
461; CHECK-SD-NEXT:    uaddw v0.2d, v0.2d, v1.2s
462; CHECK-SD-NEXT:    ret
463;
464; CHECK-GI-LABEL: uaddw2_2d:
465; CHECK-GI:       // %bb.0:
466; CHECK-GI-NEXT:    ldr q0, [x0]
467; CHECK-GI-NEXT:    ldr q1, [x1]
468; CHECK-GI-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
469; CHECK-GI-NEXT:    ret
470        %tmp1 = load <2 x i64>, ptr %A
471
472        %tmp2 = load <4 x i32>, ptr %B
473        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
474        %ext2 = zext <2 x i32> %high2 to <2 x i64>
475
476        %res = add <2 x i64> %tmp1, %ext2
477        ret <2 x i64> %res
478}
479
480define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
481; CHECK-LABEL: saddw8h:
482; CHECK:       // %bb.0:
483; CHECK-NEXT:    ldr q0, [x0]
484; CHECK-NEXT:    ldr d1, [x1]
485; CHECK-NEXT:    saddw v0.8h, v0.8h, v1.8b
486; CHECK-NEXT:    ret
487        %tmp1 = load <8 x i16>, ptr %A
488        %tmp2 = load <8 x i8>, ptr %B
489        %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
490        %tmp4 = add <8 x i16> %tmp1, %tmp3
491        ret <8 x i16> %tmp4
492}
493
494define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
495; CHECK-LABEL: saddw4s:
496; CHECK:       // %bb.0:
497; CHECK-NEXT:    ldr q0, [x0]
498; CHECK-NEXT:    ldr d1, [x1]
499; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
500; CHECK-NEXT:    ret
501        %tmp1 = load <4 x i32>, ptr %A
502        %tmp2 = load <4 x i16>, ptr %B
503        %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
504        %tmp4 = add <4 x i32> %tmp1, %tmp3
505        ret <4 x i32> %tmp4
506}
507
508define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
509; CHECK-LABEL: saddw2d:
510; CHECK:       // %bb.0:
511; CHECK-NEXT:    ldr q0, [x0]
512; CHECK-NEXT:    ldr d1, [x1]
513; CHECK-NEXT:    saddw v0.2d, v0.2d, v1.2s
514; CHECK-NEXT:    ret
515        %tmp1 = load <2 x i64>, ptr %A
516        %tmp2 = load <2 x i32>, ptr %B
517        %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
518        %tmp4 = add <2 x i64> %tmp1, %tmp3
519        ret <2 x i64> %tmp4
520}
521
522define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
523; CHECK-SD-LABEL: saddw2_8h:
524; CHECK-SD:       // %bb.0:
525; CHECK-SD-NEXT:    ldr q0, [x0]
526; CHECK-SD-NEXT:    ldr d1, [x1, #8]
527; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v1.8b
528; CHECK-SD-NEXT:    ret
529;
530; CHECK-GI-LABEL: saddw2_8h:
531; CHECK-GI:       // %bb.0:
532; CHECK-GI-NEXT:    ldr q0, [x0]
533; CHECK-GI-NEXT:    ldr q1, [x1]
534; CHECK-GI-NEXT:    saddw2 v0.8h, v0.8h, v1.16b
535; CHECK-GI-NEXT:    ret
536        %tmp1 = load <8 x i16>, ptr %A
537
538        %tmp2 = load <16 x i8>, ptr %B
539        %high2 = shufflevector <16 x i8> %tmp2, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
540        %ext2 = sext <8 x i8> %high2 to <8 x i16>
541
542        %res = add <8 x i16> %tmp1, %ext2
543        ret <8 x i16> %res
544}
545
546define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
547; CHECK-SD-LABEL: saddw2_4s:
548; CHECK-SD:       // %bb.0:
549; CHECK-SD-NEXT:    ldr q0, [x0]
550; CHECK-SD-NEXT:    ldr d1, [x1, #8]
551; CHECK-SD-NEXT:    saddw v0.4s, v0.4s, v1.4h
552; CHECK-SD-NEXT:    ret
553;
554; CHECK-GI-LABEL: saddw2_4s:
555; CHECK-GI:       // %bb.0:
556; CHECK-GI-NEXT:    ldr q0, [x0]
557; CHECK-GI-NEXT:    ldr q1, [x1]
558; CHECK-GI-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
559; CHECK-GI-NEXT:    ret
560        %tmp1 = load <4 x i32>, ptr %A
561
562        %tmp2 = load <8 x i16>, ptr %B
563        %high2 = shufflevector <8 x i16> %tmp2, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
564        %ext2 = sext <4 x i16> %high2 to <4 x i32>
565
566        %res = add <4 x i32> %tmp1, %ext2
567        ret <4 x i32> %res
568}
569
570define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
571; CHECK-SD-LABEL: saddw2_2d:
572; CHECK-SD:       // %bb.0:
573; CHECK-SD-NEXT:    ldr q0, [x0]
574; CHECK-SD-NEXT:    ldr d1, [x1, #8]
575; CHECK-SD-NEXT:    saddw v0.2d, v0.2d, v1.2s
576; CHECK-SD-NEXT:    ret
577;
578; CHECK-GI-LABEL: saddw2_2d:
579; CHECK-GI:       // %bb.0:
580; CHECK-GI-NEXT:    ldr q0, [x0]
581; CHECK-GI-NEXT:    ldr q1, [x1]
582; CHECK-GI-NEXT:    saddw2 v0.2d, v0.2d, v1.4s
583; CHECK-GI-NEXT:    ret
584        %tmp1 = load <2 x i64>, ptr %A
585
586        %tmp2 = load <4 x i32>, ptr %B
587        %high2 = shufflevector <4 x i32> %tmp2, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
588        %ext2 = sext <2 x i32> %high2 to <2 x i64>
589
590        %res = add <2 x i64> %tmp1, %ext2
591        ret <2 x i64> %res
592}
593
594define <4 x i16> @saddlp4h(ptr %A) nounwind {
595; CHECK-LABEL: saddlp4h:
596; CHECK:       // %bb.0:
597; CHECK-NEXT:    ldr d0, [x0]
598; CHECK-NEXT:    saddlp v0.4h, v0.8b
599; CHECK-NEXT:    ret
600        %tmp1 = load <8 x i8>, ptr %A
601        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
602        ret <4 x i16> %tmp3
603}
604
605define <2 x i32> @saddlp2s(ptr %A) nounwind {
606; CHECK-LABEL: saddlp2s:
607; CHECK:       // %bb.0:
608; CHECK-NEXT:    ldr d0, [x0]
609; CHECK-NEXT:    saddlp v0.2s, v0.4h
610; CHECK-NEXT:    ret
611        %tmp1 = load <4 x i16>, ptr %A
612        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
613        ret <2 x i32> %tmp3
614}
615
616define <1 x i64> @saddlp1d(ptr %A) nounwind {
617; CHECK-LABEL: saddlp1d:
618; CHECK:       // %bb.0:
619; CHECK-NEXT:    ldr d0, [x0]
620; CHECK-NEXT:    saddlp v0.1d, v0.2s
621; CHECK-NEXT:    ret
622        %tmp1 = load <2 x i32>, ptr %A
623        %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
624        ret <1 x i64> %tmp3
625}
626
627define <8 x i16> @saddlp8h(ptr %A) nounwind {
628; CHECK-LABEL: saddlp8h:
629; CHECK:       // %bb.0:
630; CHECK-NEXT:    ldr q0, [x0]
631; CHECK-NEXT:    saddlp v0.8h, v0.16b
632; CHECK-NEXT:    ret
633        %tmp1 = load <16 x i8>, ptr %A
634        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
635        ret <8 x i16> %tmp3
636}
637
638define <4 x i32> @saddlp4s(ptr %A) nounwind {
639; CHECK-LABEL: saddlp4s:
640; CHECK:       // %bb.0:
641; CHECK-NEXT:    ldr q0, [x0]
642; CHECK-NEXT:    saddlp v0.4s, v0.8h
643; CHECK-NEXT:    ret
644        %tmp1 = load <8 x i16>, ptr %A
645        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
646        ret <4 x i32> %tmp3
647}
648
649define <2 x i64> @saddlp2d(ptr %A) nounwind {
650; CHECK-LABEL: saddlp2d:
651; CHECK:       // %bb.0:
652; CHECK-NEXT:    ldr q0, [x0]
653; CHECK-NEXT:    saddlp v0.2d, v0.4s
654; CHECK-NEXT:    ret
655        %tmp1 = load <4 x i32>, ptr %A
656        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
657        ret <2 x i64> %tmp3
658}
659
660declare <4 x i16>  @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
661declare <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
662declare <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
663
664declare <8 x i16>  @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
665declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
666declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
667
668define <4 x i16> @uaddlp4h(ptr %A) nounwind {
669; CHECK-LABEL: uaddlp4h:
670; CHECK:       // %bb.0:
671; CHECK-NEXT:    ldr d0, [x0]
672; CHECK-NEXT:    uaddlp v0.4h, v0.8b
673; CHECK-NEXT:    ret
674        %tmp1 = load <8 x i8>, ptr %A
675        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
676        ret <4 x i16> %tmp3
677}
678
679define <2 x i32> @uaddlp2s(ptr %A) nounwind {
680; CHECK-LABEL: uaddlp2s:
681; CHECK:       // %bb.0:
682; CHECK-NEXT:    ldr d0, [x0]
683; CHECK-NEXT:    uaddlp v0.2s, v0.4h
684; CHECK-NEXT:    ret
685        %tmp1 = load <4 x i16>, ptr %A
686        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
687        ret <2 x i32> %tmp3
688}
689
690define <1 x i64> @uaddlp1d(ptr %A) nounwind {
691; CHECK-LABEL: uaddlp1d:
692; CHECK:       // %bb.0:
693; CHECK-NEXT:    ldr d0, [x0]
694; CHECK-NEXT:    uaddlp v0.1d, v0.2s
695; CHECK-NEXT:    ret
696        %tmp1 = load <2 x i32>, ptr %A
697        %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
698        ret <1 x i64> %tmp3
699}
700
701define <8 x i16> @uaddlp8h(ptr %A) nounwind {
702; CHECK-LABEL: uaddlp8h:
703; CHECK:       // %bb.0:
704; CHECK-NEXT:    ldr q0, [x0]
705; CHECK-NEXT:    uaddlp v0.8h, v0.16b
706; CHECK-NEXT:    ret
707        %tmp1 = load <16 x i8>, ptr %A
708        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
709        ret <8 x i16> %tmp3
710}
711
712define <4 x i32> @uaddlp4s(ptr %A) nounwind {
713; CHECK-LABEL: uaddlp4s:
714; CHECK:       // %bb.0:
715; CHECK-NEXT:    ldr q0, [x0]
716; CHECK-NEXT:    uaddlp v0.4s, v0.8h
717; CHECK-NEXT:    ret
718        %tmp1 = load <8 x i16>, ptr %A
719        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
720        ret <4 x i32> %tmp3
721}
722
723define <2 x i64> @uaddlp2d(ptr %A) nounwind {
724; CHECK-LABEL: uaddlp2d:
725; CHECK:       // %bb.0:
726; CHECK-NEXT:    ldr q0, [x0]
727; CHECK-NEXT:    uaddlp v0.2d, v0.4s
728; CHECK-NEXT:    ret
729        %tmp1 = load <4 x i32>, ptr %A
730        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
731        ret <2 x i64> %tmp3
732}
733
734declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
735declare <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16>) nounwind readnone
736declare <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32>) nounwind readnone
737
738declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
739declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind readnone
740declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
741
742define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
743; CHECK-LABEL: sadalp4h:
744; CHECK:       // %bb.0:
745; CHECK-NEXT:    ldr d1, [x0]
746; CHECK-NEXT:    ldr d0, [x1]
747; CHECK-NEXT:    sadalp v0.4h, v1.8b
748; CHECK-NEXT:    ret
749        %tmp1 = load <8 x i8>, ptr %A
750        %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
751        %tmp4 = load <4 x i16>, ptr %B
752        %tmp5 = add <4 x i16> %tmp3, %tmp4
753        ret <4 x i16> %tmp5
754}
755
756define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
757; CHECK-LABEL: sadalp2s:
758; CHECK:       // %bb.0:
759; CHECK-NEXT:    ldr d1, [x0]
760; CHECK-NEXT:    ldr d0, [x1]
761; CHECK-NEXT:    sadalp v0.2s, v1.4h
762; CHECK-NEXT:    ret
763        %tmp1 = load <4 x i16>, ptr %A
764        %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
765        %tmp4 = load <2 x i32>, ptr %B
766        %tmp5 = add <2 x i32> %tmp3, %tmp4
767        ret <2 x i32> %tmp5
768}
769
770define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
771; CHECK-LABEL: sadalp8h:
772; CHECK:       // %bb.0:
773; CHECK-NEXT:    ldr q1, [x0]
774; CHECK-NEXT:    ldr q0, [x1]
775; CHECK-NEXT:    sadalp v0.8h, v1.16b
776; CHECK-NEXT:    ret
777        %tmp1 = load <16 x i8>, ptr %A
778        %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
779        %tmp4 = load <8 x i16>, ptr %B
780        %tmp5 = add <8 x i16> %tmp3, %tmp4
781        ret <8 x i16> %tmp5
782}
783
784define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
785; CHECK-LABEL: sadalp4s:
786; CHECK:       // %bb.0:
787; CHECK-NEXT:    ldr q1, [x0]
788; CHECK-NEXT:    ldr q0, [x1]
789; CHECK-NEXT:    sadalp v0.4s, v1.8h
790; CHECK-NEXT:    ret
791        %tmp1 = load <8 x i16>, ptr %A
792        %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
793        %tmp4 = load <4 x i32>, ptr %B
794        %tmp5 = add <4 x i32> %tmp3, %tmp4
795        ret <4 x i32> %tmp5
796}
797
798define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
799; CHECK-LABEL: sadalp2d:
800; CHECK:       // %bb.0:
801; CHECK-NEXT:    ldr q1, [x0]
802; CHECK-NEXT:    ldr q0, [x1]
803; CHECK-NEXT:    sadalp v0.2d, v1.4s
804; CHECK-NEXT:    ret
805        %tmp1 = load <4 x i32>, ptr %A
806        %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
807        %tmp4 = load <2 x i64>, ptr %B
808        %tmp5 = add <2 x i64> %tmp3, %tmp4
809        ret <2 x i64> %tmp5
810}
811
812define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
813; CHECK-LABEL: uadalp4h:
814; CHECK:       // %bb.0:
815; CHECK-NEXT:    ldr d1, [x0]
816; CHECK-NEXT:    ldr d0, [x1]
817; CHECK-NEXT:    uadalp v0.4h, v1.8b
818; CHECK-NEXT:    ret
819        %tmp1 = load <8 x i8>, ptr %A
820        %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
821        %tmp4 = load <4 x i16>, ptr %B
822        %tmp5 = add <4 x i16> %tmp3, %tmp4
823        ret <4 x i16> %tmp5
824}
825
826define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
827; CHECK-LABEL: uadalp2s:
828; CHECK:       // %bb.0:
829; CHECK-NEXT:    ldr d1, [x0]
830; CHECK-NEXT:    ldr d0, [x1]
831; CHECK-NEXT:    uadalp v0.2s, v1.4h
832; CHECK-NEXT:    ret
833        %tmp1 = load <4 x i16>, ptr %A
834        %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
835        %tmp4 = load <2 x i32>, ptr %B
836        %tmp5 = add <2 x i32> %tmp3, %tmp4
837        ret <2 x i32> %tmp5
838}
839
840define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
841; CHECK-LABEL: uadalp8h:
842; CHECK:       // %bb.0:
843; CHECK-NEXT:    ldr q1, [x0]
844; CHECK-NEXT:    ldr q0, [x1]
845; CHECK-NEXT:    uadalp v0.8h, v1.16b
846; CHECK-NEXT:    ret
847        %tmp1 = load <16 x i8>, ptr %A
848        %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
849        %tmp4 = load <8 x i16>, ptr %B
850        %tmp5 = add <8 x i16> %tmp3, %tmp4
851        ret <8 x i16> %tmp5
852}
853
854define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
855; CHECK-LABEL: uadalp4s:
856; CHECK:       // %bb.0:
857; CHECK-NEXT:    ldr q1, [x0]
858; CHECK-NEXT:    ldr q0, [x1]
859; CHECK-NEXT:    uadalp v0.4s, v1.8h
860; CHECK-NEXT:    ret
861        %tmp1 = load <8 x i16>, ptr %A
862        %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
863        %tmp4 = load <4 x i32>, ptr %B
864        %tmp5 = add <4 x i32> %tmp3, %tmp4
865        ret <4 x i32> %tmp5
866}
867
868define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
869; CHECK-LABEL: uadalp2d:
870; CHECK:       // %bb.0:
871; CHECK-NEXT:    ldr q1, [x0]
872; CHECK-NEXT:    ldr q0, [x1]
873; CHECK-NEXT:    uadalp v0.2d, v1.4s
874; CHECK-NEXT:    ret
875        %tmp1 = load <4 x i32>, ptr %A
876        %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
877        %tmp4 = load <2 x i64>, ptr %B
878        %tmp5 = add <2 x i64> %tmp3, %tmp4
879        ret <2 x i64> %tmp5
880}
881
882define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
883; CHECK-LABEL: addp_8b:
884; CHECK:       // %bb.0:
885; CHECK-NEXT:    ldr d0, [x0]
886; CHECK-NEXT:    ldr d1, [x1]
887; CHECK-NEXT:    addp v0.8b, v0.8b, v1.8b
888; CHECK-NEXT:    ret
889        %tmp1 = load <8 x i8>, ptr %A
890        %tmp2 = load <8 x i8>, ptr %B
891        %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
892        ret <8 x i8> %tmp3
893}
894
895define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
896; CHECK-LABEL: addp_16b:
897; CHECK:       // %bb.0:
898; CHECK-NEXT:    ldr q0, [x0]
899; CHECK-NEXT:    ldr q1, [x1]
900; CHECK-NEXT:    addp v0.16b, v0.16b, v1.16b
901; CHECK-NEXT:    ret
902        %tmp1 = load <16 x i8>, ptr %A
903        %tmp2 = load <16 x i8>, ptr %B
904        %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
905        ret <16 x i8> %tmp3
906}
907
908define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
909; CHECK-LABEL: addp_4h:
910; CHECK:       // %bb.0:
911; CHECK-NEXT:    ldr d0, [x0]
912; CHECK-NEXT:    ldr d1, [x1]
913; CHECK-NEXT:    addp v0.4h, v0.4h, v1.4h
914; CHECK-NEXT:    ret
915        %tmp1 = load <4 x i16>, ptr %A
916        %tmp2 = load <4 x i16>, ptr %B
917        %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
918        ret <4 x i16> %tmp3
919}
920
921define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
922; CHECK-LABEL: addp_8h:
923; CHECK:       // %bb.0:
924; CHECK-NEXT:    ldr q0, [x0]
925; CHECK-NEXT:    ldr q1, [x1]
926; CHECK-NEXT:    addp v0.8h, v0.8h, v1.8h
927; CHECK-NEXT:    ret
928        %tmp1 = load <8 x i16>, ptr %A
929        %tmp2 = load <8 x i16>, ptr %B
930        %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
931        ret <8 x i16> %tmp3
932}
933
934define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
935; CHECK-LABEL: addp_2s:
936; CHECK:       // %bb.0:
937; CHECK-NEXT:    ldr d0, [x0]
938; CHECK-NEXT:    ldr d1, [x1]
939; CHECK-NEXT:    addp v0.2s, v0.2s, v1.2s
940; CHECK-NEXT:    ret
941        %tmp1 = load <2 x i32>, ptr %A
942        %tmp2 = load <2 x i32>, ptr %B
943        %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
944        ret <2 x i32> %tmp3
945}
946
947define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
948; CHECK-LABEL: addp_4s:
949; CHECK:       // %bb.0:
950; CHECK-NEXT:    ldr q0, [x0]
951; CHECK-NEXT:    ldr q1, [x1]
952; CHECK-NEXT:    addp v0.4s, v0.4s, v1.4s
953; CHECK-NEXT:    ret
954        %tmp1 = load <4 x i32>, ptr %A
955        %tmp2 = load <4 x i32>, ptr %B
956        %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
957        ret <4 x i32> %tmp3
958}
959
960define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind {
961; CHECK-LABEL: addp_2d:
962; CHECK:       // %bb.0:
963; CHECK-NEXT:    ldr q0, [x0]
964; CHECK-NEXT:    ldr q1, [x1]
965; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
966; CHECK-NEXT:    ret
967        %tmp1 = load <2 x i64>, ptr %A
968        %tmp2 = load <2 x i64>, ptr %B
969        %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
970        ret <2 x i64> %tmp3
971}
972
973declare <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
974declare <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8>, <16 x i8>) nounwind readnone
975declare <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
976declare <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
977declare <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
978declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind readnone
979declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
980
981define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
982; CHECK-LABEL: faddp_2s:
983; CHECK:       // %bb.0:
984; CHECK-NEXT:    ldr d0, [x0]
985; CHECK-NEXT:    ldr d1, [x1]
986; CHECK-NEXT:    faddp v0.2s, v0.2s, v1.2s
987; CHECK-NEXT:    ret
988        %tmp1 = load <2 x float>, ptr %A
989        %tmp2 = load <2 x float>, ptr %B
990        %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
991        ret <2 x float> %tmp3
992}
993
994define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
995; CHECK-LABEL: faddp_4s:
996; CHECK:       // %bb.0:
997; CHECK-NEXT:    ldr q0, [x0]
998; CHECK-NEXT:    ldr q1, [x1]
999; CHECK-NEXT:    faddp v0.4s, v0.4s, v1.4s
1000; CHECK-NEXT:    ret
1001        %tmp1 = load <4 x float>, ptr %A
1002        %tmp2 = load <4 x float>, ptr %B
1003        %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
1004        ret <4 x float> %tmp3
1005}
1006
1007define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind {
1008; CHECK-LABEL: faddp_2d:
1009; CHECK:       // %bb.0:
1010; CHECK-NEXT:    ldr q0, [x0]
1011; CHECK-NEXT:    ldr q1, [x1]
1012; CHECK-NEXT:    faddp v0.2d, v0.2d, v1.2d
1013; CHECK-NEXT:    ret
1014        %tmp1 = load <2 x double>, ptr %A
1015        %tmp2 = load <2 x double>, ptr %B
1016        %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
1017        ret <2 x double> %tmp3
1018}
1019
1020declare <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float>, <2 x float>) nounwind readnone
1021declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nounwind readnone
1022declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
1023
1024define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
1025; CHECK-LABEL: uaddl_duprhs:
1026; CHECK:       // %bb.0:
1027; CHECK-NEXT:    dup v1.2s, w0
1028; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
1029; CHECK-NEXT:    ret
1030  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1031  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1032
1033  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1034
1035  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1036  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1037
1038  %res = add <2 x i64> %lhs.ext, %rhs.ext
1039  ret <2 x i64> %res
1040}
1041
1042define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
1043; CHECK-SD-LABEL: uaddl2_duprhs:
1044; CHECK-SD:       // %bb.0:
1045; CHECK-SD-NEXT:    dup v1.4s, w0
1046; CHECK-SD-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
1047; CHECK-SD-NEXT:    ret
1048;
1049; CHECK-GI-LABEL: uaddl2_duprhs:
1050; CHECK-GI:       // %bb.0:
1051; CHECK-GI-NEXT:    dup v1.2s, w0
1052; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
1053; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
1054; CHECK-GI-NEXT:    ret
1055  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1056  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1057
1058  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1059
1060  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1061  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1062
1063  %res = add <2 x i64> %lhs.ext, %rhs.ext
1064  ret <2 x i64> %res
1065}
1066
1067define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
1068; CHECK-LABEL: saddl_duplhs:
1069; CHECK:       // %bb.0:
1070; CHECK-NEXT:    dup v1.2s, w0
1071; CHECK-NEXT:    saddl v0.2d, v1.2s, v0.2s
1072; CHECK-NEXT:    ret
1073  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1074  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1075
1076  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1077
1078  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1079  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1080
1081  %res = add <2 x i64> %lhs.ext, %rhs.ext
1082  ret <2 x i64> %res
1083}
1084
1085define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
1086; CHECK-SD-LABEL: saddl2_duplhs:
1087; CHECK-SD:       // %bb.0:
1088; CHECK-SD-NEXT:    dup v1.4s, w0
1089; CHECK-SD-NEXT:    saddl2 v0.2d, v1.4s, v0.4s
1090; CHECK-SD-NEXT:    ret
1091;
1092; CHECK-GI-LABEL: saddl2_duplhs:
1093; CHECK-GI:       // %bb.0:
1094; CHECK-GI-NEXT:    dup v1.2s, w0
1095; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
1096; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
1097; CHECK-GI-NEXT:    ret
1098  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1099  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1100
1101  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1102
1103  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1104  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1105
1106  %res = add <2 x i64> %lhs.ext, %rhs.ext
1107  ret <2 x i64> %res
1108}
1109
1110define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
1111; CHECK-LABEL: usubl_duprhs:
1112; CHECK:       // %bb.0:
1113; CHECK-NEXT:    dup v1.2s, w0
1114; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
1115; CHECK-NEXT:    ret
1116  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1117  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1118
1119  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1120
1121  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1122  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1123
1124  %res = sub <2 x i64> %lhs.ext, %rhs.ext
1125  ret <2 x i64> %res
1126}
1127
1128define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
1129; CHECK-SD-LABEL: usubl2_duprhs:
1130; CHECK-SD:       // %bb.0:
1131; CHECK-SD-NEXT:    dup v1.4s, w0
1132; CHECK-SD-NEXT:    usubl2 v0.2d, v0.4s, v1.4s
1133; CHECK-SD-NEXT:    ret
1134;
1135; CHECK-GI-LABEL: usubl2_duprhs:
1136; CHECK-GI:       // %bb.0:
1137; CHECK-GI-NEXT:    dup v1.2s, w0
1138; CHECK-GI-NEXT:    mov d0, v0.d[1]
1139; CHECK-GI-NEXT:    usubl v0.2d, v0.2s, v1.2s
1140; CHECK-GI-NEXT:    ret
1141  %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
1142  %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
1143
1144  %lhs.high = shufflevector <4 x i32> %lhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1145
1146  %lhs.ext = zext <2 x i32> %lhs.high to <2 x i64>
1147  %rhs.ext = zext <2 x i32> %rhsvec to <2 x i64>
1148
1149  %res = sub <2 x i64> %lhs.ext, %rhs.ext
1150  ret <2 x i64> %res
1151}
1152
1153define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
1154; CHECK-LABEL: ssubl_duplhs:
1155; CHECK:       // %bb.0:
1156; CHECK-NEXT:    dup v1.2s, w0
1157; CHECK-NEXT:    ssubl v0.2d, v1.2s, v0.2s
1158; CHECK-NEXT:    ret
1159  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1160  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1161
1162  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
1163
1164  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1165  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1166
1167  %res = sub <2 x i64> %lhs.ext, %rhs.ext
1168  ret <2 x i64> %res
1169}
1170
1171define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
1172; CHECK-SD-LABEL: ssubl2_duplhs:
1173; CHECK-SD:       // %bb.0:
1174; CHECK-SD-NEXT:    dup v1.4s, w0
1175; CHECK-SD-NEXT:    ssubl2 v0.2d, v1.4s, v0.4s
1176; CHECK-SD-NEXT:    ret
1177;
1178; CHECK-GI-LABEL: ssubl2_duplhs:
1179; CHECK-GI:       // %bb.0:
1180; CHECK-GI-NEXT:    dup v1.2s, w0
1181; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
1182; CHECK-GI-NEXT:    ssubw2 v0.2d, v1.2d, v0.4s
1183; CHECK-GI-NEXT:    ret
1184  %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
1185  %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
1186
1187  %rhs.high = shufflevector <4 x i32> %rhs, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
1188
1189  %lhs.ext = sext <2 x i32> %lhsvec to <2 x i64>
1190  %rhs.ext = sext <2 x i32> %rhs.high to <2 x i64>
1191
1192  %res = sub <2 x i64> %lhs.ext, %rhs.ext
1193  ret <2 x i64> %res
1194}
1195
1196define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
1197; CHECK-SD-LABEL: addhn8b_natural:
1198; CHECK-SD:       // %bb.0:
1199; CHECK-SD-NEXT:    ldr q0, [x0]
1200; CHECK-SD-NEXT:    ldr q1, [x1]
1201; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
1202; CHECK-SD-NEXT:    ret
1203;
1204; CHECK-GI-LABEL: addhn8b_natural:
1205; CHECK-GI:       // %bb.0:
1206; CHECK-GI-NEXT:    ldr q0, [x0]
1207; CHECK-GI-NEXT:    ldr q1, [x1]
1208; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
1209; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
1210; CHECK-GI-NEXT:    ret
1211        %tmp1 = load <8 x i16>, ptr %A
1212        %tmp2 = load <8 x i16>, ptr %B
1213        %sum = add <8 x i16> %tmp1, %tmp2
1214        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1215        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1216        ret <8 x i8> %narrowed
1217}
1218
1219define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
1220; CHECK-SD-LABEL: addhn4h_natural:
1221; CHECK-SD:       // %bb.0:
1222; CHECK-SD-NEXT:    ldr q0, [x0]
1223; CHECK-SD-NEXT:    ldr q1, [x1]
1224; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
1225; CHECK-SD-NEXT:    ret
1226;
1227; CHECK-GI-LABEL: addhn4h_natural:
1228; CHECK-GI:       // %bb.0:
1229; CHECK-GI-NEXT:    ldr q0, [x0]
1230; CHECK-GI-NEXT:    ldr q1, [x1]
1231; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
1232; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
1233; CHECK-GI-NEXT:    ret
1234        %tmp1 = load <4 x i32>, ptr %A
1235        %tmp2 = load <4 x i32>, ptr %B
1236        %sum = add <4 x i32> %tmp1, %tmp2
1237        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1238        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1239        ret <4 x i16> %narrowed
1240}
1241
1242define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
1243; CHECK-SD-LABEL: addhn2s_natural:
1244; CHECK-SD:       // %bb.0:
1245; CHECK-SD-NEXT:    ldr q0, [x0]
1246; CHECK-SD-NEXT:    ldr q1, [x1]
1247; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
1248; CHECK-SD-NEXT:    ret
1249;
1250; CHECK-GI-LABEL: addhn2s_natural:
1251; CHECK-GI:       // %bb.0:
1252; CHECK-GI-NEXT:    ldr q0, [x0]
1253; CHECK-GI-NEXT:    ldr q1, [x1]
1254; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
1255; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
1256; CHECK-GI-NEXT:    ret
1257        %tmp1 = load <2 x i64>, ptr %A
1258        %tmp2 = load <2 x i64>, ptr %B
1259        %sum = add <2 x i64> %tmp1, %tmp2
1260        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
1261        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1262        ret <2 x i32> %narrowed
1263}
1264
1265define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
1266; CHECK-SD-LABEL: addhn2_16b_natural:
1267; CHECK-SD:       // %bb.0:
1268; CHECK-SD-NEXT:    ldr q1, [x0]
1269; CHECK-SD-NEXT:    ldr q2, [x1]
1270; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1271; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
1272; CHECK-SD-NEXT:    ret
1273;
1274; CHECK-GI-LABEL: addhn2_16b_natural:
1275; CHECK-GI:       // %bb.0:
1276; CHECK-GI-NEXT:    ldr q1, [x0]
1277; CHECK-GI-NEXT:    ldr q2, [x1]
1278; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1279; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
1280; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
1281; CHECK-GI-NEXT:    ret
1282        %tmp1 = load <8 x i16>, ptr %A
1283        %tmp2 = load <8 x i16>, ptr %B
1284        %sum = add <8 x i16> %tmp1, %tmp2
1285        %high_bits = lshr <8 x i16> %sum, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1286        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1287        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1288        ret <16 x i8> %res
1289}
1290
1291define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
1292; CHECK-SD-LABEL: addhn2_8h_natural:
1293; CHECK-SD:       // %bb.0:
1294; CHECK-SD-NEXT:    ldr q1, [x0]
1295; CHECK-SD-NEXT:    ldr q2, [x1]
1296; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1297; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
1298; CHECK-SD-NEXT:    ret
1299;
1300; CHECK-GI-LABEL: addhn2_8h_natural:
1301; CHECK-GI:       // %bb.0:
1302; CHECK-GI-NEXT:    ldr q1, [x0]
1303; CHECK-GI-NEXT:    ldr q2, [x1]
1304; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1305; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
1306; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
1307; CHECK-GI-NEXT:    ret
1308        %tmp1 = load <4 x i32>, ptr %A
1309        %tmp2 = load <4 x i32>, ptr %B
1310        %sum = add <4 x i32> %tmp1, %tmp2
1311        %high_bits = lshr <4 x i32> %sum, <i32 16, i32 16, i32 16, i32 16>
1312        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1313        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1314        ret <8 x i16> %res
1315}
1316
1317define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
1318; CHECK-SD-LABEL: addhn2_4s_natural:
1319; CHECK-SD:       // %bb.0:
1320; CHECK-SD-NEXT:    ldr q1, [x0]
1321; CHECK-SD-NEXT:    ldr q2, [x1]
1322; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1323; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
1324; CHECK-SD-NEXT:    ret
1325;
1326; CHECK-GI-LABEL: addhn2_4s_natural:
1327; CHECK-GI:       // %bb.0:
1328; CHECK-GI-NEXT:    ldr q1, [x0]
1329; CHECK-GI-NEXT:    ldr q2, [x1]
1330; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1331; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
1332; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
1333; CHECK-GI-NEXT:    ret
1334        %tmp1 = load <2 x i64>, ptr %A
1335        %tmp2 = load <2 x i64>, ptr %B
1336        %sum = add <2 x i64> %tmp1, %tmp2
1337        %high_bits = lshr <2 x i64> %sum, <i64 32, i64 32>
1338        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1339        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1340        ret <4 x i32> %res
1341}
1342
1343define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
1344; CHECK-SD-LABEL: addhn_addhn2_4s:
1345; CHECK-SD:       // %bb.0:
1346; CHECK-SD-NEXT:    ldr q1, [x0]
1347; CHECK-SD-NEXT:    ldr q2, [x1]
1348; CHECK-SD-NEXT:    addhn v0.2s, v1.2d, v2.2d
1349; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
1350; CHECK-SD-NEXT:    ret
1351;
1352; CHECK-GI-LABEL: addhn_addhn2_4s:
1353; CHECK-GI:       // %bb.0:
1354; CHECK-GI-NEXT:    ldr q0, [x0]
1355; CHECK-GI-NEXT:    ldr q1, [x1]
1356; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
1357; CHECK-GI-NEXT:    shrn v0.2s, v1.2d, #32
1358; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
1359; CHECK-GI-NEXT:    ret
1360            %tmp1 = load <2 x i64>, ptr %A
1361            %tmp2 = load <2 x i64>, ptr %B
1362            %sum1 = add <2 x i64> %tmp1, %tmp2
1363            %low_bits = lshr <2 x i64> %sum1, <i64 32, i64 32>
1364            %narrowed1 = trunc <2 x i64> %low_bits to <2 x i32>
1365            %tmp3 = load <2 x i64>, ptr %C
1366            %tmp4 = load <2 x i64>, ptr %D
1367            %sum2 = add <2 x i64> %tmp3, %tmp4
1368            %high_bits = lshr <2 x i64> %sum1, <i64 32, i64 32>
1369            %narrowed2 = trunc <2 x i64> %high_bits to <2 x i32>
1370            %res = shufflevector <2 x i32> %narrowed1, <2 x i32> %narrowed2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1371            ret <4 x i32> %res
1372}
1373
1374define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
1375; CHECK-SD-LABEL: subhn8b_natural:
1376; CHECK-SD:       // %bb.0:
1377; CHECK-SD-NEXT:    ldr q0, [x0]
1378; CHECK-SD-NEXT:    ldr q1, [x1]
1379; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
1380; CHECK-SD-NEXT:    ret
1381;
1382; CHECK-GI-LABEL: subhn8b_natural:
1383; CHECK-GI:       // %bb.0:
1384; CHECK-GI-NEXT:    ldr q0, [x0]
1385; CHECK-GI-NEXT:    ldr q1, [x1]
1386; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
1387; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
1388; CHECK-GI-NEXT:    ret
1389        %tmp1 = load <8 x i16>, ptr %A
1390        %tmp2 = load <8 x i16>, ptr %B
1391        %diff = sub <8 x i16> %tmp1, %tmp2
1392        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1393        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1394        ret <8 x i8> %narrowed
1395}
1396
1397define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
1398; CHECK-SD-LABEL: subhn4h_natural:
1399; CHECK-SD:       // %bb.0:
1400; CHECK-SD-NEXT:    ldr q0, [x0]
1401; CHECK-SD-NEXT:    ldr q1, [x1]
1402; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
1403; CHECK-SD-NEXT:    ret
1404;
1405; CHECK-GI-LABEL: subhn4h_natural:
1406; CHECK-GI:       // %bb.0:
1407; CHECK-GI-NEXT:    ldr q0, [x0]
1408; CHECK-GI-NEXT:    ldr q1, [x1]
1409; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
1410; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
1411; CHECK-GI-NEXT:    ret
1412        %tmp1 = load <4 x i32>, ptr %A
1413        %tmp2 = load <4 x i32>, ptr %B
1414        %diff = sub <4 x i32> %tmp1, %tmp2
1415        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
1416        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1417        ret <4 x i16> %narrowed
1418}
1419
1420define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
1421; CHECK-SD-LABEL: subhn2s_natural:
1422; CHECK-SD:       // %bb.0:
1423; CHECK-SD-NEXT:    ldr q0, [x0]
1424; CHECK-SD-NEXT:    ldr q1, [x1]
1425; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
1426; CHECK-SD-NEXT:    ret
1427;
1428; CHECK-GI-LABEL: subhn2s_natural:
1429; CHECK-GI:       // %bb.0:
1430; CHECK-GI-NEXT:    ldr q0, [x0]
1431; CHECK-GI-NEXT:    ldr q1, [x1]
1432; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
1433; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
1434; CHECK-GI-NEXT:    ret
1435        %tmp1 = load <2 x i64>, ptr %A
1436        %tmp2 = load <2 x i64>, ptr %B
1437        %diff = sub <2 x i64> %tmp1, %tmp2
1438        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1439        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1440        ret <2 x i32> %narrowed
1441}
1442
1443define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
1444; CHECK-SD-LABEL: subhn2_16b_natural:
1445; CHECK-SD:       // %bb.0:
1446; CHECK-SD-NEXT:    ldr q1, [x0]
1447; CHECK-SD-NEXT:    ldr q2, [x1]
1448; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1449; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
1450; CHECK-SD-NEXT:    ret
1451;
1452; CHECK-GI-LABEL: subhn2_16b_natural:
1453; CHECK-GI:       // %bb.0:
1454; CHECK-GI-NEXT:    ldr q1, [x0]
1455; CHECK-GI-NEXT:    ldr q2, [x1]
1456; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1457; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
1458; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
1459; CHECK-GI-NEXT:    ret
1460        %tmp1 = load <8 x i16>, ptr %A
1461        %tmp2 = load <8 x i16>, ptr %B
1462        %diff = sub <8 x i16> %tmp1, %tmp2
1463        %high_bits = lshr <8 x i16> %diff, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
1464        %narrowed = trunc <8 x i16> %high_bits to <8 x i8>
1465        %res = shufflevector <8 x i8> %low, <8 x i8> %narrowed, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
1466        ret <16 x i8> %res
1467}
1468
1469define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
1470; CHECK-SD-LABEL: subhn2_8h_natural:
1471; CHECK-SD:       // %bb.0:
1472; CHECK-SD-NEXT:    ldr q1, [x0]
1473; CHECK-SD-NEXT:    ldr q2, [x1]
1474; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1475; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
1476; CHECK-SD-NEXT:    ret
1477;
1478; CHECK-GI-LABEL: subhn2_8h_natural:
1479; CHECK-GI:       // %bb.0:
1480; CHECK-GI-NEXT:    ldr q1, [x0]
1481; CHECK-GI-NEXT:    ldr q2, [x1]
1482; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1483; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
1484; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
1485; CHECK-GI-NEXT:    ret
1486        %tmp1 = load <4 x i32>, ptr %A
1487        %tmp2 = load <4 x i32>, ptr %B
1488        %diff = sub <4 x i32> %tmp1, %tmp2
1489        %high_bits = lshr <4 x i32> %diff, <i32 16, i32 16, i32 16, i32 16>
1490        %narrowed = trunc <4 x i32> %high_bits to <4 x i16>
1491        %res = shufflevector <4 x i16> %low, <4 x i16> %narrowed, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
1492        ret <8 x i16> %res
1493}
1494
1495define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
1496; CHECK-SD-LABEL: subhn2_4s_natural:
1497; CHECK-SD:       // %bb.0:
1498; CHECK-SD-NEXT:    ldr q1, [x0]
1499; CHECK-SD-NEXT:    ldr q2, [x1]
1500; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1501; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
1502; CHECK-SD-NEXT:    ret
1503;
1504; CHECK-GI-LABEL: subhn2_4s_natural:
1505; CHECK-GI:       // %bb.0:
1506; CHECK-GI-NEXT:    ldr q1, [x0]
1507; CHECK-GI-NEXT:    ldr q2, [x1]
1508; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
1509; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
1510; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
1511; CHECK-GI-NEXT:    ret
1512        %tmp1 = load <2 x i64>, ptr %A
1513        %tmp2 = load <2 x i64>, ptr %B
1514        %diff = sub <2 x i64> %tmp1, %tmp2
1515        %high_bits = lshr <2 x i64> %diff, <i64 32, i64 32>
1516        %narrowed = trunc <2 x i64> %high_bits to <2 x i32>
1517        %res = shufflevector <2 x i32> %low, <2 x i32> %narrowed, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
1518        ret <4 x i32> %res
1519}
1520