xref: /llvm-project/llvm/test/CodeGen/ARM/vpadd.ll (revision bed1c7f061aa12417aa081e334afdba45767b938)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - -lower-interleaved-accesses=false | FileCheck %s
3
4define <8 x i8> @vpaddi8(ptr %A, ptr %B) nounwind {
5; CHECK-LABEL: vpaddi8:
6; CHECK:       @ %bb.0:
7; CHECK-NEXT:    vldr d16, [r1]
8; CHECK-NEXT:    vldr d17, [r0]
9; CHECK-NEXT:    vpadd.i8 d16, d17, d16
10; CHECK-NEXT:    vmov r0, r1, d16
11; CHECK-NEXT:    mov pc, lr
12	%tmp1 = load <8 x i8>, ptr %A
13	%tmp2 = load <8 x i8>, ptr %B
14	%tmp3 = call <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
15	ret <8 x i8> %tmp3
16}
17
18define <4 x i16> @vpaddi16(ptr %A, ptr %B) nounwind {
19; CHECK-LABEL: vpaddi16:
20; CHECK:       @ %bb.0:
21; CHECK-NEXT:    vldr d16, [r1]
22; CHECK-NEXT:    vldr d17, [r0]
23; CHECK-NEXT:    vpadd.i16 d16, d17, d16
24; CHECK-NEXT:    vmov r0, r1, d16
25; CHECK-NEXT:    mov pc, lr
26	%tmp1 = load <4 x i16>, ptr %A
27	%tmp2 = load <4 x i16>, ptr %B
28	%tmp3 = call <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
29	ret <4 x i16> %tmp3
30}
31
32define <2 x i32> @vpaddi32(ptr %A, ptr %B) nounwind {
33; CHECK-LABEL: vpaddi32:
34; CHECK:       @ %bb.0:
35; CHECK-NEXT:    vldr d16, [r1]
36; CHECK-NEXT:    vldr d17, [r0]
37; CHECK-NEXT:    vpadd.i32 d16, d17, d16
38; CHECK-NEXT:    vmov r0, r1, d16
39; CHECK-NEXT:    mov pc, lr
40	%tmp1 = load <2 x i32>, ptr %A
41	%tmp2 = load <2 x i32>, ptr %B
42	%tmp3 = call <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
43	ret <2 x i32> %tmp3
44}
45
46define <2 x float> @vpaddf32(ptr %A, ptr %B) nounwind {
47; CHECK-LABEL: vpaddf32:
48; CHECK:       @ %bb.0:
49; CHECK-NEXT:    vldr d16, [r1]
50; CHECK-NEXT:    vldr d17, [r0]
51; CHECK-NEXT:    vpadd.f32 d16, d17, d16
52; CHECK-NEXT:    vmov r0, r1, d16
53; CHECK-NEXT:    mov pc, lr
54	%tmp1 = load <2 x float>, ptr %A
55	%tmp2 = load <2 x float>, ptr %B
56	%tmp3 = call <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
57	ret <2 x float> %tmp3
58}
59
60declare <8 x i8>  @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>) nounwind readnone
61declare <4 x i16> @llvm.arm.neon.vpadd.v4i16(<4 x i16>, <4 x i16>) nounwind readnone
62declare <2 x i32> @llvm.arm.neon.vpadd.v2i32(<2 x i32>, <2 x i32>) nounwind readnone
63
64declare <2 x float> @llvm.arm.neon.vpadd.v2f32(<2 x float>, <2 x float>) nounwind readnone
65
66define <4 x i16> @vpaddls8(ptr %A) nounwind {
67; CHECK-LABEL: vpaddls8:
68; CHECK:       @ %bb.0:
69; CHECK-NEXT:    vldr d16, [r0]
70; CHECK-NEXT:    vpaddl.s8 d16, d16
71; CHECK-NEXT:    vmov r0, r1, d16
72; CHECK-NEXT:    mov pc, lr
73	%tmp1 = load <8 x i8>, ptr %A
74	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8> %tmp1)
75	ret <4 x i16> %tmp2
76}
77
78define <2 x i32> @vpaddls16(ptr %A) nounwind {
79; CHECK-LABEL: vpaddls16:
80; CHECK:       @ %bb.0:
81; CHECK-NEXT:    vldr d16, [r0]
82; CHECK-NEXT:    vpaddl.s16 d16, d16
83; CHECK-NEXT:    vmov r0, r1, d16
84; CHECK-NEXT:    mov pc, lr
85	%tmp1 = load <4 x i16>, ptr %A
86	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16> %tmp1)
87	ret <2 x i32> %tmp2
88}
89
90define <1 x i64> @vpaddls32(ptr %A) nounwind {
91; CHECK-LABEL: vpaddls32:
92; CHECK:       @ %bb.0:
93; CHECK-NEXT:    vldr d16, [r0]
94; CHECK-NEXT:    vpaddl.s32 d16, d16
95; CHECK-NEXT:    vmov r0, r1, d16
96; CHECK-NEXT:    mov pc, lr
97	%tmp1 = load <2 x i32>, ptr %A
98	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32> %tmp1)
99	ret <1 x i64> %tmp2
100}
101
102define <4 x i16> @vpaddlu8(ptr %A) nounwind {
103; CHECK-LABEL: vpaddlu8:
104; CHECK:       @ %bb.0:
105; CHECK-NEXT:    vldr d16, [r0]
106; CHECK-NEXT:    vpaddl.u8 d16, d16
107; CHECK-NEXT:    vmov r0, r1, d16
108; CHECK-NEXT:    mov pc, lr
109	%tmp1 = load <8 x i8>, ptr %A
110	%tmp2 = call <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8> %tmp1)
111	ret <4 x i16> %tmp2
112}
113
114define <2 x i32> @vpaddlu16(ptr %A) nounwind {
115; CHECK-LABEL: vpaddlu16:
116; CHECK:       @ %bb.0:
117; CHECK-NEXT:    vldr d16, [r0]
118; CHECK-NEXT:    vpaddl.u16 d16, d16
119; CHECK-NEXT:    vmov r0, r1, d16
120; CHECK-NEXT:    mov pc, lr
121	%tmp1 = load <4 x i16>, ptr %A
122	%tmp2 = call <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16> %tmp1)
123	ret <2 x i32> %tmp2
124}
125
126define <1 x i64> @vpaddlu32(ptr %A) nounwind {
127; CHECK-LABEL: vpaddlu32:
128; CHECK:       @ %bb.0:
129; CHECK-NEXT:    vldr d16, [r0]
130; CHECK-NEXT:    vpaddl.u32 d16, d16
131; CHECK-NEXT:    vmov r0, r1, d16
132; CHECK-NEXT:    mov pc, lr
133	%tmp1 = load <2 x i32>, ptr %A
134	%tmp2 = call <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32> %tmp1)
135	ret <1 x i64> %tmp2
136}
137
138define <8 x i16> @vpaddlQs8(ptr %A) nounwind {
139; CHECK-LABEL: vpaddlQs8:
140; CHECK:       @ %bb.0:
141; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
142; CHECK-NEXT:    vpaddl.s8 q8, q8
143; CHECK-NEXT:    vmov r0, r1, d16
144; CHECK-NEXT:    vmov r2, r3, d17
145; CHECK-NEXT:    mov pc, lr
146	%tmp1 = load <16 x i8>, ptr %A
147	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8> %tmp1)
148	ret <8 x i16> %tmp2
149}
150
151define <4 x i32> @vpaddlQs16(ptr %A) nounwind {
152; CHECK-LABEL: vpaddlQs16:
153; CHECK:       @ %bb.0:
154; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
155; CHECK-NEXT:    vpaddl.s16 q8, q8
156; CHECK-NEXT:    vmov r0, r1, d16
157; CHECK-NEXT:    vmov r2, r3, d17
158; CHECK-NEXT:    mov pc, lr
159	%tmp1 = load <8 x i16>, ptr %A
160	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16> %tmp1)
161	ret <4 x i32> %tmp2
162}
163
164define <2 x i64> @vpaddlQs32(ptr %A) nounwind {
165; CHECK-LABEL: vpaddlQs32:
166; CHECK:       @ %bb.0:
167; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
168; CHECK-NEXT:    vpaddl.s32 q8, q8
169; CHECK-NEXT:    vmov r0, r1, d16
170; CHECK-NEXT:    vmov r2, r3, d17
171; CHECK-NEXT:    mov pc, lr
172	%tmp1 = load <4 x i32>, ptr %A
173	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32> %tmp1)
174	ret <2 x i64> %tmp2
175}
176
177define <8 x i16> @vpaddlQu8(ptr %A) nounwind {
178; CHECK-LABEL: vpaddlQu8:
179; CHECK:       @ %bb.0:
180; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
181; CHECK-NEXT:    vpaddl.u8 q8, q8
182; CHECK-NEXT:    vmov r0, r1, d16
183; CHECK-NEXT:    vmov r2, r3, d17
184; CHECK-NEXT:    mov pc, lr
185	%tmp1 = load <16 x i8>, ptr %A
186	%tmp2 = call <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8> %tmp1)
187	ret <8 x i16> %tmp2
188}
189
190define <4 x i32> @vpaddlQu16(ptr %A) nounwind {
191; CHECK-LABEL: vpaddlQu16:
192; CHECK:       @ %bb.0:
193; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
194; CHECK-NEXT:    vpaddl.u16 q8, q8
195; CHECK-NEXT:    vmov r0, r1, d16
196; CHECK-NEXT:    vmov r2, r3, d17
197; CHECK-NEXT:    mov pc, lr
198	%tmp1 = load <8 x i16>, ptr %A
199	%tmp2 = call <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16> %tmp1)
200	ret <4 x i32> %tmp2
201}
202
203define <2 x i64> @vpaddlQu32(ptr %A) nounwind {
204; CHECK-LABEL: vpaddlQu32:
205; CHECK:       @ %bb.0:
206; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
207; CHECK-NEXT:    vpaddl.u32 q8, q8
208; CHECK-NEXT:    vmov r0, r1, d16
209; CHECK-NEXT:    vmov r2, r3, d17
210; CHECK-NEXT:    mov pc, lr
211	%tmp1 = load <4 x i32>, ptr %A
212	%tmp2 = call <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32> %tmp1)
213	ret <2 x i64> %tmp2
214}
215
216; Combine vuzp+vadd->vpadd.
217define void @addCombineToVPADD_i8(ptr %cbcr, ptr %X) nounwind ssp {
218; CHECK-LABEL: addCombineToVPADD_i8:
219; CHECK:       @ %bb.0:
220; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
221; CHECK-NEXT:    vpadd.i8 d16, d16, d17
222; CHECK-NEXT:    vstr d16, [r1]
223; CHECK-NEXT:    mov pc, lr
224  %tmp = load <16 x i8>, ptr %cbcr
225  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
226  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
227
228  %add = add <8 x i8> %tmp3, %tmp1
229  store <8 x i8> %add, ptr %X, align 8
230  ret void
231}
232
233; Combine vuzp+vadd->vpadd.
234define void @addCombineToVPADD_i16(ptr %cbcr, ptr %X) nounwind ssp {
235; CHECK-LABEL: addCombineToVPADD_i16:
236; CHECK:       @ %bb.0:
237; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
238; CHECK-NEXT:    vpadd.i16 d16, d16, d17
239; CHECK-NEXT:    vstr d16, [r1]
240; CHECK-NEXT:    mov pc, lr
241  %tmp = load <8 x i16>, ptr %cbcr
242  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
243  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
244  %add = add <4 x i16> %tmp3, %tmp1
245  store <4 x i16> %add, ptr %X, align 8
246  ret void
247}
248
249; Combine vtrn+vadd->vpadd.
250define void @addCombineToVPADD_i32(ptr %cbcr, ptr %X) nounwind ssp {
251; CHECK-LABEL: addCombineToVPADD_i32:
252; CHECK:       @ %bb.0:
253; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
254; CHECK-NEXT:    vpadd.i32 d16, d16, d17
255; CHECK-NEXT:    vstr d16, [r1]
256; CHECK-NEXT:    mov pc, lr
257  %tmp = load <4 x i32>, ptr %cbcr
258  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
259  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
260  %add = add <2 x i32> %tmp3, %tmp1
261  store <2 x i32> %add, ptr %X, align 8
262  ret void
263}
264
265; Combine vuzp+vaddl->vpaddl
266define void @addCombineToVPADDLq_s8(ptr %cbcr, ptr %X) nounwind ssp {
267; CHECK-LABEL: addCombineToVPADDLq_s8:
268; CHECK:       @ %bb.0:
269; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
270; CHECK-NEXT:    vpaddl.s8 q8, q8
271; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
272; CHECK-NEXT:    mov pc, lr
273  %tmp = load <16 x i8>, ptr %cbcr
274  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
275  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
276  %tmp4 = sext <8 x i8> %tmp3 to <8 x i16>
277  %tmp5 = sext <8 x i8> %tmp1 to <8 x i16>
278  %add = add <8 x i16> %tmp4, %tmp5
279  store <8 x i16> %add, ptr %X, align 8
280  ret void
281}
282
283; Combine vuzp+vaddl->vpaddl
284; FIXME: Legalization butchers the shuffles.
285define void @addCombineToVPADDL_s8(ptr %cbcr, ptr %X) nounwind ssp {
286; CHECK-LABEL: addCombineToVPADDL_s8:
287; CHECK:       @ %bb.0:
288; CHECK-NEXT:    vldr d16, [r0]
289; CHECK-NEXT:    vext.8 d17, d16, d16, #1
290; CHECK-NEXT:    vshl.i16 d16, d16, #8
291; CHECK-NEXT:    vshl.i16 d17, d17, #8
292; CHECK-NEXT:    vshr.s16 d17, d17, #8
293; CHECK-NEXT:    vsra.s16 d17, d16, #8
294; CHECK-NEXT:    vstr d17, [r1]
295; CHECK-NEXT:    mov pc, lr
296  %tmp = load <16 x i8>, ptr %cbcr
297  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
298  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
299  %tmp4 = sext <4 x i8> %tmp3 to <4 x i16>
300  %tmp5 = sext <4 x i8> %tmp1 to <4 x i16>
301  %add = add <4 x i16> %tmp4, %tmp5
302  store <4 x i16> %add, ptr %X, align 8
303  ret void
304}
305
306; Combine vuzp+vaddl->vpaddl
307define void @addCombineToVPADDLq_u8(ptr %cbcr, ptr %X) nounwind ssp {
308; CHECK-LABEL: addCombineToVPADDLq_u8:
309; CHECK:       @ %bb.0:
310; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
311; CHECK-NEXT:    vpaddl.u8 q8, q8
312; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
313; CHECK-NEXT:    mov pc, lr
314  %tmp = load <16 x i8>, ptr %cbcr
315  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
316  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
317  %tmp4 = zext <8 x i8> %tmp3 to <8 x i16>
318  %tmp5 = zext <8 x i8> %tmp1 to <8 x i16>
319  %add = add <8 x i16> %tmp4, %tmp5
320  store <8 x i16> %add, ptr %X, align 8
321  ret void
322}
323
324; In theory, it's possible to match this to vpaddl, but rearranging the
325; shuffle is awkward, so this doesn't match at the moment.
326define void @addCombineToVPADDLq_u8_early_zext(ptr %cbcr, ptr %X) nounwind ssp {
327; CHECK-LABEL: addCombineToVPADDLq_u8_early_zext:
328; CHECK:       @ %bb.0:
329; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
330; CHECK-NEXT:    vmovl.u8 q9, d17
331; CHECK-NEXT:    vmovl.u8 q8, d16
332; CHECK-NEXT:    vuzp.16 q8, q9
333; CHECK-NEXT:    vadd.i16 q8, q8, q9
334; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
335; CHECK-NEXT:    mov pc, lr
336  %tmp = load <16 x i8>, ptr %cbcr
337  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
338  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
339  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
340  %add = add <8 x i16> %tmp2, %tmp3
341  store <8 x i16> %add, ptr %X, align 8
342  ret void
343}
344
345; Combine vuzp+vaddl->vpaddl
346; FIXME: Legalization butchers the shuffle.
347define void @addCombineToVPADDL_u8(ptr %cbcr, ptr %X) nounwind ssp {
348; CHECK-LABEL: addCombineToVPADDL_u8:
349; CHECK:       @ %bb.0:
350; CHECK-NEXT:    vldr d16, [r0]
351; CHECK-NEXT:    vext.8 d17, d16, d16, #1
352; CHECK-NEXT:    vbic.i16 d16, #0xff00
353; CHECK-NEXT:    vbic.i16 d17, #0xff00
354; CHECK-NEXT:    vadd.i16 d16, d17, d16
355; CHECK-NEXT:    vstr d16, [r1]
356; CHECK-NEXT:    mov pc, lr
357  %tmp = load <16 x i8>, ptr %cbcr
358  %tmp1 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
359  %tmp3 = shufflevector <16 x i8> %tmp, <16 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
360  %tmp4 = zext <4 x i8> %tmp3 to <4 x i16>
361  %tmp5 = zext <4 x i8> %tmp1 to <4 x i16>
362  %add = add <4 x i16> %tmp4, %tmp5
363  store <4 x i16> %add, ptr %X, align 8
364  ret void
365}
366
367; Matching to vpaddl.8 requires matching shuffle(zext()).
368define void @addCombineToVPADDL_u8_early_zext(ptr %cbcr, ptr %X) nounwind ssp {
369; CHECK-LABEL: addCombineToVPADDL_u8_early_zext:
370; CHECK:       @ %bb.0:
371; CHECK-NEXT:    vldr d16, [r0]
372; CHECK-NEXT:    vmovl.u8 q8, d16
373; CHECK-NEXT:    vpadd.i16 d16, d16, d17
374; CHECK-NEXT:    vstr d16, [r1]
375; CHECK-NEXT:    mov pc, lr
376  %tmp = load <16 x i8>, ptr %cbcr
377  %tmp1 = zext <16 x i8> %tmp to <16 x i16>
378  %tmp2 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
379  %tmp3 = shufflevector <16 x i16> %tmp1, <16 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
380  %add = add <4 x i16> %tmp2, %tmp3
381  store <4 x i16> %add, ptr %X, align 8
382  ret void
383}
384
385; Combine vuzp+vaddl->vpaddl
386define void @addCombineToVPADDLq_s16(ptr %cbcr, ptr %X) nounwind ssp {
387; CHECK-LABEL: addCombineToVPADDLq_s16:
388; CHECK:       @ %bb.0:
389; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
390; CHECK-NEXT:    vpaddl.s16 q8, q8
391; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
392; CHECK-NEXT:    mov pc, lr
393  %tmp = load <8 x i16>, ptr %cbcr
394  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
395  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
396  %tmp4 = sext <4 x i16> %tmp3 to <4 x i32>
397  %tmp5 = sext <4 x i16> %tmp1 to <4 x i32>
398  %add = add <4 x i32> %tmp4, %tmp5
399  store <4 x i32> %add, ptr %X, align 8
400  ret void
401}
402
403; Combine vuzp+vaddl->vpaddl
404define void @addCombineToVPADDLq_u16(ptr %cbcr, ptr %X) nounwind ssp {
405; CHECK-LABEL: addCombineToVPADDLq_u16:
406; CHECK:       @ %bb.0:
407; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
408; CHECK-NEXT:    vpaddl.u16 q8, q8
409; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
410; CHECK-NEXT:    mov pc, lr
411  %tmp = load <8 x i16>, ptr %cbcr
412  %tmp1 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
413  %tmp3 = shufflevector <8 x i16> %tmp, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
414  %tmp4 = zext <4 x i16> %tmp3 to <4 x i32>
415  %tmp5 = zext <4 x i16> %tmp1 to <4 x i32>
416  %add = add <4 x i32> %tmp4, %tmp5
417  store <4 x i32> %add, ptr %X, align 8
418  ret void
419}
420
421; Combine vtrn+vaddl->vpaddl
422define void @addCombineToVPADDLq_s32(ptr %cbcr, ptr %X) nounwind ssp {
423; CHECK-LABEL: addCombineToVPADDLq_s32:
424; CHECK:       @ %bb.0:
425; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
426; CHECK-NEXT:    vpaddl.s32 q8, q8
427; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
428; CHECK-NEXT:    mov pc, lr
429  %tmp = load <4 x i32>, ptr %cbcr
430  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
431  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
432  %tmp4 = sext <2 x i32> %tmp3 to <2 x i64>
433  %tmp5 = sext <2 x i32> %tmp1 to <2 x i64>
434  %add = add <2 x i64> %tmp4, %tmp5
435  store <2 x i64> %add, ptr %X, align 8
436  ret void
437}
438
439; Combine vtrn+vaddl->vpaddl
440define void @addCombineToVPADDLq_u32(ptr %cbcr, ptr %X) nounwind ssp {
441; CHECK-LABEL: addCombineToVPADDLq_u32:
442; CHECK:       @ %bb.0:
443; CHECK-NEXT:    vld1.64 {d16, d17}, [r0]
444; CHECK-NEXT:    vpaddl.u32 q8, q8
445; CHECK-NEXT:    vst1.64 {d16, d17}, [r1]
446; CHECK-NEXT:    mov pc, lr
447  %tmp = load <4 x i32>, ptr %cbcr
448  %tmp1 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 0, i32 2>
449  %tmp3 = shufflevector <4 x i32> %tmp, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
450  %tmp4 = zext <2 x i32> %tmp3 to <2 x i64>
451  %tmp5 = zext <2 x i32> %tmp1 to <2 x i64>
452  %add = add <2 x i64> %tmp4, %tmp5
453  store <2 x i64> %add, ptr %X, align 8
454  ret void
455}
456
457; Legalization promotes the <4 x i8> to <4 x i16>.
458define <4 x i8> @fromExtendingExtractVectorElt_i8(<8 x i8> %in) {
459; CHECK-LABEL: fromExtendingExtractVectorElt_i8:
460; CHECK:       @ %bb.0:
461; CHECK-NEXT:    vmov d16, r0, r1
462; CHECK-NEXT:    vpaddl.s8 d16, d16
463; CHECK-NEXT:    vmov r0, r1, d16
464; CHECK-NEXT:    mov pc, lr
465  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
466  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
467  %x = add <4 x i8> %tmp2, %tmp1
468  ret <4 x i8> %x
469}
470
471; Legalization promotes the <2 x i16> to <2 x i32>.
472define <2 x i16> @fromExtendingExtractVectorElt_i16(<4 x i16> %in) {
473; CHECK-LABEL: fromExtendingExtractVectorElt_i16:
474; CHECK:       @ %bb.0:
475; CHECK-NEXT:    vmov d16, r0, r1
476; CHECK-NEXT:    vpaddl.s16 d16, d16
477; CHECK-NEXT:    vmov r0, r1, d16
478; CHECK-NEXT:    mov pc, lr
479  %tmp1 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 0, i32 2>
480  %tmp2 = shufflevector <4 x i16> %in, <4 x i16> undef, <2 x i32> <i32 1, i32 3>
481  %x = add <2 x i16> %tmp2, %tmp1
482  ret <2 x i16> %x
483}
484
485; And <2 x i8> to <2 x i32>
486define <2 x i8> @fromExtendingExtractVectorElt_2i8(<8 x i8> %in) {
487; CHECK-LABEL: fromExtendingExtractVectorElt_2i8:
488; CHECK:       @ %bb.0:
489; CHECK-NEXT:    vmov d16, r0, r1
490; CHECK-NEXT:    vmov.u8 r1, d16[1]
491; CHECK-NEXT:    vmov.u8 r0, d16[0]
492; CHECK-NEXT:    vmov.u8 r2, d16[2]
493; CHECK-NEXT:    vmov.u8 r3, d16[3]
494; CHECK-NEXT:    vmov.32 d17[0], r1
495; CHECK-NEXT:    vmov.32 d16[0], r0
496; CHECK-NEXT:    vmov.32 d17[1], r3
497; CHECK-NEXT:    vmov.32 d16[1], r2
498; CHECK-NEXT:    vadd.i32 d16, d17, d16
499; CHECK-NEXT:    vmov r0, r1, d16
500; CHECK-NEXT:    mov pc, lr
501  %tmp1 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 0, i32 2>
502  %tmp2 = shufflevector <8 x i8> %in, <8 x i8> undef, <2 x i32> <i32 1, i32 3>
503  %x = add <2 x i8> %tmp2, %tmp1
504  ret <2 x i8> %x
505}
506
507define <2 x i16> @fromExtendingExtractVectorElt_2i16(<8 x i16> %in) {
508; CHECK-LABEL: fromExtendingExtractVectorElt_2i16:
509; CHECK:       @ %bb.0:
510; CHECK-NEXT:    vmov d16, r0, r1
511; CHECK-NEXT:    vmov.u16 r0, d16[0]
512; CHECK-NEXT:    vmov.u16 r1, d16[1]
513; CHECK-NEXT:    vmov.u16 r3, d16[3]
514; CHECK-NEXT:    vmov.u16 r2, d16[2]
515; CHECK-NEXT:    vmov.32 d16[0], r0
516; CHECK-NEXT:    vmov.32 d17[0], r1
517; CHECK-NEXT:    vmov.32 d16[1], r2
518; CHECK-NEXT:    vmov.32 d17[1], r3
519; CHECK-NEXT:    vadd.i32 d16, d17, d16
520; CHECK-NEXT:    vmov r0, r1, d16
521; CHECK-NEXT:    mov pc, lr
522 %tmp1 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 0, i32 2>
523 %tmp2 = shufflevector <8 x i16> %in, <8 x i16> undef, <2 x i32> <i32 1, i32 3>
524 %x = add <2 x i16> %tmp2, %tmp1
525 ret <2 x i16> %x
526}
527
528
529declare <4 x i16> @llvm.arm.neon.vpaddls.v4i16.v8i8(<8 x i8>) nounwind readnone
530declare <2 x i32> @llvm.arm.neon.vpaddls.v2i32.v4i16(<4 x i16>) nounwind readnone
531declare <1 x i64> @llvm.arm.neon.vpaddls.v1i64.v2i32(<2 x i32>) nounwind readnone
532
533declare <4 x i16> @llvm.arm.neon.vpaddlu.v4i16.v8i8(<8 x i8>) nounwind readnone
534declare <2 x i32> @llvm.arm.neon.vpaddlu.v2i32.v4i16(<4 x i16>) nounwind readnone
535declare <1 x i64> @llvm.arm.neon.vpaddlu.v1i64.v2i32(<2 x i32>) nounwind readnone
536
537declare <8 x i16> @llvm.arm.neon.vpaddls.v8i16.v16i8(<16 x i8>) nounwind readnone
538declare <4 x i32> @llvm.arm.neon.vpaddls.v4i32.v8i16(<8 x i16>) nounwind readnone
539declare <2 x i64> @llvm.arm.neon.vpaddls.v2i64.v4i32(<4 x i32>) nounwind readnone
540
541declare <8 x i16> @llvm.arm.neon.vpaddlu.v8i16.v16i8(<16 x i8>) nounwind readnone
542declare <4 x i32> @llvm.arm.neon.vpaddlu.v4i32.v8i16(<8 x i16>) nounwind readnone
543declare <2 x i64> @llvm.arm.neon.vpaddlu.v2i64.v4i32(<4 x i32>) nounwind readnone
544