xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-add.ll (revision c8dc6b59d68635f73d2970b7fc8bc9c6c2684098)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vaddv.u32 r0, q0
8; CHECK-NEXT:    bx lr
9entry:
10  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
11  ret i32 %z
12}
13
14define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
15; CHECK-LABEL: add_v4i32_v4i64_zext:
16; CHECK:       @ %bb.0: @ %entry
17; CHECK-NEXT:    vaddlv.u32 r0, r1, q0
18; CHECK-NEXT:    bx lr
19entry:
20  %xx = zext <4 x i32> %x to <4 x i64>
21  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
22  ret i64 %z
23}
24
25define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
26; CHECK-LABEL: add_v4i32_v4i64_sext:
27; CHECK:       @ %bb.0: @ %entry
28; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
29; CHECK-NEXT:    bx lr
30entry:
31  %xx = sext <4 x i32> %x to <4 x i64>
32  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
33  ret i64 %z
34}
35
36define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
37; CHECK-LABEL: add_v2i32_v2i64_zext:
38; CHECK:       @ %bb.0: @ %entry
39; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
40; CHECK-NEXT:    vand q0, q0, q1
41; CHECK-NEXT:    vmov r0, r1, d1
42; CHECK-NEXT:    vmov r2, r3, d0
43; CHECK-NEXT:    adds r0, r0, r2
44; CHECK-NEXT:    adcs r1, r3
45; CHECK-NEXT:    bx lr
46entry:
47  %xx = zext <2 x i32> %x to <2 x i64>
48  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
49  ret i64 %z
50}
51
52define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
53; CHECK-LABEL: add_v2i32_v2i64_sext:
54; CHECK:       @ %bb.0: @ %entry
55; CHECK-NEXT:    vmov r0, s0
56; CHECK-NEXT:    vmov r2, s2
57; CHECK-NEXT:    asrs r1, r0, #31
58; CHECK-NEXT:    adds r0, r0, r2
59; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
60; CHECK-NEXT:    bx lr
61entry:
62  %xx = sext <2 x i32> %x to <2 x i64>
63  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
64  ret i64 %z
65}
66
67define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
68; CHECK-LABEL: add_v8i16_v8i32_zext:
69; CHECK:       @ %bb.0: @ %entry
70; CHECK-NEXT:    vaddv.u16 r0, q0
71; CHECK-NEXT:    bx lr
72entry:
73  %xx = zext <8 x i16> %x to <8 x i32>
74  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
75  ret i32 %z
76}
77
78define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
79; CHECK-LABEL: add_v8i16_v8i32_sext:
80; CHECK:       @ %bb.0: @ %entry
81; CHECK-NEXT:    vaddv.s16 r0, q0
82; CHECK-NEXT:    bx lr
83entry:
84  %xx = sext <8 x i16> %x to <8 x i32>
85  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
86  ret i32 %z
87}
88
89define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
90; CHECK-LABEL: add_v4i16_v4i32_zext:
91; CHECK:       @ %bb.0: @ %entry
92; CHECK-NEXT:    vmovlb.u16 q0, q0
93; CHECK-NEXT:    vaddv.u32 r0, q0
94; CHECK-NEXT:    bx lr
95entry:
96  %xx = zext <4 x i16> %x to <4 x i32>
97  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
98  ret i32 %z
99}
100
101define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
102; CHECK-LABEL: add_v4i16_v4i32_sext:
103; CHECK:       @ %bb.0: @ %entry
104; CHECK-NEXT:    vmovlb.s16 q0, q0
105; CHECK-NEXT:    vaddv.u32 r0, q0
106; CHECK-NEXT:    bx lr
107entry:
108  %xx = sext <4 x i16> %x to <4 x i32>
109  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
110  ret i32 %z
111}
112
113define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
114; CHECK-LABEL: add_v8i16_v8i16:
115; CHECK:       @ %bb.0: @ %entry
116; CHECK-NEXT:    vaddv.u16 r0, q0
117; CHECK-NEXT:    uxth r0, r0
118; CHECK-NEXT:    bx lr
119entry:
120  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
121  ret i16 %z
122}
123
124define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
125; CHECK-LABEL: add_v8i16_v8i64_zext:
126; CHECK:       @ %bb.0: @ %entry
127; CHECK-NEXT:    vmov.u16 r0, q0[1]
128; CHECK-NEXT:    vmov.u16 r1, q0[0]
129; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
130; CHECK-NEXT:    vmov.i64 q1, #0xffff
131; CHECK-NEXT:    vand q2, q2, q1
132; CHECK-NEXT:    vmov.u16 r3, q0[2]
133; CHECK-NEXT:    vmov r0, s10
134; CHECK-NEXT:    vmov r1, r2, d4
135; CHECK-NEXT:    add r0, r1
136; CHECK-NEXT:    vmov.u16 r1, q0[3]
137; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
138; CHECK-NEXT:    vmov.u16 r3, q0[4]
139; CHECK-NEXT:    vand q2, q2, q1
140; CHECK-NEXT:    vmov r1, s8
141; CHECK-NEXT:    add r0, r1
142; CHECK-NEXT:    vmov r1, s10
143; CHECK-NEXT:    add r0, r1
144; CHECK-NEXT:    vmov.u16 r1, q0[5]
145; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
146; CHECK-NEXT:    vand q2, q2, q1
147; CHECK-NEXT:    vmov r1, s8
148; CHECK-NEXT:    add r0, r1
149; CHECK-NEXT:    vmov r1, r3, d5
150; CHECK-NEXT:    adds r0, r0, r1
151; CHECK-NEXT:    adc.w r1, r2, r3
152; CHECK-NEXT:    vmov.u16 r2, q0[7]
153; CHECK-NEXT:    vmov.u16 r3, q0[6]
154; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
155; CHECK-NEXT:    vand q0, q0, q1
156; CHECK-NEXT:    vmov r2, r3, d0
157; CHECK-NEXT:    adds r0, r0, r2
158; CHECK-NEXT:    adcs r1, r3
159; CHECK-NEXT:    vmov r2, r3, d1
160; CHECK-NEXT:    adds r0, r0, r2
161; CHECK-NEXT:    adcs r1, r3
162; CHECK-NEXT:    bx lr
163entry:
164  %xx = zext <8 x i16> %x to <8 x i64>
165  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
166  ret i64 %z
167}
168
169define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
170; CHECK-LABEL: add_v8i16_v8i64_sext:
171; CHECK:       @ %bb.0: @ %entry
172; CHECK-NEXT:    vmov.s16 r0, q0[0]
173; CHECK-NEXT:    vmov.s16 r2, q0[1]
174; CHECK-NEXT:    asrs r1, r0, #31
175; CHECK-NEXT:    adds r0, r0, r2
176; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
177; CHECK-NEXT:    vmov.s16 r2, q0[2]
178; CHECK-NEXT:    adds r0, r0, r2
179; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
180; CHECK-NEXT:    vmov.s16 r2, q0[3]
181; CHECK-NEXT:    adds r0, r0, r2
182; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
183; CHECK-NEXT:    vmov.s16 r2, q0[4]
184; CHECK-NEXT:    adds r0, r0, r2
185; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
186; CHECK-NEXT:    vmov.s16 r2, q0[5]
187; CHECK-NEXT:    adds r0, r0, r2
188; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
189; CHECK-NEXT:    vmov.s16 r2, q0[6]
190; CHECK-NEXT:    adds r0, r0, r2
191; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
192; CHECK-NEXT:    vmov.s16 r2, q0[7]
193; CHECK-NEXT:    adds r0, r0, r2
194; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
195; CHECK-NEXT:    bx lr
196entry:
197  %xx = sext <8 x i16> %x to <8 x i64>
198  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
199  ret i64 %z
200}
201
202define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
203; CHECK-LABEL: add_v4i16_v4i64_zext:
204; CHECK:       @ %bb.0: @ %entry
205; CHECK-NEXT:    vmovlb.u16 q0, q0
206; CHECK-NEXT:    vaddlv.u32 r0, r1, q0
207; CHECK-NEXT:    bx lr
208entry:
209  %xx = zext <4 x i16> %x to <4 x i64>
210  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
211  ret i64 %z
212}
213
214define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
215; CHECK-LABEL: add_v4i16_v4i64_sext:
216; CHECK:       @ %bb.0: @ %entry
217; CHECK-NEXT:    vmovlb.s16 q0, q0
218; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
219; CHECK-NEXT:    bx lr
220entry:
221  %xx = sext <4 x i16> %x to <4 x i64>
222  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
223  ret i64 %z
224}
225
226define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
227; CHECK-LABEL: add_v2i16_v2i64_zext:
228; CHECK:       @ %bb.0: @ %entry
229; CHECK-NEXT:    vmov.i64 q1, #0xffff
230; CHECK-NEXT:    vand q0, q0, q1
231; CHECK-NEXT:    vmov r0, s2
232; CHECK-NEXT:    vmov r2, r1, d0
233; CHECK-NEXT:    add r0, r2
234; CHECK-NEXT:    bx lr
235entry:
236  %xx = zext <2 x i16> %x to <2 x i64>
237  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
238  ret i64 %z
239}
240
241define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
242; CHECK-LABEL: add_v2i16_v2i64_sext:
243; CHECK:       @ %bb.0: @ %entry
244; CHECK-NEXT:    vmov r0, s0
245; CHECK-NEXT:    vmov r2, s2
246; CHECK-NEXT:    sxth r0, r0
247; CHECK-NEXT:    asrs r1, r0, #31
248; CHECK-NEXT:    sxth r2, r2
249; CHECK-NEXT:    adds r0, r0, r2
250; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
251; CHECK-NEXT:    bx lr
252entry:
253  %xx = sext <2 x i16> %x to <2 x i64>
254  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
255  ret i64 %z
256}
257
258define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
259; CHECK-LABEL: add_v16i8_v16i32_zext:
260; CHECK:       @ %bb.0: @ %entry
261; CHECK-NEXT:    vaddv.u8 r0, q0
262; CHECK-NEXT:    bx lr
263entry:
264  %xx = zext <16 x i8> %x to <16 x i32>
265  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
266  ret i32 %z
267}
268
269define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
270; CHECK-LABEL: add_v16i8_v16i32_sext:
271; CHECK:       @ %bb.0: @ %entry
272; CHECK-NEXT:    vaddv.s8 r0, q0
273; CHECK-NEXT:    bx lr
274entry:
275  %xx = sext <16 x i8> %x to <16 x i32>
276  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
277  ret i32 %z
278}
279
280define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
281; CHECK-LABEL: add_v8i8_v8i32_zext:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    vmovlb.u8 q0, q0
284; CHECK-NEXT:    vaddv.u16 r0, q0
285; CHECK-NEXT:    bx lr
286entry:
287  %xx = zext <8 x i8> %x to <8 x i32>
288  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
289  ret i32 %z
290}
291
292define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
293; CHECK-LABEL: add_v8i8_v8i32_sext:
294; CHECK:       @ %bb.0: @ %entry
295; CHECK-NEXT:    vmovlb.s8 q0, q0
296; CHECK-NEXT:    vaddv.s16 r0, q0
297; CHECK-NEXT:    bx lr
298entry:
299  %xx = sext <8 x i8> %x to <8 x i32>
300  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
301  ret i32 %z
302}
303
304define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
305; CHECK-LABEL: add_v4i8_v4i32_zext:
306; CHECK:       @ %bb.0: @ %entry
307; CHECK-NEXT:    vmov.i32 q1, #0xff
308; CHECK-NEXT:    vand q0, q0, q1
309; CHECK-NEXT:    vaddv.u32 r0, q0
310; CHECK-NEXT:    bx lr
311entry:
312  %xx = zext <4 x i8> %x to <4 x i32>
313  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
314  ret i32 %z
315}
316
317define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
318; CHECK-LABEL: add_v4i8_v4i32_sext:
319; CHECK:       @ %bb.0: @ %entry
320; CHECK-NEXT:    vmovlb.s8 q0, q0
321; CHECK-NEXT:    vmovlb.s16 q0, q0
322; CHECK-NEXT:    vaddv.u32 r0, q0
323; CHECK-NEXT:    bx lr
324entry:
325  %xx = sext <4 x i8> %x to <4 x i32>
326  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
327  ret i32 %z
328}
329
330define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
331; CHECK-LABEL: add_v16i8_v16i16_zext:
332; CHECK:       @ %bb.0: @ %entry
333; CHECK-NEXT:    vaddv.u8 r0, q0
334; CHECK-NEXT:    uxth r0, r0
335; CHECK-NEXT:    bx lr
336entry:
337  %xx = zext <16 x i8> %x to <16 x i16>
338  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
339  ret i16 %z
340}
341
342define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
343; CHECK-LABEL: add_v16i8_v16i16_sext:
344; CHECK:       @ %bb.0: @ %entry
345; CHECK-NEXT:    vaddv.s8 r0, q0
346; CHECK-NEXT:    sxth r0, r0
347; CHECK-NEXT:    bx lr
348entry:
349  %xx = sext <16 x i8> %x to <16 x i16>
350  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
351  ret i16 %z
352}
353
354define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
355; CHECK-LABEL: add_v8i8_v8i16_zext:
356; CHECK:       @ %bb.0: @ %entry
357; CHECK-NEXT:    vmovlb.u8 q0, q0
358; CHECK-NEXT:    vaddv.u16 r0, q0
359; CHECK-NEXT:    uxth r0, r0
360; CHECK-NEXT:    bx lr
361entry:
362  %xx = zext <8 x i8> %x to <8 x i16>
363  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
364  ret i16 %z
365}
366
367define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
368; CHECK-LABEL: add_v8i8_v8i16_sext:
369; CHECK:       @ %bb.0: @ %entry
370; CHECK-NEXT:    vmovlb.s8 q0, q0
371; CHECK-NEXT:    vaddv.u16 r0, q0
372; CHECK-NEXT:    sxth r0, r0
373; CHECK-NEXT:    bx lr
374entry:
375  %xx = sext <8 x i8> %x to <8 x i16>
376  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
377  ret i16 %z
378}
379
380define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
381; CHECK-LABEL: add_v16i8_v16i8:
382; CHECK:       @ %bb.0: @ %entry
383; CHECK-NEXT:    vaddv.u8 r0, q0
384; CHECK-NEXT:    uxtb r0, r0
385; CHECK-NEXT:    bx lr
386entry:
387  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
388  ret i8 %z
389}
390
391define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
392; CHECK-LABEL: add_v16i8_v16i64_zext:
393; CHECK:       @ %bb.0: @ %entry
394; CHECK-NEXT:    vmov.u8 r0, q0[1]
395; CHECK-NEXT:    vmov.u8 r1, q0[0]
396; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
397; CHECK-NEXT:    vmov.i64 q1, #0xff
398; CHECK-NEXT:    vand q2, q2, q1
399; CHECK-NEXT:    vmov.u8 r3, q0[2]
400; CHECK-NEXT:    vmov r0, s10
401; CHECK-NEXT:    vmov r1, r2, d4
402; CHECK-NEXT:    add r0, r1
403; CHECK-NEXT:    vmov.u8 r1, q0[3]
404; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
405; CHECK-NEXT:    vmov.u8 r3, q0[4]
406; CHECK-NEXT:    vand q2, q2, q1
407; CHECK-NEXT:    vmov r1, s8
408; CHECK-NEXT:    add r0, r1
409; CHECK-NEXT:    vmov r1, s10
410; CHECK-NEXT:    add r0, r1
411; CHECK-NEXT:    vmov.u8 r1, q0[5]
412; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
413; CHECK-NEXT:    vand q2, q2, q1
414; CHECK-NEXT:    vmov r1, s8
415; CHECK-NEXT:    add r0, r1
416; CHECK-NEXT:    vmov r1, r3, d5
417; CHECK-NEXT:    adds r0, r0, r1
418; CHECK-NEXT:    adc.w r1, r2, r3
419; CHECK-NEXT:    vmov.u8 r2, q0[7]
420; CHECK-NEXT:    vmov.u8 r3, q0[6]
421; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
422; CHECK-NEXT:    vand q2, q2, q1
423; CHECK-NEXT:    vmov r2, r3, d4
424; CHECK-NEXT:    adds r0, r0, r2
425; CHECK-NEXT:    adcs r1, r3
426; CHECK-NEXT:    vmov r2, r3, d5
427; CHECK-NEXT:    adds r0, r0, r2
428; CHECK-NEXT:    vmov.u8 r2, q0[9]
429; CHECK-NEXT:    adcs r1, r3
430; CHECK-NEXT:    vmov.u8 r3, q0[8]
431; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
432; CHECK-NEXT:    vand q2, q2, q1
433; CHECK-NEXT:    vmov r2, r3, d4
434; CHECK-NEXT:    adds r0, r0, r2
435; CHECK-NEXT:    adcs r1, r3
436; CHECK-NEXT:    vmov r2, r3, d5
437; CHECK-NEXT:    adds r0, r0, r2
438; CHECK-NEXT:    vmov.u8 r2, q0[11]
439; CHECK-NEXT:    adcs r1, r3
440; CHECK-NEXT:    vmov.u8 r3, q0[10]
441; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
442; CHECK-NEXT:    vand q2, q2, q1
443; CHECK-NEXT:    vmov r2, r3, d4
444; CHECK-NEXT:    adds r0, r0, r2
445; CHECK-NEXT:    adcs r1, r3
446; CHECK-NEXT:    vmov r2, r3, d5
447; CHECK-NEXT:    adds r0, r0, r2
448; CHECK-NEXT:    vmov.u8 r2, q0[13]
449; CHECK-NEXT:    adcs r1, r3
450; CHECK-NEXT:    vmov.u8 r3, q0[12]
451; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
452; CHECK-NEXT:    vand q2, q2, q1
453; CHECK-NEXT:    vmov r2, r3, d4
454; CHECK-NEXT:    adds r0, r0, r2
455; CHECK-NEXT:    adcs r1, r3
456; CHECK-NEXT:    vmov r2, r3, d5
457; CHECK-NEXT:    adds r0, r0, r2
458; CHECK-NEXT:    vmov.u8 r2, q0[15]
459; CHECK-NEXT:    adcs r1, r3
460; CHECK-NEXT:    vmov.u8 r3, q0[14]
461; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
462; CHECK-NEXT:    vand q0, q0, q1
463; CHECK-NEXT:    vmov r2, r3, d0
464; CHECK-NEXT:    adds r0, r0, r2
465; CHECK-NEXT:    adcs r1, r3
466; CHECK-NEXT:    vmov r2, r3, d1
467; CHECK-NEXT:    adds r0, r0, r2
468; CHECK-NEXT:    adcs r1, r3
469; CHECK-NEXT:    bx lr
470entry:
471  %xx = zext <16 x i8> %x to <16 x i64>
472  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
473  ret i64 %z
474}
475
476define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
477; CHECK-LABEL: add_v16i8_v16i64_sext:
478; CHECK:       @ %bb.0: @ %entry
479; CHECK-NEXT:    vmov.s8 r0, q0[0]
480; CHECK-NEXT:    vmov.s8 r2, q0[1]
481; CHECK-NEXT:    asrs r1, r0, #31
482; CHECK-NEXT:    adds r0, r0, r2
483; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
484; CHECK-NEXT:    vmov.s8 r2, q0[2]
485; CHECK-NEXT:    adds r0, r0, r2
486; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
487; CHECK-NEXT:    vmov.s8 r2, q0[3]
488; CHECK-NEXT:    adds r0, r0, r2
489; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
490; CHECK-NEXT:    vmov.s8 r2, q0[4]
491; CHECK-NEXT:    adds r0, r0, r2
492; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
493; CHECK-NEXT:    vmov.s8 r2, q0[5]
494; CHECK-NEXT:    adds r0, r0, r2
495; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
496; CHECK-NEXT:    vmov.s8 r2, q0[6]
497; CHECK-NEXT:    adds r0, r0, r2
498; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
499; CHECK-NEXT:    vmov.s8 r2, q0[7]
500; CHECK-NEXT:    adds r0, r0, r2
501; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
502; CHECK-NEXT:    vmov.s8 r2, q0[8]
503; CHECK-NEXT:    adds r0, r0, r2
504; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
505; CHECK-NEXT:    vmov.s8 r2, q0[9]
506; CHECK-NEXT:    adds r0, r0, r2
507; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
508; CHECK-NEXT:    vmov.s8 r2, q0[10]
509; CHECK-NEXT:    adds r0, r0, r2
510; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
511; CHECK-NEXT:    vmov.s8 r2, q0[11]
512; CHECK-NEXT:    adds r0, r0, r2
513; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
514; CHECK-NEXT:    vmov.s8 r2, q0[12]
515; CHECK-NEXT:    adds r0, r0, r2
516; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
517; CHECK-NEXT:    vmov.s8 r2, q0[13]
518; CHECK-NEXT:    adds r0, r0, r2
519; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
520; CHECK-NEXT:    vmov.s8 r2, q0[14]
521; CHECK-NEXT:    adds r0, r0, r2
522; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
523; CHECK-NEXT:    vmov.s8 r2, q0[15]
524; CHECK-NEXT:    adds r0, r0, r2
525; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
526; CHECK-NEXT:    bx lr
527entry:
528  %xx = sext <16 x i8> %x to <16 x i64>
529  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
530  ret i64 %z
531}
532
533define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
534; CHECK-LABEL: add_v8i8_v8i64_zext:
535; CHECK:       @ %bb.0: @ %entry
536; CHECK-NEXT:    vmovlb.u8 q0, q0
537; CHECK-NEXT:    vmov.i64 q1, #0xffff
538; CHECK-NEXT:    vmov.u16 r0, q0[1]
539; CHECK-NEXT:    vmov.u16 r1, q0[0]
540; CHECK-NEXT:    vmov q2[2], q2[0], r1, r0
541; CHECK-NEXT:    vmov.u16 r3, q0[2]
542; CHECK-NEXT:    vand q2, q2, q1
543; CHECK-NEXT:    vmov r0, s10
544; CHECK-NEXT:    vmov r1, r2, d4
545; CHECK-NEXT:    add r0, r1
546; CHECK-NEXT:    vmov.u16 r1, q0[3]
547; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
548; CHECK-NEXT:    vmov.u16 r3, q0[4]
549; CHECK-NEXT:    vand q2, q2, q1
550; CHECK-NEXT:    vmov r1, s8
551; CHECK-NEXT:    add r0, r1
552; CHECK-NEXT:    vmov r1, s10
553; CHECK-NEXT:    add r0, r1
554; CHECK-NEXT:    vmov.u16 r1, q0[5]
555; CHECK-NEXT:    vmov q2[2], q2[0], r3, r1
556; CHECK-NEXT:    vand q2, q2, q1
557; CHECK-NEXT:    vmov r1, s8
558; CHECK-NEXT:    add r0, r1
559; CHECK-NEXT:    vmov r1, r3, d5
560; CHECK-NEXT:    adds r0, r0, r1
561; CHECK-NEXT:    adc.w r1, r2, r3
562; CHECK-NEXT:    vmov.u16 r2, q0[7]
563; CHECK-NEXT:    vmov.u16 r3, q0[6]
564; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
565; CHECK-NEXT:    vand q0, q0, q1
566; CHECK-NEXT:    vmov r2, r3, d0
567; CHECK-NEXT:    adds r0, r0, r2
568; CHECK-NEXT:    adcs r1, r3
569; CHECK-NEXT:    vmov r2, r3, d1
570; CHECK-NEXT:    adds r0, r0, r2
571; CHECK-NEXT:    adcs r1, r3
572; CHECK-NEXT:    bx lr
573entry:
574  %xx = zext <8 x i8> %x to <8 x i64>
575  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
576  ret i64 %z
577}
578
579define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
580; CHECK-LABEL: add_v8i8_v8i64_sext:
581; CHECK:       @ %bb.0: @ %entry
582; CHECK-NEXT:    vmov.u16 r0, q0[0]
583; CHECK-NEXT:    vmov.u16 r2, q0[1]
584; CHECK-NEXT:    sxtb r0, r0
585; CHECK-NEXT:    sxtb r2, r2
586; CHECK-NEXT:    asrs r1, r0, #31
587; CHECK-NEXT:    adds r0, r0, r2
588; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
589; CHECK-NEXT:    vmov.u16 r2, q0[2]
590; CHECK-NEXT:    sxtb r2, r2
591; CHECK-NEXT:    adds r0, r0, r2
592; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
593; CHECK-NEXT:    vmov.u16 r2, q0[3]
594; CHECK-NEXT:    sxtb r2, r2
595; CHECK-NEXT:    adds r0, r0, r2
596; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
597; CHECK-NEXT:    vmov.u16 r2, q0[4]
598; CHECK-NEXT:    sxtb r2, r2
599; CHECK-NEXT:    adds r0, r0, r2
600; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
601; CHECK-NEXT:    vmov.u16 r2, q0[5]
602; CHECK-NEXT:    sxtb r2, r2
603; CHECK-NEXT:    adds r0, r0, r2
604; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
605; CHECK-NEXT:    vmov.u16 r2, q0[6]
606; CHECK-NEXT:    sxtb r2, r2
607; CHECK-NEXT:    adds r0, r0, r2
608; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
609; CHECK-NEXT:    vmov.u16 r2, q0[7]
610; CHECK-NEXT:    sxtb r2, r2
611; CHECK-NEXT:    adds r0, r0, r2
612; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
613; CHECK-NEXT:    bx lr
614entry:
615  %xx = sext <8 x i8> %x to <8 x i64>
616  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
617  ret i64 %z
618}
619
620define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
621; CHECK-LABEL: add_v4i8_v4i64_zext:
622; CHECK:       @ %bb.0: @ %entry
623; CHECK-NEXT:    vmov.i32 q1, #0xff
624; CHECK-NEXT:    vand q0, q0, q1
625; CHECK-NEXT:    vaddlv.u32 r0, r1, q0
626; CHECK-NEXT:    bx lr
627entry:
628  %xx = zext <4 x i8> %x to <4 x i64>
629  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
630  ret i64 %z
631}
632
633define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
634; CHECK-LABEL: add_v4i8_v4i64_sext:
635; CHECK:       @ %bb.0: @ %entry
636; CHECK-NEXT:    vmovlb.s8 q0, q0
637; CHECK-NEXT:    vmovlb.s16 q0, q0
638; CHECK-NEXT:    vaddlv.s32 r0, r1, q0
639; CHECK-NEXT:    bx lr
640entry:
641  %xx = sext <4 x i8> %x to <4 x i64>
642  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
643  ret i64 %z
644}
645
646define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
647; CHECK-LABEL: add_v2i8_v2i64_zext:
648; CHECK:       @ %bb.0: @ %entry
649; CHECK-NEXT:    vmov.i64 q1, #0xff
650; CHECK-NEXT:    vand q0, q0, q1
651; CHECK-NEXT:    vmov r0, s2
652; CHECK-NEXT:    vmov r2, r1, d0
653; CHECK-NEXT:    add r0, r2
654; CHECK-NEXT:    bx lr
655entry:
656  %xx = zext <2 x i8> %x to <2 x i64>
657  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
658  ret i64 %z
659}
660
661define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
662; CHECK-LABEL: add_v2i8_v2i64_sext:
663; CHECK:       @ %bb.0: @ %entry
664; CHECK-NEXT:    vmov r0, s0
665; CHECK-NEXT:    vmov r2, s2
666; CHECK-NEXT:    sxtb r0, r0
667; CHECK-NEXT:    asrs r1, r0, #31
668; CHECK-NEXT:    sxtb r2, r2
669; CHECK-NEXT:    adds r0, r0, r2
670; CHECK-NEXT:    adc.w r1, r1, r2, asr #31
671; CHECK-NEXT:    bx lr
672entry:
673  %xx = sext <2 x i8> %x to <2 x i64>
674  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
675  ret i64 %z
676}
677
678define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x) {
679; CHECK-LABEL: add_v2i64_v2i64:
680; CHECK:       @ %bb.0: @ %entry
681; CHECK-NEXT:    vmov r0, r1, d1
682; CHECK-NEXT:    vmov r2, r3, d0
683; CHECK-NEXT:    adds r0, r0, r2
684; CHECK-NEXT:    adcs r1, r3
685; CHECK-NEXT:    bx lr
686entry:
687  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
688  ret i64 %z
689}
690
691define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
692; CHECK-LABEL: add_v4i32_v4i32_acc:
693; CHECK:       @ %bb.0: @ %entry
694; CHECK-NEXT:    vaddva.u32 r0, q0
695; CHECK-NEXT:    bx lr
696entry:
697  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
698  %r = add i32 %z, %a
699  ret i32 %r
700}
701
702define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
703; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
704; CHECK:       @ %bb.0: @ %entry
705; CHECK-NEXT:    vaddlva.u32 r0, r1, q0
706; CHECK-NEXT:    bx lr
707entry:
708  %xx = zext <4 x i32> %x to <4 x i64>
709  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
710  %r = add i64 %z, %a
711  ret i64 %r
712}
713
714define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
715; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
716; CHECK:       @ %bb.0: @ %entry
717; CHECK-NEXT:    vaddlva.s32 r0, r1, q0
718; CHECK-NEXT:    bx lr
719entry:
720  %xx = sext <4 x i32> %x to <4 x i64>
721  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
722  %r = add i64 %z, %a
723  ret i64 %r
724}
725
726define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
727; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
728; CHECK:       @ %bb.0: @ %entry
729; CHECK-NEXT:    .save {r7, lr}
730; CHECK-NEXT:    push {r7, lr}
731; CHECK-NEXT:    vmov.i64 q1, #0xffffffff
732; CHECK-NEXT:    vand q0, q0, q1
733; CHECK-NEXT:    vmov lr, r12, d1
734; CHECK-NEXT:    vmov r3, r2, d0
735; CHECK-NEXT:    adds.w r3, r3, lr
736; CHECK-NEXT:    adc.w r2, r2, r12
737; CHECK-NEXT:    adds r0, r0, r3
738; CHECK-NEXT:    adcs r1, r2
739; CHECK-NEXT:    pop {r7, pc}
740entry:
741  %xx = zext <2 x i32> %x to <2 x i64>
742  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
743  %r = add i64 %z, %a
744  ret i64 %r
745}
746
747define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
748; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
749; CHECK:       @ %bb.0: @ %entry
750; CHECK-NEXT:    vmov r2, s0
751; CHECK-NEXT:    vmov r3, s2
752; CHECK-NEXT:    asr.w r12, r2, #31
753; CHECK-NEXT:    adds r2, r2, r3
754; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
755; CHECK-NEXT:    adds r0, r0, r2
756; CHECK-NEXT:    adcs r1, r3
757; CHECK-NEXT:    bx lr
758entry:
759  %xx = sext <2 x i32> %x to <2 x i64>
760  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
761  %r = add i64 %z, %a
762  ret i64 %r
763}
764
765define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
766; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
767; CHECK:       @ %bb.0: @ %entry
768; CHECK-NEXT:    vaddva.u16 r0, q0
769; CHECK-NEXT:    bx lr
770entry:
771  %xx = zext <8 x i16> %x to <8 x i32>
772  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
773  %r = add i32 %z, %a
774  ret i32 %r
775}
776
777define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
778; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
779; CHECK:       @ %bb.0: @ %entry
780; CHECK-NEXT:    vaddva.s16 r0, q0
781; CHECK-NEXT:    bx lr
782entry:
783  %xx = sext <8 x i16> %x to <8 x i32>
784  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
785  %r = add i32 %z, %a
786  ret i32 %r
787}
788
789define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
790; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
791; CHECK:       @ %bb.0: @ %entry
792; CHECK-NEXT:    vmovlb.u16 q0, q0
793; CHECK-NEXT:    vaddva.u32 r0, q0
794; CHECK-NEXT:    bx lr
795entry:
796  %xx = zext <4 x i16> %x to <4 x i32>
797  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
798  %r = add i32 %z, %a
799  ret i32 %r
800}
801
802define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
803; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
804; CHECK:       @ %bb.0: @ %entry
805; CHECK-NEXT:    vmovlb.s16 q0, q0
806; CHECK-NEXT:    vaddva.u32 r0, q0
807; CHECK-NEXT:    bx lr
808entry:
809  %xx = sext <4 x i16> %x to <4 x i32>
810  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
811  %r = add i32 %z, %a
812  ret i32 %r
813}
814
815define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
816; CHECK-LABEL: add_v8i16_v8i16_acc:
817; CHECK:       @ %bb.0: @ %entry
818; CHECK-NEXT:    vaddva.u16 r0, q0
819; CHECK-NEXT:    uxth r0, r0
820; CHECK-NEXT:    bx lr
821entry:
822  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
823  %r = add i16 %z, %a
824  ret i16 %r
825}
826
827define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
828; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
829; CHECK:       @ %bb.0: @ %entry
830; CHECK-NEXT:    .save {r7, lr}
831; CHECK-NEXT:    push {r7, lr}
832; CHECK-NEXT:    vmov.u16 r2, q0[1]
833; CHECK-NEXT:    vmov.u16 r3, q0[0]
834; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
835; CHECK-NEXT:    vmov.i64 q1, #0xffff
836; CHECK-NEXT:    vand q2, q2, q1
837; CHECK-NEXT:    vmov r2, s10
838; CHECK-NEXT:    vmov r3, r12, d4
839; CHECK-NEXT:    add.w lr, r3, r2
840; CHECK-NEXT:    vmov.u16 r3, q0[3]
841; CHECK-NEXT:    vmov.u16 r2, q0[2]
842; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
843; CHECK-NEXT:    vand q2, q2, q1
844; CHECK-NEXT:    vmov r2, s8
845; CHECK-NEXT:    vmov r3, s10
846; CHECK-NEXT:    add r2, lr
847; CHECK-NEXT:    add.w lr, r2, r3
848; CHECK-NEXT:    vmov.u16 r3, q0[5]
849; CHECK-NEXT:    vmov.u16 r2, q0[4]
850; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
851; CHECK-NEXT:    vand q2, q2, q1
852; CHECK-NEXT:    vmov r2, s8
853; CHECK-NEXT:    add lr, r2
854; CHECK-NEXT:    vmov r3, r2, d5
855; CHECK-NEXT:    adds.w lr, lr, r3
856; CHECK-NEXT:    vmov.u16 r3, q0[6]
857; CHECK-NEXT:    adc.w r12, r12, r2
858; CHECK-NEXT:    vmov.u16 r2, q0[7]
859; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
860; CHECK-NEXT:    vand q0, q0, q1
861; CHECK-NEXT:    vmov r2, r3, d0
862; CHECK-NEXT:    adds.w lr, lr, r2
863; CHECK-NEXT:    adc.w r12, r12, r3
864; CHECK-NEXT:    vmov r2, r3, d1
865; CHECK-NEXT:    adds.w r2, r2, lr
866; CHECK-NEXT:    adc.w r3, r3, r12
867; CHECK-NEXT:    adds r0, r0, r2
868; CHECK-NEXT:    adcs r1, r3
869; CHECK-NEXT:    pop {r7, pc}
870entry:
871  %xx = zext <8 x i16> %x to <8 x i64>
872  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
873  %r = add i64 %z, %a
874  ret i64 %r
875}
876
877define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
878; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
879; CHECK:       @ %bb.0: @ %entry
880; CHECK-NEXT:    .save {r7, lr}
881; CHECK-NEXT:    push {r7, lr}
882; CHECK-NEXT:    vmov.s16 r2, q0[0]
883; CHECK-NEXT:    vmov.s16 r3, q0[1]
884; CHECK-NEXT:    asr.w r12, r2, #31
885; CHECK-NEXT:    adds.w lr, r2, r3
886; CHECK-NEXT:    vmov.s16 r2, q0[2]
887; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
888; CHECK-NEXT:    adds.w r12, lr, r2
889; CHECK-NEXT:    adc.w r2, r3, r2, asr #31
890; CHECK-NEXT:    vmov.s16 r3, q0[3]
891; CHECK-NEXT:    adds.w r12, r12, r3
892; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
893; CHECK-NEXT:    vmov.s16 r3, q0[4]
894; CHECK-NEXT:    adds.w r12, r12, r3
895; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
896; CHECK-NEXT:    vmov.s16 r3, q0[5]
897; CHECK-NEXT:    adds.w r12, r12, r3
898; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
899; CHECK-NEXT:    vmov.s16 r3, q0[6]
900; CHECK-NEXT:    adds.w r12, r12, r3
901; CHECK-NEXT:    adc.w lr, r2, r3, asr #31
902; CHECK-NEXT:    vmov.s16 r3, q0[7]
903; CHECK-NEXT:    adds.w r2, r12, r3
904; CHECK-NEXT:    adc.w r3, lr, r3, asr #31
905; CHECK-NEXT:    adds r0, r0, r2
906; CHECK-NEXT:    adcs r1, r3
907; CHECK-NEXT:    pop {r7, pc}
908entry:
909  %xx = sext <8 x i16> %x to <8 x i64>
910  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
911  %r = add i64 %z, %a
912  ret i64 %r
913}
914
915define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
916; CHECK-LABEL: add_v4i16_v4i64_acc_zext:
917; CHECK:       @ %bb.0: @ %entry
918; CHECK-NEXT:    vmovlb.u16 q0, q0
919; CHECK-NEXT:    vaddlva.u32 r0, r1, q0
920; CHECK-NEXT:    bx lr
921entry:
922  %xx = zext <4 x i16> %x to <4 x i64>
923  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
924  %r = add i64 %z, %a
925  ret i64 %r
926}
927
928define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
929; CHECK-LABEL: add_v4i16_v4i64_acc_sext:
930; CHECK:       @ %bb.0: @ %entry
931; CHECK-NEXT:    vmovlb.s16 q0, q0
932; CHECK-NEXT:    vaddlva.s32 r0, r1, q0
933; CHECK-NEXT:    bx lr
934entry:
935  %xx = sext <4 x i16> %x to <4 x i64>
936  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
937  %r = add i64 %z, %a
938  ret i64 %r
939}
940
941define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
942; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
943; CHECK:       @ %bb.0: @ %entry
944; CHECK-NEXT:    vmov.i64 q1, #0xffff
945; CHECK-NEXT:    vand q0, q0, q1
946; CHECK-NEXT:    vmov r2, s2
947; CHECK-NEXT:    vmov r3, r12, d0
948; CHECK-NEXT:    add r2, r3
949; CHECK-NEXT:    adds r0, r0, r2
950; CHECK-NEXT:    adc.w r1, r1, r12
951; CHECK-NEXT:    bx lr
952entry:
953  %xx = zext <2 x i16> %x to <2 x i64>
954  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
955  %r = add i64 %z, %a
956  ret i64 %r
957}
958
959define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
960; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
961; CHECK:       @ %bb.0: @ %entry
962; CHECK-NEXT:    vmov r2, s0
963; CHECK-NEXT:    vmov r3, s2
964; CHECK-NEXT:    sxth r2, r2
965; CHECK-NEXT:    asr.w r12, r2, #31
966; CHECK-NEXT:    sxth r3, r3
967; CHECK-NEXT:    adds r2, r2, r3
968; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
969; CHECK-NEXT:    adds r0, r0, r2
970; CHECK-NEXT:    adcs r1, r3
971; CHECK-NEXT:    bx lr
972entry:
973  %xx = sext <2 x i16> %x to <2 x i64>
974  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
975  %r = add i64 %z, %a
976  ret i64 %r
977}
978
979define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
980; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
981; CHECK:       @ %bb.0: @ %entry
982; CHECK-NEXT:    vaddva.u8 r0, q0
983; CHECK-NEXT:    bx lr
984entry:
985  %xx = zext <16 x i8> %x to <16 x i32>
986  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
987  %r = add i32 %z, %a
988  ret i32 %r
989}
990
991define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
992; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
993; CHECK:       @ %bb.0: @ %entry
994; CHECK-NEXT:    vaddva.s8 r0, q0
995; CHECK-NEXT:    bx lr
996entry:
997  %xx = sext <16 x i8> %x to <16 x i32>
998  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
999  %r = add i32 %z, %a
1000  ret i32 %r
1001}
1002
1003define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
1004; CHECK-LABEL: add_v8i8_v8i32_acc_zext:
1005; CHECK:       @ %bb.0: @ %entry
1006; CHECK-NEXT:    vmovlb.u8 q0, q0
1007; CHECK-NEXT:    vaddva.u16 r0, q0
1008; CHECK-NEXT:    bx lr
1009entry:
1010  %xx = zext <8 x i8> %x to <8 x i32>
1011  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1012  %r = add i32 %z, %a
1013  ret i32 %r
1014}
1015
1016define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
1017; CHECK-LABEL: add_v8i8_v8i32_acc_sext:
1018; CHECK:       @ %bb.0: @ %entry
1019; CHECK-NEXT:    vmovlb.s8 q0, q0
1020; CHECK-NEXT:    vaddva.s16 r0, q0
1021; CHECK-NEXT:    bx lr
1022entry:
1023  %xx = sext <8 x i8> %x to <8 x i32>
1024  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1025  %r = add i32 %z, %a
1026  ret i32 %r
1027}
1028
1029define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
1030; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
1031; CHECK:       @ %bb.0: @ %entry
1032; CHECK-NEXT:    vmov.i32 q1, #0xff
1033; CHECK-NEXT:    vand q0, q0, q1
1034; CHECK-NEXT:    vaddva.u32 r0, q0
1035; CHECK-NEXT:    bx lr
1036entry:
1037  %xx = zext <4 x i8> %x to <4 x i32>
1038  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1039  %r = add i32 %z, %a
1040  ret i32 %r
1041}
1042
1043define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
1044; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
1045; CHECK:       @ %bb.0: @ %entry
1046; CHECK-NEXT:    vmovlb.s8 q0, q0
1047; CHECK-NEXT:    vmovlb.s16 q0, q0
1048; CHECK-NEXT:    vaddva.u32 r0, q0
1049; CHECK-NEXT:    bx lr
1050entry:
1051  %xx = sext <4 x i8> %x to <4 x i32>
1052  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1053  %r = add i32 %z, %a
1054  ret i32 %r
1055}
1056
1057define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
1058; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1059; CHECK:       @ %bb.0: @ %entry
1060; CHECK-NEXT:    vaddva.u8 r0, q0
1061; CHECK-NEXT:    uxth r0, r0
1062; CHECK-NEXT:    bx lr
1063entry:
1064  %xx = zext <16 x i8> %x to <16 x i16>
1065  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1066  %r = add i16 %z, %a
1067  ret i16 %r
1068}
1069
1070define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
1071; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1072; CHECK:       @ %bb.0: @ %entry
1073; CHECK-NEXT:    vaddva.s8 r0, q0
1074; CHECK-NEXT:    sxth r0, r0
1075; CHECK-NEXT:    bx lr
1076entry:
1077  %xx = sext <16 x i8> %x to <16 x i16>
1078  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1079  %r = add i16 %z, %a
1080  ret i16 %r
1081}
1082
1083define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
1084; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
1085; CHECK:       @ %bb.0: @ %entry
1086; CHECK-NEXT:    vmovlb.u8 q0, q0
1087; CHECK-NEXT:    vaddva.u16 r0, q0
1088; CHECK-NEXT:    uxth r0, r0
1089; CHECK-NEXT:    bx lr
1090entry:
1091  %xx = zext <8 x i8> %x to <8 x i16>
1092  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1093  %r = add i16 %z, %a
1094  ret i16 %r
1095}
1096
1097define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
1098; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
1099; CHECK:       @ %bb.0: @ %entry
1100; CHECK-NEXT:    vmovlb.s8 q0, q0
1101; CHECK-NEXT:    vaddva.u16 r0, q0
1102; CHECK-NEXT:    sxth r0, r0
1103; CHECK-NEXT:    bx lr
1104entry:
1105  %xx = sext <8 x i8> %x to <8 x i16>
1106  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1107  %r = add i16 %z, %a
1108  ret i16 %r
1109}
1110
1111define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
1112; CHECK-LABEL: add_v16i8_v16i8_acc:
1113; CHECK:       @ %bb.0: @ %entry
1114; CHECK-NEXT:    vaddva.u8 r0, q0
1115; CHECK-NEXT:    uxtb r0, r0
1116; CHECK-NEXT:    bx lr
1117entry:
1118  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
1119  %r = add i8 %z, %a
1120  ret i8 %r
1121}
1122
1123define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
1124; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
1125; CHECK:       @ %bb.0: @ %entry
1126; CHECK-NEXT:    .save {r7, lr}
1127; CHECK-NEXT:    push {r7, lr}
1128; CHECK-NEXT:    vmov.u8 r2, q0[1]
1129; CHECK-NEXT:    vmov.u8 r3, q0[0]
1130; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1131; CHECK-NEXT:    vmov.i64 q1, #0xff
1132; CHECK-NEXT:    vand q2, q2, q1
1133; CHECK-NEXT:    vmov r2, s10
1134; CHECK-NEXT:    vmov r3, r12, d4
1135; CHECK-NEXT:    add.w lr, r3, r2
1136; CHECK-NEXT:    vmov.u8 r3, q0[3]
1137; CHECK-NEXT:    vmov.u8 r2, q0[2]
1138; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
1139; CHECK-NEXT:    vand q2, q2, q1
1140; CHECK-NEXT:    vmov r2, s8
1141; CHECK-NEXT:    vmov r3, s10
1142; CHECK-NEXT:    add r2, lr
1143; CHECK-NEXT:    add.w lr, r2, r3
1144; CHECK-NEXT:    vmov.u8 r3, q0[5]
1145; CHECK-NEXT:    vmov.u8 r2, q0[4]
1146; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
1147; CHECK-NEXT:    vand q2, q2, q1
1148; CHECK-NEXT:    vmov r2, s8
1149; CHECK-NEXT:    add lr, r2
1150; CHECK-NEXT:    vmov r3, r2, d5
1151; CHECK-NEXT:    adds.w lr, lr, r3
1152; CHECK-NEXT:    vmov.u8 r3, q0[6]
1153; CHECK-NEXT:    adc.w r12, r12, r2
1154; CHECK-NEXT:    vmov.u8 r2, q0[7]
1155; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1156; CHECK-NEXT:    vand q2, q2, q1
1157; CHECK-NEXT:    vmov r2, r3, d4
1158; CHECK-NEXT:    adds.w lr, lr, r2
1159; CHECK-NEXT:    adc.w r12, r12, r3
1160; CHECK-NEXT:    vmov r2, r3, d5
1161; CHECK-NEXT:    adds.w lr, lr, r2
1162; CHECK-NEXT:    vmov.u8 r2, q0[9]
1163; CHECK-NEXT:    adc.w r12, r12, r3
1164; CHECK-NEXT:    vmov.u8 r3, q0[8]
1165; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1166; CHECK-NEXT:    vand q2, q2, q1
1167; CHECK-NEXT:    vmov r2, r3, d4
1168; CHECK-NEXT:    adds.w lr, lr, r2
1169; CHECK-NEXT:    adc.w r12, r12, r3
1170; CHECK-NEXT:    vmov r2, r3, d5
1171; CHECK-NEXT:    adds.w lr, lr, r2
1172; CHECK-NEXT:    vmov.u8 r2, q0[11]
1173; CHECK-NEXT:    adc.w r12, r12, r3
1174; CHECK-NEXT:    vmov.u8 r3, q0[10]
1175; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1176; CHECK-NEXT:    vand q2, q2, q1
1177; CHECK-NEXT:    vmov r2, r3, d4
1178; CHECK-NEXT:    adds.w lr, lr, r2
1179; CHECK-NEXT:    adc.w r12, r12, r3
1180; CHECK-NEXT:    vmov r2, r3, d5
1181; CHECK-NEXT:    adds.w lr, lr, r2
1182; CHECK-NEXT:    vmov.u8 r2, q0[13]
1183; CHECK-NEXT:    adc.w r12, r12, r3
1184; CHECK-NEXT:    vmov.u8 r3, q0[12]
1185; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1186; CHECK-NEXT:    vand q2, q2, q1
1187; CHECK-NEXT:    vmov r2, r3, d4
1188; CHECK-NEXT:    adds.w lr, lr, r2
1189; CHECK-NEXT:    adc.w r12, r12, r3
1190; CHECK-NEXT:    vmov r2, r3, d5
1191; CHECK-NEXT:    adds.w lr, lr, r2
1192; CHECK-NEXT:    vmov.u8 r2, q0[15]
1193; CHECK-NEXT:    adc.w r12, r12, r3
1194; CHECK-NEXT:    vmov.u8 r3, q0[14]
1195; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1196; CHECK-NEXT:    vand q0, q0, q1
1197; CHECK-NEXT:    vmov r2, r3, d0
1198; CHECK-NEXT:    adds.w lr, lr, r2
1199; CHECK-NEXT:    adc.w r12, r12, r3
1200; CHECK-NEXT:    vmov r2, r3, d1
1201; CHECK-NEXT:    adds.w r2, r2, lr
1202; CHECK-NEXT:    adc.w r3, r3, r12
1203; CHECK-NEXT:    adds r0, r0, r2
1204; CHECK-NEXT:    adcs r1, r3
1205; CHECK-NEXT:    pop {r7, pc}
1206entry:
1207  %xx = zext <16 x i8> %x to <16 x i64>
1208  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1209  %r = add i64 %z, %a
1210  ret i64 %r
1211}
1212
1213define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
1214; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
1215; CHECK:       @ %bb.0: @ %entry
1216; CHECK-NEXT:    .save {r7, lr}
1217; CHECK-NEXT:    push {r7, lr}
1218; CHECK-NEXT:    vmov.s8 r2, q0[0]
1219; CHECK-NEXT:    vmov.s8 r3, q0[1]
1220; CHECK-NEXT:    asr.w r12, r2, #31
1221; CHECK-NEXT:    adds.w lr, r2, r3
1222; CHECK-NEXT:    vmov.s8 r2, q0[2]
1223; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
1224; CHECK-NEXT:    adds.w r12, lr, r2
1225; CHECK-NEXT:    adc.w r2, r3, r2, asr #31
1226; CHECK-NEXT:    vmov.s8 r3, q0[3]
1227; CHECK-NEXT:    adds.w r12, r12, r3
1228; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1229; CHECK-NEXT:    vmov.s8 r3, q0[4]
1230; CHECK-NEXT:    adds.w r12, r12, r3
1231; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1232; CHECK-NEXT:    vmov.s8 r3, q0[5]
1233; CHECK-NEXT:    adds.w r12, r12, r3
1234; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1235; CHECK-NEXT:    vmov.s8 r3, q0[6]
1236; CHECK-NEXT:    adds.w r12, r12, r3
1237; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1238; CHECK-NEXT:    vmov.s8 r3, q0[7]
1239; CHECK-NEXT:    adds.w r12, r12, r3
1240; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1241; CHECK-NEXT:    vmov.s8 r3, q0[8]
1242; CHECK-NEXT:    adds.w r12, r12, r3
1243; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1244; CHECK-NEXT:    vmov.s8 r3, q0[9]
1245; CHECK-NEXT:    adds.w r12, r12, r3
1246; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1247; CHECK-NEXT:    vmov.s8 r3, q0[10]
1248; CHECK-NEXT:    adds.w r12, r12, r3
1249; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1250; CHECK-NEXT:    vmov.s8 r3, q0[11]
1251; CHECK-NEXT:    adds.w r12, r12, r3
1252; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1253; CHECK-NEXT:    vmov.s8 r3, q0[12]
1254; CHECK-NEXT:    adds.w r12, r12, r3
1255; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1256; CHECK-NEXT:    vmov.s8 r3, q0[13]
1257; CHECK-NEXT:    adds.w r12, r12, r3
1258; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1259; CHECK-NEXT:    vmov.s8 r3, q0[14]
1260; CHECK-NEXT:    adds.w r12, r12, r3
1261; CHECK-NEXT:    adc.w lr, r2, r3, asr #31
1262; CHECK-NEXT:    vmov.s8 r3, q0[15]
1263; CHECK-NEXT:    adds.w r2, r12, r3
1264; CHECK-NEXT:    adc.w r3, lr, r3, asr #31
1265; CHECK-NEXT:    adds r0, r0, r2
1266; CHECK-NEXT:    adcs r1, r3
1267; CHECK-NEXT:    pop {r7, pc}
1268entry:
1269  %xx = sext <16 x i8> %x to <16 x i64>
1270  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1271  %r = add i64 %z, %a
1272  ret i64 %r
1273}
1274
1275define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
1276; CHECK-LABEL: add_v8i8_v8i64_acc_zext:
1277; CHECK:       @ %bb.0: @ %entry
1278; CHECK-NEXT:    .save {r7, lr}
1279; CHECK-NEXT:    push {r7, lr}
1280; CHECK-NEXT:    vmovlb.u8 q0, q0
1281; CHECK-NEXT:    vmov.i64 q1, #0xffff
1282; CHECK-NEXT:    vmov.u16 r2, q0[1]
1283; CHECK-NEXT:    vmov.u16 r3, q0[0]
1284; CHECK-NEXT:    vmov q2[2], q2[0], r3, r2
1285; CHECK-NEXT:    vand q2, q2, q1
1286; CHECK-NEXT:    vmov r2, s10
1287; CHECK-NEXT:    vmov r3, r12, d4
1288; CHECK-NEXT:    add.w lr, r3, r2
1289; CHECK-NEXT:    vmov.u16 r3, q0[3]
1290; CHECK-NEXT:    vmov.u16 r2, q0[2]
1291; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
1292; CHECK-NEXT:    vand q2, q2, q1
1293; CHECK-NEXT:    vmov r2, s8
1294; CHECK-NEXT:    vmov r3, s10
1295; CHECK-NEXT:    add r2, lr
1296; CHECK-NEXT:    add.w lr, r2, r3
1297; CHECK-NEXT:    vmov.u16 r3, q0[5]
1298; CHECK-NEXT:    vmov.u16 r2, q0[4]
1299; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
1300; CHECK-NEXT:    vand q2, q2, q1
1301; CHECK-NEXT:    vmov r2, s8
1302; CHECK-NEXT:    add lr, r2
1303; CHECK-NEXT:    vmov r3, r2, d5
1304; CHECK-NEXT:    adds.w lr, lr, r3
1305; CHECK-NEXT:    vmov.u16 r3, q0[6]
1306; CHECK-NEXT:    adc.w r12, r12, r2
1307; CHECK-NEXT:    vmov.u16 r2, q0[7]
1308; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1309; CHECK-NEXT:    vand q0, q0, q1
1310; CHECK-NEXT:    vmov r2, r3, d0
1311; CHECK-NEXT:    adds.w lr, lr, r2
1312; CHECK-NEXT:    adc.w r12, r12, r3
1313; CHECK-NEXT:    vmov r2, r3, d1
1314; CHECK-NEXT:    adds.w r2, r2, lr
1315; CHECK-NEXT:    adc.w r3, r3, r12
1316; CHECK-NEXT:    adds r0, r0, r2
1317; CHECK-NEXT:    adcs r1, r3
1318; CHECK-NEXT:    pop {r7, pc}
1319entry:
1320  %xx = zext <8 x i8> %x to <8 x i64>
1321  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1322  %r = add i64 %z, %a
1323  ret i64 %r
1324}
1325
1326define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
1327; CHECK-LABEL: add_v8i8_v8i64_acc_sext:
1328; CHECK:       @ %bb.0: @ %entry
1329; CHECK-NEXT:    .save {r7, lr}
1330; CHECK-NEXT:    push {r7, lr}
1331; CHECK-NEXT:    vmov.u16 r2, q0[0]
1332; CHECK-NEXT:    vmov.u16 r3, q0[1]
1333; CHECK-NEXT:    sxtb r2, r2
1334; CHECK-NEXT:    sxtb r3, r3
1335; CHECK-NEXT:    asr.w r12, r2, #31
1336; CHECK-NEXT:    adds.w lr, r2, r3
1337; CHECK-NEXT:    vmov.u16 r2, q0[2]
1338; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
1339; CHECK-NEXT:    sxtb r2, r2
1340; CHECK-NEXT:    adds.w r12, lr, r2
1341; CHECK-NEXT:    adc.w r2, r3, r2, asr #31
1342; CHECK-NEXT:    vmov.u16 r3, q0[3]
1343; CHECK-NEXT:    sxtb r3, r3
1344; CHECK-NEXT:    adds.w r12, r12, r3
1345; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1346; CHECK-NEXT:    vmov.u16 r3, q0[4]
1347; CHECK-NEXT:    sxtb r3, r3
1348; CHECK-NEXT:    adds.w r12, r12, r3
1349; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1350; CHECK-NEXT:    vmov.u16 r3, q0[5]
1351; CHECK-NEXT:    sxtb r3, r3
1352; CHECK-NEXT:    adds.w r12, r12, r3
1353; CHECK-NEXT:    adc.w r2, r2, r3, asr #31
1354; CHECK-NEXT:    vmov.u16 r3, q0[6]
1355; CHECK-NEXT:    sxtb r3, r3
1356; CHECK-NEXT:    adds.w r12, r12, r3
1357; CHECK-NEXT:    adc.w lr, r2, r3, asr #31
1358; CHECK-NEXT:    vmov.u16 r3, q0[7]
1359; CHECK-NEXT:    sxtb r3, r3
1360; CHECK-NEXT:    adds.w r2, r12, r3
1361; CHECK-NEXT:    adc.w r3, lr, r3, asr #31
1362; CHECK-NEXT:    adds r0, r0, r2
1363; CHECK-NEXT:    adcs r1, r3
1364; CHECK-NEXT:    pop {r7, pc}
1365entry:
1366  %xx = sext <8 x i8> %x to <8 x i64>
1367  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1368  %r = add i64 %z, %a
1369  ret i64 %r
1370}
1371
1372define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
1373; CHECK-LABEL: add_v4i8_v4i64_acc_zext:
1374; CHECK:       @ %bb.0: @ %entry
1375; CHECK-NEXT:    vmov.i32 q1, #0xff
1376; CHECK-NEXT:    vand q0, q0, q1
1377; CHECK-NEXT:    vaddlva.u32 r0, r1, q0
1378; CHECK-NEXT:    bx lr
1379entry:
1380  %xx = zext <4 x i8> %x to <4 x i64>
1381  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1382  %r = add i64 %z, %a
1383  ret i64 %r
1384}
1385
1386define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
1387; CHECK-LABEL: add_v4i8_v4i64_acc_sext:
1388; CHECK:       @ %bb.0: @ %entry
1389; CHECK-NEXT:    vmovlb.s8 q0, q0
1390; CHECK-NEXT:    vmovlb.s16 q0, q0
1391; CHECK-NEXT:    vaddlva.s32 r0, r1, q0
1392; CHECK-NEXT:    bx lr
1393entry:
1394  %xx = sext <4 x i8> %x to <4 x i64>
1395  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1396  %r = add i64 %z, %a
1397  ret i64 %r
1398}
1399
1400define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
1401; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
1402; CHECK:       @ %bb.0: @ %entry
1403; CHECK-NEXT:    vmov.i64 q1, #0xff
1404; CHECK-NEXT:    vand q0, q0, q1
1405; CHECK-NEXT:    vmov r2, s2
1406; CHECK-NEXT:    vmov r3, r12, d0
1407; CHECK-NEXT:    add r2, r3
1408; CHECK-NEXT:    adds r0, r0, r2
1409; CHECK-NEXT:    adc.w r1, r1, r12
1410; CHECK-NEXT:    bx lr
1411entry:
1412  %xx = zext <2 x i8> %x to <2 x i64>
1413  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1414  %r = add i64 %z, %a
1415  ret i64 %r
1416}
1417
1418define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
1419; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1420; CHECK:       @ %bb.0: @ %entry
1421; CHECK-NEXT:    vmov r2, s0
1422; CHECK-NEXT:    vmov r3, s2
1423; CHECK-NEXT:    sxtb r2, r2
1424; CHECK-NEXT:    asr.w r12, r2, #31
1425; CHECK-NEXT:    sxtb r3, r3
1426; CHECK-NEXT:    adds r2, r2, r3
1427; CHECK-NEXT:    adc.w r3, r12, r3, asr #31
1428; CHECK-NEXT:    adds r0, r0, r2
1429; CHECK-NEXT:    adcs r1, r3
1430; CHECK-NEXT:    bx lr
1431entry:
1432  %xx = sext <2 x i8> %x to <2 x i64>
1433  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1434  %r = add i64 %z, %a
1435  ret i64 %r
1436}
1437
1438define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
1439; CHECK-LABEL: add_v2i64_v2i64_acc:
1440; CHECK:       @ %bb.0: @ %entry
1441; CHECK-NEXT:    .save {r7, lr}
1442; CHECK-NEXT:    push {r7, lr}
1443; CHECK-NEXT:    vmov lr, r12, d1
1444; CHECK-NEXT:    vmov r3, r2, d0
1445; CHECK-NEXT:    adds.w r3, r3, lr
1446; CHECK-NEXT:    adc.w r2, r2, r12
1447; CHECK-NEXT:    adds r0, r0, r3
1448; CHECK-NEXT:    adcs r1, r2
1449; CHECK-NEXT:    pop {r7, pc}
1450entry:
1451  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
1452  %r = add i64 %z, %a
1453  ret i64 %r
1454}
1455
1456declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
1457declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
1458declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
1459declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
1460declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
1461declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
1462declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
1463declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
1464declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
1465declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
1466