xref: /llvm-project/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll (revision 7b3bbd83c0c24087072ec5b22a76799ab31f87d5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK
3
4define arm_aapcs_vfpcc i32 @add_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
5; CHECK-LABEL: add_v4i32_v4i32:
6; CHECK:       @ %bb.0: @ %entry
7; CHECK-NEXT:    vpt.i32 eq, q2, zr
8; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
9; CHECK-NEXT:    bx lr
10entry:
11  %c = icmp eq <4 x i32> %b, zeroinitializer
12  %m = mul <4 x i32> %x, %y
13  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
14  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
15  ret i32 %z
16}
17
18define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
19; CHECK-LABEL: add_v4i32_v4i64_zext:
20; CHECK:       @ %bb.0: @ %entry
21; CHECK-NEXT:    vpt.i32 eq, q2, zr
22; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
23; CHECK-NEXT:    bx lr
24entry:
25  %c = icmp eq <4 x i32> %b, zeroinitializer
26  %xx = zext <4 x i32> %x to <4 x i64>
27  %yy = zext <4 x i32> %y to <4 x i64>
28  %m = mul <4 x i64> %xx, %yy
29  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
30  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
31  ret i64 %z
32}
33
34define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b) {
35; CHECK-LABEL: add_v4i32_v4i64_sext:
36; CHECK:       @ %bb.0: @ %entry
37; CHECK-NEXT:    vpt.i32 eq, q2, zr
38; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
39; CHECK-NEXT:    bx lr
40entry:
41  %c = icmp eq <4 x i32> %b, zeroinitializer
42  %xx = sext <4 x i32> %x to <4 x i64>
43  %yy = sext <4 x i32> %y to <4 x i64>
44  %m = mul <4 x i64> %xx, %yy
45  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
46  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
47  ret i64 %z
48}
49
50define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
51; CHECK-LABEL: add_v2i32_v2i64_zext:
52; CHECK:       @ %bb.0: @ %entry
53; CHECK-NEXT:    vmov r0, s8
54; CHECK-NEXT:    movs r1, #0
55; CHECK-NEXT:    vmullb.u32 q3, q0, q1
56; CHECK-NEXT:    vmov.i32 q0, #0x0
57; CHECK-NEXT:    cmp r0, #0
58; CHECK-NEXT:    csetm r0, eq
59; CHECK-NEXT:    bfi r1, r0, #0, #8
60; CHECK-NEXT:    vmov r0, s10
61; CHECK-NEXT:    cmp r0, #0
62; CHECK-NEXT:    csetm r0, eq
63; CHECK-NEXT:    bfi r1, r0, #8, #8
64; CHECK-NEXT:    vmsr p0, r1
65; CHECK-NEXT:    vpsel q0, q3, q0
66; CHECK-NEXT:    vmov r0, r1, d1
67; CHECK-NEXT:    vmov r2, r3, d0
68; CHECK-NEXT:    adds r0, r0, r2
69; CHECK-NEXT:    adcs r1, r3
70; CHECK-NEXT:    bx lr
71entry:
72  %c = icmp eq <2 x i32> %b, zeroinitializer
73  %xx = zext <2 x i32> %x to <2 x i64>
74  %yy = zext <2 x i32> %y to <2 x i64>
75  %m = mul <2 x i64> %xx, %yy
76  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
77  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
78  ret i64 %z
79}
80
81define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b) {
82; CHECK-LABEL: add_v2i32_v2i64_sext:
83; CHECK:       @ %bb.0: @ %entry
84; CHECK-NEXT:    vmov r0, s8
85; CHECK-NEXT:    movs r1, #0
86; CHECK-NEXT:    vmullb.s32 q3, q0, q1
87; CHECK-NEXT:    vmov.i32 q0, #0x0
88; CHECK-NEXT:    cmp r0, #0
89; CHECK-NEXT:    csetm r0, eq
90; CHECK-NEXT:    bfi r1, r0, #0, #8
91; CHECK-NEXT:    vmov r0, s10
92; CHECK-NEXT:    cmp r0, #0
93; CHECK-NEXT:    csetm r0, eq
94; CHECK-NEXT:    bfi r1, r0, #8, #8
95; CHECK-NEXT:    vmsr p0, r1
96; CHECK-NEXT:    vpsel q0, q3, q0
97; CHECK-NEXT:    vmov r0, r1, d1
98; CHECK-NEXT:    vmov r2, r3, d0
99; CHECK-NEXT:    adds r0, r0, r2
100; CHECK-NEXT:    adcs r1, r3
101; CHECK-NEXT:    bx lr
102entry:
103  %c = icmp eq <2 x i32> %b, zeroinitializer
104  %xx = sext <2 x i32> %x to <2 x i64>
105  %yy = sext <2 x i32> %y to <2 x i64>
106  %m = mul <2 x i64> %xx, %yy
107  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
108  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
109  ret i64 %z
110}
111
112define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
113; CHECK-LABEL: add_v8i16_v8i32_zext:
114; CHECK:       @ %bb.0: @ %entry
115; CHECK-NEXT:    vpt.i16 eq, q2, zr
116; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
117; CHECK-NEXT:    bx lr
118entry:
119  %c = icmp eq <8 x i16> %b, zeroinitializer
120  %xx = zext <8 x i16> %x to <8 x i32>
121  %yy = zext <8 x i16> %y to <8 x i32>
122  %m = mul <8 x i32> %xx, %yy
123  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
124  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
125  ret i32 %z
126}
127
128define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
129; CHECK-LABEL: add_v8i16_v8i32_sext:
130; CHECK:       @ %bb.0: @ %entry
131; CHECK-NEXT:    vpt.i16 eq, q2, zr
132; CHECK-NEXT:    vmlavt.s16 r0, q0, q1
133; CHECK-NEXT:    bx lr
134entry:
135  %c = icmp eq <8 x i16> %b, zeroinitializer
136  %xx = sext <8 x i16> %x to <8 x i32>
137  %yy = sext <8 x i16> %y to <8 x i32>
138  %m = mul <8 x i32> %xx, %yy
139  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
140  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
141  ret i32 %z
142}
143
144define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
145; CHECK-LABEL: add_v4i16_v4i32_zext:
146; CHECK:       @ %bb.0: @ %entry
147; CHECK-NEXT:    vmovlb.u16 q2, q2
148; CHECK-NEXT:    vmovlb.u16 q1, q1
149; CHECK-NEXT:    vmovlb.u16 q0, q0
150; CHECK-NEXT:    vpt.i32 eq, q2, zr
151; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
152; CHECK-NEXT:    bx lr
153entry:
154  %c = icmp eq <4 x i16> %b, zeroinitializer
155  %xx = zext <4 x i16> %x to <4 x i32>
156  %yy = zext <4 x i16> %y to <4 x i32>
157  %m = mul <4 x i32> %xx, %yy
158  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
159  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
160  ret i32 %z
161}
162
163define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
164; CHECK-LABEL: add_v4i16_v4i32_sext:
165; CHECK:       @ %bb.0: @ %entry
166; CHECK-NEXT:    vmovlb.u16 q2, q2
167; CHECK-NEXT:    vmovlb.s16 q1, q1
168; CHECK-NEXT:    vmovlb.s16 q0, q0
169; CHECK-NEXT:    vpt.i32 eq, q2, zr
170; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
171; CHECK-NEXT:    bx lr
172entry:
173  %c = icmp eq <4 x i16> %b, zeroinitializer
174  %xx = sext <4 x i16> %x to <4 x i32>
175  %yy = sext <4 x i16> %y to <4 x i32>
176  %m = mul <4 x i32> %xx, %yy
177  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
178  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
179  ret i32 %z
180}
181
182define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
183; CHECK-LABEL: add_v8i16_v8i16:
184; CHECK:       @ %bb.0: @ %entry
185; CHECK-NEXT:    vpt.i16 eq, q2, zr
186; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
187; CHECK-NEXT:    uxth r0, r0
188; CHECK-NEXT:    bx lr
189entry:
190  %c = icmp eq <8 x i16> %b, zeroinitializer
191  %m = mul <8 x i16> %x, %y
192  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
193  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
194  ret i16 %z
195}
196
197define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
198; CHECK-LABEL: add_v8i16_v8i64_zext:
199; CHECK:       @ %bb.0: @ %entry
200; CHECK-NEXT:    vpt.i16 eq, q2, zr
201; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
202; CHECK-NEXT:    bx lr
203entry:
204  %c = icmp eq <8 x i16> %b, zeroinitializer
205  %xx = zext <8 x i16> %x to <8 x i64>
206  %yy = zext <8 x i16> %y to <8 x i64>
207  %m = mul <8 x i64> %xx, %yy
208  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
209  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
210  ret i64 %z
211}
212
213define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
214; CHECK-LABEL: add_v8i16_v8i64_sext:
215; CHECK:       @ %bb.0: @ %entry
216; CHECK-NEXT:    vpt.i16 eq, q2, zr
217; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
218; CHECK-NEXT:    bx lr
219entry:
220  %c = icmp eq <8 x i16> %b, zeroinitializer
221  %xx = sext <8 x i16> %x to <8 x i64>
222  %yy = sext <8 x i16> %y to <8 x i64>
223  %m = mul <8 x i64> %xx, %yy
224  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
225  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
226  ret i64 %z
227}
228
229define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_zext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
230; CHECK-LABEL: add_v8i8i16_v8i64_zext:
231; CHECK:       @ %bb.0: @ %entry
232; CHECK-NEXT:    vmovlb.u8 q1, q1
233; CHECK-NEXT:    vpt.i16 eq, q2, zr
234; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
235; CHECK-NEXT:    bx lr
236entry:
237  %c = icmp eq <8 x i16> %b, zeroinitializer
238  %xx = zext <8 x i16> %x to <8 x i64>
239  %yy = zext <8 x i8> %y to <8 x i64>
240  %m = mul <8 x i64> %xx, %yy
241  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
242  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
243  ret i64 %z
244}
245
246define arm_aapcs_vfpcc i64 @add_v8i8i16_v8i64_sext(<8 x i16> %x, <8 x i8> %y, <8 x i16> %b) {
247; CHECK-LABEL: add_v8i8i16_v8i64_sext:
248; CHECK:       @ %bb.0: @ %entry
249; CHECK-NEXT:    vmovlb.s8 q1, q1
250; CHECK-NEXT:    vpt.i16 eq, q2, zr
251; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
252; CHECK-NEXT:    bx lr
253entry:
254  %c = icmp eq <8 x i16> %b, zeroinitializer
255  %xx = sext <8 x i16> %x to <8 x i64>
256  %yy = sext <8 x i8> %y to <8 x i64>
257  %m = mul <8 x i64> %xx, %yy
258  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
259  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
260  ret i64 %z
261}
262
263define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
264; CHECK-LABEL: add_v8i16_v8i32_v8i64_zext:
265; CHECK:       @ %bb.0: @ %entry
266; CHECK-NEXT:    vpt.i16 eq, q2, zr
267; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
268; CHECK-NEXT:    bx lr
269entry:
270  %c = icmp eq <8 x i16> %b, zeroinitializer
271  %xx = zext <8 x i16> %x to <8 x i32>
272  %yy = zext <8 x i16> %y to <8 x i32>
273  %m = mul <8 x i32> %xx, %yy
274  %ma = zext <8 x i32> %m to <8 x i64>
275  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
276  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
277  ret i64 %z
278}
279
280define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
281; CHECK-LABEL: add_v8i16_v8i32_v8i64_sext:
282; CHECK:       @ %bb.0: @ %entry
283; CHECK-NEXT:    vpt.i16 eq, q2, zr
284; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
285; CHECK-NEXT:    bx lr
286entry:
287  %c = icmp eq <8 x i16> %b, zeroinitializer
288  %xx = sext <8 x i16> %x to <8 x i32>
289  %yy = sext <8 x i16> %y to <8 x i32>
290  %m = mul <8 x i32> %xx, %yy
291  %ma = sext <8 x i32> %m to <8 x i64>
292  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
293  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
294  ret i64 %z
295}
296
297define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b) {
298; CHECK-LABEL: add_v8i16_v8i32_v8i64_sextzext:
299; CHECK:       @ %bb.0: @ %entry
300; CHECK-NEXT:    vpt.i16 eq, q2, zr
301; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q0
302; CHECK-NEXT:    bx lr
303entry:
304  %c = icmp eq <8 x i16> %b, zeroinitializer
305  %xx = sext <8 x i16> %x to <8 x i32>
306  %m = mul <8 x i32> %xx, %xx
307  %ma = zext <8 x i32> %m to <8 x i64>
308  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
309  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
310  ret i64 %z
311}
312
313define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
314; CHECK-LABEL: add_v4i16_v4i64_zext:
315; CHECK:       @ %bb.0: @ %entry
316; CHECK-NEXT:    vmovlb.u16 q2, q2
317; CHECK-NEXT:    vmovlb.u16 q1, q1
318; CHECK-NEXT:    vmovlb.u16 q0, q0
319; CHECK-NEXT:    vpt.i32 eq, q2, zr
320; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
321; CHECK-NEXT:    bx lr
322entry:
323  %c = icmp eq <4 x i16> %b, zeroinitializer
324  %xx = zext <4 x i16> %x to <4 x i64>
325  %yy = zext <4 x i16> %y to <4 x i64>
326  %m = mul <4 x i64> %xx, %yy
327  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
328  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
329  ret i64 %z
330}
331
332define arm_aapcs_vfpcc i64 @add_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b) {
333; CHECK-LABEL: add_v4i16_v4i64_sext:
334; CHECK:       @ %bb.0: @ %entry
335; CHECK-NEXT:    vmovlb.u16 q2, q2
336; CHECK-NEXT:    vmovlb.s16 q1, q1
337; CHECK-NEXT:    vmovlb.s16 q0, q0
338; CHECK-NEXT:    vpt.i32 eq, q2, zr
339; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
340; CHECK-NEXT:    bx lr
341entry:
342  %c = icmp eq <4 x i16> %b, zeroinitializer
343  %xx = sext <4 x i16> %x to <4 x i64>
344  %yy = sext <4 x i16> %y to <4 x i64>
345  %m = mul <4 x i64> %xx, %yy
346  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
347  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
348  ret i64 %z
349}
350
351define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
352; CHECK-LABEL: add_v2i16_v2i64_zext:
353; CHECK:       @ %bb.0: @ %entry
354; CHECK-NEXT:    vmov.i64 q3, #0xffff
355; CHECK-NEXT:    vand q1, q1, q3
356; CHECK-NEXT:    vand q0, q0, q3
357; CHECK-NEXT:    vmov r0, s6
358; CHECK-NEXT:    vmov r1, s2
359; CHECK-NEXT:    vmov r2, s4
360; CHECK-NEXT:    vand q1, q2, q3
361; CHECK-NEXT:    vmov r3, s0
362; CHECK-NEXT:    umull r0, r1, r1, r0
363; CHECK-NEXT:    umull r2, r3, r3, r2
364; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
365; CHECK-NEXT:    vmov r0, s4
366; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
367; CHECK-NEXT:    movs r1, #0
368; CHECK-NEXT:    cmp r0, #0
369; CHECK-NEXT:    csetm r0, eq
370; CHECK-NEXT:    bfi r1, r0, #0, #8
371; CHECK-NEXT:    vmov r0, s6
372; CHECK-NEXT:    vmov.i32 q1, #0x0
373; CHECK-NEXT:    cmp r0, #0
374; CHECK-NEXT:    csetm r0, eq
375; CHECK-NEXT:    bfi r1, r0, #8, #8
376; CHECK-NEXT:    vmsr p0, r1
377; CHECK-NEXT:    vpsel q0, q0, q1
378; CHECK-NEXT:    vmov r0, r1, d1
379; CHECK-NEXT:    vmov r2, r3, d0
380; CHECK-NEXT:    adds r0, r0, r2
381; CHECK-NEXT:    adcs r1, r3
382; CHECK-NEXT:    bx lr
383entry:
384  %c = icmp eq <2 x i16> %b, zeroinitializer
385  %xx = zext <2 x i16> %x to <2 x i64>
386  %yy = zext <2 x i16> %y to <2 x i64>
387  %m = mul <2 x i64> %xx, %yy
388  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
389  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
390  ret i64 %z
391}
392
393define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b) {
394; CHECK-LABEL: add_v2i16_v2i64_sext:
395; CHECK:       @ %bb.0: @ %entry
396; CHECK-NEXT:    vmov.i32 q3, #0xffff
397; CHECK-NEXT:    movs r1, #0
398; CHECK-NEXT:    vand q2, q2, q3
399; CHECK-NEXT:    vmov r2, s4
400; CHECK-NEXT:    vmov r0, s8
401; CHECK-NEXT:    vmov r3, s0
402; CHECK-NEXT:    cmp r0, #0
403; CHECK-NEXT:    sxth r2, r2
404; CHECK-NEXT:    csetm r0, eq
405; CHECK-NEXT:    bfi r1, r0, #0, #8
406; CHECK-NEXT:    vmov r0, s10
407; CHECK-NEXT:    sxth r3, r3
408; CHECK-NEXT:    smull r2, r3, r3, r2
409; CHECK-NEXT:    cmp r0, #0
410; CHECK-NEXT:    csetm r0, eq
411; CHECK-NEXT:    bfi r1, r0, #8, #8
412; CHECK-NEXT:    vmov r0, s6
413; CHECK-NEXT:    vmsr p0, r1
414; CHECK-NEXT:    vmov r1, s2
415; CHECK-NEXT:    vmov.i32 q1, #0x0
416; CHECK-NEXT:    sxth r0, r0
417; CHECK-NEXT:    sxth r1, r1
418; CHECK-NEXT:    smull r0, r1, r1, r0
419; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
420; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
421; CHECK-NEXT:    vpsel q0, q0, q1
422; CHECK-NEXT:    vmov r0, r1, d1
423; CHECK-NEXT:    vmov r2, r3, d0
424; CHECK-NEXT:    adds r0, r0, r2
425; CHECK-NEXT:    adcs r1, r3
426; CHECK-NEXT:    bx lr
427entry:
428  %c = icmp eq <2 x i16> %b, zeroinitializer
429  %xx = sext <2 x i16> %x to <2 x i64>
430  %yy = sext <2 x i16> %y to <2 x i64>
431  %m = mul <2 x i64> %xx, %yy
432  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
433  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
434  ret i64 %z
435}
436
437define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
438; CHECK-LABEL: add_v16i8_v16i32_zext:
439; CHECK:       @ %bb.0: @ %entry
440; CHECK-NEXT:    vpt.i8 eq, q2, zr
441; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
442; CHECK-NEXT:    bx lr
443entry:
444  %c = icmp eq <16 x i8> %b, zeroinitializer
445  %xx = zext <16 x i8> %x to <16 x i32>
446  %yy = zext <16 x i8> %y to <16 x i32>
447  %m = mul <16 x i32> %xx, %yy
448  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
449  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
450  ret i32 %z
451}
452
453define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
454; CHECK-LABEL: add_v16i8_v16i32_sext:
455; CHECK:       @ %bb.0: @ %entry
456; CHECK-NEXT:    vpt.i8 eq, q2, zr
457; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
458; CHECK-NEXT:    bx lr
459entry:
460  %c = icmp eq <16 x i8> %b, zeroinitializer
461  %xx = sext <16 x i8> %x to <16 x i32>
462  %yy = sext <16 x i8> %y to <16 x i32>
463  %m = mul <16 x i32> %xx, %yy
464  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
465  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
466  ret i32 %z
467}
468
469define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
470; CHECK-LABEL: add_v16i8_v16i16_v16i32_zext:
471; CHECK:       @ %bb.0: @ %entry
472; CHECK-NEXT:    vpt.i8 eq, q2, zr
473; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
474; CHECK-NEXT:    bx lr
475entry:
476  %c = icmp eq <16 x i8> %b, zeroinitializer
477  %xx = zext <16 x i8> %x to <16 x i16>
478  %yy = zext <16 x i8> %y to <16 x i16>
479  %m = mul <16 x i16> %xx, %yy
480  %ma = zext <16 x i16> %m to <16 x i32>
481  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
482  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
483  ret i32 %z
484}
485
486define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
487; CHECK-LABEL: add_v16i8_v16i16_v16i32_sext:
488; CHECK:       @ %bb.0: @ %entry
489; CHECK-NEXT:    vpt.i8 eq, q2, zr
490; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
491; CHECK-NEXT:    bx lr
492entry:
493  %c = icmp eq <16 x i8> %b, zeroinitializer
494  %xx = sext <16 x i8> %x to <16 x i16>
495  %yy = sext <16 x i8> %y to <16 x i16>
496  %m = mul <16 x i16> %xx, %yy
497  %ma = sext <16 x i16> %m to <16 x i32>
498  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
499  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
500  ret i32 %z
501}
502
503define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
504; CHECK-LABEL: add_v16i8_v16i16_v16i32_sextzext:
505; CHECK:       @ %bb.0: @ %entry
506; CHECK-NEXT:    vpt.i8 eq, q2, zr
507; CHECK-NEXT:    vmlavt.s8 r0, q0, q0
508; CHECK-NEXT:    bx lr
509entry:
510  %c = icmp eq <16 x i8> %b, zeroinitializer
511  %xx = sext <16 x i8> %x to <16 x i16>
512  %m = mul <16 x i16> %xx, %xx
513  %ma = zext <16 x i16> %m to <16 x i32>
514  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
515  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
516  ret i32 %z
517}
518
519define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
520; CHECK-LABEL: add_v8i8_v8i32_zext:
521; CHECK:       @ %bb.0: @ %entry
522; CHECK-NEXT:    vmovlb.u8 q2, q2
523; CHECK-NEXT:    vmovlb.u8 q1, q1
524; CHECK-NEXT:    vmovlb.u8 q0, q0
525; CHECK-NEXT:    vpt.i16 eq, q2, zr
526; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
527; CHECK-NEXT:    bx lr
528entry:
529  %c = icmp eq <8 x i8> %b, zeroinitializer
530  %xx = zext <8 x i8> %x to <8 x i32>
531  %yy = zext <8 x i8> %y to <8 x i32>
532  %m = mul <8 x i32> %xx, %yy
533  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
534  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
535  ret i32 %z
536}
537
538define arm_aapcs_vfpcc i32 @add_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
539; CHECK-LABEL: add_v8i8_v8i32_sext:
540; CHECK:       @ %bb.0: @ %entry
541; CHECK-NEXT:    vmovlb.u8 q2, q2
542; CHECK-NEXT:    vmovlb.s8 q1, q1
543; CHECK-NEXT:    vmovlb.s8 q0, q0
544; CHECK-NEXT:    vpt.i16 eq, q2, zr
545; CHECK-NEXT:    vmlavt.s16 r0, q0, q1
546; CHECK-NEXT:    bx lr
547entry:
548  %c = icmp eq <8 x i8> %b, zeroinitializer
549  %xx = sext <8 x i8> %x to <8 x i32>
550  %yy = sext <8 x i8> %y to <8 x i32>
551  %m = mul <8 x i32> %xx, %yy
552  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
553  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
554  ret i32 %z
555}
556
557define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_zext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
558; CHECK-LABEL: add_v8i8i16_v8i32_zext:
559; CHECK:       @ %bb.0: @ %entry
560; CHECK-NEXT:    vmovlb.u8 q2, q2
561; CHECK-NEXT:    vmovlb.u8 q0, q0
562; CHECK-NEXT:    vpt.i16 eq, q2, zr
563; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
564; CHECK-NEXT:    bx lr
565entry:
566  %c = icmp eq <8 x i8> %b, zeroinitializer
567  %xx = zext <8 x i8> %x to <8 x i32>
568  %yy = zext <8 x i16> %y to <8 x i32>
569  %m = mul <8 x i32> %xx, %yy
570  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
571  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
572  ret i32 %z
573}
574
575define arm_aapcs_vfpcc i32 @add_v8i8i16_v8i32_sext(<8 x i8> %x, <8 x i16> %y, <8 x i8> %b) {
576; CHECK-LABEL: add_v8i8i16_v8i32_sext:
577; CHECK:       @ %bb.0: @ %entry
578; CHECK-NEXT:    vmovlb.u8 q2, q2
579; CHECK-NEXT:    vmovlb.s8 q0, q0
580; CHECK-NEXT:    vpt.i16 eq, q2, zr
581; CHECK-NEXT:    vmlavt.s16 r0, q0, q1
582; CHECK-NEXT:    bx lr
583entry:
584  %c = icmp eq <8 x i8> %b, zeroinitializer
585  %xx = sext <8 x i8> %x to <8 x i32>
586  %yy = sext <8 x i16> %y to <8 x i32>
587  %m = mul <8 x i32> %xx, %yy
588  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
589  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
590  ret i32 %z
591}
592
593define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
594; CHECK-LABEL: add_v4i8_v4i32_zext:
595; CHECK:       @ %bb.0: @ %entry
596; CHECK-NEXT:    vmov.i32 q3, #0xff
597; CHECK-NEXT:    vand q2, q2, q3
598; CHECK-NEXT:    vand q1, q1, q3
599; CHECK-NEXT:    vand q0, q0, q3
600; CHECK-NEXT:    vpt.i32 eq, q2, zr
601; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
602; CHECK-NEXT:    bx lr
603entry:
604  %c = icmp eq <4 x i8> %b, zeroinitializer
605  %xx = zext <4 x i8> %x to <4 x i32>
606  %yy = zext <4 x i8> %y to <4 x i32>
607  %m = mul <4 x i32> %xx, %yy
608  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
609  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
610  ret i32 %z
611}
612
613define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
614; CHECK-LABEL: add_v4i8_v4i32_sext:
615; CHECK:       @ %bb.0: @ %entry
616; CHECK-NEXT:    vmov.i32 q3, #0xff
617; CHECK-NEXT:    vmovlb.s8 q1, q1
618; CHECK-NEXT:    vmovlb.s8 q0, q0
619; CHECK-NEXT:    vand q2, q2, q3
620; CHECK-NEXT:    vmovlb.s16 q1, q1
621; CHECK-NEXT:    vmovlb.s16 q0, q0
622; CHECK-NEXT:    vpt.i32 eq, q2, zr
623; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
624; CHECK-NEXT:    bx lr
625entry:
626  %c = icmp eq <4 x i8> %b, zeroinitializer
627  %xx = sext <4 x i8> %x to <4 x i32>
628  %yy = sext <4 x i8> %y to <4 x i32>
629  %m = mul <4 x i32> %xx, %yy
630  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
631  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
632  ret i32 %z
633}
634
635define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_szext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
636; CHECK-LABEL: add_v4i8_v4i32_szext:
637; CHECK:       @ %bb.0: @ %entry
638; CHECK-NEXT:    vmov.i32 q3, #0xff
639; CHECK-NEXT:    vmovlb.s8 q0, q0
640; CHECK-NEXT:    vand q2, q2, q3
641; CHECK-NEXT:    vand q1, q1, q3
642; CHECK-NEXT:    vmovlb.s16 q0, q0
643; CHECK-NEXT:    vpt.i32 eq, q2, zr
644; CHECK-NEXT:    vmlavt.u32 r0, q0, q1
645; CHECK-NEXT:    bx lr
646entry:
647  %c = icmp eq <4 x i8> %b, zeroinitializer
648  %xx = sext <4 x i8> %x to <4 x i32>
649  %yy = zext <4 x i8> %y to <4 x i32>
650  %m = mul <4 x i32> %xx, %yy
651  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
652  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
653  ret i32 %z
654}
655
656define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
657; CHECK-LABEL: add_v16i8_v16i16_zext:
658; CHECK:       @ %bb.0: @ %entry
659; CHECK-NEXT:    vpt.i8 eq, q2, zr
660; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
661; CHECK-NEXT:    uxth r0, r0
662; CHECK-NEXT:    bx lr
663entry:
664  %c = icmp eq <16 x i8> %b, zeroinitializer
665  %xx = zext <16 x i8> %x to <16 x i16>
666  %yy = zext <16 x i8> %y to <16 x i16>
667  %m = mul <16 x i16> %xx, %yy
668  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
669  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
670  ret i16 %z
671}
672
673define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
674; CHECK-LABEL: add_v16i8_v16i16_sext:
675; CHECK:       @ %bb.0: @ %entry
676; CHECK-NEXT:    vpt.i8 eq, q2, zr
677; CHECK-NEXT:    vmlavt.s8 r0, q0, q1
678; CHECK-NEXT:    sxth r0, r0
679; CHECK-NEXT:    bx lr
680entry:
681  %c = icmp eq <16 x i8> %b, zeroinitializer
682  %xx = sext <16 x i8> %x to <16 x i16>
683  %yy = sext <16 x i8> %y to <16 x i16>
684  %m = mul <16 x i16> %xx, %yy
685  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
686  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
687  ret i16 %z
688}
689
690define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_szext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
691; CHECK-LABEL: add_v16i8_v16i16_szext:
692; CHECK:       @ %bb.0: @ %entry
693; CHECK-NEXT:    .pad #32
694; CHECK-NEXT:    sub sp, #32
695; CHECK-NEXT:    add r0, sp, #16
696; CHECK-NEXT:    mov r1, sp
697; CHECK-NEXT:    vstrw.32 q1, [r0]
698; CHECK-NEXT:    vstrw.32 q0, [r1]
699; CHECK-NEXT:    vcmp.i8 eq, q2, zr
700; CHECK-NEXT:    vmov.i8 q0, #0x0
701; CHECK-NEXT:    vmov.i8 q1, #0xff
702; CHECK-NEXT:    vldrb.s16 q2, [r1, #8]
703; CHECK-NEXT:    vpsel q0, q1, q0
704; CHECK-NEXT:    vmov.u8 r2, q0[8]
705; CHECK-NEXT:    vmov.u8 r3, q0[0]
706; CHECK-NEXT:    vmov.16 q1[0], r2
707; CHECK-NEXT:    vmov.u8 r2, q0[9]
708; CHECK-NEXT:    vmov.16 q1[1], r2
709; CHECK-NEXT:    vmov.u8 r2, q0[10]
710; CHECK-NEXT:    vmov.16 q1[2], r2
711; CHECK-NEXT:    vmov.u8 r2, q0[11]
712; CHECK-NEXT:    vmov.16 q1[3], r2
713; CHECK-NEXT:    vmov.u8 r2, q0[12]
714; CHECK-NEXT:    vmov.16 q1[4], r2
715; CHECK-NEXT:    vmov.u8 r2, q0[13]
716; CHECK-NEXT:    vmov.16 q1[5], r2
717; CHECK-NEXT:    vmov.u8 r2, q0[14]
718; CHECK-NEXT:    vmov.16 q1[6], r2
719; CHECK-NEXT:    vmov.u8 r2, q0[15]
720; CHECK-NEXT:    vmov.16 q1[7], r2
721; CHECK-NEXT:    vcmp.i16 ne, q1, zr
722; CHECK-NEXT:    vldrb.u16 q1, [r0, #8]
723; CHECK-NEXT:    vpst
724; CHECK-NEXT:    vmlavt.u16 r2, q2, q1
725; CHECK-NEXT:    vmov.16 q1[0], r3
726; CHECK-NEXT:    vmov.u8 r3, q0[1]
727; CHECK-NEXT:    vmov.16 q1[1], r3
728; CHECK-NEXT:    vmov.u8 r3, q0[2]
729; CHECK-NEXT:    vmov.16 q1[2], r3
730; CHECK-NEXT:    vmov.u8 r3, q0[3]
731; CHECK-NEXT:    vmov.16 q1[3], r3
732; CHECK-NEXT:    vmov.u8 r3, q0[4]
733; CHECK-NEXT:    vmov.16 q1[4], r3
734; CHECK-NEXT:    vmov.u8 r3, q0[5]
735; CHECK-NEXT:    vmov.16 q1[5], r3
736; CHECK-NEXT:    vmov.u8 r3, q0[6]
737; CHECK-NEXT:    vmov.16 q1[6], r3
738; CHECK-NEXT:    vmov.u8 r3, q0[7]
739; CHECK-NEXT:    vmov.16 q1[7], r3
740; CHECK-NEXT:    vldrb.u16 q0, [r0]
741; CHECK-NEXT:    vcmp.i16 ne, q1, zr
742; CHECK-NEXT:    vldrb.s16 q1, [r1]
743; CHECK-NEXT:    vpst
744; CHECK-NEXT:    vmlavat.u16 r2, q1, q0
745; CHECK-NEXT:    sxth r0, r2
746; CHECK-NEXT:    add sp, #32
747; CHECK-NEXT:    bx lr
748entry:
749  %c = icmp eq <16 x i8> %b, zeroinitializer
750  %xx = sext <16 x i8> %x to <16 x i16>
751  %yy = zext <16 x i8> %y to <16 x i16>
752  %m = mul <16 x i16> %xx, %yy
753  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
754  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
755  ret i16 %z
756}
757
758define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
759; CHECK-LABEL: add_v8i8_v8i16_zext:
760; CHECK:       @ %bb.0: @ %entry
761; CHECK-NEXT:    vmovlb.u8 q2, q2
762; CHECK-NEXT:    vmovlb.u8 q1, q1
763; CHECK-NEXT:    vmovlb.u8 q0, q0
764; CHECK-NEXT:    vpt.i16 eq, q2, zr
765; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
766; CHECK-NEXT:    uxth r0, r0
767; CHECK-NEXT:    bx lr
768entry:
769  %c = icmp eq <8 x i8> %b, zeroinitializer
770  %xx = zext <8 x i8> %x to <8 x i16>
771  %yy = zext <8 x i8> %y to <8 x i16>
772  %m = mul <8 x i16> %xx, %yy
773  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
774  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
775  ret i16 %z
776}
777
778define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
779; CHECK-LABEL: add_v8i8_v8i16_sext:
780; CHECK:       @ %bb.0: @ %entry
781; CHECK-NEXT:    vmovlb.u8 q2, q2
782; CHECK-NEXT:    vmovlb.s8 q1, q1
783; CHECK-NEXT:    vmovlb.s8 q0, q0
784; CHECK-NEXT:    vpt.i16 eq, q2, zr
785; CHECK-NEXT:    vmlavt.u16 r0, q0, q1
786; CHECK-NEXT:    sxth r0, r0
787; CHECK-NEXT:    bx lr
788entry:
789  %c = icmp eq <8 x i8> %b, zeroinitializer
790  %xx = sext <8 x i8> %x to <8 x i16>
791  %yy = sext <8 x i8> %y to <8 x i16>
792  %m = mul <8 x i16> %xx, %yy
793  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
794  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
795  ret i16 %z
796}
797
798define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
799; CHECK-LABEL: add_v16i8_v16i8:
800; CHECK:       @ %bb.0: @ %entry
801; CHECK-NEXT:    vpt.i8 eq, q2, zr
802; CHECK-NEXT:    vmlavt.u8 r0, q0, q1
803; CHECK-NEXT:    uxtb r0, r0
804; CHECK-NEXT:    bx lr
805entry:
806  %c = icmp eq <16 x i8> %b, zeroinitializer
807  %m = mul <16 x i8> %x, %y
808  %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
809  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
810  ret i8 %z
811}
812
813define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
814; CHECK-LABEL: add_v16i8_v16i64_zext:
815; CHECK:       @ %bb.0: @ %entry
816; CHECK-NEXT:    .save {r7, lr}
817; CHECK-NEXT:    push {r7, lr}
818; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
819; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
820; CHECK-NEXT:    .pad #32
821; CHECK-NEXT:    sub sp, #32
822; CHECK-NEXT:    vmov q3, q0
823; CHECK-NEXT:    vmov.i8 q0, #0x0
824; CHECK-NEXT:    vcmp.i8 eq, q2, zr
825; CHECK-NEXT:    vmov.i8 q2, #0xff
826; CHECK-NEXT:    vpsel q6, q2, q0
827; CHECK-NEXT:    vmov q4, q0
828; CHECK-NEXT:    vmov.u8 r0, q6[0]
829; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
830; CHECK-NEXT:    vmov.16 q0[0], r0
831; CHECK-NEXT:    vmov.u8 r0, q6[1]
832; CHECK-NEXT:    vmov.16 q0[1], r0
833; CHECK-NEXT:    vmov.u8 r0, q6[2]
834; CHECK-NEXT:    vmov.16 q0[2], r0
835; CHECK-NEXT:    vmov.u8 r0, q6[3]
836; CHECK-NEXT:    vmov.16 q0[3], r0
837; CHECK-NEXT:    vmov.u8 r0, q6[4]
838; CHECK-NEXT:    vmov.16 q0[4], r0
839; CHECK-NEXT:    vmov.u8 r0, q6[5]
840; CHECK-NEXT:    vmov.16 q0[5], r0
841; CHECK-NEXT:    vmov.u8 r0, q6[6]
842; CHECK-NEXT:    vmov.16 q0[6], r0
843; CHECK-NEXT:    vmov.u8 r0, q6[7]
844; CHECK-NEXT:    vmov.16 q0[7], r0
845; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
846; CHECK-NEXT:    vcmp.i16 ne, q0, zr
847; CHECK-NEXT:    vmov.u8 r2, q3[0]
848; CHECK-NEXT:    vpsel q7, q2, q4
849; CHECK-NEXT:    vmov.u16 r0, q7[2]
850; CHECK-NEXT:    vmov.u16 r1, q7[0]
851; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
852; CHECK-NEXT:    vmov.u16 r0, q7[3]
853; CHECK-NEXT:    vmov.u16 r1, q7[1]
854; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
855; CHECK-NEXT:    vcmp.i32 ne, q0, zr
856; CHECK-NEXT:    vpsel q0, q2, q4
857; CHECK-NEXT:    vmov r0, r1, d0
858; CHECK-NEXT:    vmov q2[2], q2[0], r0, r1
859; CHECK-NEXT:    vmov q2[3], q2[1], r0, r1
860; CHECK-NEXT:    vmov.u8 r0, q1[1]
861; CHECK-NEXT:    vmov.u8 r1, q1[0]
862; CHECK-NEXT:    vcmp.i32 ne, q2, zr
863; CHECK-NEXT:    vmov q5[2], q5[0], r1, r0
864; CHECK-NEXT:    vmov.u8 r1, q3[1]
865; CHECK-NEXT:    vmov.i64 q2, #0xff
866; CHECK-NEXT:    vmov q4[2], q4[0], r2, r1
867; CHECK-NEXT:    vand q5, q5, q2
868; CHECK-NEXT:    vand q4, q4, q2
869; CHECK-NEXT:    vmov r0, s22
870; CHECK-NEXT:    vmov r1, s18
871; CHECK-NEXT:    vmov r2, s20
872; CHECK-NEXT:    vmov.i32 q5, #0x0
873; CHECK-NEXT:    vmov r3, s16
874; CHECK-NEXT:    umull r0, r1, r1, r0
875; CHECK-NEXT:    umull r2, r3, r3, r2
876; CHECK-NEXT:    vmov q4[2], q4[0], r2, r0
877; CHECK-NEXT:    vmov q4[3], q4[1], r3, r1
878; CHECK-NEXT:    vpsel q4, q4, q5
879; CHECK-NEXT:    vmov r0, r1, d9
880; CHECK-NEXT:    vmov r2, r3, d8
881; CHECK-NEXT:    adds.w r12, r2, r0
882; CHECK-NEXT:    vmov.u8 r0, q3[2]
883; CHECK-NEXT:    adc.w lr, r3, r1
884; CHECK-NEXT:    vmov r2, r3, d1
885; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
886; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
887; CHECK-NEXT:    vmov.u8 r2, q1[3]
888; CHECK-NEXT:    vmov.u8 r3, q1[2]
889; CHECK-NEXT:    vcmp.i32 ne, q0, zr
890; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
891; CHECK-NEXT:    vmov.u8 r3, q3[3]
892; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
893; CHECK-NEXT:    vand q0, q0, q2
894; CHECK-NEXT:    vand q4, q4, q2
895; CHECK-NEXT:    vmov r2, s2
896; CHECK-NEXT:    vmov r0, s18
897; CHECK-NEXT:    vmov r1, s16
898; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
899; CHECK-NEXT:    vmov r3, s0
900; CHECK-NEXT:    umull r0, r2, r0, r2
901; CHECK-NEXT:    umull r1, r3, r1, r3
902; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
903; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
904; CHECK-NEXT:    vpsel q0, q0, q5
905; CHECK-NEXT:    vmov r0, r1, d0
906; CHECK-NEXT:    vmov r2, r3, d1
907; CHECK-NEXT:    adds.w r0, r0, r12
908; CHECK-NEXT:    adc.w r1, r1, lr
909; CHECK-NEXT:    adds.w r12, r0, r2
910; CHECK-NEXT:    adc.w lr, r1, r3
911; CHECK-NEXT:    vmov.u16 r2, q7[6]
912; CHECK-NEXT:    vmov.u16 r3, q7[4]
913; CHECK-NEXT:    vmov.u8 r0, q3[4]
914; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
915; CHECK-NEXT:    vmov.u16 r2, q7[7]
916; CHECK-NEXT:    vmov.u16 r3, q7[5]
917; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
918; CHECK-NEXT:    vcmp.i32 ne, q0, zr
919; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
920; CHECK-NEXT:    vpsel q0, q0, q4
921; CHECK-NEXT:    vmov r2, r3, d0
922; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
923; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
924; CHECK-NEXT:    vmov.u8 r2, q1[5]
925; CHECK-NEXT:    vmov.u8 r3, q1[4]
926; CHECK-NEXT:    vcmp.i32 ne, q4, zr
927; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
928; CHECK-NEXT:    vmov.u8 r3, q3[5]
929; CHECK-NEXT:    vmov q7[2], q7[0], r0, r3
930; CHECK-NEXT:    vand q4, q4, q2
931; CHECK-NEXT:    vand q7, q7, q2
932; CHECK-NEXT:    vmov r2, s18
933; CHECK-NEXT:    vmov r0, s30
934; CHECK-NEXT:    vmov r1, s28
935; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
936; CHECK-NEXT:    vmov r3, s16
937; CHECK-NEXT:    umull r0, r2, r0, r2
938; CHECK-NEXT:    umull r1, r3, r1, r3
939; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
940; CHECK-NEXT:    vmov q4[3], q4[1], r3, r2
941; CHECK-NEXT:    vpsel q4, q4, q5
942; CHECK-NEXT:    vmov r0, r1, d8
943; CHECK-NEXT:    vmov r2, r3, d9
944; CHECK-NEXT:    adds.w r0, r0, r12
945; CHECK-NEXT:    adc.w r1, r1, lr
946; CHECK-NEXT:    adds.w r12, r0, r2
947; CHECK-NEXT:    adc.w lr, r1, r3
948; CHECK-NEXT:    vmov r2, r3, d1
949; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
950; CHECK-NEXT:    vmov.u8 r0, q3[6]
951; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
952; CHECK-NEXT:    vmov.u8 r2, q1[7]
953; CHECK-NEXT:    vmov.u8 r3, q1[6]
954; CHECK-NEXT:    vcmp.i32 ne, q0, zr
955; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
956; CHECK-NEXT:    vmov.u8 r3, q3[7]
957; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
958; CHECK-NEXT:    vand q0, q0, q2
959; CHECK-NEXT:    vand q4, q4, q2
960; CHECK-NEXT:    vmov r2, s2
961; CHECK-NEXT:    vmov r0, s18
962; CHECK-NEXT:    vmov r1, s16
963; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
964; CHECK-NEXT:    vmov r3, s0
965; CHECK-NEXT:    umull r0, r2, r0, r2
966; CHECK-NEXT:    umull r1, r3, r1, r3
967; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
968; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
969; CHECK-NEXT:    vpsel q0, q0, q5
970; CHECK-NEXT:    vmov r0, r1, d0
971; CHECK-NEXT:    vmov r2, r3, d1
972; CHECK-NEXT:    adds.w r0, r0, r12
973; CHECK-NEXT:    adc.w r1, r1, lr
974; CHECK-NEXT:    adds.w r12, r0, r2
975; CHECK-NEXT:    vmov.u8 r2, q6[8]
976; CHECK-NEXT:    adc.w lr, r1, r3
977; CHECK-NEXT:    vmov.16 q0[0], r2
978; CHECK-NEXT:    vmov.u8 r2, q6[9]
979; CHECK-NEXT:    vmov.16 q0[1], r2
980; CHECK-NEXT:    vmov.u8 r2, q6[10]
981; CHECK-NEXT:    vmov.16 q0[2], r2
982; CHECK-NEXT:    vmov.u8 r2, q6[11]
983; CHECK-NEXT:    vmov.16 q0[3], r2
984; CHECK-NEXT:    vmov.u8 r2, q6[12]
985; CHECK-NEXT:    vmov.16 q0[4], r2
986; CHECK-NEXT:    vmov.u8 r2, q6[13]
987; CHECK-NEXT:    vmov.16 q0[5], r2
988; CHECK-NEXT:    vmov.u8 r2, q6[14]
989; CHECK-NEXT:    vmov.16 q0[6], r2
990; CHECK-NEXT:    vmov.u8 r2, q6[15]
991; CHECK-NEXT:    vmov.16 q0[7], r2
992; CHECK-NEXT:    vmov.u8 r0, q3[8]
993; CHECK-NEXT:    vcmp.i16 ne, q0, zr
994; CHECK-NEXT:    vpsel q6, q7, q4
995; CHECK-NEXT:    vmov.u16 r2, q6[2]
996; CHECK-NEXT:    vmov.u16 r3, q6[0]
997; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
998; CHECK-NEXT:    vmov.u16 r2, q6[3]
999; CHECK-NEXT:    vmov.u16 r3, q6[1]
1000; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1001; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1002; CHECK-NEXT:    vpsel q0, q7, q4
1003; CHECK-NEXT:    vmov r2, r3, d0
1004; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
1005; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
1006; CHECK-NEXT:    vmov.u8 r2, q1[9]
1007; CHECK-NEXT:    vmov.u8 r3, q1[8]
1008; CHECK-NEXT:    vcmp.i32 ne, q4, zr
1009; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
1010; CHECK-NEXT:    vmov.u8 r3, q3[9]
1011; CHECK-NEXT:    vmov q7[2], q7[0], r0, r3
1012; CHECK-NEXT:    vand q4, q4, q2
1013; CHECK-NEXT:    vand q7, q7, q2
1014; CHECK-NEXT:    vmov r2, s18
1015; CHECK-NEXT:    vmov r0, s30
1016; CHECK-NEXT:    vmov r3, s16
1017; CHECK-NEXT:    vmov r1, s28
1018; CHECK-NEXT:    umull r0, r2, r0, r2
1019; CHECK-NEXT:    umull r1, r3, r1, r3
1020; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
1021; CHECK-NEXT:    vmov q4[3], q4[1], r3, r2
1022; CHECK-NEXT:    vpsel q4, q4, q5
1023; CHECK-NEXT:    vmov r0, r1, d8
1024; CHECK-NEXT:    vmov r2, r3, d9
1025; CHECK-NEXT:    adds.w r0, r0, r12
1026; CHECK-NEXT:    adc.w r1, r1, lr
1027; CHECK-NEXT:    adds.w r12, r0, r2
1028; CHECK-NEXT:    adc.w lr, r1, r3
1029; CHECK-NEXT:    vmov r2, r3, d1
1030; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1031; CHECK-NEXT:    vmov.u8 r0, q3[10]
1032; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1033; CHECK-NEXT:    vmov.u8 r2, q1[11]
1034; CHECK-NEXT:    vmov.u8 r3, q1[10]
1035; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1036; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1037; CHECK-NEXT:    vmov.u8 r3, q3[11]
1038; CHECK-NEXT:    vmov q4[2], q4[0], r0, r3
1039; CHECK-NEXT:    vand q0, q0, q2
1040; CHECK-NEXT:    vand q4, q4, q2
1041; CHECK-NEXT:    vmov r2, s2
1042; CHECK-NEXT:    vmov r0, s18
1043; CHECK-NEXT:    vmov r1, s16
1044; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
1045; CHECK-NEXT:    vmov r3, s0
1046; CHECK-NEXT:    umull r0, r2, r0, r2
1047; CHECK-NEXT:    umull r1, r3, r1, r3
1048; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
1049; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1050; CHECK-NEXT:    vpsel q0, q0, q5
1051; CHECK-NEXT:    vmov r0, r1, d0
1052; CHECK-NEXT:    vmov r2, r3, d1
1053; CHECK-NEXT:    adds.w r0, r0, r12
1054; CHECK-NEXT:    adc.w r1, r1, lr
1055; CHECK-NEXT:    adds.w r12, r0, r2
1056; CHECK-NEXT:    adc.w lr, r1, r3
1057; CHECK-NEXT:    vmov.u16 r2, q6[6]
1058; CHECK-NEXT:    vmov.u16 r3, q6[4]
1059; CHECK-NEXT:    vmov.u8 r0, q3[12]
1060; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1061; CHECK-NEXT:    vmov.u16 r2, q6[7]
1062; CHECK-NEXT:    vmov.u16 r3, q6[5]
1063; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1064; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1065; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
1066; CHECK-NEXT:    vpsel q0, q0, q4
1067; CHECK-NEXT:    vmov r2, r3, d0
1068; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
1069; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
1070; CHECK-NEXT:    vmov.u8 r2, q1[13]
1071; CHECK-NEXT:    vmov.u8 r3, q1[12]
1072; CHECK-NEXT:    vcmp.i32 ne, q4, zr
1073; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
1074; CHECK-NEXT:    vmov.u8 r3, q3[13]
1075; CHECK-NEXT:    vmov q6[2], q6[0], r0, r3
1076; CHECK-NEXT:    vand q4, q4, q2
1077; CHECK-NEXT:    vand q6, q6, q2
1078; CHECK-NEXT:    vmov r2, s18
1079; CHECK-NEXT:    vmov r0, s26
1080; CHECK-NEXT:    vmov r3, s16
1081; CHECK-NEXT:    vmov r1, s24
1082; CHECK-NEXT:    umull r0, r2, r0, r2
1083; CHECK-NEXT:    umull r1, r3, r1, r3
1084; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
1085; CHECK-NEXT:    vmov q4[3], q4[1], r3, r2
1086; CHECK-NEXT:    vpsel q4, q4, q5
1087; CHECK-NEXT:    vmov r0, r1, d8
1088; CHECK-NEXT:    vmov r2, r3, d9
1089; CHECK-NEXT:    adds.w r0, r0, r12
1090; CHECK-NEXT:    adc.w r1, r1, lr
1091; CHECK-NEXT:    adds.w r12, r0, r2
1092; CHECK-NEXT:    adc.w lr, r1, r3
1093; CHECK-NEXT:    vmov r2, r3, d1
1094; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1095; CHECK-NEXT:    vmov.u8 r0, q3[14]
1096; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1097; CHECK-NEXT:    vmov.u8 r2, q1[15]
1098; CHECK-NEXT:    vmov.u8 r3, q1[14]
1099; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1100; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1101; CHECK-NEXT:    vmov.u8 r3, q3[15]
1102; CHECK-NEXT:    vmov q1[2], q1[0], r0, r3
1103; CHECK-NEXT:    vand q0, q0, q2
1104; CHECK-NEXT:    vand q1, q1, q2
1105; CHECK-NEXT:    vmov r2, s2
1106; CHECK-NEXT:    vmov r0, s6
1107; CHECK-NEXT:    vmov r3, s0
1108; CHECK-NEXT:    vmov r1, s4
1109; CHECK-NEXT:    umull r0, r2, r0, r2
1110; CHECK-NEXT:    umull r1, r3, r1, r3
1111; CHECK-NEXT:    vmov q0[2], q0[0], r1, r0
1112; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1113; CHECK-NEXT:    vpsel q0, q0, q5
1114; CHECK-NEXT:    vmov r0, r1, d0
1115; CHECK-NEXT:    vmov r2, r3, d1
1116; CHECK-NEXT:    adds.w r0, r0, r12
1117; CHECK-NEXT:    adc.w r1, r1, lr
1118; CHECK-NEXT:    adds r0, r0, r2
1119; CHECK-NEXT:    adcs r1, r3
1120; CHECK-NEXT:    add sp, #32
1121; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1122; CHECK-NEXT:    pop {r7, pc}
1123entry:
1124  %c = icmp eq <16 x i8> %b, zeroinitializer
1125  %xx = zext <16 x i8> %x to <16 x i64>
1126  %yy = zext <16 x i8> %y to <16 x i64>
1127  %m = mul <16 x i64> %xx, %yy
1128  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1129  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1130  ret i64 %z
1131}
1132
1133define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b) {
1134; CHECK-LABEL: add_v16i8_v16i64_sext:
1135; CHECK:       @ %bb.0: @ %entry
1136; CHECK-NEXT:    .save {r7, lr}
1137; CHECK-NEXT:    push {r7, lr}
1138; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
1139; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
1140; CHECK-NEXT:    .pad #16
1141; CHECK-NEXT:    sub sp, #16
1142; CHECK-NEXT:    vmov q3, q0
1143; CHECK-NEXT:    vcmp.i8 eq, q2, zr
1144; CHECK-NEXT:    vmov.i8 q0, #0x0
1145; CHECK-NEXT:    vmov.i8 q2, #0xff
1146; CHECK-NEXT:    vpsel q5, q2, q0
1147; CHECK-NEXT:    vmov.s8 r2, q1[0]
1148; CHECK-NEXT:    vmov.u8 r0, q5[0]
1149; CHECK-NEXT:    vmov.s8 r3, q3[0]
1150; CHECK-NEXT:    vmov.16 q4[0], r0
1151; CHECK-NEXT:    vmov.u8 r0, q5[1]
1152; CHECK-NEXT:    vmov.16 q4[1], r0
1153; CHECK-NEXT:    vmov.u8 r0, q5[2]
1154; CHECK-NEXT:    vmov.16 q4[2], r0
1155; CHECK-NEXT:    vmov.u8 r0, q5[3]
1156; CHECK-NEXT:    vmov.16 q4[3], r0
1157; CHECK-NEXT:    vmov.u8 r0, q5[4]
1158; CHECK-NEXT:    vmov.16 q4[4], r0
1159; CHECK-NEXT:    vmov.u8 r0, q5[5]
1160; CHECK-NEXT:    vmov.16 q4[5], r0
1161; CHECK-NEXT:    vmov.u8 r0, q5[6]
1162; CHECK-NEXT:    vmov.16 q4[6], r0
1163; CHECK-NEXT:    vmov.u8 r0, q5[7]
1164; CHECK-NEXT:    vmov.16 q4[7], r0
1165; CHECK-NEXT:    smull r2, r3, r3, r2
1166; CHECK-NEXT:    vcmp.i16 ne, q4, zr
1167; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
1168; CHECK-NEXT:    vpsel q6, q2, q0
1169; CHECK-NEXT:    vmov.u16 r0, q6[2]
1170; CHECK-NEXT:    vmov.u16 r1, q6[0]
1171; CHECK-NEXT:    vmov q4[2], q4[0], r1, r0
1172; CHECK-NEXT:    vmov.u16 r0, q6[3]
1173; CHECK-NEXT:    vmov.u16 r1, q6[1]
1174; CHECK-NEXT:    vmov q4[3], q4[1], r1, r0
1175; CHECK-NEXT:    vcmp.i32 ne, q4, zr
1176; CHECK-NEXT:    vpsel q7, q2, q0
1177; CHECK-NEXT:    vmov r0, r1, d14
1178; CHECK-NEXT:    vmov q4[2], q4[0], r0, r1
1179; CHECK-NEXT:    vmov q4[3], q4[1], r0, r1
1180; CHECK-NEXT:    vmov.s8 r0, q1[1]
1181; CHECK-NEXT:    vmov.s8 r1, q3[1]
1182; CHECK-NEXT:    vcmp.i32 ne, q4, zr
1183; CHECK-NEXT:    smull r0, r1, r1, r0
1184; CHECK-NEXT:    vmov.i32 q4, #0x0
1185; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
1186; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
1187; CHECK-NEXT:    vpsel q0, q0, q4
1188; CHECK-NEXT:    vmov r0, r1, d1
1189; CHECK-NEXT:    vmov r2, r3, d0
1190; CHECK-NEXT:    adds.w r12, r2, r0
1191; CHECK-NEXT:    vmov.s8 r0, q1[2]
1192; CHECK-NEXT:    adc.w lr, r3, r1
1193; CHECK-NEXT:    vmov r2, r3, d15
1194; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1195; CHECK-NEXT:    vmov.s8 r1, q3[2]
1196; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1197; CHECK-NEXT:    vmov.s8 r2, q1[3]
1198; CHECK-NEXT:    vmov.s8 r3, q3[3]
1199; CHECK-NEXT:    smull r0, r1, r1, r0
1200; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1201; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
1202; CHECK-NEXT:    smull r2, r3, r3, r2
1203; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1204; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1205; CHECK-NEXT:    vpsel q0, q0, q4
1206; CHECK-NEXT:    vmov r0, r1, d0
1207; CHECK-NEXT:    vmov r2, r3, d1
1208; CHECK-NEXT:    adds.w r0, r0, r12
1209; CHECK-NEXT:    adc.w r1, r1, lr
1210; CHECK-NEXT:    adds.w r12, r0, r2
1211; CHECK-NEXT:    adc.w lr, r1, r3
1212; CHECK-NEXT:    vmov.u16 r2, q6[6]
1213; CHECK-NEXT:    vmov.u16 r3, q6[4]
1214; CHECK-NEXT:    vmov.s8 r0, q1[4]
1215; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1216; CHECK-NEXT:    vmov.u16 r2, q6[7]
1217; CHECK-NEXT:    vmov.u16 r3, q6[5]
1218; CHECK-NEXT:    vmov.s8 r1, q3[4]
1219; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1220; CHECK-NEXT:    smull r0, r1, r1, r0
1221; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1222; CHECK-NEXT:    vpsel q6, q2, q7
1223; CHECK-NEXT:    vmov r2, r3, d12
1224; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1225; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1226; CHECK-NEXT:    vmov.s8 r2, q1[5]
1227; CHECK-NEXT:    vmov.s8 r3, q3[5]
1228; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1229; CHECK-NEXT:    smull r2, r3, r3, r2
1230; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1231; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1232; CHECK-NEXT:    vpsel q0, q0, q4
1233; CHECK-NEXT:    vmov r0, r1, d0
1234; CHECK-NEXT:    vmov r2, r3, d1
1235; CHECK-NEXT:    adds.w r0, r0, r12
1236; CHECK-NEXT:    adc.w r1, r1, lr
1237; CHECK-NEXT:    adds.w r12, r0, r2
1238; CHECK-NEXT:    adc.w lr, r1, r3
1239; CHECK-NEXT:    vmov r2, r3, d13
1240; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1241; CHECK-NEXT:    vmov.s8 r0, q1[6]
1242; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1243; CHECK-NEXT:    vmov.s8 r1, q3[6]
1244; CHECK-NEXT:    vmov.s8 r2, q1[7]
1245; CHECK-NEXT:    vmov.s8 r3, q3[7]
1246; CHECK-NEXT:    smull r2, r3, r3, r2
1247; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1248; CHECK-NEXT:    smull r0, r1, r1, r0
1249; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1250; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1251; CHECK-NEXT:    vpsel q0, q0, q4
1252; CHECK-NEXT:    vmov r0, r1, d0
1253; CHECK-NEXT:    vmov r2, r3, d1
1254; CHECK-NEXT:    adds.w r0, r0, r12
1255; CHECK-NEXT:    adc.w r1, r1, lr
1256; CHECK-NEXT:    adds.w r12, r0, r2
1257; CHECK-NEXT:    vmov.u8 r2, q5[8]
1258; CHECK-NEXT:    adc.w lr, r1, r3
1259; CHECK-NEXT:    vmov.16 q6[0], r2
1260; CHECK-NEXT:    vmov.u8 r2, q5[9]
1261; CHECK-NEXT:    vmov.16 q6[1], r2
1262; CHECK-NEXT:    vmov.u8 r2, q5[10]
1263; CHECK-NEXT:    vmov.16 q6[2], r2
1264; CHECK-NEXT:    vmov.u8 r2, q5[11]
1265; CHECK-NEXT:    vmov.16 q6[3], r2
1266; CHECK-NEXT:    vmov.u8 r2, q5[12]
1267; CHECK-NEXT:    vmov.16 q6[4], r2
1268; CHECK-NEXT:    vmov.u8 r2, q5[13]
1269; CHECK-NEXT:    vmov.16 q6[5], r2
1270; CHECK-NEXT:    vmov.u8 r2, q5[14]
1271; CHECK-NEXT:    vmov.16 q6[6], r2
1272; CHECK-NEXT:    vmov.u8 r2, q5[15]
1273; CHECK-NEXT:    vmov.16 q6[7], r2
1274; CHECK-NEXT:    vmov.s8 r0, q1[8]
1275; CHECK-NEXT:    vcmp.i16 ne, q6, zr
1276; CHECK-NEXT:    vmov.s8 r1, q3[8]
1277; CHECK-NEXT:    vpsel q5, q2, q7
1278; CHECK-NEXT:    smull r0, r1, r1, r0
1279; CHECK-NEXT:    vmov.u16 r2, q5[2]
1280; CHECK-NEXT:    vmov.u16 r3, q5[0]
1281; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1282; CHECK-NEXT:    vmov.u16 r2, q5[3]
1283; CHECK-NEXT:    vmov.u16 r3, q5[1]
1284; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1285; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1286; CHECK-NEXT:    vpsel q6, q2, q7
1287; CHECK-NEXT:    vmov r2, r3, d12
1288; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1289; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1290; CHECK-NEXT:    vmov.s8 r2, q1[9]
1291; CHECK-NEXT:    vmov.s8 r3, q3[9]
1292; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1293; CHECK-NEXT:    smull r2, r3, r3, r2
1294; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1295; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1296; CHECK-NEXT:    vpsel q0, q0, q4
1297; CHECK-NEXT:    vmov r0, r1, d0
1298; CHECK-NEXT:    vmov r2, r3, d1
1299; CHECK-NEXT:    adds.w r0, r0, r12
1300; CHECK-NEXT:    adc.w r1, r1, lr
1301; CHECK-NEXT:    adds.w r12, r0, r2
1302; CHECK-NEXT:    adc.w lr, r1, r3
1303; CHECK-NEXT:    vmov r2, r3, d13
1304; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1305; CHECK-NEXT:    vmov.s8 r0, q1[10]
1306; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1307; CHECK-NEXT:    vmov.s8 r1, q3[10]
1308; CHECK-NEXT:    vmov.s8 r2, q1[11]
1309; CHECK-NEXT:    vmov.s8 r3, q3[11]
1310; CHECK-NEXT:    smull r2, r3, r3, r2
1311; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1312; CHECK-NEXT:    smull r0, r1, r1, r0
1313; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1314; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1315; CHECK-NEXT:    vpsel q0, q0, q4
1316; CHECK-NEXT:    vmov r0, r1, d0
1317; CHECK-NEXT:    vmov r2, r3, d1
1318; CHECK-NEXT:    adds.w r0, r0, r12
1319; CHECK-NEXT:    adc.w r1, r1, lr
1320; CHECK-NEXT:    adds.w r12, r0, r2
1321; CHECK-NEXT:    adc.w lr, r1, r3
1322; CHECK-NEXT:    vmov.u16 r2, q5[6]
1323; CHECK-NEXT:    vmov.u16 r3, q5[4]
1324; CHECK-NEXT:    vmov.s8 r0, q1[12]
1325; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
1326; CHECK-NEXT:    vmov.u16 r2, q5[7]
1327; CHECK-NEXT:    vmov.u16 r3, q5[5]
1328; CHECK-NEXT:    vmov.s8 r1, q3[12]
1329; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
1330; CHECK-NEXT:    smull r0, r1, r1, r0
1331; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1332; CHECK-NEXT:    vpsel q2, q2, q7
1333; CHECK-NEXT:    vmov r2, r3, d4
1334; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1335; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1336; CHECK-NEXT:    vmov.s8 r2, q1[13]
1337; CHECK-NEXT:    vmov.s8 r3, q3[13]
1338; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1339; CHECK-NEXT:    smull r2, r3, r3, r2
1340; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1341; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1342; CHECK-NEXT:    vpsel q0, q0, q4
1343; CHECK-NEXT:    vmov r0, r1, d0
1344; CHECK-NEXT:    vmov r2, r3, d1
1345; CHECK-NEXT:    adds.w r0, r0, r12
1346; CHECK-NEXT:    adc.w r1, r1, lr
1347; CHECK-NEXT:    adds.w r12, r0, r2
1348; CHECK-NEXT:    adc.w lr, r1, r3
1349; CHECK-NEXT:    vmov r2, r3, d5
1350; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
1351; CHECK-NEXT:    vmov.s8 r0, q1[14]
1352; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
1353; CHECK-NEXT:    vmov.s8 r1, q3[14]
1354; CHECK-NEXT:    vmov.s8 r2, q1[15]
1355; CHECK-NEXT:    vmov.s8 r3, q3[15]
1356; CHECK-NEXT:    smull r2, r3, r3, r2
1357; CHECK-NEXT:    vcmp.i32 ne, q0, zr
1358; CHECK-NEXT:    smull r0, r1, r1, r0
1359; CHECK-NEXT:    vmov q0[2], q0[0], r0, r2
1360; CHECK-NEXT:    vmov q0[3], q0[1], r1, r3
1361; CHECK-NEXT:    vpsel q0, q0, q4
1362; CHECK-NEXT:    vmov r0, r1, d0
1363; CHECK-NEXT:    vmov r2, r3, d1
1364; CHECK-NEXT:    adds.w r0, r0, r12
1365; CHECK-NEXT:    adc.w r1, r1, lr
1366; CHECK-NEXT:    adds r0, r0, r2
1367; CHECK-NEXT:    adcs r1, r3
1368; CHECK-NEXT:    add sp, #16
1369; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
1370; CHECK-NEXT:    pop {r7, pc}
1371entry:
1372  %c = icmp eq <16 x i8> %b, zeroinitializer
1373  %xx = sext <16 x i8> %x to <16 x i64>
1374  %yy = sext <16 x i8> %y to <16 x i64>
1375  %m = mul <16 x i64> %xx, %yy
1376  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
1377  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
1378  ret i64 %z
1379}
1380
1381define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1382; CHECK-LABEL: add_v8i8_v8i64_zext:
1383; CHECK:       @ %bb.0: @ %entry
1384; CHECK-NEXT:    vmovlb.u8 q2, q2
1385; CHECK-NEXT:    vmovlb.u8 q1, q1
1386; CHECK-NEXT:    vmovlb.u8 q0, q0
1387; CHECK-NEXT:    vpt.i16 eq, q2, zr
1388; CHECK-NEXT:    vmlalvt.u16 r0, r1, q0, q1
1389; CHECK-NEXT:    bx lr
1390entry:
1391  %c = icmp eq <8 x i8> %b, zeroinitializer
1392  %xx = zext <8 x i8> %x to <8 x i64>
1393  %yy = zext <8 x i8> %y to <8 x i64>
1394  %m = mul <8 x i64> %xx, %yy
1395  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1396  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1397  ret i64 %z
1398}
1399
1400define arm_aapcs_vfpcc i64 @add_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b) {
1401; CHECK-LABEL: add_v8i8_v8i64_sext:
1402; CHECK:       @ %bb.0: @ %entry
1403; CHECK-NEXT:    vmovlb.u8 q2, q2
1404; CHECK-NEXT:    vmovlb.s8 q1, q1
1405; CHECK-NEXT:    vmovlb.s8 q0, q0
1406; CHECK-NEXT:    vpt.i16 eq, q2, zr
1407; CHECK-NEXT:    vmlalvt.s16 r0, r1, q0, q1
1408; CHECK-NEXT:    bx lr
1409entry:
1410  %c = icmp eq <8 x i8> %b, zeroinitializer
1411  %xx = sext <8 x i8> %x to <8 x i64>
1412  %yy = sext <8 x i8> %y to <8 x i64>
1413  %m = mul <8 x i64> %xx, %yy
1414  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1415  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1416  ret i64 %z
1417}
1418
1419define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1420; CHECK-LABEL: add_v4i8_v4i64_zext:
1421; CHECK:       @ %bb.0: @ %entry
1422; CHECK-NEXT:    vmov.i32 q3, #0xff
1423; CHECK-NEXT:    vand q2, q2, q3
1424; CHECK-NEXT:    vand q1, q1, q3
1425; CHECK-NEXT:    vand q0, q0, q3
1426; CHECK-NEXT:    vpt.i32 eq, q2, zr
1427; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
1428; CHECK-NEXT:    bx lr
1429entry:
1430  %c = icmp eq <4 x i8> %b, zeroinitializer
1431  %xx = zext <4 x i8> %x to <4 x i64>
1432  %yy = zext <4 x i8> %y to <4 x i64>
1433  %m = mul <4 x i64> %xx, %yy
1434  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1435  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1436  ret i64 %z
1437}
1438
1439define arm_aapcs_vfpcc i64 @add_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b) {
1440; CHECK-LABEL: add_v4i8_v4i64_sext:
1441; CHECK:       @ %bb.0: @ %entry
1442; CHECK-NEXT:    vmov.i32 q3, #0xff
1443; CHECK-NEXT:    vmovlb.s8 q1, q1
1444; CHECK-NEXT:    vmovlb.s8 q0, q0
1445; CHECK-NEXT:    vand q2, q2, q3
1446; CHECK-NEXT:    vmovlb.s16 q1, q1
1447; CHECK-NEXT:    vmovlb.s16 q0, q0
1448; CHECK-NEXT:    vpt.i32 eq, q2, zr
1449; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
1450; CHECK-NEXT:    bx lr
1451entry:
1452  %c = icmp eq <4 x i8> %b, zeroinitializer
1453  %xx = sext <4 x i8> %x to <4 x i64>
1454  %yy = sext <4 x i8> %y to <4 x i64>
1455  %m = mul <4 x i64> %xx, %yy
1456  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1457  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1458  ret i64 %z
1459}
1460
1461define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1462; CHECK-LABEL: add_v4i8i16_v4i64_zext:
1463; CHECK:       @ %bb.0: @ %entry
1464; CHECK-NEXT:    vmov.i32 q3, #0xff
1465; CHECK-NEXT:    vmovlb.u16 q1, q1
1466; CHECK-NEXT:    vand q2, q2, q3
1467; CHECK-NEXT:    vand q0, q0, q3
1468; CHECK-NEXT:    vpt.i32 eq, q2, zr
1469; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
1470; CHECK-NEXT:    bx lr
1471entry:
1472  %c = icmp eq <4 x i8> %b, zeroinitializer
1473  %xx = zext <4 x i8> %x to <4 x i64>
1474  %yy = zext <4 x i16> %y to <4 x i64>
1475  %m = mul <4 x i64> %xx, %yy
1476  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1477  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1478  ret i64 %z
1479}
1480
1481define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1482; CHECK-LABEL: add_v4i8i16_v4i64_sext:
1483; CHECK:       @ %bb.0: @ %entry
1484; CHECK-NEXT:    vmov.i32 q3, #0xff
1485; CHECK-NEXT:    vmovlb.s8 q0, q0
1486; CHECK-NEXT:    vand q2, q2, q3
1487; CHECK-NEXT:    vmovlb.s16 q1, q1
1488; CHECK-NEXT:    vmovlb.s16 q0, q0
1489; CHECK-NEXT:    vpt.i32 eq, q2, zr
1490; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
1491; CHECK-NEXT:    bx lr
1492entry:
1493  %c = icmp eq <4 x i8> %b, zeroinitializer
1494  %xx = sext <4 x i8> %x to <4 x i64>
1495  %yy = sext <4 x i16> %y to <4 x i64>
1496  %m = mul <4 x i64> %xx, %yy
1497  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1498  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1499  ret i64 %z
1500}
1501
1502define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_zext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1503; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_zext:
1504; CHECK:       @ %bb.0: @ %entry
1505; CHECK-NEXT:    vmov.i32 q3, #0xff
1506; CHECK-NEXT:    vmovlb.u16 q1, q1
1507; CHECK-NEXT:    vand q2, q2, q3
1508; CHECK-NEXT:    vand q0, q0, q3
1509; CHECK-NEXT:    vpt.i32 eq, q2, zr
1510; CHECK-NEXT:    vmlalvt.u32 r0, r1, q0, q1
1511; CHECK-NEXT:    bx lr
1512entry:
1513  %c = icmp eq <4 x i8> %b, zeroinitializer
1514  %xx = zext <4 x i8> %x to <4 x i32>
1515  %yy = zext <4 x i16> %y to <4 x i32>
1516  %mm = mul <4 x i32> %xx, %yy
1517  %m = zext <4 x i32> %mm to <4 x i64>
1518  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1519  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1520  ret i64 %z
1521}
1522
1523define arm_aapcs_vfpcc i64 @add_v4i8i16_v4i32_v4i64_sext(<4 x i8> %x, <4 x i16> %y, <4 x i8> %b) {
1524; CHECK-LABEL: add_v4i8i16_v4i32_v4i64_sext:
1525; CHECK:       @ %bb.0: @ %entry
1526; CHECK-NEXT:    vmov.i32 q3, #0xff
1527; CHECK-NEXT:    vmovlb.s8 q0, q0
1528; CHECK-NEXT:    vand q2, q2, q3
1529; CHECK-NEXT:    vmovlb.s16 q1, q1
1530; CHECK-NEXT:    vmovlb.s16 q0, q0
1531; CHECK-NEXT:    vpt.i32 eq, q2, zr
1532; CHECK-NEXT:    vmlalvt.s32 r0, r1, q0, q1
1533; CHECK-NEXT:    bx lr
1534entry:
1535  %c = icmp eq <4 x i8> %b, zeroinitializer
1536  %xx = sext <4 x i8> %x to <4 x i32>
1537  %yy = sext <4 x i16> %y to <4 x i32>
1538  %mm = mul <4 x i32> %xx, %yy
1539  %m = sext <4 x i32> %mm to <4 x i64>
1540  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1541  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1542  ret i64 %z
1543}
1544
1545define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1546; CHECK-LABEL: add_v2i8_v2i64_zext:
1547; CHECK:       @ %bb.0: @ %entry
1548; CHECK-NEXT:    vmov.i64 q3, #0xff
1549; CHECK-NEXT:    vand q1, q1, q3
1550; CHECK-NEXT:    vand q0, q0, q3
1551; CHECK-NEXT:    vmov r0, s6
1552; CHECK-NEXT:    vmov r1, s2
1553; CHECK-NEXT:    vmov r2, s4
1554; CHECK-NEXT:    vand q1, q2, q3
1555; CHECK-NEXT:    vmov r3, s0
1556; CHECK-NEXT:    umull r0, r1, r1, r0
1557; CHECK-NEXT:    umull r2, r3, r3, r2
1558; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
1559; CHECK-NEXT:    vmov r0, s4
1560; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
1561; CHECK-NEXT:    movs r1, #0
1562; CHECK-NEXT:    cmp r0, #0
1563; CHECK-NEXT:    csetm r0, eq
1564; CHECK-NEXT:    bfi r1, r0, #0, #8
1565; CHECK-NEXT:    vmov r0, s6
1566; CHECK-NEXT:    vmov.i32 q1, #0x0
1567; CHECK-NEXT:    cmp r0, #0
1568; CHECK-NEXT:    csetm r0, eq
1569; CHECK-NEXT:    bfi r1, r0, #8, #8
1570; CHECK-NEXT:    vmsr p0, r1
1571; CHECK-NEXT:    vpsel q0, q0, q1
1572; CHECK-NEXT:    vmov r0, r1, d1
1573; CHECK-NEXT:    vmov r2, r3, d0
1574; CHECK-NEXT:    adds r0, r0, r2
1575; CHECK-NEXT:    adcs r1, r3
1576; CHECK-NEXT:    bx lr
1577entry:
1578  %c = icmp eq <2 x i8> %b, zeroinitializer
1579  %xx = zext <2 x i8> %x to <2 x i64>
1580  %yy = zext <2 x i8> %y to <2 x i64>
1581  %m = mul <2 x i64> %xx, %yy
1582  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1583  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1584  ret i64 %z
1585}
1586
1587define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b) {
1588; CHECK-LABEL: add_v2i8_v2i64_sext:
1589; CHECK:       @ %bb.0: @ %entry
1590; CHECK-NEXT:    vmov.i32 q3, #0xff
1591; CHECK-NEXT:    movs r1, #0
1592; CHECK-NEXT:    vand q2, q2, q3
1593; CHECK-NEXT:    vmov r2, s4
1594; CHECK-NEXT:    vmov r0, s8
1595; CHECK-NEXT:    vmov r3, s0
1596; CHECK-NEXT:    cmp r0, #0
1597; CHECK-NEXT:    sxtb r2, r2
1598; CHECK-NEXT:    csetm r0, eq
1599; CHECK-NEXT:    bfi r1, r0, #0, #8
1600; CHECK-NEXT:    vmov r0, s10
1601; CHECK-NEXT:    sxtb r3, r3
1602; CHECK-NEXT:    smull r2, r3, r3, r2
1603; CHECK-NEXT:    cmp r0, #0
1604; CHECK-NEXT:    csetm r0, eq
1605; CHECK-NEXT:    bfi r1, r0, #8, #8
1606; CHECK-NEXT:    vmov r0, s6
1607; CHECK-NEXT:    vmsr p0, r1
1608; CHECK-NEXT:    vmov r1, s2
1609; CHECK-NEXT:    vmov.i32 q1, #0x0
1610; CHECK-NEXT:    sxtb r0, r0
1611; CHECK-NEXT:    sxtb r1, r1
1612; CHECK-NEXT:    smull r0, r1, r1, r0
1613; CHECK-NEXT:    vmov q0[2], q0[0], r2, r0
1614; CHECK-NEXT:    vmov q0[3], q0[1], r3, r1
1615; CHECK-NEXT:    vpsel q0, q0, q1
1616; CHECK-NEXT:    vmov r0, r1, d1
1617; CHECK-NEXT:    vmov r2, r3, d0
1618; CHECK-NEXT:    adds r0, r0, r2
1619; CHECK-NEXT:    adcs r1, r3
1620; CHECK-NEXT:    bx lr
1621entry:
1622  %c = icmp eq <2 x i8> %b, zeroinitializer
1623  %xx = sext <2 x i8> %x to <2 x i64>
1624  %yy = sext <2 x i8> %y to <2 x i64>
1625  %m = mul <2 x i64> %xx, %yy
1626  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1627  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1628  ret i64 %z
1629}
1630
1631define arm_aapcs_vfpcc i64 @add_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b) {
1632; CHECK-LABEL: add_v2i64_v2i64:
1633; CHECK:       @ %bb.0: @ %entry
1634; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, lr}
1635; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, lr}
1636; CHECK-NEXT:    vmov r0, r12, d3
1637; CHECK-NEXT:    vmov r2, lr, d1
1638; CHECK-NEXT:    vmov r4, r9, d2
1639; CHECK-NEXT:    vmov.i32 q1, #0x0
1640; CHECK-NEXT:    vmov r6, r7, d0
1641; CHECK-NEXT:    umull r1, r8, r2, r0
1642; CHECK-NEXT:    umull r3, r5, r6, r4
1643; CHECK-NEXT:    vmov q0[2], q0[0], r3, r1
1644; CHECK-NEXT:    mla r1, r2, r12, r8
1645; CHECK-NEXT:    mla r0, lr, r0, r1
1646; CHECK-NEXT:    mla r1, r6, r9, r5
1647; CHECK-NEXT:    mla r1, r7, r4, r1
1648; CHECK-NEXT:    vmov q0[3], q0[1], r1, r0
1649; CHECK-NEXT:    vmov r0, r1, d4
1650; CHECK-NEXT:    orrs r0, r1
1651; CHECK-NEXT:    mov.w r1, #0
1652; CHECK-NEXT:    csetm r0, eq
1653; CHECK-NEXT:    bfi r1, r0, #0, #8
1654; CHECK-NEXT:    vmov r0, r2, d5
1655; CHECK-NEXT:    orrs r0, r2
1656; CHECK-NEXT:    csetm r0, eq
1657; CHECK-NEXT:    bfi r1, r0, #8, #8
1658; CHECK-NEXT:    vmsr p0, r1
1659; CHECK-NEXT:    vpsel q0, q0, q1
1660; CHECK-NEXT:    vmov r0, r1, d1
1661; CHECK-NEXT:    vmov r2, r3, d0
1662; CHECK-NEXT:    adds r0, r0, r2
1663; CHECK-NEXT:    adcs r1, r3
1664; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, pc}
1665entry:
1666  %c = icmp eq <2 x i64> %b, zeroinitializer
1667  %m = mul <2 x i64> %x, %y
1668  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1669  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1670  ret i64 %z
1671}
1672
1673define arm_aapcs_vfpcc i32 @add_v4i32_v4i32_acc(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i32 %a) {
1674; CHECK-LABEL: add_v4i32_v4i32_acc:
1675; CHECK:       @ %bb.0: @ %entry
1676; CHECK-NEXT:    vpt.i32 eq, q2, zr
1677; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1678; CHECK-NEXT:    bx lr
1679entry:
1680  %c = icmp eq <4 x i32> %b, zeroinitializer
1681  %m = mul <4 x i32> %x, %y
1682  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1683  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1684  %r = add i32 %z, %a
1685  ret i32 %r
1686}
1687
1688define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1689; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
1690; CHECK:       @ %bb.0: @ %entry
1691; CHECK-NEXT:    vpt.i32 eq, q2, zr
1692; CHECK-NEXT:    vmlalvat.u32 r0, r1, q0, q1
1693; CHECK-NEXT:    bx lr
1694entry:
1695  %c = icmp eq <4 x i32> %b, zeroinitializer
1696  %xx = zext <4 x i32> %x to <4 x i64>
1697  %yy = zext <4 x i32> %y to <4 x i64>
1698  %m = mul <4 x i64> %xx, %yy
1699  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1700  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1701  %r = add i64 %z, %a
1702  ret i64 %r
1703}
1704
1705define arm_aapcs_vfpcc i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, <4 x i32> %y, <4 x i32> %b, i64 %a) {
1706; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
1707; CHECK:       @ %bb.0: @ %entry
1708; CHECK-NEXT:    vpt.i32 eq, q2, zr
1709; CHECK-NEXT:    vmlalvat.s32 r0, r1, q0, q1
1710; CHECK-NEXT:    bx lr
1711entry:
1712  %c = icmp eq <4 x i32> %b, zeroinitializer
1713  %xx = sext <4 x i32> %x to <4 x i64>
1714  %yy = sext <4 x i32> %y to <4 x i64>
1715  %m = mul <4 x i64> %xx, %yy
1716  %s = select <4 x i1> %c, <4 x i64> %m, <4 x i64> zeroinitializer
1717  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %s)
1718  %r = add i64 %z, %a
1719  ret i64 %r
1720}
1721
1722define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1723; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
1724; CHECK:       @ %bb.0: @ %entry
1725; CHECK-NEXT:    .save {r7, lr}
1726; CHECK-NEXT:    push {r7, lr}
1727; CHECK-NEXT:    vmov r2, s8
1728; CHECK-NEXT:    movs r3, #0
1729; CHECK-NEXT:    vmullb.u32 q3, q0, q1
1730; CHECK-NEXT:    vmov.i32 q0, #0x0
1731; CHECK-NEXT:    cmp r2, #0
1732; CHECK-NEXT:    csetm r2, eq
1733; CHECK-NEXT:    bfi r3, r2, #0, #8
1734; CHECK-NEXT:    vmov r2, s10
1735; CHECK-NEXT:    cmp r2, #0
1736; CHECK-NEXT:    csetm r2, eq
1737; CHECK-NEXT:    bfi r3, r2, #8, #8
1738; CHECK-NEXT:    vmsr p0, r3
1739; CHECK-NEXT:    vpsel q0, q3, q0
1740; CHECK-NEXT:    vmov lr, r12, d1
1741; CHECK-NEXT:    vmov r3, r2, d0
1742; CHECK-NEXT:    adds.w r3, r3, lr
1743; CHECK-NEXT:    adc.w r2, r2, r12
1744; CHECK-NEXT:    adds r0, r0, r3
1745; CHECK-NEXT:    adcs r1, r2
1746; CHECK-NEXT:    pop {r7, pc}
1747entry:
1748  %c = icmp eq <2 x i32> %b, zeroinitializer
1749  %xx = zext <2 x i32> %x to <2 x i64>
1750  %yy = zext <2 x i32> %y to <2 x i64>
1751  %m = mul <2 x i64> %xx, %yy
1752  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1753  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1754  %r = add i64 %z, %a
1755  ret i64 %r
1756}
1757
1758define arm_aapcs_vfpcc i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, <2 x i32> %y, <2 x i32> %b, i64 %a) {
1759; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
1760; CHECK:       @ %bb.0: @ %entry
1761; CHECK-NEXT:    .save {r7, lr}
1762; CHECK-NEXT:    push {r7, lr}
1763; CHECK-NEXT:    vmov r2, s8
1764; CHECK-NEXT:    movs r3, #0
1765; CHECK-NEXT:    vmullb.s32 q3, q0, q1
1766; CHECK-NEXT:    vmov.i32 q0, #0x0
1767; CHECK-NEXT:    cmp r2, #0
1768; CHECK-NEXT:    csetm r2, eq
1769; CHECK-NEXT:    bfi r3, r2, #0, #8
1770; CHECK-NEXT:    vmov r2, s10
1771; CHECK-NEXT:    cmp r2, #0
1772; CHECK-NEXT:    csetm r2, eq
1773; CHECK-NEXT:    bfi r3, r2, #8, #8
1774; CHECK-NEXT:    vmsr p0, r3
1775; CHECK-NEXT:    vpsel q0, q3, q0
1776; CHECK-NEXT:    vmov lr, r12, d1
1777; CHECK-NEXT:    vmov r3, r2, d0
1778; CHECK-NEXT:    adds.w r3, r3, lr
1779; CHECK-NEXT:    adc.w r2, r2, r12
1780; CHECK-NEXT:    adds r0, r0, r3
1781; CHECK-NEXT:    adcs r1, r2
1782; CHECK-NEXT:    pop {r7, pc}
1783entry:
1784  %c = icmp eq <2 x i32> %b, zeroinitializer
1785  %xx = sext <2 x i32> %x to <2 x i64>
1786  %yy = sext <2 x i32> %y to <2 x i64>
1787  %m = mul <2 x i64> %xx, %yy
1788  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
1789  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
1790  %r = add i64 %z, %a
1791  ret i64 %r
1792}
1793
1794define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1795; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
1796; CHECK:       @ %bb.0: @ %entry
1797; CHECK-NEXT:    vpt.i16 eq, q2, zr
1798; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
1799; CHECK-NEXT:    bx lr
1800entry:
1801  %c = icmp eq <8 x i16> %b, zeroinitializer
1802  %xx = zext <8 x i16> %x to <8 x i32>
1803  %yy = zext <8 x i16> %y to <8 x i32>
1804  %m = mul <8 x i32> %xx, %yy
1805  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1806  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1807  %r = add i32 %z, %a
1808  ret i32 %r
1809}
1810
1811define arm_aapcs_vfpcc i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i32 %a) {
1812; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
1813; CHECK:       @ %bb.0: @ %entry
1814; CHECK-NEXT:    vpt.i16 eq, q2, zr
1815; CHECK-NEXT:    vmlavat.s16 r0, q0, q1
1816; CHECK-NEXT:    bx lr
1817entry:
1818  %c = icmp eq <8 x i16> %b, zeroinitializer
1819  %xx = sext <8 x i16> %x to <8 x i32>
1820  %yy = sext <8 x i16> %y to <8 x i32>
1821  %m = mul <8 x i32> %xx, %yy
1822  %s = select <8 x i1> %c, <8 x i32> %m, <8 x i32> zeroinitializer
1823  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %s)
1824  %r = add i32 %z, %a
1825  ret i32 %r
1826}
1827
1828define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1829; CHECK-LABEL: add_v4i16_v4i32_acc_zext:
1830; CHECK:       @ %bb.0: @ %entry
1831; CHECK-NEXT:    vmovlb.u16 q2, q2
1832; CHECK-NEXT:    vmovlb.u16 q1, q1
1833; CHECK-NEXT:    vmovlb.u16 q0, q0
1834; CHECK-NEXT:    vpt.i32 eq, q2, zr
1835; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1836; CHECK-NEXT:    bx lr
1837entry:
1838  %c = icmp eq <4 x i16> %b, zeroinitializer
1839  %xx = zext <4 x i16> %x to <4 x i32>
1840  %yy = zext <4 x i16> %y to <4 x i32>
1841  %m = mul <4 x i32> %xx, %yy
1842  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1843  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1844  %r = add i32 %z, %a
1845  ret i32 %r
1846}
1847
1848define arm_aapcs_vfpcc i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, <4 x i16> %y, <4 x i16> %b, i32 %a) {
1849; CHECK-LABEL: add_v4i16_v4i32_acc_sext:
1850; CHECK:       @ %bb.0: @ %entry
1851; CHECK-NEXT:    vmovlb.u16 q2, q2
1852; CHECK-NEXT:    vmovlb.s16 q1, q1
1853; CHECK-NEXT:    vmovlb.s16 q0, q0
1854; CHECK-NEXT:    vpt.i32 eq, q2, zr
1855; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
1856; CHECK-NEXT:    bx lr
1857entry:
1858  %c = icmp eq <4 x i16> %b, zeroinitializer
1859  %xx = sext <4 x i16> %x to <4 x i32>
1860  %yy = sext <4 x i16> %y to <4 x i32>
1861  %m = mul <4 x i32> %xx, %yy
1862  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
1863  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
1864  %r = add i32 %z, %a
1865  ret i32 %r
1866}
1867
1868define arm_aapcs_vfpcc zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i16 %a) {
1869; CHECK-LABEL: add_v8i16_v8i16_acc:
1870; CHECK:       @ %bb.0: @ %entry
1871; CHECK-NEXT:    vpt.i16 eq, q2, zr
1872; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
1873; CHECK-NEXT:    uxth r0, r0
1874; CHECK-NEXT:    bx lr
1875entry:
1876  %c = icmp eq <8 x i16> %b, zeroinitializer
1877  %m = mul <8 x i16> %x, %y
1878  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
1879  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
1880  %r = add i16 %z, %a
1881  ret i16 %r
1882}
1883
1884define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1885; CHECK-LABEL: add_v8i16_v8i64_acc_zext:
1886; CHECK:       @ %bb.0: @ %entry
1887; CHECK-NEXT:    vpt.i16 eq, q2, zr
1888; CHECK-NEXT:    vmlalvat.u16 r0, r1, q0, q1
1889; CHECK-NEXT:    bx lr
1890entry:
1891  %c = icmp eq <8 x i16> %b, zeroinitializer
1892  %xx = zext <8 x i16> %x to <8 x i64>
1893  %yy = zext <8 x i16> %y to <8 x i64>
1894  %m = mul <8 x i64> %xx, %yy
1895  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1896  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1897  %r = add i64 %z, %a
1898  ret i64 %r
1899}
1900
1901define arm_aapcs_vfpcc i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1902; CHECK-LABEL: add_v8i16_v8i64_acc_sext:
1903; CHECK:       @ %bb.0: @ %entry
1904; CHECK-NEXT:    vpt.i16 eq, q2, zr
1905; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q1
1906; CHECK-NEXT:    bx lr
1907entry:
1908  %c = icmp eq <8 x i16> %b, zeroinitializer
1909  %xx = sext <8 x i16> %x to <8 x i64>
1910  %yy = sext <8 x i16> %y to <8 x i64>
1911  %m = mul <8 x i64> %xx, %yy
1912  %s = select <8 x i1> %c, <8 x i64> %m, <8 x i64> zeroinitializer
1913  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1914  %r = add i64 %z, %a
1915  ret i64 %r
1916}
1917
1918define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_zext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1919; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_zext:
1920; CHECK:       @ %bb.0: @ %entry
1921; CHECK-NEXT:    vpt.i16 eq, q2, zr
1922; CHECK-NEXT:    vmlalvat.u16 r0, r1, q0, q1
1923; CHECK-NEXT:    bx lr
1924entry:
1925  %c = icmp eq <8 x i16> %b, zeroinitializer
1926  %xx = zext <8 x i16> %x to <8 x i32>
1927  %yy = zext <8 x i16> %y to <8 x i32>
1928  %m = mul <8 x i32> %xx, %yy
1929  %ma = zext <8 x i32> %m to <8 x i64>
1930  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1931  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1932  %r = add i64 %z, %a
1933  ret i64 %r
1934}
1935
1936define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1937; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sext:
1938; CHECK:       @ %bb.0: @ %entry
1939; CHECK-NEXT:    vpt.i16 eq, q2, zr
1940; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q1
1941; CHECK-NEXT:    bx lr
1942entry:
1943  %c = icmp eq <8 x i16> %b, zeroinitializer
1944  %xx = sext <8 x i16> %x to <8 x i32>
1945  %yy = sext <8 x i16> %y to <8 x i32>
1946  %m = mul <8 x i32> %xx, %yy
1947  %ma = sext <8 x i32> %m to <8 x i64>
1948  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1949  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1950  %r = add i64 %z, %a
1951  ret i64 %r
1952}
1953
1954define arm_aapcs_vfpcc i64 @add_v8i16_v8i32_v8i64_acc_sextzext(<8 x i16> %x, <8 x i16> %y, <8 x i16> %b, i64 %a) {
1955; CHECK-LABEL: add_v8i16_v8i32_v8i64_acc_sextzext:
1956; CHECK:       @ %bb.0: @ %entry
1957; CHECK-NEXT:    vpt.i16 eq, q2, zr
1958; CHECK-NEXT:    vmlalvat.s16 r0, r1, q0, q0
1959; CHECK-NEXT:    bx lr
1960entry:
1961  %c = icmp eq <8 x i16> %b, zeroinitializer
1962  %xx = sext <8 x i16> %x to <8 x i32>
1963  %m = mul <8 x i32> %xx, %xx
1964  %ma = zext <8 x i32> %m to <8 x i64>
1965  %s = select <8 x i1> %c, <8 x i64> %ma, <8 x i64> zeroinitializer
1966  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %s)
1967  %r = add i64 %z, %a
1968  ret i64 %r
1969}
1970
1971define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
1972; CHECK-LABEL: add_v2i16_v2i64_acc_zext:
1973; CHECK:       @ %bb.0: @ %entry
1974; CHECK-NEXT:    .save {r7, lr}
1975; CHECK-NEXT:    push {r7, lr}
1976; CHECK-NEXT:    vmov.i64 q3, #0xffff
1977; CHECK-NEXT:    vand q1, q1, q3
1978; CHECK-NEXT:    vand q0, q0, q3
1979; CHECK-NEXT:    vmov r2, s6
1980; CHECK-NEXT:    vmov r3, s2
1981; CHECK-NEXT:    umull lr, r12, r3, r2
1982; CHECK-NEXT:    vmov r3, s4
1983; CHECK-NEXT:    vmov r2, s0
1984; CHECK-NEXT:    vand q1, q2, q3
1985; CHECK-NEXT:    umull r2, r3, r2, r3
1986; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
1987; CHECK-NEXT:    vmov r2, s4
1988; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
1989; CHECK-NEXT:    movs r3, #0
1990; CHECK-NEXT:    cmp r2, #0
1991; CHECK-NEXT:    csetm r2, eq
1992; CHECK-NEXT:    bfi r3, r2, #0, #8
1993; CHECK-NEXT:    vmov r2, s6
1994; CHECK-NEXT:    vmov.i32 q1, #0x0
1995; CHECK-NEXT:    cmp r2, #0
1996; CHECK-NEXT:    csetm r2, eq
1997; CHECK-NEXT:    bfi r3, r2, #8, #8
1998; CHECK-NEXT:    vmsr p0, r3
1999; CHECK-NEXT:    vpsel q0, q0, q1
2000; CHECK-NEXT:    vmov lr, r12, d1
2001; CHECK-NEXT:    vmov r3, r2, d0
2002; CHECK-NEXT:    adds.w r3, r3, lr
2003; CHECK-NEXT:    adc.w r2, r2, r12
2004; CHECK-NEXT:    adds r0, r0, r3
2005; CHECK-NEXT:    adcs r1, r2
2006; CHECK-NEXT:    pop {r7, pc}
2007entry:
2008  %c = icmp eq <2 x i16> %b, zeroinitializer
2009  %xx = zext <2 x i16> %x to <2 x i64>
2010  %yy = zext <2 x i16> %y to <2 x i64>
2011  %m = mul <2 x i64> %xx, %yy
2012  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2013  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2014  %r = add i64 %z, %a
2015  ret i64 %r
2016}
2017
2018define arm_aapcs_vfpcc i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, <2 x i16> %y, <2 x i16> %b, i64 %a) {
2019; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
2020; CHECK:       @ %bb.0: @ %entry
2021; CHECK-NEXT:    .save {r7, lr}
2022; CHECK-NEXT:    push {r7, lr}
2023; CHECK-NEXT:    vmov.i32 q3, #0xffff
2024; CHECK-NEXT:    movs r3, #0
2025; CHECK-NEXT:    vand q2, q2, q3
2026; CHECK-NEXT:    vmov r2, s8
2027; CHECK-NEXT:    cmp r2, #0
2028; CHECK-NEXT:    csetm r2, eq
2029; CHECK-NEXT:    bfi r3, r2, #0, #8
2030; CHECK-NEXT:    vmov r2, s10
2031; CHECK-NEXT:    cmp r2, #0
2032; CHECK-NEXT:    csetm r2, eq
2033; CHECK-NEXT:    bfi r3, r2, #8, #8
2034; CHECK-NEXT:    vmov r2, s6
2035; CHECK-NEXT:    vmsr p0, r3
2036; CHECK-NEXT:    vmov r3, s2
2037; CHECK-NEXT:    sxth r2, r2
2038; CHECK-NEXT:    sxth r3, r3
2039; CHECK-NEXT:    smull lr, r12, r3, r2
2040; CHECK-NEXT:    vmov r3, s4
2041; CHECK-NEXT:    vmov r2, s0
2042; CHECK-NEXT:    vmov.i32 q1, #0x0
2043; CHECK-NEXT:    sxth r3, r3
2044; CHECK-NEXT:    sxth r2, r2
2045; CHECK-NEXT:    smull r2, r3, r2, r3
2046; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
2047; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
2048; CHECK-NEXT:    vpsel q0, q0, q1
2049; CHECK-NEXT:    vmov lr, r12, d1
2050; CHECK-NEXT:    vmov r3, r2, d0
2051; CHECK-NEXT:    adds.w r3, r3, lr
2052; CHECK-NEXT:    adc.w r2, r2, r12
2053; CHECK-NEXT:    adds r0, r0, r3
2054; CHECK-NEXT:    adcs r1, r2
2055; CHECK-NEXT:    pop {r7, pc}
2056entry:
2057  %c = icmp eq <2 x i16> %b, zeroinitializer
2058  %xx = sext <2 x i16> %x to <2 x i64>
2059  %yy = sext <2 x i16> %y to <2 x i64>
2060  %m = mul <2 x i64> %xx, %yy
2061  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2062  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2063  %r = add i64 %z, %a
2064  ret i64 %r
2065}
2066
2067define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2068; CHECK-LABEL: add_v16i8_v16i32_acc_zext:
2069; CHECK:       @ %bb.0: @ %entry
2070; CHECK-NEXT:    vpt.i8 eq, q2, zr
2071; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2072; CHECK-NEXT:    bx lr
2073entry:
2074  %c = icmp eq <16 x i8> %b, zeroinitializer
2075  %xx = zext <16 x i8> %x to <16 x i32>
2076  %yy = zext <16 x i8> %y to <16 x i32>
2077  %m = mul <16 x i32> %xx, %yy
2078  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2079  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2080  %r = add i32 %z, %a
2081  ret i32 %r
2082}
2083
2084define arm_aapcs_vfpcc i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2085; CHECK-LABEL: add_v16i8_v16i32_acc_sext:
2086; CHECK:       @ %bb.0: @ %entry
2087; CHECK-NEXT:    vpt.i8 eq, q2, zr
2088; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
2089; CHECK-NEXT:    bx lr
2090entry:
2091  %c = icmp eq <16 x i8> %b, zeroinitializer
2092  %xx = sext <16 x i8> %x to <16 x i32>
2093  %yy = sext <16 x i8> %y to <16 x i32>
2094  %m = mul <16 x i32> %xx, %yy
2095  %s = select <16 x i1> %c, <16 x i32> %m, <16 x i32> zeroinitializer
2096  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2097  %r = add i32 %z, %a
2098  ret i32 %r
2099}
2100
2101define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2102; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_zext:
2103; CHECK:       @ %bb.0: @ %entry
2104; CHECK-NEXT:    vpt.i8 eq, q2, zr
2105; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2106; CHECK-NEXT:    bx lr
2107entry:
2108  %c = icmp eq <16 x i8> %b, zeroinitializer
2109  %xx = zext <16 x i8> %x to <16 x i16>
2110  %yy = zext <16 x i8> %y to <16 x i16>
2111  %m = mul <16 x i16> %xx, %yy
2112  %ma = zext <16 x i16> %m to <16 x i32>
2113  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2114  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2115  %r = add i32 %z, %a
2116  ret i32 %r
2117}
2118
2119define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2120; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sext:
2121; CHECK:       @ %bb.0: @ %entry
2122; CHECK-NEXT:    vpt.i8 eq, q2, zr
2123; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
2124; CHECK-NEXT:    bx lr
2125entry:
2126  %c = icmp eq <16 x i8> %b, zeroinitializer
2127  %xx = sext <16 x i8> %x to <16 x i16>
2128  %yy = sext <16 x i8> %y to <16 x i16>
2129  %m = mul <16 x i16> %xx, %yy
2130  %ma = sext <16 x i16> %m to <16 x i32>
2131  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2132  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2133  %r = add i32 %z, %a
2134  ret i32 %r
2135}
2136
2137define arm_aapcs_vfpcc i32 @add_v16i8_v16i16_v16i32_acc_sextzext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i32 %a) {
2138; CHECK-LABEL: add_v16i8_v16i16_v16i32_acc_sextzext:
2139; CHECK:       @ %bb.0: @ %entry
2140; CHECK-NEXT:    vpt.i8 eq, q2, zr
2141; CHECK-NEXT:    vmlavat.s8 r0, q0, q0
2142; CHECK-NEXT:    bx lr
2143entry:
2144  %c = icmp eq <16 x i8> %b, zeroinitializer
2145  %xx = sext <16 x i8> %x to <16 x i16>
2146  %m = mul <16 x i16> %xx, %xx
2147  %ma = zext <16 x i16> %m to <16 x i32>
2148  %s = select <16 x i1> %c, <16 x i32> %ma, <16 x i32> zeroinitializer
2149  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %s)
2150  %r = add i32 %z, %a
2151  ret i32 %r
2152}
2153
2154define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2155; CHECK-LABEL: add_v4i8_v4i32_acc_zext:
2156; CHECK:       @ %bb.0: @ %entry
2157; CHECK-NEXT:    vmov.i32 q3, #0xff
2158; CHECK-NEXT:    vand q2, q2, q3
2159; CHECK-NEXT:    vand q1, q1, q3
2160; CHECK-NEXT:    vand q0, q0, q3
2161; CHECK-NEXT:    vpt.i32 eq, q2, zr
2162; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
2163; CHECK-NEXT:    bx lr
2164entry:
2165  %c = icmp eq <4 x i8> %b, zeroinitializer
2166  %xx = zext <4 x i8> %x to <4 x i32>
2167  %yy = zext <4 x i8> %y to <4 x i32>
2168  %m = mul <4 x i32> %xx, %yy
2169  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2170  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2171  %r = add i32 %z, %a
2172  ret i32 %r
2173}
2174
2175define arm_aapcs_vfpcc i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, <4 x i8> %y, <4 x i8> %b, i32 %a) {
2176; CHECK-LABEL: add_v4i8_v4i32_acc_sext:
2177; CHECK:       @ %bb.0: @ %entry
2178; CHECK-NEXT:    vmov.i32 q3, #0xff
2179; CHECK-NEXT:    vmovlb.s8 q1, q1
2180; CHECK-NEXT:    vmovlb.s8 q0, q0
2181; CHECK-NEXT:    vand q2, q2, q3
2182; CHECK-NEXT:    vmovlb.s16 q1, q1
2183; CHECK-NEXT:    vmovlb.s16 q0, q0
2184; CHECK-NEXT:    vpt.i32 eq, q2, zr
2185; CHECK-NEXT:    vmlavat.u32 r0, q0, q1
2186; CHECK-NEXT:    bx lr
2187entry:
2188  %c = icmp eq <4 x i8> %b, zeroinitializer
2189  %xx = sext <4 x i8> %x to <4 x i32>
2190  %yy = sext <4 x i8> %y to <4 x i32>
2191  %m = mul <4 x i32> %xx, %yy
2192  %s = select <4 x i1> %c, <4 x i32> %m, <4 x i32> zeroinitializer
2193  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %s)
2194  %r = add i32 %z, %a
2195  ret i32 %r
2196}
2197
2198define arm_aapcs_vfpcc zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2199; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
2200; CHECK:       @ %bb.0: @ %entry
2201; CHECK-NEXT:    vpt.i8 eq, q2, zr
2202; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2203; CHECK-NEXT:    uxth r0, r0
2204; CHECK-NEXT:    bx lr
2205entry:
2206  %c = icmp eq <16 x i8> %b, zeroinitializer
2207  %xx = zext <16 x i8> %x to <16 x i16>
2208  %yy = zext <16 x i8> %y to <16 x i16>
2209  %m = mul <16 x i16> %xx, %yy
2210  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2211  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2212  %r = add i16 %z, %a
2213  ret i16 %r
2214}
2215
2216define arm_aapcs_vfpcc signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i16 %a) {
2217; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
2218; CHECK:       @ %bb.0: @ %entry
2219; CHECK-NEXT:    vpt.i8 eq, q2, zr
2220; CHECK-NEXT:    vmlavat.s8 r0, q0, q1
2221; CHECK-NEXT:    sxth r0, r0
2222; CHECK-NEXT:    bx lr
2223entry:
2224  %c = icmp eq <16 x i8> %b, zeroinitializer
2225  %xx = sext <16 x i8> %x to <16 x i16>
2226  %yy = sext <16 x i8> %y to <16 x i16>
2227  %m = mul <16 x i16> %xx, %yy
2228  %s = select <16 x i1> %c, <16 x i16> %m, <16 x i16> zeroinitializer
2229  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %s)
2230  %r = add i16 %z, %a
2231  ret i16 %r
2232}
2233
2234define arm_aapcs_vfpcc zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2235; CHECK-LABEL: add_v8i8_v8i16_acc_zext:
2236; CHECK:       @ %bb.0: @ %entry
2237; CHECK-NEXT:    vmovlb.u8 q2, q2
2238; CHECK-NEXT:    vmovlb.u8 q1, q1
2239; CHECK-NEXT:    vmovlb.u8 q0, q0
2240; CHECK-NEXT:    vpt.i16 eq, q2, zr
2241; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
2242; CHECK-NEXT:    uxth r0, r0
2243; CHECK-NEXT:    bx lr
2244entry:
2245  %c = icmp eq <8 x i8> %b, zeroinitializer
2246  %xx = zext <8 x i8> %x to <8 x i16>
2247  %yy = zext <8 x i8> %y to <8 x i16>
2248  %m = mul <8 x i16> %xx, %yy
2249  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2250  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2251  %r = add i16 %z, %a
2252  ret i16 %r
2253}
2254
2255define arm_aapcs_vfpcc signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, <8 x i8> %y, <8 x i8> %b, i16 %a) {
2256; CHECK-LABEL: add_v8i8_v8i16_acc_sext:
2257; CHECK:       @ %bb.0: @ %entry
2258; CHECK-NEXT:    vmovlb.u8 q2, q2
2259; CHECK-NEXT:    vmovlb.s8 q1, q1
2260; CHECK-NEXT:    vmovlb.s8 q0, q0
2261; CHECK-NEXT:    vpt.i16 eq, q2, zr
2262; CHECK-NEXT:    vmlavat.u16 r0, q0, q1
2263; CHECK-NEXT:    sxth r0, r0
2264; CHECK-NEXT:    bx lr
2265entry:
2266  %c = icmp eq <8 x i8> %b, zeroinitializer
2267  %xx = sext <8 x i8> %x to <8 x i16>
2268  %yy = sext <8 x i8> %y to <8 x i16>
2269  %m = mul <8 x i16> %xx, %yy
2270  %s = select <8 x i1> %c, <8 x i16> %m, <8 x i16> zeroinitializer
2271  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %s)
2272  %r = add i16 %z, %a
2273  ret i16 %r
2274}
2275
2276define arm_aapcs_vfpcc zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i8 %a) {
2277; CHECK-LABEL: add_v16i8_v16i8_acc:
2278; CHECK:       @ %bb.0: @ %entry
2279; CHECK-NEXT:    vpt.i8 eq, q2, zr
2280; CHECK-NEXT:    vmlavat.u8 r0, q0, q1
2281; CHECK-NEXT:    uxtb r0, r0
2282; CHECK-NEXT:    bx lr
2283entry:
2284  %c = icmp eq <16 x i8> %b, zeroinitializer
2285  %m = mul <16 x i8> %x, %y
2286  %s = select <16 x i1> %c, <16 x i8> %m, <16 x i8> zeroinitializer
2287  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %s)
2288  %r = add i8 %z, %a
2289  ret i8 %r
2290}
2291
2292define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2293; CHECK-LABEL: add_v16i8_v16i64_acc_zext:
2294; CHECK:       @ %bb.0: @ %entry
2295; CHECK-NEXT:    .save {r4, r5, r7, lr}
2296; CHECK-NEXT:    push {r4, r5, r7, lr}
2297; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2298; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2299; CHECK-NEXT:    .pad #32
2300; CHECK-NEXT:    sub sp, #32
2301; CHECK-NEXT:    vmov q3, q0
2302; CHECK-NEXT:    vmov.i8 q0, #0x0
2303; CHECK-NEXT:    vcmp.i8 eq, q2, zr
2304; CHECK-NEXT:    vmov.i8 q2, #0xff
2305; CHECK-NEXT:    vpsel q6, q2, q0
2306; CHECK-NEXT:    vmov q4, q0
2307; CHECK-NEXT:    vmov.u8 r2, q6[0]
2308; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
2309; CHECK-NEXT:    vmov.16 q0[0], r2
2310; CHECK-NEXT:    vmov.u8 r2, q6[1]
2311; CHECK-NEXT:    vmov.16 q0[1], r2
2312; CHECK-NEXT:    vmov.u8 r2, q6[2]
2313; CHECK-NEXT:    vmov.16 q0[2], r2
2314; CHECK-NEXT:    vmov.u8 r2, q6[3]
2315; CHECK-NEXT:    vmov.16 q0[3], r2
2316; CHECK-NEXT:    vmov.u8 r2, q6[4]
2317; CHECK-NEXT:    vmov.16 q0[4], r2
2318; CHECK-NEXT:    vmov.u8 r2, q6[5]
2319; CHECK-NEXT:    vmov.16 q0[5], r2
2320; CHECK-NEXT:    vmov.u8 r2, q6[6]
2321; CHECK-NEXT:    vmov.16 q0[6], r2
2322; CHECK-NEXT:    vmov.u8 r2, q6[7]
2323; CHECK-NEXT:    vmov.16 q0[7], r2
2324; CHECK-NEXT:    vstrw.32 q2, [sp, #16] @ 16-byte Spill
2325; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2326; CHECK-NEXT:    vmov.u8 r4, q3[2]
2327; CHECK-NEXT:    vpsel q7, q2, q4
2328; CHECK-NEXT:    vmov.u16 r2, q7[2]
2329; CHECK-NEXT:    vmov.u16 r3, q7[0]
2330; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
2331; CHECK-NEXT:    vmov.u16 r2, q7[3]
2332; CHECK-NEXT:    vmov.u16 r3, q7[1]
2333; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
2334; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2335; CHECK-NEXT:    vpsel q0, q2, q4
2336; CHECK-NEXT:    vmov r2, r3, d0
2337; CHECK-NEXT:    vmov q2[2], q2[0], r2, r3
2338; CHECK-NEXT:    vmov q2[3], q2[1], r2, r3
2339; CHECK-NEXT:    vmov.u8 r2, q1[1]
2340; CHECK-NEXT:    vmov.u8 r3, q1[0]
2341; CHECK-NEXT:    vcmp.i32 ne, q2, zr
2342; CHECK-NEXT:    vmov q5[2], q5[0], r3, r2
2343; CHECK-NEXT:    vmov.u8 r3, q3[1]
2344; CHECK-NEXT:    vmov.u8 r2, q3[0]
2345; CHECK-NEXT:    vmov.i64 q2, #0xff
2346; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
2347; CHECK-NEXT:    vand q5, q5, q2
2348; CHECK-NEXT:    vand q4, q4, q2
2349; CHECK-NEXT:    vmov r12, s22
2350; CHECK-NEXT:    vmov r2, s18
2351; CHECK-NEXT:    vmov r3, s20
2352; CHECK-NEXT:    vmov.i32 q5, #0x0
2353; CHECK-NEXT:    umull lr, r12, r2, r12
2354; CHECK-NEXT:    vmov r2, s16
2355; CHECK-NEXT:    umull r2, r3, r2, r3
2356; CHECK-NEXT:    vmov q4[2], q4[0], r2, lr
2357; CHECK-NEXT:    vmov q4[3], q4[1], r3, r12
2358; CHECK-NEXT:    vpsel q4, q4, q5
2359; CHECK-NEXT:    vmov lr, r12, d9
2360; CHECK-NEXT:    vmov r3, r2, d8
2361; CHECK-NEXT:    adds.w lr, lr, r3
2362; CHECK-NEXT:    adc.w r12, r12, r2
2363; CHECK-NEXT:    vmov r2, r3, d1
2364; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
2365; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
2366; CHECK-NEXT:    vmov.u8 r2, q1[3]
2367; CHECK-NEXT:    vmov.u8 r3, q1[2]
2368; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2369; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
2370; CHECK-NEXT:    vmov.u8 r3, q3[3]
2371; CHECK-NEXT:    vmov q4[2], q4[0], r4, r3
2372; CHECK-NEXT:    vand q0, q0, q2
2373; CHECK-NEXT:    vand q4, q4, q2
2374; CHECK-NEXT:    vmov r2, s2
2375; CHECK-NEXT:    vmov r3, s18
2376; CHECK-NEXT:    vmov r5, s16
2377; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
2378; CHECK-NEXT:    vmov r4, s0
2379; CHECK-NEXT:    umull r2, r3, r3, r2
2380; CHECK-NEXT:    umull r4, r5, r5, r4
2381; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
2382; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
2383; CHECK-NEXT:    vpsel q0, q0, q5
2384; CHECK-NEXT:    vmov r2, r3, d0
2385; CHECK-NEXT:    vmov r5, r4, d1
2386; CHECK-NEXT:    adds.w r2, r2, lr
2387; CHECK-NEXT:    adc.w r3, r3, r12
2388; CHECK-NEXT:    adds.w r12, r2, r5
2389; CHECK-NEXT:    adc.w lr, r3, r4
2390; CHECK-NEXT:    vmov.u16 r5, q7[6]
2391; CHECK-NEXT:    vmov.u16 r4, q7[4]
2392; CHECK-NEXT:    vmov.u8 r2, q3[4]
2393; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2394; CHECK-NEXT:    vmov.u16 r5, q7[7]
2395; CHECK-NEXT:    vmov.u16 r4, q7[5]
2396; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2397; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2398; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2399; CHECK-NEXT:    vpsel q0, q0, q4
2400; CHECK-NEXT:    vmov r5, r4, d0
2401; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
2402; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
2403; CHECK-NEXT:    vmov.u8 r5, q1[5]
2404; CHECK-NEXT:    vmov.u8 r4, q1[4]
2405; CHECK-NEXT:    vcmp.i32 ne, q4, zr
2406; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
2407; CHECK-NEXT:    vmov.u8 r4, q3[5]
2408; CHECK-NEXT:    vmov q7[2], q7[0], r2, r4
2409; CHECK-NEXT:    vand q4, q4, q2
2410; CHECK-NEXT:    vand q7, q7, q2
2411; CHECK-NEXT:    vmov r5, s18
2412; CHECK-NEXT:    vmov r2, s30
2413; CHECK-NEXT:    vmov r3, s28
2414; CHECK-NEXT:    vldrw.u32 q7, [sp, #16] @ 16-byte Reload
2415; CHECK-NEXT:    vmov r4, s16
2416; CHECK-NEXT:    umull r2, r5, r2, r5
2417; CHECK-NEXT:    umull r3, r4, r3, r4
2418; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
2419; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
2420; CHECK-NEXT:    vpsel q4, q4, q5
2421; CHECK-NEXT:    vmov r2, r3, d8
2422; CHECK-NEXT:    vmov r5, r4, d9
2423; CHECK-NEXT:    adds.w r2, r2, r12
2424; CHECK-NEXT:    adc.w r3, r3, lr
2425; CHECK-NEXT:    adds.w r12, r2, r5
2426; CHECK-NEXT:    adc.w lr, r3, r4
2427; CHECK-NEXT:    vmov r5, r4, d1
2428; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2429; CHECK-NEXT:    vmov.u8 r2, q3[6]
2430; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2431; CHECK-NEXT:    vmov.u8 r5, q1[7]
2432; CHECK-NEXT:    vmov.u8 r4, q1[6]
2433; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2434; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2435; CHECK-NEXT:    vmov.u8 r4, q3[7]
2436; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
2437; CHECK-NEXT:    vand q0, q0, q2
2438; CHECK-NEXT:    vand q4, q4, q2
2439; CHECK-NEXT:    vmov r5, s2
2440; CHECK-NEXT:    vmov r2, s18
2441; CHECK-NEXT:    vmov r3, s16
2442; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
2443; CHECK-NEXT:    vmov r4, s0
2444; CHECK-NEXT:    umull r2, r5, r2, r5
2445; CHECK-NEXT:    umull r3, r4, r3, r4
2446; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
2447; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2448; CHECK-NEXT:    vpsel q0, q0, q5
2449; CHECK-NEXT:    vmov r2, r3, d0
2450; CHECK-NEXT:    vmov r5, r4, d1
2451; CHECK-NEXT:    adds.w r2, r2, r12
2452; CHECK-NEXT:    adc.w r3, r3, lr
2453; CHECK-NEXT:    adds.w r12, r2, r5
2454; CHECK-NEXT:    vmov.u8 r5, q6[8]
2455; CHECK-NEXT:    adc.w lr, r3, r4
2456; CHECK-NEXT:    vmov.16 q0[0], r5
2457; CHECK-NEXT:    vmov.u8 r5, q6[9]
2458; CHECK-NEXT:    vmov.16 q0[1], r5
2459; CHECK-NEXT:    vmov.u8 r5, q6[10]
2460; CHECK-NEXT:    vmov.16 q0[2], r5
2461; CHECK-NEXT:    vmov.u8 r5, q6[11]
2462; CHECK-NEXT:    vmov.16 q0[3], r5
2463; CHECK-NEXT:    vmov.u8 r5, q6[12]
2464; CHECK-NEXT:    vmov.16 q0[4], r5
2465; CHECK-NEXT:    vmov.u8 r5, q6[13]
2466; CHECK-NEXT:    vmov.16 q0[5], r5
2467; CHECK-NEXT:    vmov.u8 r5, q6[14]
2468; CHECK-NEXT:    vmov.16 q0[6], r5
2469; CHECK-NEXT:    vmov.u8 r5, q6[15]
2470; CHECK-NEXT:    vmov.16 q0[7], r5
2471; CHECK-NEXT:    vmov.u8 r2, q3[8]
2472; CHECK-NEXT:    vcmp.i16 ne, q0, zr
2473; CHECK-NEXT:    vpsel q6, q7, q4
2474; CHECK-NEXT:    vmov.u16 r5, q6[2]
2475; CHECK-NEXT:    vmov.u16 r4, q6[0]
2476; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2477; CHECK-NEXT:    vmov.u16 r5, q6[3]
2478; CHECK-NEXT:    vmov.u16 r4, q6[1]
2479; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2480; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2481; CHECK-NEXT:    vpsel q0, q7, q4
2482; CHECK-NEXT:    vmov r5, r4, d0
2483; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
2484; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
2485; CHECK-NEXT:    vmov.u8 r5, q1[9]
2486; CHECK-NEXT:    vmov.u8 r4, q1[8]
2487; CHECK-NEXT:    vcmp.i32 ne, q4, zr
2488; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
2489; CHECK-NEXT:    vmov.u8 r4, q3[9]
2490; CHECK-NEXT:    vmov q7[2], q7[0], r2, r4
2491; CHECK-NEXT:    vand q4, q4, q2
2492; CHECK-NEXT:    vand q7, q7, q2
2493; CHECK-NEXT:    vmov r5, s18
2494; CHECK-NEXT:    vmov r2, s30
2495; CHECK-NEXT:    vmov r4, s16
2496; CHECK-NEXT:    vmov r3, s28
2497; CHECK-NEXT:    umull r2, r5, r2, r5
2498; CHECK-NEXT:    umull r3, r4, r3, r4
2499; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
2500; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
2501; CHECK-NEXT:    vpsel q4, q4, q5
2502; CHECK-NEXT:    vmov r2, r3, d8
2503; CHECK-NEXT:    vmov r5, r4, d9
2504; CHECK-NEXT:    adds.w r2, r2, r12
2505; CHECK-NEXT:    adc.w r3, r3, lr
2506; CHECK-NEXT:    adds.w r12, r2, r5
2507; CHECK-NEXT:    adc.w lr, r3, r4
2508; CHECK-NEXT:    vmov r5, r4, d1
2509; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2510; CHECK-NEXT:    vmov.u8 r2, q3[10]
2511; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2512; CHECK-NEXT:    vmov.u8 r5, q1[11]
2513; CHECK-NEXT:    vmov.u8 r4, q1[10]
2514; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2515; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2516; CHECK-NEXT:    vmov.u8 r4, q3[11]
2517; CHECK-NEXT:    vmov q4[2], q4[0], r2, r4
2518; CHECK-NEXT:    vand q0, q0, q2
2519; CHECK-NEXT:    vand q4, q4, q2
2520; CHECK-NEXT:    vmov r5, s2
2521; CHECK-NEXT:    vmov r2, s18
2522; CHECK-NEXT:    vmov r3, s16
2523; CHECK-NEXT:    vldrw.u32 q4, [sp] @ 16-byte Reload
2524; CHECK-NEXT:    vmov r4, s0
2525; CHECK-NEXT:    umull r2, r5, r2, r5
2526; CHECK-NEXT:    umull r3, r4, r3, r4
2527; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
2528; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2529; CHECK-NEXT:    vpsel q0, q0, q5
2530; CHECK-NEXT:    vmov r2, r3, d0
2531; CHECK-NEXT:    vmov r5, r4, d1
2532; CHECK-NEXT:    adds.w r2, r2, r12
2533; CHECK-NEXT:    adc.w r3, r3, lr
2534; CHECK-NEXT:    adds.w r12, r2, r5
2535; CHECK-NEXT:    adc.w lr, r3, r4
2536; CHECK-NEXT:    vmov.u16 r5, q6[6]
2537; CHECK-NEXT:    vmov.u16 r4, q6[4]
2538; CHECK-NEXT:    vmov.u8 r2, q3[12]
2539; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2540; CHECK-NEXT:    vmov.u16 r5, q6[7]
2541; CHECK-NEXT:    vmov.u16 r4, q6[5]
2542; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2543; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2544; CHECK-NEXT:    vldrw.u32 q0, [sp, #16] @ 16-byte Reload
2545; CHECK-NEXT:    vpsel q0, q0, q4
2546; CHECK-NEXT:    vmov r5, r4, d0
2547; CHECK-NEXT:    vmov q4[2], q4[0], r5, r4
2548; CHECK-NEXT:    vmov q4[3], q4[1], r5, r4
2549; CHECK-NEXT:    vmov.u8 r5, q1[13]
2550; CHECK-NEXT:    vmov.u8 r4, q1[12]
2551; CHECK-NEXT:    vcmp.i32 ne, q4, zr
2552; CHECK-NEXT:    vmov q4[2], q4[0], r4, r5
2553; CHECK-NEXT:    vmov.u8 r4, q3[13]
2554; CHECK-NEXT:    vmov q6[2], q6[0], r2, r4
2555; CHECK-NEXT:    vand q4, q4, q2
2556; CHECK-NEXT:    vand q6, q6, q2
2557; CHECK-NEXT:    vmov r5, s18
2558; CHECK-NEXT:    vmov r2, s26
2559; CHECK-NEXT:    vmov r4, s16
2560; CHECK-NEXT:    vmov r3, s24
2561; CHECK-NEXT:    umull r2, r5, r2, r5
2562; CHECK-NEXT:    umull r3, r4, r3, r4
2563; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
2564; CHECK-NEXT:    vmov q4[3], q4[1], r4, r5
2565; CHECK-NEXT:    vpsel q4, q4, q5
2566; CHECK-NEXT:    vmov r2, r3, d8
2567; CHECK-NEXT:    vmov r5, r4, d9
2568; CHECK-NEXT:    adds.w r2, r2, r12
2569; CHECK-NEXT:    adc.w r3, r3, lr
2570; CHECK-NEXT:    adds.w r12, r2, r5
2571; CHECK-NEXT:    adc.w lr, r3, r4
2572; CHECK-NEXT:    vmov r5, r4, d1
2573; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2574; CHECK-NEXT:    vmov.u8 r2, q3[14]
2575; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2576; CHECK-NEXT:    vmov.u8 r5, q1[15]
2577; CHECK-NEXT:    vmov.u8 r4, q1[14]
2578; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2579; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2580; CHECK-NEXT:    vmov.u8 r4, q3[15]
2581; CHECK-NEXT:    vmov q1[2], q1[0], r2, r4
2582; CHECK-NEXT:    vand q0, q0, q2
2583; CHECK-NEXT:    vand q1, q1, q2
2584; CHECK-NEXT:    vmov r5, s2
2585; CHECK-NEXT:    vmov r2, s6
2586; CHECK-NEXT:    vmov r4, s0
2587; CHECK-NEXT:    vmov r3, s4
2588; CHECK-NEXT:    umull r2, r5, r2, r5
2589; CHECK-NEXT:    umull r3, r4, r3, r4
2590; CHECK-NEXT:    vmov q0[2], q0[0], r3, r2
2591; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2592; CHECK-NEXT:    vpsel q0, q0, q5
2593; CHECK-NEXT:    vmov r2, r3, d0
2594; CHECK-NEXT:    vmov r5, r4, d1
2595; CHECK-NEXT:    adds.w r2, r2, r12
2596; CHECK-NEXT:    adc.w r3, r3, lr
2597; CHECK-NEXT:    adds r2, r2, r5
2598; CHECK-NEXT:    adcs r3, r4
2599; CHECK-NEXT:    adds r0, r0, r2
2600; CHECK-NEXT:    adcs r1, r3
2601; CHECK-NEXT:    add sp, #32
2602; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2603; CHECK-NEXT:    pop {r4, r5, r7, pc}
2604entry:
2605  %c = icmp eq <16 x i8> %b, zeroinitializer
2606  %xx = zext <16 x i8> %x to <16 x i64>
2607  %yy = zext <16 x i8> %y to <16 x i64>
2608  %m = mul <16 x i64> %xx, %yy
2609  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2610  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2611  %r = add i64 %z, %a
2612  ret i64 %r
2613}
2614
2615define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y, <16 x i8> %b, i64 %a) {
2616; CHECK-LABEL: add_v16i8_v16i64_acc_sext:
2617; CHECK:       @ %bb.0: @ %entry
2618; CHECK-NEXT:    .save {r4, r5, r7, lr}
2619; CHECK-NEXT:    push {r4, r5, r7, lr}
2620; CHECK-NEXT:    .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
2621; CHECK-NEXT:    vpush {d8, d9, d10, d11, d12, d13, d14, d15}
2622; CHECK-NEXT:    .pad #16
2623; CHECK-NEXT:    sub sp, #16
2624; CHECK-NEXT:    vmov q3, q0
2625; CHECK-NEXT:    vcmp.i8 eq, q2, zr
2626; CHECK-NEXT:    vmov.i8 q0, #0x0
2627; CHECK-NEXT:    vmov.i8 q2, #0xff
2628; CHECK-NEXT:    vpsel q5, q2, q0
2629; CHECK-NEXT:    vstrw.32 q0, [sp] @ 16-byte Spill
2630; CHECK-NEXT:    vmov.u8 r2, q5[0]
2631; CHECK-NEXT:    vmov.s8 r4, q1[2]
2632; CHECK-NEXT:    vmov.16 q4[0], r2
2633; CHECK-NEXT:    vmov.u8 r2, q5[1]
2634; CHECK-NEXT:    vmov.16 q4[1], r2
2635; CHECK-NEXT:    vmov.u8 r2, q5[2]
2636; CHECK-NEXT:    vmov.16 q4[2], r2
2637; CHECK-NEXT:    vmov.u8 r2, q5[3]
2638; CHECK-NEXT:    vmov.16 q4[3], r2
2639; CHECK-NEXT:    vmov.u8 r2, q5[4]
2640; CHECK-NEXT:    vmov.16 q4[4], r2
2641; CHECK-NEXT:    vmov.u8 r2, q5[5]
2642; CHECK-NEXT:    vmov.16 q4[5], r2
2643; CHECK-NEXT:    vmov.u8 r2, q5[6]
2644; CHECK-NEXT:    vmov.16 q4[6], r2
2645; CHECK-NEXT:    vmov.u8 r2, q5[7]
2646; CHECK-NEXT:    vmov.16 q4[7], r2
2647; CHECK-NEXT:    vmov.s8 r5, q3[2]
2648; CHECK-NEXT:    vcmp.i16 ne, q4, zr
2649; CHECK-NEXT:    smull r4, r5, r5, r4
2650; CHECK-NEXT:    vpsel q6, q2, q0
2651; CHECK-NEXT:    vmov.u16 r2, q6[2]
2652; CHECK-NEXT:    vmov.u16 r3, q6[0]
2653; CHECK-NEXT:    vmov q4[2], q4[0], r3, r2
2654; CHECK-NEXT:    vmov.u16 r2, q6[3]
2655; CHECK-NEXT:    vmov.u16 r3, q6[1]
2656; CHECK-NEXT:    vmov q4[3], q4[1], r3, r2
2657; CHECK-NEXT:    vcmp.i32 ne, q4, zr
2658; CHECK-NEXT:    vpsel q7, q2, q0
2659; CHECK-NEXT:    vmov r2, r3, d14
2660; CHECK-NEXT:    vmov q4[2], q4[0], r2, r3
2661; CHECK-NEXT:    vmov q4[3], q4[1], r2, r3
2662; CHECK-NEXT:    vmov.s8 r2, q1[1]
2663; CHECK-NEXT:    vmov.s8 r3, q3[1]
2664; CHECK-NEXT:    vcmp.i32 ne, q4, zr
2665; CHECK-NEXT:    smull lr, r12, r3, r2
2666; CHECK-NEXT:    vmov.s8 r3, q1[0]
2667; CHECK-NEXT:    vmov.s8 r2, q3[0]
2668; CHECK-NEXT:    vmov.i32 q4, #0x0
2669; CHECK-NEXT:    smull r2, r3, r2, r3
2670; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
2671; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
2672; CHECK-NEXT:    vpsel q0, q0, q4
2673; CHECK-NEXT:    vmov lr, r12, d1
2674; CHECK-NEXT:    vmov r3, r2, d0
2675; CHECK-NEXT:    adds.w lr, lr, r3
2676; CHECK-NEXT:    adc.w r12, r12, r2
2677; CHECK-NEXT:    vmov r2, r3, d15
2678; CHECK-NEXT:    vmov q0[2], q0[0], r2, r3
2679; CHECK-NEXT:    vldrw.u32 q7, [sp] @ 16-byte Reload
2680; CHECK-NEXT:    vmov q0[3], q0[1], r2, r3
2681; CHECK-NEXT:    vmov.s8 r2, q1[3]
2682; CHECK-NEXT:    vmov.s8 r3, q3[3]
2683; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2684; CHECK-NEXT:    smull r2, r3, r3, r2
2685; CHECK-NEXT:    vmov q0[2], q0[0], r4, r2
2686; CHECK-NEXT:    vmov q0[3], q0[1], r5, r3
2687; CHECK-NEXT:    vpsel q0, q0, q4
2688; CHECK-NEXT:    vmov r2, r3, d0
2689; CHECK-NEXT:    vmov r5, r4, d1
2690; CHECK-NEXT:    adds.w r2, r2, lr
2691; CHECK-NEXT:    adc.w r3, r3, r12
2692; CHECK-NEXT:    adds.w r12, r2, r5
2693; CHECK-NEXT:    adc.w lr, r3, r4
2694; CHECK-NEXT:    vmov.u16 r5, q6[6]
2695; CHECK-NEXT:    vmov.u16 r4, q6[4]
2696; CHECK-NEXT:    vmov.s8 r2, q1[4]
2697; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2698; CHECK-NEXT:    vmov.u16 r5, q6[7]
2699; CHECK-NEXT:    vmov.u16 r4, q6[5]
2700; CHECK-NEXT:    vmov.s8 r3, q3[4]
2701; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2702; CHECK-NEXT:    smull r2, r3, r3, r2
2703; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2704; CHECK-NEXT:    vpsel q6, q2, q7
2705; CHECK-NEXT:    vmov r5, r4, d12
2706; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2707; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2708; CHECK-NEXT:    vmov.s8 r5, q1[5]
2709; CHECK-NEXT:    vmov.s8 r4, q3[5]
2710; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2711; CHECK-NEXT:    smull r5, r4, r4, r5
2712; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2713; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2714; CHECK-NEXT:    vpsel q0, q0, q4
2715; CHECK-NEXT:    vmov r2, r3, d0
2716; CHECK-NEXT:    vmov r5, r4, d1
2717; CHECK-NEXT:    adds.w r2, r2, r12
2718; CHECK-NEXT:    adc.w r3, r3, lr
2719; CHECK-NEXT:    adds.w r12, r2, r5
2720; CHECK-NEXT:    adc.w lr, r3, r4
2721; CHECK-NEXT:    vmov r5, r4, d13
2722; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2723; CHECK-NEXT:    vmov.s8 r2, q1[6]
2724; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2725; CHECK-NEXT:    vmov.s8 r3, q3[6]
2726; CHECK-NEXT:    vmov.s8 r5, q1[7]
2727; CHECK-NEXT:    vmov.s8 r4, q3[7]
2728; CHECK-NEXT:    smull r5, r4, r4, r5
2729; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2730; CHECK-NEXT:    smull r2, r3, r3, r2
2731; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2732; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2733; CHECK-NEXT:    vpsel q0, q0, q4
2734; CHECK-NEXT:    vmov r2, r3, d0
2735; CHECK-NEXT:    vmov r5, r4, d1
2736; CHECK-NEXT:    adds.w r2, r2, r12
2737; CHECK-NEXT:    adc.w r3, r3, lr
2738; CHECK-NEXT:    adds.w r12, r2, r5
2739; CHECK-NEXT:    vmov.u8 r5, q5[8]
2740; CHECK-NEXT:    adc.w lr, r3, r4
2741; CHECK-NEXT:    vmov.16 q6[0], r5
2742; CHECK-NEXT:    vmov.u8 r5, q5[9]
2743; CHECK-NEXT:    vmov.16 q6[1], r5
2744; CHECK-NEXT:    vmov.u8 r5, q5[10]
2745; CHECK-NEXT:    vmov.16 q6[2], r5
2746; CHECK-NEXT:    vmov.u8 r5, q5[11]
2747; CHECK-NEXT:    vmov.16 q6[3], r5
2748; CHECK-NEXT:    vmov.u8 r5, q5[12]
2749; CHECK-NEXT:    vmov.16 q6[4], r5
2750; CHECK-NEXT:    vmov.u8 r5, q5[13]
2751; CHECK-NEXT:    vmov.16 q6[5], r5
2752; CHECK-NEXT:    vmov.u8 r5, q5[14]
2753; CHECK-NEXT:    vmov.16 q6[6], r5
2754; CHECK-NEXT:    vmov.u8 r5, q5[15]
2755; CHECK-NEXT:    vmov.16 q6[7], r5
2756; CHECK-NEXT:    vmov.s8 r2, q1[8]
2757; CHECK-NEXT:    vcmp.i16 ne, q6, zr
2758; CHECK-NEXT:    vmov.s8 r3, q3[8]
2759; CHECK-NEXT:    vpsel q5, q2, q7
2760; CHECK-NEXT:    smull r2, r3, r3, r2
2761; CHECK-NEXT:    vmov.u16 r5, q5[2]
2762; CHECK-NEXT:    vmov.u16 r4, q5[0]
2763; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2764; CHECK-NEXT:    vmov.u16 r5, q5[3]
2765; CHECK-NEXT:    vmov.u16 r4, q5[1]
2766; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2767; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2768; CHECK-NEXT:    vpsel q6, q2, q7
2769; CHECK-NEXT:    vmov r5, r4, d12
2770; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2771; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2772; CHECK-NEXT:    vmov.s8 r5, q1[9]
2773; CHECK-NEXT:    vmov.s8 r4, q3[9]
2774; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2775; CHECK-NEXT:    smull r5, r4, r4, r5
2776; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2777; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2778; CHECK-NEXT:    vpsel q0, q0, q4
2779; CHECK-NEXT:    vmov r2, r3, d0
2780; CHECK-NEXT:    vmov r5, r4, d1
2781; CHECK-NEXT:    adds.w r2, r2, r12
2782; CHECK-NEXT:    adc.w r3, r3, lr
2783; CHECK-NEXT:    adds.w r12, r2, r5
2784; CHECK-NEXT:    adc.w lr, r3, r4
2785; CHECK-NEXT:    vmov r5, r4, d13
2786; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2787; CHECK-NEXT:    vmov.s8 r2, q1[10]
2788; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2789; CHECK-NEXT:    vmov.s8 r3, q3[10]
2790; CHECK-NEXT:    vmov.s8 r5, q1[11]
2791; CHECK-NEXT:    vmov.s8 r4, q3[11]
2792; CHECK-NEXT:    smull r5, r4, r4, r5
2793; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2794; CHECK-NEXT:    smull r2, r3, r3, r2
2795; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2796; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2797; CHECK-NEXT:    vpsel q0, q0, q4
2798; CHECK-NEXT:    vmov r2, r3, d0
2799; CHECK-NEXT:    vmov r5, r4, d1
2800; CHECK-NEXT:    adds.w r2, r2, r12
2801; CHECK-NEXT:    adc.w r3, r3, lr
2802; CHECK-NEXT:    adds.w r12, r2, r5
2803; CHECK-NEXT:    adc.w lr, r3, r4
2804; CHECK-NEXT:    vmov.u16 r5, q5[6]
2805; CHECK-NEXT:    vmov.u16 r4, q5[4]
2806; CHECK-NEXT:    vmov.s8 r2, q1[12]
2807; CHECK-NEXT:    vmov q0[2], q0[0], r4, r5
2808; CHECK-NEXT:    vmov.u16 r5, q5[7]
2809; CHECK-NEXT:    vmov.u16 r4, q5[5]
2810; CHECK-NEXT:    vmov.s8 r3, q3[12]
2811; CHECK-NEXT:    vmov q0[3], q0[1], r4, r5
2812; CHECK-NEXT:    smull r2, r3, r3, r2
2813; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2814; CHECK-NEXT:    vpsel q2, q2, q7
2815; CHECK-NEXT:    vmov r5, r4, d4
2816; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2817; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2818; CHECK-NEXT:    vmov.s8 r5, q1[13]
2819; CHECK-NEXT:    vmov.s8 r4, q3[13]
2820; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2821; CHECK-NEXT:    smull r5, r4, r4, r5
2822; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2823; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2824; CHECK-NEXT:    vpsel q0, q0, q4
2825; CHECK-NEXT:    vmov r2, r3, d0
2826; CHECK-NEXT:    vmov r5, r4, d1
2827; CHECK-NEXT:    adds.w r2, r2, r12
2828; CHECK-NEXT:    adc.w r3, r3, lr
2829; CHECK-NEXT:    adds.w r12, r2, r5
2830; CHECK-NEXT:    adc.w lr, r3, r4
2831; CHECK-NEXT:    vmov r5, r4, d5
2832; CHECK-NEXT:    vmov q0[2], q0[0], r5, r4
2833; CHECK-NEXT:    vmov.s8 r2, q1[14]
2834; CHECK-NEXT:    vmov q0[3], q0[1], r5, r4
2835; CHECK-NEXT:    vmov.s8 r3, q3[14]
2836; CHECK-NEXT:    vmov.s8 r5, q1[15]
2837; CHECK-NEXT:    vmov.s8 r4, q3[15]
2838; CHECK-NEXT:    smull r5, r4, r4, r5
2839; CHECK-NEXT:    vcmp.i32 ne, q0, zr
2840; CHECK-NEXT:    smull r2, r3, r3, r2
2841; CHECK-NEXT:    vmov q0[2], q0[0], r2, r5
2842; CHECK-NEXT:    vmov q0[3], q0[1], r3, r4
2843; CHECK-NEXT:    vpsel q0, q0, q4
2844; CHECK-NEXT:    vmov r2, r3, d0
2845; CHECK-NEXT:    vmov r5, r4, d1
2846; CHECK-NEXT:    adds.w r2, r2, r12
2847; CHECK-NEXT:    adc.w r3, r3, lr
2848; CHECK-NEXT:    adds r2, r2, r5
2849; CHECK-NEXT:    adcs r3, r4
2850; CHECK-NEXT:    adds r0, r0, r2
2851; CHECK-NEXT:    adcs r1, r3
2852; CHECK-NEXT:    add sp, #16
2853; CHECK-NEXT:    vpop {d8, d9, d10, d11, d12, d13, d14, d15}
2854; CHECK-NEXT:    pop {r4, r5, r7, pc}
2855entry:
2856  %c = icmp eq <16 x i8> %b, zeroinitializer
2857  %xx = sext <16 x i8> %x to <16 x i64>
2858  %yy = sext <16 x i8> %y to <16 x i64>
2859  %m = mul <16 x i64> %xx, %yy
2860  %s = select <16 x i1> %c, <16 x i64> %m, <16 x i64> zeroinitializer
2861  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %s)
2862  %r = add i64 %z, %a
2863  ret i64 %r
2864}
2865
2866define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2867; CHECK-LABEL: add_v2i8_v2i64_acc_zext:
2868; CHECK:       @ %bb.0: @ %entry
2869; CHECK-NEXT:    .save {r7, lr}
2870; CHECK-NEXT:    push {r7, lr}
2871; CHECK-NEXT:    vmov.i64 q3, #0xff
2872; CHECK-NEXT:    vand q1, q1, q3
2873; CHECK-NEXT:    vand q0, q0, q3
2874; CHECK-NEXT:    vmov r2, s6
2875; CHECK-NEXT:    vmov r3, s2
2876; CHECK-NEXT:    umull lr, r12, r3, r2
2877; CHECK-NEXT:    vmov r3, s4
2878; CHECK-NEXT:    vmov r2, s0
2879; CHECK-NEXT:    vand q1, q2, q3
2880; CHECK-NEXT:    umull r2, r3, r2, r3
2881; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
2882; CHECK-NEXT:    vmov r2, s4
2883; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
2884; CHECK-NEXT:    movs r3, #0
2885; CHECK-NEXT:    cmp r2, #0
2886; CHECK-NEXT:    csetm r2, eq
2887; CHECK-NEXT:    bfi r3, r2, #0, #8
2888; CHECK-NEXT:    vmov r2, s6
2889; CHECK-NEXT:    vmov.i32 q1, #0x0
2890; CHECK-NEXT:    cmp r2, #0
2891; CHECK-NEXT:    csetm r2, eq
2892; CHECK-NEXT:    bfi r3, r2, #8, #8
2893; CHECK-NEXT:    vmsr p0, r3
2894; CHECK-NEXT:    vpsel q0, q0, q1
2895; CHECK-NEXT:    vmov lr, r12, d1
2896; CHECK-NEXT:    vmov r3, r2, d0
2897; CHECK-NEXT:    adds.w r3, r3, lr
2898; CHECK-NEXT:    adc.w r2, r2, r12
2899; CHECK-NEXT:    adds r0, r0, r3
2900; CHECK-NEXT:    adcs r1, r2
2901; CHECK-NEXT:    pop {r7, pc}
2902entry:
2903  %c = icmp eq <2 x i8> %b, zeroinitializer
2904  %xx = zext <2 x i8> %x to <2 x i64>
2905  %yy = zext <2 x i8> %y to <2 x i64>
2906  %m = mul <2 x i64> %xx, %yy
2907  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2908  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2909  %r = add i64 %z, %a
2910  ret i64 %r
2911}
2912
2913define arm_aapcs_vfpcc i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, <2 x i8> %y, <2 x i8> %b, i64 %a) {
2914; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
2915; CHECK:       @ %bb.0: @ %entry
2916; CHECK-NEXT:    .save {r7, lr}
2917; CHECK-NEXT:    push {r7, lr}
2918; CHECK-NEXT:    vmov.i32 q3, #0xff
2919; CHECK-NEXT:    movs r3, #0
2920; CHECK-NEXT:    vand q2, q2, q3
2921; CHECK-NEXT:    vmov r2, s8
2922; CHECK-NEXT:    cmp r2, #0
2923; CHECK-NEXT:    csetm r2, eq
2924; CHECK-NEXT:    bfi r3, r2, #0, #8
2925; CHECK-NEXT:    vmov r2, s10
2926; CHECK-NEXT:    cmp r2, #0
2927; CHECK-NEXT:    csetm r2, eq
2928; CHECK-NEXT:    bfi r3, r2, #8, #8
2929; CHECK-NEXT:    vmov r2, s6
2930; CHECK-NEXT:    vmsr p0, r3
2931; CHECK-NEXT:    vmov r3, s2
2932; CHECK-NEXT:    sxtb r2, r2
2933; CHECK-NEXT:    sxtb r3, r3
2934; CHECK-NEXT:    smull lr, r12, r3, r2
2935; CHECK-NEXT:    vmov r3, s4
2936; CHECK-NEXT:    vmov r2, s0
2937; CHECK-NEXT:    vmov.i32 q1, #0x0
2938; CHECK-NEXT:    sxtb r3, r3
2939; CHECK-NEXT:    sxtb r2, r2
2940; CHECK-NEXT:    smull r2, r3, r2, r3
2941; CHECK-NEXT:    vmov q0[2], q0[0], r2, lr
2942; CHECK-NEXT:    vmov q0[3], q0[1], r3, r12
2943; CHECK-NEXT:    vpsel q0, q0, q1
2944; CHECK-NEXT:    vmov lr, r12, d1
2945; CHECK-NEXT:    vmov r3, r2, d0
2946; CHECK-NEXT:    adds.w r3, r3, lr
2947; CHECK-NEXT:    adc.w r2, r2, r12
2948; CHECK-NEXT:    adds r0, r0, r3
2949; CHECK-NEXT:    adcs r1, r2
2950; CHECK-NEXT:    pop {r7, pc}
2951entry:
2952  %c = icmp eq <2 x i8> %b, zeroinitializer
2953  %xx = sext <2 x i8> %x to <2 x i64>
2954  %yy = sext <2 x i8> %y to <2 x i64>
2955  %m = mul <2 x i64> %xx, %yy
2956  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
2957  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
2958  %r = add i64 %z, %a
2959  ret i64 %r
2960}
2961
2962define arm_aapcs_vfpcc i64 @add_v2i64_v2i64_acc(<2 x i64> %x, <2 x i64> %y, <2 x i64> %b, i64 %a) {
2963; CHECK-LABEL: add_v2i64_v2i64_acc:
2964; CHECK:       @ %bb.0: @ %entry
2965; CHECK-NEXT:    .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2966; CHECK-NEXT:    push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
2967; CHECK-NEXT:    vmov r2, r12, d3
2968; CHECK-NEXT:    vmov r3, lr, d1
2969; CHECK-NEXT:    vmov r6, r9, d2
2970; CHECK-NEXT:    vmov.i32 q1, #0x0
2971; CHECK-NEXT:    vmov r5, r11, d0
2972; CHECK-NEXT:    umull r10, r8, r3, r2
2973; CHECK-NEXT:    umull r4, r7, r5, r6
2974; CHECK-NEXT:    mla r3, r3, r12, r8
2975; CHECK-NEXT:    vmov q0[2], q0[0], r4, r10
2976; CHECK-NEXT:    mla r2, lr, r2, r3
2977; CHECK-NEXT:    mla r3, r5, r9, r7
2978; CHECK-NEXT:    mla r3, r11, r6, r3
2979; CHECK-NEXT:    vmov q0[3], q0[1], r3, r2
2980; CHECK-NEXT:    vmov r2, r3, d4
2981; CHECK-NEXT:    orrs r2, r3
2982; CHECK-NEXT:    mov.w r3, #0
2983; CHECK-NEXT:    csetm r2, eq
2984; CHECK-NEXT:    bfi r3, r2, #0, #8
2985; CHECK-NEXT:    vmov r2, r7, d5
2986; CHECK-NEXT:    orrs r2, r7
2987; CHECK-NEXT:    csetm r2, eq
2988; CHECK-NEXT:    bfi r3, r2, #8, #8
2989; CHECK-NEXT:    vmsr p0, r3
2990; CHECK-NEXT:    vpsel q0, q0, q1
2991; CHECK-NEXT:    vmov r2, r3, d1
2992; CHECK-NEXT:    vmov r7, r6, d0
2993; CHECK-NEXT:    adds r2, r2, r7
2994; CHECK-NEXT:    adcs r3, r6
2995; CHECK-NEXT:    adds r0, r0, r2
2996; CHECK-NEXT:    adcs r1, r3
2997; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
2998entry:
2999  %c = icmp eq <2 x i64> %b, zeroinitializer
3000  %m = mul <2 x i64> %x, %y
3001  %s = select <2 x i1> %c, <2 x i64> %m, <2 x i64> zeroinitializer
3002  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %s)
3003  %r = add i64 %z, %a
3004  ret i64 %r
3005}
3006
3007declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
3008declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
3009declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
3010declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
3011declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
3012declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
3013declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
3014declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
3015declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
3016declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
3017