xref: /llvm-project/llvm/test/CodeGen/AArch64/vecreduce-add.ll (revision 4a890c2c605640f48ecbaefebda8f3a42043ff3d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64-none-linux-gnu %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-BASE
3; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-DOT
4; RUN: llc -mtriple=aarch64-none-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-BASE
5; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+dotprod -global-isel %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-DOT
6
7define i32 @addv_v2i32(<2 x i32> %a) {
8; CHECK-LABEL: addv_v2i32:
9; CHECK:       // %bb.0: // %entry
10; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
11; CHECK-NEXT:    fmov w0, s0
12; CHECK-NEXT:    ret
13entry:
14  %arg1 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a)
15  ret i32 %arg1
16}
17
18define i16 @addv_v4i16(<4 x i16> %a) {
19; CHECK-LABEL: addv_v4i16:
20; CHECK:       // %bb.0: // %entry
21; CHECK-NEXT:    addv h0, v0.4h
22; CHECK-NEXT:    fmov w0, s0
23; CHECK-NEXT:    ret
24entry:
25  %arg1 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a)
26  ret i16 %arg1
27}
28
29define i32 @add_v4i32_v4i32(<4 x i32> %x) {
30; CHECK-LABEL: add_v4i32_v4i32:
31; CHECK:       // %bb.0: // %entry
32; CHECK-NEXT:    addv s0, v0.4s
33; CHECK-NEXT:    fmov w0, s0
34; CHECK-NEXT:    ret
35entry:
36  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
37  ret i32 %z
38}
39
40define i8 @addv_v8i8(<8 x i8> %a) {
41; CHECK-LABEL: addv_v8i8:
42; CHECK:       // %bb.0: // %entry
43; CHECK-NEXT:    addv b0, v0.8b
44; CHECK-NEXT:    fmov w0, s0
45; CHECK-NEXT:    ret
46entry:
47  %arg1 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a)
48  ret i8 %arg1
49}
50
51define i64 @add_v4i32_v4i64_zext(<4 x i32> %x) {
52; CHECK-LABEL: add_v4i32_v4i64_zext:
53; CHECK:       // %bb.0: // %entry
54; CHECK-NEXT:    uaddlv d0, v0.4s
55; CHECK-NEXT:    fmov x0, d0
56; CHECK-NEXT:    ret
57entry:
58  %xx = zext <4 x i32> %x to <4 x i64>
59  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
60  ret i64 %z
61}
62
63define i64 @add_v4i32_v4i64_sext(<4 x i32> %x) {
64; CHECK-LABEL: add_v4i32_v4i64_sext:
65; CHECK:       // %bb.0: // %entry
66; CHECK-NEXT:    saddlv d0, v0.4s
67; CHECK-NEXT:    fmov x0, d0
68; CHECK-NEXT:    ret
69entry:
70  %xx = sext <4 x i32> %x to <4 x i64>
71  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
72  ret i64 %z
73}
74
75define i64 @add_v4i32_v4i64_zsext(<4 x i32> %xi) {
76; CHECK-LABEL: add_v4i32_v4i64_zsext:
77; CHECK:       // %bb.0: // %entry
78; CHECK-NEXT:    ushll v1.2d, v0.2s, #0
79; CHECK-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
80; CHECK-NEXT:    addp d0, v0.2d
81; CHECK-NEXT:    fmov x0, d0
82; CHECK-NEXT:    ret
83entry:
84  %x = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 0, i32 1>
85  %y = shufflevector <4 x i32> %xi, <4 x i32> %xi, <2 x i32> <i32 2, i32 3>
86  %xx = zext <2 x i32> %x to <2 x i64>
87  %yy = sext <2 x i32> %y to <2 x i64>
88  %zz = add <2 x i64> %xx, %yy
89  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %zz)
90  ret i64 %z
91}
92
93define i64 @add_v2i32_v2i64_zext(<2 x i32> %x) {
94; CHECK-LABEL: add_v2i32_v2i64_zext:
95; CHECK:       // %bb.0: // %entry
96; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
97; CHECK-NEXT:    addp d0, v0.2d
98; CHECK-NEXT:    fmov x0, d0
99; CHECK-NEXT:    ret
100entry:
101  %xx = zext <2 x i32> %x to <2 x i64>
102  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
103  ret i64 %z
104}
105
106define i64 @add_v2i32_v2i64_sext(<2 x i32> %x) {
107; CHECK-LABEL: add_v2i32_v2i64_sext:
108; CHECK:       // %bb.0: // %entry
109; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
110; CHECK-NEXT:    addp d0, v0.2d
111; CHECK-NEXT:    fmov x0, d0
112; CHECK-NEXT:    ret
113entry:
114  %xx = sext <2 x i32> %x to <2 x i64>
115  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
116  ret i64 %z
117}
118
119define i32 @add_v8i16_v8i32_zext(<8 x i16> %x) {
120; CHECK-LABEL: add_v8i16_v8i32_zext:
121; CHECK:       // %bb.0: // %entry
122; CHECK-NEXT:    uaddlv s0, v0.8h
123; CHECK-NEXT:    fmov w0, s0
124; CHECK-NEXT:    ret
125entry:
126  %xx = zext <8 x i16> %x to <8 x i32>
127  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
128  ret i32 %z
129}
130
131define i32 @add_v8i16_v8i32_sext(<8 x i16> %x) {
132; CHECK-LABEL: add_v8i16_v8i32_sext:
133; CHECK:       // %bb.0: // %entry
134; CHECK-NEXT:    saddlv s0, v0.8h
135; CHECK-NEXT:    fmov w0, s0
136; CHECK-NEXT:    ret
137entry:
138  %xx = sext <8 x i16> %x to <8 x i32>
139  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
140  ret i32 %z
141}
142
143define i32 @add_v4i16_v4i32_zext(<4 x i16> %x) {
144; CHECK-SD-LABEL: add_v4i16_v4i32_zext:
145; CHECK-SD:       // %bb.0: // %entry
146; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
147; CHECK-SD-NEXT:    addv s0, v0.4s
148; CHECK-SD-NEXT:    fmov w0, s0
149; CHECK-SD-NEXT:    ret
150;
151; CHECK-GI-LABEL: add_v4i16_v4i32_zext:
152; CHECK-GI:       // %bb.0: // %entry
153; CHECK-GI-NEXT:    uaddlv s0, v0.4h
154; CHECK-GI-NEXT:    fmov w0, s0
155; CHECK-GI-NEXT:    ret
156entry:
157  %xx = zext <4 x i16> %x to <4 x i32>
158  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
159  ret i32 %z
160}
161
162define i32 @add_v4i16_v4i32_sext(<4 x i16> %x) {
163; CHECK-SD-LABEL: add_v4i16_v4i32_sext:
164; CHECK-SD:       // %bb.0: // %entry
165; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
166; CHECK-SD-NEXT:    addv s0, v0.4s
167; CHECK-SD-NEXT:    fmov w0, s0
168; CHECK-SD-NEXT:    ret
169;
170; CHECK-GI-LABEL: add_v4i16_v4i32_sext:
171; CHECK-GI:       // %bb.0: // %entry
172; CHECK-GI-NEXT:    saddlv s0, v0.4h
173; CHECK-GI-NEXT:    fmov w0, s0
174; CHECK-GI-NEXT:    ret
175entry:
176  %xx = sext <4 x i16> %x to <4 x i32>
177  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
178  ret i32 %z
179}
180
181define zeroext i16 @add_v8i16_v8i16(<8 x i16> %x) {
182; CHECK-SD-LABEL: add_v8i16_v8i16:
183; CHECK-SD:       // %bb.0: // %entry
184; CHECK-SD-NEXT:    addv h0, v0.8h
185; CHECK-SD-NEXT:    fmov w0, s0
186; CHECK-SD-NEXT:    ret
187;
188; CHECK-GI-LABEL: add_v8i16_v8i16:
189; CHECK-GI:       // %bb.0: // %entry
190; CHECK-GI-NEXT:    addv h0, v0.8h
191; CHECK-GI-NEXT:    fmov w8, s0
192; CHECK-GI-NEXT:    uxth w0, w8
193; CHECK-GI-NEXT:    ret
194entry:
195  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
196  ret i16 %z
197}
198
199define i64 @add_v8i16_v8i64_zext(<8 x i16> %x) {
200; CHECK-SD-LABEL: add_v8i16_v8i64_zext:
201; CHECK-SD:       // %bb.0: // %entry
202; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
203; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
204; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
205; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
206; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
207; CHECK-SD-NEXT:    addp d0, v0.2d
208; CHECK-SD-NEXT:    fmov x0, d0
209; CHECK-SD-NEXT:    ret
210;
211; CHECK-GI-LABEL: add_v8i16_v8i64_zext:
212; CHECK-GI:       // %bb.0: // %entry
213; CHECK-GI-NEXT:    uaddlv s0, v0.8h
214; CHECK-GI-NEXT:    mov w0, v0.s[0]
215; CHECK-GI-NEXT:    ret
216entry:
217  %xx = zext <8 x i16> %x to <8 x i64>
218  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
219  ret i64 %z
220}
221
222define i64 @add_v8i16_v8i64_sext(<8 x i16> %x) {
223; CHECK-SD-LABEL: add_v8i16_v8i64_sext:
224; CHECK-SD:       // %bb.0: // %entry
225; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
226; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
227; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
228; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
229; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
230; CHECK-SD-NEXT:    addp d0, v0.2d
231; CHECK-SD-NEXT:    fmov x0, d0
232; CHECK-SD-NEXT:    ret
233;
234; CHECK-GI-LABEL: add_v8i16_v8i64_sext:
235; CHECK-GI:       // %bb.0: // %entry
236; CHECK-GI-NEXT:    saddlv s0, v0.8h
237; CHECK-GI-NEXT:    smov x0, v0.s[0]
238; CHECK-GI-NEXT:    ret
239entry:
240  %xx = sext <8 x i16> %x to <8 x i64>
241  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
242  ret i64 %z
243}
244
245define i64 @add_v4i16_v4i64_zext(<4 x i16> %x) {
246; CHECK-SD-LABEL: add_v4i16_v4i64_zext:
247; CHECK-SD:       // %bb.0: // %entry
248; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
249; CHECK-SD-NEXT:    uaddlv d0, v0.4s
250; CHECK-SD-NEXT:    fmov x0, d0
251; CHECK-SD-NEXT:    ret
252;
253; CHECK-GI-LABEL: add_v4i16_v4i64_zext:
254; CHECK-GI:       // %bb.0: // %entry
255; CHECK-GI-NEXT:    uaddlv s0, v0.4h
256; CHECK-GI-NEXT:    mov w0, v0.s[0]
257; CHECK-GI-NEXT:    ret
258entry:
259  %xx = zext <4 x i16> %x to <4 x i64>
260  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
261  ret i64 %z
262}
263
264define i64 @add_v4i16_v4i64_sext(<4 x i16> %x) {
265; CHECK-SD-LABEL: add_v4i16_v4i64_sext:
266; CHECK-SD:       // %bb.0: // %entry
267; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
268; CHECK-SD-NEXT:    saddlv d0, v0.4s
269; CHECK-SD-NEXT:    fmov x0, d0
270; CHECK-SD-NEXT:    ret
271;
272; CHECK-GI-LABEL: add_v4i16_v4i64_sext:
273; CHECK-GI:       // %bb.0: // %entry
274; CHECK-GI-NEXT:    saddlv s0, v0.4h
275; CHECK-GI-NEXT:    smov x0, v0.s[0]
276; CHECK-GI-NEXT:    ret
277entry:
278  %xx = sext <4 x i16> %x to <4 x i64>
279  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
280  ret i64 %z
281}
282
283define i64 @add_v2i16_v2i64_zext(<2 x i16> %x) {
284; CHECK-SD-LABEL: add_v2i16_v2i64_zext:
285; CHECK-SD:       // %bb.0: // %entry
286; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
287; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
288; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
289; CHECK-SD-NEXT:    addp d0, v0.2d
290; CHECK-SD-NEXT:    fmov x0, d0
291; CHECK-SD-NEXT:    ret
292;
293; CHECK-GI-LABEL: add_v2i16_v2i64_zext:
294; CHECK-GI:       // %bb.0: // %entry
295; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
296; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
297; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
298; CHECK-GI-NEXT:    addp d0, v0.2d
299; CHECK-GI-NEXT:    fmov x0, d0
300; CHECK-GI-NEXT:    ret
301entry:
302  %xx = zext <2 x i16> %x to <2 x i64>
303  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
304  ret i64 %z
305}
306
307define i64 @add_v2i16_v2i64_sext(<2 x i16> %x) {
308; CHECK-LABEL: add_v2i16_v2i64_sext:
309; CHECK:       // %bb.0: // %entry
310; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
311; CHECK-NEXT:    shl v0.2d, v0.2d, #48
312; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
313; CHECK-NEXT:    addp d0, v0.2d
314; CHECK-NEXT:    fmov x0, d0
315; CHECK-NEXT:    ret
316entry:
317  %xx = sext <2 x i16> %x to <2 x i64>
318  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
319  ret i64 %z
320}
321
322define i32 @add_v16i8_v16i32_zext(<16 x i8> %x) {
323; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_zext:
324; CHECK-SD-BASE:       // %bb.0: // %entry
325; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
326; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
327; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
328; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
329; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
330; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
331; CHECK-SD-BASE-NEXT:    fmov w0, s0
332; CHECK-SD-BASE-NEXT:    ret
333;
334; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_zext:
335; CHECK-SD-DOT:       // %bb.0: // %entry
336; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
337; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
338; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
339; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
340; CHECK-SD-DOT-NEXT:    fmov w0, s0
341; CHECK-SD-DOT-NEXT:    ret
342;
343; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_zext:
344; CHECK-GI-BASE:       // %bb.0: // %entry
345; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
346; CHECK-GI-BASE-NEXT:    fmov w8, s0
347; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
348; CHECK-GI-BASE-NEXT:    ret
349;
350; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_zext:
351; CHECK-GI-DOT:       // %bb.0: // %entry
352; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
353; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
354; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
355; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
356; CHECK-GI-DOT-NEXT:    fmov w0, s0
357; CHECK-GI-DOT-NEXT:    ret
358entry:
359  %xx = zext <16 x i8> %x to <16 x i32>
360  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
361  ret i32 %z
362}
363
364define i32 @add_v16i8_v16i32_sext(<16 x i8> %x) {
365; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_sext:
366; CHECK-SD-BASE:       // %bb.0: // %entry
367; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
368; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
369; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
370; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
371; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
372; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
373; CHECK-SD-BASE-NEXT:    fmov w0, s0
374; CHECK-SD-BASE-NEXT:    ret
375;
376; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_sext:
377; CHECK-SD-DOT:       // %bb.0: // %entry
378; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
379; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
380; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
381; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
382; CHECK-SD-DOT-NEXT:    fmov w0, s0
383; CHECK-SD-DOT-NEXT:    ret
384;
385; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_sext:
386; CHECK-GI-BASE:       // %bb.0: // %entry
387; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
388; CHECK-GI-BASE-NEXT:    fmov w8, s0
389; CHECK-GI-BASE-NEXT:    sxth w0, w8
390; CHECK-GI-BASE-NEXT:    ret
391;
392; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_sext:
393; CHECK-GI-DOT:       // %bb.0: // %entry
394; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
395; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
396; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
397; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
398; CHECK-GI-DOT-NEXT:    fmov w0, s0
399; CHECK-GI-DOT-NEXT:    ret
400entry:
401  %xx = sext <16 x i8> %x to <16 x i32>
402  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
403  ret i32 %z
404}
405
406define i32 @add_v8i8_v8i32_zext(<8 x i8> %x) {
407; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_zext:
408; CHECK-SD-BASE:       // %bb.0: // %entry
409; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
410; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
411; CHECK-SD-BASE-NEXT:    fmov w0, s0
412; CHECK-SD-BASE-NEXT:    ret
413;
414; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_zext:
415; CHECK-SD-DOT:       // %bb.0: // %entry
416; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
417; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
418; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
419; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
420; CHECK-SD-DOT-NEXT:    fmov w0, s0
421; CHECK-SD-DOT-NEXT:    ret
422;
423; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_zext:
424; CHECK-GI-BASE:       // %bb.0: // %entry
425; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
426; CHECK-GI-BASE-NEXT:    fmov w8, s0
427; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
428; CHECK-GI-BASE-NEXT:    ret
429;
430; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_zext:
431; CHECK-GI-DOT:       // %bb.0: // %entry
432; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
433; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
434; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
435; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
436; CHECK-GI-DOT-NEXT:    fmov w0, s0
437; CHECK-GI-DOT-NEXT:    ret
438entry:
439  %xx = zext <8 x i8> %x to <8 x i32>
440  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
441  ret i32 %z
442}
443
444define i32 @add_v8i8_v8i32_sext(<8 x i8> %x) {
445; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_sext:
446; CHECK-SD-BASE:       // %bb.0: // %entry
447; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
448; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
449; CHECK-SD-BASE-NEXT:    fmov w0, s0
450; CHECK-SD-BASE-NEXT:    ret
451;
452; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_sext:
453; CHECK-SD-DOT:       // %bb.0: // %entry
454; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
455; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
456; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
457; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
458; CHECK-SD-DOT-NEXT:    fmov w0, s0
459; CHECK-SD-DOT-NEXT:    ret
460;
461; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_sext:
462; CHECK-GI-BASE:       // %bb.0: // %entry
463; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
464; CHECK-GI-BASE-NEXT:    fmov w8, s0
465; CHECK-GI-BASE-NEXT:    sxth w0, w8
466; CHECK-GI-BASE-NEXT:    ret
467;
468; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_sext:
469; CHECK-GI-DOT:       // %bb.0: // %entry
470; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
471; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
472; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
473; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
474; CHECK-GI-DOT-NEXT:    fmov w0, s0
475; CHECK-GI-DOT-NEXT:    ret
476entry:
477  %xx = sext <8 x i8> %x to <8 x i32>
478  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
479  ret i32 %z
480}
481
482define i32 @add_v4i8_v4i32_zext(<4 x i8> %x) {
483; CHECK-SD-LABEL: add_v4i8_v4i32_zext:
484; CHECK-SD:       // %bb.0: // %entry
485; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
486; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
487; CHECK-SD-NEXT:    addv s0, v0.4s
488; CHECK-SD-NEXT:    fmov w0, s0
489; CHECK-SD-NEXT:    ret
490;
491; CHECK-GI-LABEL: add_v4i8_v4i32_zext:
492; CHECK-GI:       // %bb.0: // %entry
493; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
494; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
495; CHECK-GI-NEXT:    uaddlv s0, v0.4h
496; CHECK-GI-NEXT:    fmov w8, s0
497; CHECK-GI-NEXT:    and w0, w8, #0xffff
498; CHECK-GI-NEXT:    ret
499entry:
500  %xx = zext <4 x i8> %x to <4 x i32>
501  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
502  ret i32 %z
503}
504
505define i32 @add_v4i8_v4i32_sext(<4 x i8> %x) {
506; CHECK-SD-LABEL: add_v4i8_v4i32_sext:
507; CHECK-SD:       // %bb.0: // %entry
508; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
509; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
510; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
511; CHECK-SD-NEXT:    addv s0, v0.4s
512; CHECK-SD-NEXT:    fmov w0, s0
513; CHECK-SD-NEXT:    ret
514;
515; CHECK-GI-LABEL: add_v4i8_v4i32_sext:
516; CHECK-GI:       // %bb.0: // %entry
517; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
518; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
519; CHECK-GI-NEXT:    saddlv s0, v0.4h
520; CHECK-GI-NEXT:    fmov w8, s0
521; CHECK-GI-NEXT:    sxth w0, w8
522; CHECK-GI-NEXT:    ret
523entry:
524  %xx = sext <4 x i8> %x to <4 x i32>
525  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
526  ret i32 %z
527}
528
529define zeroext i16 @add_v16i8_v16i16_zext(<16 x i8> %x) {
530; CHECK-SD-LABEL: add_v16i8_v16i16_zext:
531; CHECK-SD:       // %bb.0: // %entry
532; CHECK-SD-NEXT:    uaddlp v0.8h, v0.16b
533; CHECK-SD-NEXT:    addv h0, v0.8h
534; CHECK-SD-NEXT:    fmov w0, s0
535; CHECK-SD-NEXT:    ret
536;
537; CHECK-GI-LABEL: add_v16i8_v16i16_zext:
538; CHECK-GI:       // %bb.0: // %entry
539; CHECK-GI-NEXT:    uaddlv h0, v0.16b
540; CHECK-GI-NEXT:    fmov w8, s0
541; CHECK-GI-NEXT:    and w0, w8, #0xffff
542; CHECK-GI-NEXT:    ret
543entry:
544  %xx = zext <16 x i8> %x to <16 x i16>
545  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
546  ret i16 %z
547}
548
549define signext i16 @add_v16i8_v16i16_sext(<16 x i8> %x) {
550; CHECK-SD-LABEL: add_v16i8_v16i16_sext:
551; CHECK-SD:       // %bb.0: // %entry
552; CHECK-SD-NEXT:    saddlp v0.8h, v0.16b
553; CHECK-SD-NEXT:    addv h0, v0.8h
554; CHECK-SD-NEXT:    smov w0, v0.h[0]
555; CHECK-SD-NEXT:    ret
556;
557; CHECK-GI-LABEL: add_v16i8_v16i16_sext:
558; CHECK-GI:       // %bb.0: // %entry
559; CHECK-GI-NEXT:    saddlv h0, v0.16b
560; CHECK-GI-NEXT:    fmov w8, s0
561; CHECK-GI-NEXT:    sxth w0, w8
562; CHECK-GI-NEXT:    ret
563entry:
564  %xx = sext <16 x i8> %x to <16 x i16>
565  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
566  ret i16 %z
567}
568
569define zeroext i16 @add_v8i8_v8i16_zext(<8 x i8> %x) {
570; CHECK-SD-LABEL: add_v8i8_v8i16_zext:
571; CHECK-SD:       // %bb.0: // %entry
572; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
573; CHECK-SD-NEXT:    addv h0, v0.8h
574; CHECK-SD-NEXT:    fmov w0, s0
575; CHECK-SD-NEXT:    ret
576;
577; CHECK-GI-LABEL: add_v8i8_v8i16_zext:
578; CHECK-GI:       // %bb.0: // %entry
579; CHECK-GI-NEXT:    uaddlv h0, v0.8b
580; CHECK-GI-NEXT:    fmov w8, s0
581; CHECK-GI-NEXT:    and w0, w8, #0xffff
582; CHECK-GI-NEXT:    ret
583entry:
584  %xx = zext <8 x i8> %x to <8 x i16>
585  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
586  ret i16 %z
587}
588
589define signext i16 @add_v8i8_v8i16_sext(<8 x i8> %x) {
590; CHECK-SD-LABEL: add_v8i8_v8i16_sext:
591; CHECK-SD:       // %bb.0: // %entry
592; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
593; CHECK-SD-NEXT:    addv h0, v0.8h
594; CHECK-SD-NEXT:    smov w0, v0.h[0]
595; CHECK-SD-NEXT:    ret
596;
597; CHECK-GI-LABEL: add_v8i8_v8i16_sext:
598; CHECK-GI:       // %bb.0: // %entry
599; CHECK-GI-NEXT:    saddlv h0, v0.8b
600; CHECK-GI-NEXT:    fmov w8, s0
601; CHECK-GI-NEXT:    sxth w0, w8
602; CHECK-GI-NEXT:    ret
603entry:
604  %xx = sext <8 x i8> %x to <8 x i16>
605  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
606  ret i16 %z
607}
608
609define zeroext i8 @add_v16i8_v16i8(<16 x i8> %x) {
610; CHECK-SD-LABEL: add_v16i8_v16i8:
611; CHECK-SD:       // %bb.0: // %entry
612; CHECK-SD-NEXT:    addv b0, v0.16b
613; CHECK-SD-NEXT:    fmov w0, s0
614; CHECK-SD-NEXT:    ret
615;
616; CHECK-GI-LABEL: add_v16i8_v16i8:
617; CHECK-GI:       // %bb.0: // %entry
618; CHECK-GI-NEXT:    addv b0, v0.16b
619; CHECK-GI-NEXT:    fmov w8, s0
620; CHECK-GI-NEXT:    uxtb w0, w8
621; CHECK-GI-NEXT:    ret
622entry:
623  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
624  ret i8 %z
625}
626
627define i64 @add_v16i8_v16i64_zext(<16 x i8> %x) {
628; CHECK-SD-LABEL: add_v16i8_v16i64_zext:
629; CHECK-SD:       // %bb.0: // %entry
630; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
631; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
632; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0
633; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
634; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
635; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
636; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
637; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s
638; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
639; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
640; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
641; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
642; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
643; CHECK-SD-NEXT:    addp d0, v0.2d
644; CHECK-SD-NEXT:    fmov x0, d0
645; CHECK-SD-NEXT:    ret
646;
647; CHECK-GI-LABEL: add_v16i8_v16i64_zext:
648; CHECK-GI:       // %bb.0: // %entry
649; CHECK-GI-NEXT:    uaddlv h0, v0.16b
650; CHECK-GI-NEXT:    fmov w8, s0
651; CHECK-GI-NEXT:    and x0, x8, #0xffff
652; CHECK-GI-NEXT:    ret
653entry:
654  %xx = zext <16 x i8> %x to <16 x i64>
655  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
656  ret i64 %z
657}
658
659define i64 @add_v16i8_v16i64_sext(<16 x i8> %x) {
660; CHECK-SD-LABEL: add_v16i8_v16i64_sext:
661; CHECK-SD:       // %bb.0: // %entry
662; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
663; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
664; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0
665; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0
666; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
667; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
668; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
669; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s
670; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
671; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
672; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
673; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
674; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
675; CHECK-SD-NEXT:    addp d0, v0.2d
676; CHECK-SD-NEXT:    fmov x0, d0
677; CHECK-SD-NEXT:    ret
678;
679; CHECK-GI-LABEL: add_v16i8_v16i64_sext:
680; CHECK-GI:       // %bb.0: // %entry
681; CHECK-GI-NEXT:    saddlv h0, v0.16b
682; CHECK-GI-NEXT:    fmov w8, s0
683; CHECK-GI-NEXT:    sxth x0, w8
684; CHECK-GI-NEXT:    ret
685entry:
686  %xx = sext <16 x i8> %x to <16 x i64>
687  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
688  ret i64 %z
689}
690
691define i64 @add_v8i8_v8i64_zext(<8 x i8> %x) {
692; CHECK-SD-LABEL: add_v8i8_v8i64_zext:
693; CHECK-SD:       // %bb.0: // %entry
694; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
695; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
696; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
697; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
698; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
699; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
700; CHECK-SD-NEXT:    addp d0, v0.2d
701; CHECK-SD-NEXT:    fmov x0, d0
702; CHECK-SD-NEXT:    ret
703;
704; CHECK-GI-LABEL: add_v8i8_v8i64_zext:
705; CHECK-GI:       // %bb.0: // %entry
706; CHECK-GI-NEXT:    uaddlv h0, v0.8b
707; CHECK-GI-NEXT:    fmov w8, s0
708; CHECK-GI-NEXT:    and x0, x8, #0xffff
709; CHECK-GI-NEXT:    ret
710entry:
711  %xx = zext <8 x i8> %x to <8 x i64>
712  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
713  ret i64 %z
714}
715
716define i64 @add_v8i8_v8i64_sext(<8 x i8> %x) {
717; CHECK-SD-LABEL: add_v8i8_v8i64_sext:
718; CHECK-SD:       // %bb.0: // %entry
719; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
720; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
721; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
722; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
723; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
724; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
725; CHECK-SD-NEXT:    addp d0, v0.2d
726; CHECK-SD-NEXT:    fmov x0, d0
727; CHECK-SD-NEXT:    ret
728;
729; CHECK-GI-LABEL: add_v8i8_v8i64_sext:
730; CHECK-GI:       // %bb.0: // %entry
731; CHECK-GI-NEXT:    saddlv h0, v0.8b
732; CHECK-GI-NEXT:    fmov w8, s0
733; CHECK-GI-NEXT:    sxth x0, w8
734; CHECK-GI-NEXT:    ret
735entry:
736  %xx = sext <8 x i8> %x to <8 x i64>
737  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
738  ret i64 %z
739}
740
741define i64 @add_v4i8_v4i64_zext(<4 x i8> %x) {
742; CHECK-SD-LABEL: add_v4i8_v4i64_zext:
743; CHECK-SD:       // %bb.0: // %entry
744; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
745; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
746; CHECK-SD-NEXT:    uaddlv d0, v0.4s
747; CHECK-SD-NEXT:    fmov x0, d0
748; CHECK-SD-NEXT:    ret
749;
750; CHECK-GI-LABEL: add_v4i8_v4i64_zext:
751; CHECK-GI:       // %bb.0: // %entry
752; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
753; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
754; CHECK-GI-NEXT:    uaddlv s0, v0.4h
755; CHECK-GI-NEXT:    fmov w8, s0
756; CHECK-GI-NEXT:    and x0, x8, #0xffff
757; CHECK-GI-NEXT:    ret
758entry:
759  %xx = zext <4 x i8> %x to <4 x i64>
760  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
761  ret i64 %z
762}
763
764define i64 @add_v4i8_v4i64_sext(<4 x i8> %x) {
765; CHECK-SD-LABEL: add_v4i8_v4i64_sext:
766; CHECK-SD:       // %bb.0: // %entry
767; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
768; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
769; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
770; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
771; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
772; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56
773; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56
774; CHECK-SD-NEXT:    addp d0, v1.2d
775; CHECK-SD-NEXT:    fmov x0, d0
776; CHECK-SD-NEXT:    ret
777;
778; CHECK-GI-LABEL: add_v4i8_v4i64_sext:
779; CHECK-GI:       // %bb.0: // %entry
780; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
781; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
782; CHECK-GI-NEXT:    saddlv s0, v0.4h
783; CHECK-GI-NEXT:    fmov w8, s0
784; CHECK-GI-NEXT:    sxth x0, w8
785; CHECK-GI-NEXT:    ret
786entry:
787  %xx = sext <4 x i8> %x to <4 x i64>
788  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
789  ret i64 %z
790}
791
792define i64 @add_v2i8_v2i64_zext(<2 x i8> %x) {
793; CHECK-SD-LABEL: add_v2i8_v2i64_zext:
794; CHECK-SD:       // %bb.0: // %entry
795; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
796; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
797; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
798; CHECK-SD-NEXT:    addp d0, v0.2d
799; CHECK-SD-NEXT:    fmov x0, d0
800; CHECK-SD-NEXT:    ret
801;
802; CHECK-GI-LABEL: add_v2i8_v2i64_zext:
803; CHECK-GI:       // %bb.0: // %entry
804; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
805; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
806; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
807; CHECK-GI-NEXT:    addp d0, v0.2d
808; CHECK-GI-NEXT:    fmov x0, d0
809; CHECK-GI-NEXT:    ret
810entry:
811  %xx = zext <2 x i8> %x to <2 x i64>
812  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
813  ret i64 %z
814}
815
816define i64 @add_v2i8_v2i64_sext(<2 x i8> %x) {
817; CHECK-LABEL: add_v2i8_v2i64_sext:
818; CHECK:       // %bb.0: // %entry
819; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
820; CHECK-NEXT:    shl v0.2d, v0.2d, #56
821; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
822; CHECK-NEXT:    addp d0, v0.2d
823; CHECK-NEXT:    fmov x0, d0
824; CHECK-NEXT:    ret
825entry:
826  %xx = sext <2 x i8> %x to <2 x i64>
827  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
828  ret i64 %z
829}
830
831define i64 @add_v2i64_v2i64(<2 x i64> %x) {
832; CHECK-LABEL: add_v2i64_v2i64:
833; CHECK:       // %bb.0: // %entry
834; CHECK-NEXT:    addp d0, v0.2d
835; CHECK-NEXT:    fmov x0, d0
836; CHECK-NEXT:    ret
837entry:
838  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
839  ret i64 %z
840}
841
842define i32 @add_v4i32_v4i32_acc(<4 x i32> %x, i32 %a) {
843; CHECK-LABEL: add_v4i32_v4i32_acc:
844; CHECK:       // %bb.0: // %entry
845; CHECK-NEXT:    addv s0, v0.4s
846; CHECK-NEXT:    fmov w8, s0
847; CHECK-NEXT:    add w0, w8, w0
848; CHECK-NEXT:    ret
849entry:
850  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
851  %r = add i32 %z, %a
852  ret i32 %r
853}
854
855define i64 @add_v4i32_v4i64_acc_zext(<4 x i32> %x, i64 %a) {
856; CHECK-LABEL: add_v4i32_v4i64_acc_zext:
857; CHECK:       // %bb.0: // %entry
858; CHECK-NEXT:    uaddlv d0, v0.4s
859; CHECK-NEXT:    fmov x8, d0
860; CHECK-NEXT:    add x0, x8, x0
861; CHECK-NEXT:    ret
862entry:
863  %xx = zext <4 x i32> %x to <4 x i64>
864  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
865  %r = add i64 %z, %a
866  ret i64 %r
867}
868
869define i64 @add_v4i32_v4i64_acc_sext(<4 x i32> %x, i64 %a) {
870; CHECK-LABEL: add_v4i32_v4i64_acc_sext:
871; CHECK:       // %bb.0: // %entry
872; CHECK-NEXT:    saddlv d0, v0.4s
873; CHECK-NEXT:    fmov x8, d0
874; CHECK-NEXT:    add x0, x8, x0
875; CHECK-NEXT:    ret
876entry:
877  %xx = sext <4 x i32> %x to <4 x i64>
878  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
879  %r = add i64 %z, %a
880  ret i64 %r
881}
882
883define i64 @add_v2i32_v2i64_acc_zext(<2 x i32> %x, i64 %a) {
884; CHECK-LABEL: add_v2i32_v2i64_acc_zext:
885; CHECK:       // %bb.0: // %entry
886; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
887; CHECK-NEXT:    addp d0, v0.2d
888; CHECK-NEXT:    fmov x8, d0
889; CHECK-NEXT:    add x0, x8, x0
890; CHECK-NEXT:    ret
891entry:
892  %xx = zext <2 x i32> %x to <2 x i64>
893  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
894  %r = add i64 %z, %a
895  ret i64 %r
896}
897
898define i64 @add_v2i32_v2i64_acc_sext(<2 x i32> %x, i64 %a) {
899; CHECK-LABEL: add_v2i32_v2i64_acc_sext:
900; CHECK:       // %bb.0: // %entry
901; CHECK-NEXT:    sshll v0.2d, v0.2s, #0
902; CHECK-NEXT:    addp d0, v0.2d
903; CHECK-NEXT:    fmov x8, d0
904; CHECK-NEXT:    add x0, x8, x0
905; CHECK-NEXT:    ret
906entry:
907  %xx = sext <2 x i32> %x to <2 x i64>
908  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
909  %r = add i64 %z, %a
910  ret i64 %r
911}
912
913define i32 @add_v8i16_v8i32_acc_zext(<8 x i16> %x, i32 %a) {
914; CHECK-LABEL: add_v8i16_v8i32_acc_zext:
915; CHECK:       // %bb.0: // %entry
916; CHECK-NEXT:    uaddlv s0, v0.8h
917; CHECK-NEXT:    fmov w8, s0
918; CHECK-NEXT:    add w0, w8, w0
919; CHECK-NEXT:    ret
920entry:
921  %xx = zext <8 x i16> %x to <8 x i32>
922  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
923  %r = add i32 %z, %a
924  ret i32 %r
925}
926
927define i32 @add_v8i16_v8i32_acc_sext(<8 x i16> %x, i32 %a) {
928; CHECK-LABEL: add_v8i16_v8i32_acc_sext:
929; CHECK:       // %bb.0: // %entry
930; CHECK-NEXT:    saddlv s0, v0.8h
931; CHECK-NEXT:    fmov w8, s0
932; CHECK-NEXT:    add w0, w8, w0
933; CHECK-NEXT:    ret
934entry:
935  %xx = sext <8 x i16> %x to <8 x i32>
936  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
937  %r = add i32 %z, %a
938  ret i32 %r
939}
940
941define i32 @add_v4i16_v4i32_acc_zext(<4 x i16> %x, i32 %a) {
942; CHECK-SD-LABEL: add_v4i16_v4i32_acc_zext:
943; CHECK-SD:       // %bb.0: // %entry
944; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
945; CHECK-SD-NEXT:    addv s0, v0.4s
946; CHECK-SD-NEXT:    fmov w8, s0
947; CHECK-SD-NEXT:    add w0, w8, w0
948; CHECK-SD-NEXT:    ret
949;
950; CHECK-GI-LABEL: add_v4i16_v4i32_acc_zext:
951; CHECK-GI:       // %bb.0: // %entry
952; CHECK-GI-NEXT:    uaddlv s0, v0.4h
953; CHECK-GI-NEXT:    fmov w8, s0
954; CHECK-GI-NEXT:    add w0, w8, w0
955; CHECK-GI-NEXT:    ret
956entry:
957  %xx = zext <4 x i16> %x to <4 x i32>
958  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
959  %r = add i32 %z, %a
960  ret i32 %r
961}
962
963define i32 @add_v4i16_v4i32_acc_sext(<4 x i16> %x, i32 %a) {
964; CHECK-SD-LABEL: add_v4i16_v4i32_acc_sext:
965; CHECK-SD:       // %bb.0: // %entry
966; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
967; CHECK-SD-NEXT:    addv s0, v0.4s
968; CHECK-SD-NEXT:    fmov w8, s0
969; CHECK-SD-NEXT:    add w0, w8, w0
970; CHECK-SD-NEXT:    ret
971;
972; CHECK-GI-LABEL: add_v4i16_v4i32_acc_sext:
973; CHECK-GI:       // %bb.0: // %entry
974; CHECK-GI-NEXT:    saddlv s0, v0.4h
975; CHECK-GI-NEXT:    fmov w8, s0
976; CHECK-GI-NEXT:    add w0, w8, w0
977; CHECK-GI-NEXT:    ret
978entry:
979  %xx = sext <4 x i16> %x to <4 x i32>
980  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
981  %r = add i32 %z, %a
982  ret i32 %r
983}
984
985define zeroext i16 @add_v8i16_v8i16_acc(<8 x i16> %x, i16 %a) {
986; CHECK-SD-LABEL: add_v8i16_v8i16_acc:
987; CHECK-SD:       // %bb.0: // %entry
988; CHECK-SD-NEXT:    addv h0, v0.8h
989; CHECK-SD-NEXT:    fmov w8, s0
990; CHECK-SD-NEXT:    add w8, w8, w0
991; CHECK-SD-NEXT:    and w0, w8, #0xffff
992; CHECK-SD-NEXT:    ret
993;
994; CHECK-GI-LABEL: add_v8i16_v8i16_acc:
995; CHECK-GI:       // %bb.0: // %entry
996; CHECK-GI-NEXT:    addv h0, v0.8h
997; CHECK-GI-NEXT:    fmov w8, s0
998; CHECK-GI-NEXT:    add w8, w0, w8, uxth
999; CHECK-GI-NEXT:    and w0, w8, #0xffff
1000; CHECK-GI-NEXT:    ret
1001entry:
1002  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
1003  %r = add i16 %z, %a
1004  ret i16 %r
1005}
1006
1007define i64 @add_v8i16_v8i64_acc_zext(<8 x i16> %x, i64 %a) {
1008; CHECK-SD-LABEL: add_v8i16_v8i64_acc_zext:
1009; CHECK-SD:       // %bb.0: // %entry
1010; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
1011; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1012; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
1013; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
1014; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1015; CHECK-SD-NEXT:    addp d0, v0.2d
1016; CHECK-SD-NEXT:    fmov x8, d0
1017; CHECK-SD-NEXT:    add x0, x8, x0
1018; CHECK-SD-NEXT:    ret
1019;
1020; CHECK-GI-LABEL: add_v8i16_v8i64_acc_zext:
1021; CHECK-GI:       // %bb.0: // %entry
1022; CHECK-GI-NEXT:    uaddlv s0, v0.8h
1023; CHECK-GI-NEXT:    fmov w8, s0
1024; CHECK-GI-NEXT:    add x0, x0, w8, uxtw
1025; CHECK-GI-NEXT:    ret
1026entry:
1027  %xx = zext <8 x i16> %x to <8 x i64>
1028  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1029  %r = add i64 %z, %a
1030  ret i64 %r
1031}
1032
1033define i64 @add_v8i16_v8i64_acc_sext(<8 x i16> %x, i64 %a) {
1034; CHECK-SD-LABEL: add_v8i16_v8i64_acc_sext:
1035; CHECK-SD:       // %bb.0: // %entry
1036; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
1037; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
1038; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
1039; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
1040; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1041; CHECK-SD-NEXT:    addp d0, v0.2d
1042; CHECK-SD-NEXT:    fmov x8, d0
1043; CHECK-SD-NEXT:    add x0, x8, x0
1044; CHECK-SD-NEXT:    ret
1045;
1046; CHECK-GI-LABEL: add_v8i16_v8i64_acc_sext:
1047; CHECK-GI:       // %bb.0: // %entry
1048; CHECK-GI-NEXT:    saddlv s0, v0.8h
1049; CHECK-GI-NEXT:    fmov w8, s0
1050; CHECK-GI-NEXT:    add x0, x0, w8, sxtw
1051; CHECK-GI-NEXT:    ret
1052entry:
1053  %xx = sext <8 x i16> %x to <8 x i64>
1054  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1055  %r = add i64 %z, %a
1056  ret i64 %r
1057}
1058
1059define i64 @add_v4i16_v4i64_acc_zext(<4 x i16> %x, i64 %a) {
1060; CHECK-SD-LABEL: add_v4i16_v4i64_acc_zext:
1061; CHECK-SD:       // %bb.0: // %entry
1062; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1063; CHECK-SD-NEXT:    uaddlv d0, v0.4s
1064; CHECK-SD-NEXT:    fmov x8, d0
1065; CHECK-SD-NEXT:    add x0, x8, x0
1066; CHECK-SD-NEXT:    ret
1067;
1068; CHECK-GI-LABEL: add_v4i16_v4i64_acc_zext:
1069; CHECK-GI:       // %bb.0: // %entry
1070; CHECK-GI-NEXT:    uaddlv s0, v0.4h
1071; CHECK-GI-NEXT:    fmov w8, s0
1072; CHECK-GI-NEXT:    add x0, x0, w8, uxtw
1073; CHECK-GI-NEXT:    ret
1074entry:
1075  %xx = zext <4 x i16> %x to <4 x i64>
1076  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1077  %r = add i64 %z, %a
1078  ret i64 %r
1079}
1080
1081define i64 @add_v4i16_v4i64_acc_sext(<4 x i16> %x, i64 %a) {
1082; CHECK-SD-LABEL: add_v4i16_v4i64_acc_sext:
1083; CHECK-SD:       // %bb.0: // %entry
1084; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
1085; CHECK-SD-NEXT:    saddlv d0, v0.4s
1086; CHECK-SD-NEXT:    fmov x8, d0
1087; CHECK-SD-NEXT:    add x0, x8, x0
1088; CHECK-SD-NEXT:    ret
1089;
1090; CHECK-GI-LABEL: add_v4i16_v4i64_acc_sext:
1091; CHECK-GI:       // %bb.0: // %entry
1092; CHECK-GI-NEXT:    saddlv s0, v0.4h
1093; CHECK-GI-NEXT:    fmov w8, s0
1094; CHECK-GI-NEXT:    add x0, x0, w8, sxtw
1095; CHECK-GI-NEXT:    ret
1096entry:
1097  %xx = sext <4 x i16> %x to <4 x i64>
1098  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1099  %r = add i64 %z, %a
1100  ret i64 %r
1101}
1102
1103define i64 @add_v2i16_v2i64_acc_zext(<2 x i16> %x, i64 %a) {
1104; CHECK-SD-LABEL: add_v2i16_v2i64_acc_zext:
1105; CHECK-SD:       // %bb.0: // %entry
1106; CHECK-SD-NEXT:    movi d1, #0x00ffff0000ffff
1107; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
1108; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
1109; CHECK-SD-NEXT:    addp d0, v0.2d
1110; CHECK-SD-NEXT:    fmov x8, d0
1111; CHECK-SD-NEXT:    add x0, x8, x0
1112; CHECK-SD-NEXT:    ret
1113;
1114; CHECK-GI-LABEL: add_v2i16_v2i64_acc_zext:
1115; CHECK-GI:       // %bb.0: // %entry
1116; CHECK-GI-NEXT:    movi v1.2d, #0x0000000000ffff
1117; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1118; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
1119; CHECK-GI-NEXT:    addp d0, v0.2d
1120; CHECK-GI-NEXT:    fmov x8, d0
1121; CHECK-GI-NEXT:    add x0, x8, x0
1122; CHECK-GI-NEXT:    ret
1123entry:
1124  %xx = zext <2 x i16> %x to <2 x i64>
1125  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1126  %r = add i64 %z, %a
1127  ret i64 %r
1128}
1129
1130define i64 @add_v2i16_v2i64_acc_sext(<2 x i16> %x, i64 %a) {
1131; CHECK-LABEL: add_v2i16_v2i64_acc_sext:
1132; CHECK:       // %bb.0: // %entry
1133; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
1134; CHECK-NEXT:    shl v0.2d, v0.2d, #48
1135; CHECK-NEXT:    sshr v0.2d, v0.2d, #48
1136; CHECK-NEXT:    addp d0, v0.2d
1137; CHECK-NEXT:    fmov x8, d0
1138; CHECK-NEXT:    add x0, x8, x0
1139; CHECK-NEXT:    ret
1140entry:
1141  %xx = sext <2 x i16> %x to <2 x i64>
1142  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1143  %r = add i64 %z, %a
1144  ret i64 %r
1145}
1146
1147define i32 @add_v16i8_v16i32_acc_zext(<16 x i8> %x, i32 %a) {
1148; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_zext:
1149; CHECK-SD-BASE:       // %bb.0: // %entry
1150; CHECK-SD-BASE-NEXT:    ushll2 v1.8h, v0.16b, #0
1151; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
1152; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v0.8h, v1.8h
1153; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
1154; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
1155; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
1156; CHECK-SD-BASE-NEXT:    fmov w8, s0
1157; CHECK-SD-BASE-NEXT:    add w0, w8, w0
1158; CHECK-SD-BASE-NEXT:    ret
1159;
1160; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_zext:
1161; CHECK-SD-DOT:       // %bb.0: // %entry
1162; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
1163; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
1164; CHECK-SD-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
1165; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
1166; CHECK-SD-DOT-NEXT:    fmov w8, s0
1167; CHECK-SD-DOT-NEXT:    add w0, w8, w0
1168; CHECK-SD-DOT-NEXT:    ret
1169;
1170; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_zext:
1171; CHECK-GI-BASE:       // %bb.0: // %entry
1172; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
1173; CHECK-GI-BASE-NEXT:    fmov w8, s0
1174; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth
1175; CHECK-GI-BASE-NEXT:    ret
1176;
1177; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_zext:
1178; CHECK-GI-DOT:       // %bb.0: // %entry
1179; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
1180; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
1181; CHECK-GI-DOT-NEXT:    udot v2.4s, v0.16b, v1.16b
1182; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
1183; CHECK-GI-DOT-NEXT:    fmov w8, s0
1184; CHECK-GI-DOT-NEXT:    add w0, w8, w0
1185; CHECK-GI-DOT-NEXT:    ret
1186entry:
1187  %xx = zext <16 x i8> %x to <16 x i32>
1188  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
1189  %r = add i32 %z, %a
1190  ret i32 %r
1191}
1192
1193define i32 @add_v16i8_v16i32_acc_sext(<16 x i8> %x, i32 %a) {
1194; CHECK-SD-BASE-LABEL: add_v16i8_v16i32_acc_sext:
1195; CHECK-SD-BASE:       // %bb.0: // %entry
1196; CHECK-SD-BASE-NEXT:    sshll2 v1.8h, v0.16b, #0
1197; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
1198; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v0.8h, v1.8h
1199; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
1200; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
1201; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
1202; CHECK-SD-BASE-NEXT:    fmov w8, s0
1203; CHECK-SD-BASE-NEXT:    add w0, w8, w0
1204; CHECK-SD-BASE-NEXT:    ret
1205;
1206; CHECK-SD-DOT-LABEL: add_v16i8_v16i32_acc_sext:
1207; CHECK-SD-DOT:       // %bb.0: // %entry
1208; CHECK-SD-DOT-NEXT:    movi v1.16b, #1
1209; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
1210; CHECK-SD-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
1211; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
1212; CHECK-SD-DOT-NEXT:    fmov w8, s0
1213; CHECK-SD-DOT-NEXT:    add w0, w8, w0
1214; CHECK-SD-DOT-NEXT:    ret
1215;
1216; CHECK-GI-BASE-LABEL: add_v16i8_v16i32_acc_sext:
1217; CHECK-GI-BASE:       // %bb.0: // %entry
1218; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
1219; CHECK-GI-BASE-NEXT:    fmov w8, s0
1220; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth
1221; CHECK-GI-BASE-NEXT:    ret
1222;
1223; CHECK-GI-DOT-LABEL: add_v16i8_v16i32_acc_sext:
1224; CHECK-GI-DOT:       // %bb.0: // %entry
1225; CHECK-GI-DOT-NEXT:    movi v1.16b, #1
1226; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
1227; CHECK-GI-DOT-NEXT:    sdot v2.4s, v0.16b, v1.16b
1228; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
1229; CHECK-GI-DOT-NEXT:    fmov w8, s0
1230; CHECK-GI-DOT-NEXT:    add w0, w8, w0
1231; CHECK-GI-DOT-NEXT:    ret
1232entry:
1233  %xx = sext <16 x i8> %x to <16 x i32>
1234  %z = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
1235  %r = add i32 %z, %a
1236  ret i32 %r
1237}
1238
1239define i32 @add_v8i8_v8i32_acc_zext(<8 x i8> %x, i32 %a) {
1240; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_zext:
1241; CHECK-SD-BASE:       // %bb.0: // %entry
1242; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
1243; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
1244; CHECK-SD-BASE-NEXT:    fmov w8, s0
1245; CHECK-SD-BASE-NEXT:    add w0, w8, w0
1246; CHECK-SD-BASE-NEXT:    ret
1247;
1248; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_zext:
1249; CHECK-SD-DOT:       // %bb.0: // %entry
1250; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
1251; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
1252; CHECK-SD-DOT-NEXT:    udot v1.2s, v0.8b, v2.8b
1253; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
1254; CHECK-SD-DOT-NEXT:    fmov w8, s0
1255; CHECK-SD-DOT-NEXT:    add w0, w8, w0
1256; CHECK-SD-DOT-NEXT:    ret
1257;
1258; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_zext:
1259; CHECK-GI-BASE:       // %bb.0: // %entry
1260; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
1261; CHECK-GI-BASE-NEXT:    fmov w8, s0
1262; CHECK-GI-BASE-NEXT:    add w0, w0, w8, uxth
1263; CHECK-GI-BASE-NEXT:    ret
1264;
1265; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_zext:
1266; CHECK-GI-DOT:       // %bb.0: // %entry
1267; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
1268; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
1269; CHECK-GI-DOT-NEXT:    udot v2.2s, v0.8b, v1.8b
1270; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
1271; CHECK-GI-DOT-NEXT:    fmov w8, s0
1272; CHECK-GI-DOT-NEXT:    add w0, w8, w0
1273; CHECK-GI-DOT-NEXT:    ret
1274entry:
1275  %xx = zext <8 x i8> %x to <8 x i32>
1276  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1277  %r = add i32 %z, %a
1278  ret i32 %r
1279}
1280
1281define i32 @add_v8i8_v8i32_acc_sext(<8 x i8> %x, i32 %a) {
1282; CHECK-SD-BASE-LABEL: add_v8i8_v8i32_acc_sext:
1283; CHECK-SD-BASE:       // %bb.0: // %entry
1284; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
1285; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
1286; CHECK-SD-BASE-NEXT:    fmov w8, s0
1287; CHECK-SD-BASE-NEXT:    add w0, w8, w0
1288; CHECK-SD-BASE-NEXT:    ret
1289;
1290; CHECK-SD-DOT-LABEL: add_v8i8_v8i32_acc_sext:
1291; CHECK-SD-DOT:       // %bb.0: // %entry
1292; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
1293; CHECK-SD-DOT-NEXT:    movi v2.8b, #1
1294; CHECK-SD-DOT-NEXT:    sdot v1.2s, v0.8b, v2.8b
1295; CHECK-SD-DOT-NEXT:    addp v0.2s, v1.2s, v1.2s
1296; CHECK-SD-DOT-NEXT:    fmov w8, s0
1297; CHECK-SD-DOT-NEXT:    add w0, w8, w0
1298; CHECK-SD-DOT-NEXT:    ret
1299;
1300; CHECK-GI-BASE-LABEL: add_v8i8_v8i32_acc_sext:
1301; CHECK-GI-BASE:       // %bb.0: // %entry
1302; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
1303; CHECK-GI-BASE-NEXT:    fmov w8, s0
1304; CHECK-GI-BASE-NEXT:    add w0, w0, w8, sxth
1305; CHECK-GI-BASE-NEXT:    ret
1306;
1307; CHECK-GI-DOT-LABEL: add_v8i8_v8i32_acc_sext:
1308; CHECK-GI-DOT:       // %bb.0: // %entry
1309; CHECK-GI-DOT-NEXT:    movi v1.8b, #1
1310; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
1311; CHECK-GI-DOT-NEXT:    sdot v2.2s, v0.8b, v1.8b
1312; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
1313; CHECK-GI-DOT-NEXT:    fmov w8, s0
1314; CHECK-GI-DOT-NEXT:    add w0, w8, w0
1315; CHECK-GI-DOT-NEXT:    ret
1316entry:
1317  %xx = sext <8 x i8> %x to <8 x i32>
1318  %z = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1319  %r = add i32 %z, %a
1320  ret i32 %r
1321}
1322
1323define i32 @add_v4i8_v4i32_acc_zext(<4 x i8> %x, i32 %a) {
1324; CHECK-SD-LABEL: add_v4i8_v4i32_acc_zext:
1325; CHECK-SD:       // %bb.0: // %entry
1326; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
1327; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1328; CHECK-SD-NEXT:    addv s0, v0.4s
1329; CHECK-SD-NEXT:    fmov w8, s0
1330; CHECK-SD-NEXT:    add w0, w8, w0
1331; CHECK-SD-NEXT:    ret
1332;
1333; CHECK-GI-LABEL: add_v4i8_v4i32_acc_zext:
1334; CHECK-GI:       // %bb.0: // %entry
1335; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
1336; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
1337; CHECK-GI-NEXT:    uaddlv s0, v0.4h
1338; CHECK-GI-NEXT:    fmov w8, s0
1339; CHECK-GI-NEXT:    add w0, w0, w8, uxth
1340; CHECK-GI-NEXT:    ret
1341entry:
1342  %xx = zext <4 x i8> %x to <4 x i32>
1343  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1344  %r = add i32 %z, %a
1345  ret i32 %r
1346}
1347
1348define i32 @add_v4i8_v4i32_acc_sext(<4 x i8> %x, i32 %a) {
1349; CHECK-SD-LABEL: add_v4i8_v4i32_acc_sext:
1350; CHECK-SD:       // %bb.0: // %entry
1351; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1352; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
1353; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
1354; CHECK-SD-NEXT:    addv s0, v0.4s
1355; CHECK-SD-NEXT:    fmov w8, s0
1356; CHECK-SD-NEXT:    add w0, w8, w0
1357; CHECK-SD-NEXT:    ret
1358;
1359; CHECK-GI-LABEL: add_v4i8_v4i32_acc_sext:
1360; CHECK-GI:       // %bb.0: // %entry
1361; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
1362; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
1363; CHECK-GI-NEXT:    saddlv s0, v0.4h
1364; CHECK-GI-NEXT:    fmov w8, s0
1365; CHECK-GI-NEXT:    add w0, w0, w8, sxth
1366; CHECK-GI-NEXT:    ret
1367entry:
1368  %xx = sext <4 x i8> %x to <4 x i32>
1369  %z = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1370  %r = add i32 %z, %a
1371  ret i32 %r
1372}
1373
1374define zeroext i16 @add_v16i8_v16i16_acc_zext(<16 x i8> %x, i16 %a) {
1375; CHECK-LABEL: add_v16i8_v16i16_acc_zext:
1376; CHECK:       // %bb.0: // %entry
1377; CHECK-NEXT:    uaddlv h0, v0.16b
1378; CHECK-NEXT:    fmov w8, s0
1379; CHECK-NEXT:    add w8, w8, w0
1380; CHECK-NEXT:    and w0, w8, #0xffff
1381; CHECK-NEXT:    ret
1382entry:
1383  %xx = zext <16 x i8> %x to <16 x i16>
1384  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1385  %r = add i16 %z, %a
1386  ret i16 %r
1387}
1388
1389define signext i16 @add_v16i8_v16i16_acc_sext(<16 x i8> %x, i16 %a) {
1390; CHECK-LABEL: add_v16i8_v16i16_acc_sext:
1391; CHECK:       // %bb.0: // %entry
1392; CHECK-NEXT:    saddlv h0, v0.16b
1393; CHECK-NEXT:    fmov w8, s0
1394; CHECK-NEXT:    add w8, w8, w0
1395; CHECK-NEXT:    sxth w0, w8
1396; CHECK-NEXT:    ret
1397entry:
1398  %xx = sext <16 x i8> %x to <16 x i16>
1399  %z = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
1400  %r = add i16 %z, %a
1401  ret i16 %r
1402}
1403
1404define zeroext i16 @add_v8i8_v8i16_acc_zext(<8 x i8> %x, i16 %a) {
1405; CHECK-SD-LABEL: add_v8i8_v8i16_acc_zext:
1406; CHECK-SD:       // %bb.0: // %entry
1407; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
1408; CHECK-SD-NEXT:    addv h0, v0.8h
1409; CHECK-SD-NEXT:    fmov w8, s0
1410; CHECK-SD-NEXT:    add w8, w8, w0
1411; CHECK-SD-NEXT:    and w0, w8, #0xffff
1412; CHECK-SD-NEXT:    ret
1413;
1414; CHECK-GI-LABEL: add_v8i8_v8i16_acc_zext:
1415; CHECK-GI:       // %bb.0: // %entry
1416; CHECK-GI-NEXT:    uaddlv h0, v0.8b
1417; CHECK-GI-NEXT:    fmov w8, s0
1418; CHECK-GI-NEXT:    add w8, w8, w0
1419; CHECK-GI-NEXT:    and w0, w8, #0xffff
1420; CHECK-GI-NEXT:    ret
1421entry:
1422  %xx = zext <8 x i8> %x to <8 x i16>
1423  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1424  %r = add i16 %z, %a
1425  ret i16 %r
1426}
1427
1428define signext i16 @add_v8i8_v8i16_acc_sext(<8 x i8> %x, i16 %a) {
1429; CHECK-SD-LABEL: add_v8i8_v8i16_acc_sext:
1430; CHECK-SD:       // %bb.0: // %entry
1431; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
1432; CHECK-SD-NEXT:    addv h0, v0.8h
1433; CHECK-SD-NEXT:    fmov w8, s0
1434; CHECK-SD-NEXT:    add w8, w8, w0
1435; CHECK-SD-NEXT:    sxth w0, w8
1436; CHECK-SD-NEXT:    ret
1437;
1438; CHECK-GI-LABEL: add_v8i8_v8i16_acc_sext:
1439; CHECK-GI:       // %bb.0: // %entry
1440; CHECK-GI-NEXT:    saddlv h0, v0.8b
1441; CHECK-GI-NEXT:    fmov w8, s0
1442; CHECK-GI-NEXT:    add w8, w8, w0
1443; CHECK-GI-NEXT:    sxth w0, w8
1444; CHECK-GI-NEXT:    ret
1445entry:
1446  %xx = sext <8 x i8> %x to <8 x i16>
1447  %z = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
1448  %r = add i16 %z, %a
1449  ret i16 %r
1450}
1451
1452define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
1453; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
1454; CHECK-SD:       // %bb.0: // %entry
1455; CHECK-SD-NEXT:    addv b0, v0.16b
1456; CHECK-SD-NEXT:    fmov w8, s0
1457; CHECK-SD-NEXT:    add w8, w8, w0
1458; CHECK-SD-NEXT:    and w0, w8, #0xff
1459; CHECK-SD-NEXT:    ret
1460;
1461; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
1462; CHECK-GI:       // %bb.0: // %entry
1463; CHECK-GI-NEXT:    addv b0, v0.16b
1464; CHECK-GI-NEXT:    fmov w8, s0
1465; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
1466; CHECK-GI-NEXT:    and w0, w8, #0xff
1467; CHECK-GI-NEXT:    ret
1468entry:
1469  %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
1470  %r = add i8 %z, %a
1471  ret i8 %r
1472}
1473
1474define i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, i64 %a) {
1475; CHECK-SD-LABEL: add_v16i8_v16i64_acc_zext:
1476; CHECK-SD:       // %bb.0: // %entry
1477; CHECK-SD-NEXT:    ushll2 v1.8h, v0.16b, #0
1478; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
1479; CHECK-SD-NEXT:    ushll2 v2.4s, v1.8h, #0
1480; CHECK-SD-NEXT:    ushll2 v3.4s, v0.8h, #0
1481; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
1482; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1483; CHECK-SD-NEXT:    uaddl2 v4.2d, v3.4s, v2.4s
1484; CHECK-SD-NEXT:    uaddl v2.2d, v3.2s, v2.2s
1485; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v1.4s
1486; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
1487; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
1488; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1489; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
1490; CHECK-SD-NEXT:    addp d0, v0.2d
1491; CHECK-SD-NEXT:    fmov x8, d0
1492; CHECK-SD-NEXT:    add x0, x8, x0
1493; CHECK-SD-NEXT:    ret
1494;
1495; CHECK-GI-LABEL: add_v16i8_v16i64_acc_zext:
1496; CHECK-GI:       // %bb.0: // %entry
1497; CHECK-GI-NEXT:    uaddlv h0, v0.16b
1498; CHECK-GI-NEXT:    fmov w8, s0
1499; CHECK-GI-NEXT:    add x0, x0, w8, uxth
1500; CHECK-GI-NEXT:    ret
1501entry:
1502  %xx = zext <16 x i8> %x to <16 x i64>
1503  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1504  %r = add i64 %z, %a
1505  ret i64 %r
1506}
1507
1508define i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, i64 %a) {
1509; CHECK-SD-LABEL: add_v16i8_v16i64_acc_sext:
1510; CHECK-SD:       // %bb.0: // %entry
1511; CHECK-SD-NEXT:    sshll2 v1.8h, v0.16b, #0
1512; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
1513; CHECK-SD-NEXT:    sshll2 v2.4s, v1.8h, #0
1514; CHECK-SD-NEXT:    sshll2 v3.4s, v0.8h, #0
1515; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
1516; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
1517; CHECK-SD-NEXT:    saddl2 v4.2d, v3.4s, v2.4s
1518; CHECK-SD-NEXT:    saddl v2.2d, v3.2s, v2.2s
1519; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v1.4s
1520; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
1521; CHECK-SD-NEXT:    add v1.2d, v5.2d, v4.2d
1522; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1523; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
1524; CHECK-SD-NEXT:    addp d0, v0.2d
1525; CHECK-SD-NEXT:    fmov x8, d0
1526; CHECK-SD-NEXT:    add x0, x8, x0
1527; CHECK-SD-NEXT:    ret
1528;
1529; CHECK-GI-LABEL: add_v16i8_v16i64_acc_sext:
1530; CHECK-GI:       // %bb.0: // %entry
1531; CHECK-GI-NEXT:    saddlv h0, v0.16b
1532; CHECK-GI-NEXT:    fmov w8, s0
1533; CHECK-GI-NEXT:    add x0, x0, w8, sxth
1534; CHECK-GI-NEXT:    ret
1535entry:
1536  %xx = sext <16 x i8> %x to <16 x i64>
1537  %z = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
1538  %r = add i64 %z, %a
1539  ret i64 %r
1540}
1541
1542define i64 @add_v8i8_v8i64_acc_zext(<8 x i8> %x, i64 %a) {
1543; CHECK-SD-LABEL: add_v8i8_v8i64_acc_zext:
1544; CHECK-SD:       // %bb.0: // %entry
1545; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
1546; CHECK-SD-NEXT:    ushll2 v1.4s, v0.8h, #0
1547; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1548; CHECK-SD-NEXT:    uaddl2 v2.2d, v0.4s, v1.4s
1549; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
1550; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1551; CHECK-SD-NEXT:    addp d0, v0.2d
1552; CHECK-SD-NEXT:    fmov x8, d0
1553; CHECK-SD-NEXT:    add x0, x8, x0
1554; CHECK-SD-NEXT:    ret
1555;
1556; CHECK-GI-LABEL: add_v8i8_v8i64_acc_zext:
1557; CHECK-GI:       // %bb.0: // %entry
1558; CHECK-GI-NEXT:    uaddlv h0, v0.8b
1559; CHECK-GI-NEXT:    fmov w8, s0
1560; CHECK-GI-NEXT:    add x0, x0, w8, uxth
1561; CHECK-GI-NEXT:    ret
1562entry:
1563  %xx = zext <8 x i8> %x to <8 x i64>
1564  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1565  %r = add i64 %z, %a
1566  ret i64 %r
1567}
1568
1569define i64 @add_v8i8_v8i64_acc_sext(<8 x i8> %x, i64 %a) {
1570; CHECK-SD-LABEL: add_v8i8_v8i64_acc_sext:
1571; CHECK-SD:       // %bb.0: // %entry
1572; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
1573; CHECK-SD-NEXT:    sshll2 v1.4s, v0.8h, #0
1574; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
1575; CHECK-SD-NEXT:    saddl2 v2.2d, v0.4s, v1.4s
1576; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
1577; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
1578; CHECK-SD-NEXT:    addp d0, v0.2d
1579; CHECK-SD-NEXT:    fmov x8, d0
1580; CHECK-SD-NEXT:    add x0, x8, x0
1581; CHECK-SD-NEXT:    ret
1582;
1583; CHECK-GI-LABEL: add_v8i8_v8i64_acc_sext:
1584; CHECK-GI:       // %bb.0: // %entry
1585; CHECK-GI-NEXT:    saddlv h0, v0.8b
1586; CHECK-GI-NEXT:    fmov w8, s0
1587; CHECK-GI-NEXT:    add x0, x0, w8, sxth
1588; CHECK-GI-NEXT:    ret
1589entry:
1590  %xx = sext <8 x i8> %x to <8 x i64>
1591  %z = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
1592  %r = add i64 %z, %a
1593  ret i64 %r
1594}
1595
1596define i64 @add_v4i8_v4i64_acc_zext(<4 x i8> %x, i64 %a) {
1597; CHECK-SD-LABEL: add_v4i8_v4i64_acc_zext:
1598; CHECK-SD:       // %bb.0: // %entry
1599; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
1600; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1601; CHECK-SD-NEXT:    uaddlv d0, v0.4s
1602; CHECK-SD-NEXT:    fmov x8, d0
1603; CHECK-SD-NEXT:    add x0, x8, x0
1604; CHECK-SD-NEXT:    ret
1605;
1606; CHECK-GI-LABEL: add_v4i8_v4i64_acc_zext:
1607; CHECK-GI:       // %bb.0: // %entry
1608; CHECK-GI-NEXT:    movi d1, #0xff00ff00ff00ff
1609; CHECK-GI-NEXT:    and v0.8b, v0.8b, v1.8b
1610; CHECK-GI-NEXT:    uaddlv s0, v0.4h
1611; CHECK-GI-NEXT:    fmov w8, s0
1612; CHECK-GI-NEXT:    add x0, x0, w8, uxth
1613; CHECK-GI-NEXT:    ret
1614entry:
1615  %xx = zext <4 x i8> %x to <4 x i64>
1616  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1617  %r = add i64 %z, %a
1618  ret i64 %r
1619}
1620
1621define i64 @add_v4i8_v4i64_acc_sext(<4 x i8> %x, i64 %a) {
1622; CHECK-SD-LABEL: add_v4i8_v4i64_acc_sext:
1623; CHECK-SD:       // %bb.0: // %entry
1624; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
1625; CHECK-SD-NEXT:    ushll v1.2d, v0.2s, #0
1626; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
1627; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
1628; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
1629; CHECK-SD-NEXT:    sshr v1.2d, v1.2d, #56
1630; CHECK-SD-NEXT:    ssra v1.2d, v0.2d, #56
1631; CHECK-SD-NEXT:    addp d0, v1.2d
1632; CHECK-SD-NEXT:    fmov x8, d0
1633; CHECK-SD-NEXT:    add x0, x8, x0
1634; CHECK-SD-NEXT:    ret
1635;
1636; CHECK-GI-LABEL: add_v4i8_v4i64_acc_sext:
1637; CHECK-GI:       // %bb.0: // %entry
1638; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
1639; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
1640; CHECK-GI-NEXT:    saddlv s0, v0.4h
1641; CHECK-GI-NEXT:    fmov w8, s0
1642; CHECK-GI-NEXT:    add x0, x0, w8, sxth
1643; CHECK-GI-NEXT:    ret
1644entry:
1645  %xx = sext <4 x i8> %x to <4 x i64>
1646  %z = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1647  %r = add i64 %z, %a
1648  ret i64 %r
1649}
1650
1651define i64 @add_v2i8_v2i64_acc_zext(<2 x i8> %x, i64 %a) {
1652; CHECK-SD-LABEL: add_v2i8_v2i64_acc_zext:
1653; CHECK-SD:       // %bb.0: // %entry
1654; CHECK-SD-NEXT:    movi d1, #0x0000ff000000ff
1655; CHECK-SD-NEXT:    and v0.8b, v0.8b, v1.8b
1656; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
1657; CHECK-SD-NEXT:    addp d0, v0.2d
1658; CHECK-SD-NEXT:    fmov x8, d0
1659; CHECK-SD-NEXT:    add x0, x8, x0
1660; CHECK-SD-NEXT:    ret
1661;
1662; CHECK-GI-LABEL: add_v2i8_v2i64_acc_zext:
1663; CHECK-GI:       // %bb.0: // %entry
1664; CHECK-GI-NEXT:    movi v1.2d, #0x000000000000ff
1665; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1666; CHECK-GI-NEXT:    and v0.16b, v0.16b, v1.16b
1667; CHECK-GI-NEXT:    addp d0, v0.2d
1668; CHECK-GI-NEXT:    fmov x8, d0
1669; CHECK-GI-NEXT:    add x0, x8, x0
1670; CHECK-GI-NEXT:    ret
1671entry:
1672  %xx = zext <2 x i8> %x to <2 x i64>
1673  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1674  %r = add i64 %z, %a
1675  ret i64 %r
1676}
1677
1678define i64 @add_v2i8_v2i64_acc_sext(<2 x i8> %x, i64 %a) {
1679; CHECK-LABEL: add_v2i8_v2i64_acc_sext:
1680; CHECK:       // %bb.0: // %entry
1681; CHECK-NEXT:    ushll v0.2d, v0.2s, #0
1682; CHECK-NEXT:    shl v0.2d, v0.2d, #56
1683; CHECK-NEXT:    sshr v0.2d, v0.2d, #56
1684; CHECK-NEXT:    addp d0, v0.2d
1685; CHECK-NEXT:    fmov x8, d0
1686; CHECK-NEXT:    add x0, x8, x0
1687; CHECK-NEXT:    ret
1688entry:
1689  %xx = sext <2 x i8> %x to <2 x i64>
1690  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1691  %r = add i64 %z, %a
1692  ret i64 %r
1693}
1694
1695define i64 @add_v2i64_v2i64_acc(<2 x i64> %x, i64 %a) {
1696; CHECK-LABEL: add_v2i64_v2i64_acc:
1697; CHECK:       // %bb.0: // %entry
1698; CHECK-NEXT:    addp d0, v0.2d
1699; CHECK-NEXT:    fmov x8, d0
1700; CHECK-NEXT:    add x0, x8, x0
1701; CHECK-NEXT:    ret
1702entry:
1703  %z = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
1704  %r = add i64 %z, %a
1705  ret i64 %r
1706}
1707
1708define i32 @add_pair_v4i32_v4i32(<4 x i32> %x, <4 x i32> %y) {
1709; CHECK-SD-LABEL: add_pair_v4i32_v4i32:
1710; CHECK-SD:       // %bb.0: // %entry
1711; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
1712; CHECK-SD-NEXT:    addv s0, v0.4s
1713; CHECK-SD-NEXT:    fmov w0, s0
1714; CHECK-SD-NEXT:    ret
1715;
1716; CHECK-GI-LABEL: add_pair_v4i32_v4i32:
1717; CHECK-GI:       // %bb.0: // %entry
1718; CHECK-GI-NEXT:    addv s0, v0.4s
1719; CHECK-GI-NEXT:    addv s1, v1.4s
1720; CHECK-GI-NEXT:    fmov w8, s0
1721; CHECK-GI-NEXT:    fmov w9, s1
1722; CHECK-GI-NEXT:    add w0, w8, w9
1723; CHECK-GI-NEXT:    ret
1724entry:
1725  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %x)
1726  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %y)
1727  %z = add i32 %z1, %z2
1728  ret i32 %z
1729}
1730
1731define i64 @add_pair_v4i32_v4i64_zext(<4 x i32> %x, <4 x i32> %y) {
1732; CHECK-SD-LABEL: add_pair_v4i32_v4i64_zext:
1733; CHECK-SD:       // %bb.0: // %entry
1734; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
1735; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
1736; CHECK-SD-NEXT:    addp d0, v1.2d
1737; CHECK-SD-NEXT:    fmov x0, d0
1738; CHECK-SD-NEXT:    ret
1739;
1740; CHECK-GI-LABEL: add_pair_v4i32_v4i64_zext:
1741; CHECK-GI:       // %bb.0: // %entry
1742; CHECK-GI-NEXT:    uaddlv d0, v0.4s
1743; CHECK-GI-NEXT:    uaddlv d1, v1.4s
1744; CHECK-GI-NEXT:    fmov x8, d0
1745; CHECK-GI-NEXT:    fmov x9, d1
1746; CHECK-GI-NEXT:    add x0, x8, x9
1747; CHECK-GI-NEXT:    ret
1748entry:
1749  %xx = zext <4 x i32> %x to <4 x i64>
1750  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1751  %yy = zext <4 x i32> %y to <4 x i64>
1752  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1753  %z = add i64 %z1, %z2
1754  ret i64 %z
1755}
1756
1757define i64 @add_pair_v4i32_v4i64_sext(<4 x i32> %x, <4 x i32> %y) {
1758; CHECK-SD-LABEL: add_pair_v4i32_v4i64_sext:
1759; CHECK-SD:       // %bb.0: // %entry
1760; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s
1761; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s
1762; CHECK-SD-NEXT:    addp d0, v1.2d
1763; CHECK-SD-NEXT:    fmov x0, d0
1764; CHECK-SD-NEXT:    ret
1765;
1766; CHECK-GI-LABEL: add_pair_v4i32_v4i64_sext:
1767; CHECK-GI:       // %bb.0: // %entry
1768; CHECK-GI-NEXT:    saddlv d0, v0.4s
1769; CHECK-GI-NEXT:    saddlv d1, v1.4s
1770; CHECK-GI-NEXT:    fmov x8, d0
1771; CHECK-GI-NEXT:    fmov x9, d1
1772; CHECK-GI-NEXT:    add x0, x8, x9
1773; CHECK-GI-NEXT:    ret
1774entry:
1775  %xx = sext <4 x i32> %x to <4 x i64>
1776  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
1777  %yy = sext <4 x i32> %y to <4 x i64>
1778  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
1779  %z = add i64 %z1, %z2
1780  ret i64 %z
1781}
1782
1783define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
1784; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
1785; CHECK-SD:       // %bb.0: // %entry
1786; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1787; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
1788; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
1789; CHECK-SD-NEXT:    uaddlv d0, v0.4s
1790; CHECK-SD-NEXT:    fmov x0, d0
1791; CHECK-SD-NEXT:    ret
1792;
1793; CHECK-GI-LABEL: add_pair_v2i32_v2i64_zext:
1794; CHECK-GI:       // %bb.0: // %entry
1795; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
1796; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
1797; CHECK-GI-NEXT:    addp d0, v0.2d
1798; CHECK-GI-NEXT:    addp d1, v1.2d
1799; CHECK-GI-NEXT:    fmov x8, d0
1800; CHECK-GI-NEXT:    fmov x9, d1
1801; CHECK-GI-NEXT:    add x0, x8, x9
1802; CHECK-GI-NEXT:    ret
1803entry:
1804  %xx = zext <2 x i32> %x to <2 x i64>
1805  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1806  %yy = zext <2 x i32> %y to <2 x i64>
1807  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1808  %z = add i64 %z1, %z2
1809  ret i64 %z
1810}
1811
1812define i64 @add_pair_v2i32_v2i64_sext(<2 x i32> %x, <2 x i32> %y) {
1813; CHECK-SD-LABEL: add_pair_v2i32_v2i64_sext:
1814; CHECK-SD:       // %bb.0: // %entry
1815; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v1.2s
1816; CHECK-SD-NEXT:    addp d0, v0.2d
1817; CHECK-SD-NEXT:    fmov x0, d0
1818; CHECK-SD-NEXT:    ret
1819;
1820; CHECK-GI-LABEL: add_pair_v2i32_v2i64_sext:
1821; CHECK-GI:       // %bb.0: // %entry
1822; CHECK-GI-NEXT:    sshll v0.2d, v0.2s, #0
1823; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
1824; CHECK-GI-NEXT:    addp d0, v0.2d
1825; CHECK-GI-NEXT:    addp d1, v1.2d
1826; CHECK-GI-NEXT:    fmov x8, d0
1827; CHECK-GI-NEXT:    fmov x9, d1
1828; CHECK-GI-NEXT:    add x0, x8, x9
1829; CHECK-GI-NEXT:    ret
1830entry:
1831  %xx = sext <2 x i32> %x to <2 x i64>
1832  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
1833  %yy = sext <2 x i32> %y to <2 x i64>
1834  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
1835  %z = add i64 %z1, %z2
1836  ret i64 %z
1837}
1838
1839define i32 @add_pair_v8i16_v8i32_zext(<8 x i16> %x, <8 x i16> %y) {
1840; CHECK-SD-LABEL: add_pair_v8i16_v8i32_zext:
1841; CHECK-SD:       // %bb.0: // %entry
1842; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h
1843; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h
1844; CHECK-SD-NEXT:    addv s0, v1.4s
1845; CHECK-SD-NEXT:    fmov w0, s0
1846; CHECK-SD-NEXT:    ret
1847;
1848; CHECK-GI-LABEL: add_pair_v8i16_v8i32_zext:
1849; CHECK-GI:       // %bb.0: // %entry
1850; CHECK-GI-NEXT:    uaddlv s0, v0.8h
1851; CHECK-GI-NEXT:    uaddlv s1, v1.8h
1852; CHECK-GI-NEXT:    fmov w8, s0
1853; CHECK-GI-NEXT:    fmov w9, s1
1854; CHECK-GI-NEXT:    add w0, w8, w9
1855; CHECK-GI-NEXT:    ret
1856entry:
1857  %xx = zext <8 x i16> %x to <8 x i32>
1858  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1859  %yy = zext <8 x i16> %y to <8 x i32>
1860  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1861  %z = add i32 %z1, %z2
1862  ret i32 %z
1863}
1864
1865define i32 @add_pair_v8i16_v8i32_sext(<8 x i16> %x, <8 x i16> %y) {
1866; CHECK-SD-LABEL: add_pair_v8i16_v8i32_sext:
1867; CHECK-SD:       // %bb.0: // %entry
1868; CHECK-SD-NEXT:    saddlp v1.4s, v1.8h
1869; CHECK-SD-NEXT:    sadalp v1.4s, v0.8h
1870; CHECK-SD-NEXT:    addv s0, v1.4s
1871; CHECK-SD-NEXT:    fmov w0, s0
1872; CHECK-SD-NEXT:    ret
1873;
1874; CHECK-GI-LABEL: add_pair_v8i16_v8i32_sext:
1875; CHECK-GI:       // %bb.0: // %entry
1876; CHECK-GI-NEXT:    saddlv s0, v0.8h
1877; CHECK-GI-NEXT:    saddlv s1, v1.8h
1878; CHECK-GI-NEXT:    fmov w8, s0
1879; CHECK-GI-NEXT:    fmov w9, s1
1880; CHECK-GI-NEXT:    add w0, w8, w9
1881; CHECK-GI-NEXT:    ret
1882entry:
1883  %xx = sext <8 x i16> %x to <8 x i32>
1884  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
1885  %yy = sext <8 x i16> %y to <8 x i32>
1886  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
1887  %z = add i32 %z1, %z2
1888  ret i32 %z
1889}
1890
1891define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
1892; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
1893; CHECK-SD:       // %bb.0: // %entry
1894; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
1895; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
1896; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
1897; CHECK-SD-NEXT:    uaddlv s0, v0.8h
1898; CHECK-SD-NEXT:    fmov w0, s0
1899; CHECK-SD-NEXT:    ret
1900;
1901; CHECK-GI-LABEL: add_pair_v4i16_v4i32_zext:
1902; CHECK-GI:       // %bb.0: // %entry
1903; CHECK-GI-NEXT:    uaddlv s0, v0.4h
1904; CHECK-GI-NEXT:    uaddlv s1, v1.4h
1905; CHECK-GI-NEXT:    fmov w8, s0
1906; CHECK-GI-NEXT:    fmov w9, s1
1907; CHECK-GI-NEXT:    add w0, w8, w9
1908; CHECK-GI-NEXT:    ret
1909entry:
1910  %xx = zext <4 x i16> %x to <4 x i32>
1911  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1912  %yy = zext <4 x i16> %y to <4 x i32>
1913  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1914  %z = add i32 %z1, %z2
1915  ret i32 %z
1916}
1917
1918define i32 @add_pair_v4i16_v4i32_sext(<4 x i16> %x, <4 x i16> %y) {
1919; CHECK-SD-LABEL: add_pair_v4i16_v4i32_sext:
1920; CHECK-SD:       // %bb.0: // %entry
1921; CHECK-SD-NEXT:    saddl v0.4s, v0.4h, v1.4h
1922; CHECK-SD-NEXT:    addv s0, v0.4s
1923; CHECK-SD-NEXT:    fmov w0, s0
1924; CHECK-SD-NEXT:    ret
1925;
1926; CHECK-GI-LABEL: add_pair_v4i16_v4i32_sext:
1927; CHECK-GI:       // %bb.0: // %entry
1928; CHECK-GI-NEXT:    saddlv s0, v0.4h
1929; CHECK-GI-NEXT:    saddlv s1, v1.4h
1930; CHECK-GI-NEXT:    fmov w8, s0
1931; CHECK-GI-NEXT:    fmov w9, s1
1932; CHECK-GI-NEXT:    add w0, w8, w9
1933; CHECK-GI-NEXT:    ret
1934entry:
1935  %xx = sext <4 x i16> %x to <4 x i32>
1936  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
1937  %yy = sext <4 x i16> %y to <4 x i32>
1938  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
1939  %z = add i32 %z1, %z2
1940  ret i32 %z
1941}
1942
1943define i32 @test_udot_v8i8(<8 x i8> %a, <8 x i8> %b) {
1944; CHECK-SD-BASE-LABEL: test_udot_v8i8:
1945; CHECK-SD-BASE:       // %bb.0: // %entry
1946; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
1947; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
1948; CHECK-SD-BASE-NEXT:    fmov w0, s0
1949; CHECK-SD-BASE-NEXT:    ret
1950;
1951; CHECK-SD-DOT-LABEL: test_udot_v8i8:
1952; CHECK-SD-DOT:       // %bb.0: // %entry
1953; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
1954; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
1955; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
1956; CHECK-SD-DOT-NEXT:    fmov w0, s0
1957; CHECK-SD-DOT-NEXT:    ret
1958;
1959; CHECK-GI-BASE-LABEL: test_udot_v8i8:
1960; CHECK-GI-BASE:       // %bb.0: // %entry
1961; CHECK-GI-BASE-NEXT:    ushll v0.8h, v0.8b, #0
1962; CHECK-GI-BASE-NEXT:    ushll v1.8h, v1.8b, #0
1963; CHECK-GI-BASE-NEXT:    umull v2.4s, v1.4h, v0.4h
1964; CHECK-GI-BASE-NEXT:    umlal2 v2.4s, v1.8h, v0.8h
1965; CHECK-GI-BASE-NEXT:    addv s0, v2.4s
1966; CHECK-GI-BASE-NEXT:    fmov w0, s0
1967; CHECK-GI-BASE-NEXT:    ret
1968;
1969; CHECK-GI-DOT-LABEL: test_udot_v8i8:
1970; CHECK-GI-DOT:       // %bb.0: // %entry
1971; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
1972; CHECK-GI-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
1973; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
1974; CHECK-GI-DOT-NEXT:    fmov w0, s0
1975; CHECK-GI-DOT-NEXT:    ret
1976entry:
1977  %0 = zext <8 x i8> %a to <8 x i32>
1978  %1 = zext <8 x i8> %b to <8 x i32>
1979  %2 = mul nuw nsw <8 x i32> %1, %0
1980  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
1981  ret i32 %3
1982}
1983
1984define i32 @test_udot_v16i8(<16 x i8> %a, <16 x i8> %b) {
1985; CHECK-SD-BASE-LABEL: test_udot_v16i8:
1986; CHECK-SD-BASE:       // %bb.0: // %entry
1987; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v1.16b, v0.16b
1988; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
1989; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v0.8h, v2.8h
1990; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
1991; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
1992; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
1993; CHECK-SD-BASE-NEXT:    fmov w0, s0
1994; CHECK-SD-BASE-NEXT:    ret
1995;
1996; CHECK-SD-DOT-LABEL: test_udot_v16i8:
1997; CHECK-SD-DOT:       // %bb.0: // %entry
1998; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
1999; CHECK-SD-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
2000; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
2001; CHECK-SD-DOT-NEXT:    fmov w0, s0
2002; CHECK-SD-DOT-NEXT:    ret
2003;
2004; CHECK-GI-BASE-LABEL: test_udot_v16i8:
2005; CHECK-GI-BASE:       // %bb.0: // %entry
2006; CHECK-GI-BASE-NEXT:    ushll v2.8h, v0.8b, #0
2007; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
2008; CHECK-GI-BASE-NEXT:    ushll v3.8h, v1.8b, #0
2009; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
2010; CHECK-GI-BASE-NEXT:    umull v4.4s, v3.4h, v2.4h
2011; CHECK-GI-BASE-NEXT:    umull v5.4s, v1.4h, v0.4h
2012; CHECK-GI-BASE-NEXT:    umlal2 v4.4s, v3.8h, v2.8h
2013; CHECK-GI-BASE-NEXT:    umlal2 v5.4s, v1.8h, v0.8h
2014; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s
2015; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2016; CHECK-GI-BASE-NEXT:    fmov w0, s0
2017; CHECK-GI-BASE-NEXT:    ret
2018;
2019; CHECK-GI-DOT-LABEL: test_udot_v16i8:
2020; CHECK-GI-DOT:       // %bb.0: // %entry
2021; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
2022; CHECK-GI-DOT-NEXT:    udot v2.4s, v1.16b, v0.16b
2023; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
2024; CHECK-GI-DOT-NEXT:    fmov w0, s0
2025; CHECK-GI-DOT-NEXT:    ret
2026entry:
2027  %0 = zext <16 x i8> %a to <16 x i32>
2028  %1 = zext <16 x i8> %b to <16 x i32>
2029  %2 = mul nuw nsw <16 x i32> %1, %0
2030  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2031  ret i32 %3
2032}
2033
2034define i32 @test_udot_v24i8(ptr %p1, ptr %p2) {
2035; CHECK-SD-BASE-LABEL: test_udot_v24i8:
2036; CHECK-SD-BASE:       // %bb.0: // %entry
2037; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
2038; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
2039; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16]
2040; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16]
2041; CHECK-SD-BASE-NEXT:    umull v2.8h, v3.8b, v2.8b
2042; CHECK-SD-BASE-NEXT:    umull v3.8h, v1.8b, v0.8b
2043; CHECK-SD-BASE-NEXT:    umull2 v0.8h, v1.16b, v0.16b
2044; CHECK-SD-BASE-NEXT:    uaddl2 v1.4s, v3.8h, v2.8h
2045; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h
2046; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v1.4s, v0.8h
2047; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v2.4s, v0.4h
2048; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2049; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2050; CHECK-SD-BASE-NEXT:    fmov w0, s0
2051; CHECK-SD-BASE-NEXT:    ret
2052;
2053; CHECK-SD-DOT-LABEL: test_udot_v24i8:
2054; CHECK-SD-DOT:       // %bb.0: // %entry
2055; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
2056; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
2057; CHECK-SD-DOT-NEXT:    ldr q2, [x0]
2058; CHECK-SD-DOT-NEXT:    ldr q3, [x1]
2059; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16]
2060; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16]
2061; CHECK-SD-DOT-NEXT:    udot v1.2s, v5.8b, v4.8b
2062; CHECK-SD-DOT-NEXT:    udot v0.4s, v3.16b, v2.16b
2063; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s
2064; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
2065; CHECK-SD-DOT-NEXT:    fmov w8, s1
2066; CHECK-SD-DOT-NEXT:    fmov w9, s0
2067; CHECK-SD-DOT-NEXT:    add w0, w9, w8
2068; CHECK-SD-DOT-NEXT:    ret
2069;
2070; CHECK-GI-BASE-LABEL: test_udot_v24i8:
2071; CHECK-GI-BASE:       // %bb.0: // %entry
2072; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
2073; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
2074; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
2075; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
2076; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
2077; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
2078; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
2079; CHECK-GI-BASE-NEXT:    ushll v2.8h, v2.8b, #0
2080; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
2081; CHECK-GI-BASE-NEXT:    ushll v3.8h, v3.8b, #0
2082; CHECK-GI-BASE-NEXT:    umull v6.4s, v5.4h, v4.4h
2083; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v5.8h, v4.8h
2084; CHECK-GI-BASE-NEXT:    umull2 v5.4s, v1.8h, v0.8h
2085; CHECK-GI-BASE-NEXT:    umull v7.4s, v3.4h, v2.4h
2086; CHECK-GI-BASE-NEXT:    umull v0.4s, v1.4h, v0.4h
2087; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v3.8h, v2.8h
2088; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
2089; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
2090; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
2091; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
2092; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2093; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
2094; CHECK-GI-BASE-NEXT:    fmov w8, s2
2095; CHECK-GI-BASE-NEXT:    fmov w9, s3
2096; CHECK-GI-BASE-NEXT:    fmov w10, s4
2097; CHECK-GI-BASE-NEXT:    fmov w11, s5
2098; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2099; CHECK-GI-BASE-NEXT:    fmov w9, s0
2100; CHECK-GI-BASE-NEXT:    add w10, w10, w11
2101; CHECK-GI-BASE-NEXT:    fmov w11, s1
2102; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2103; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2104; CHECK-GI-BASE-NEXT:    add w0, w8, w9
2105; CHECK-GI-BASE-NEXT:    ret
2106;
2107; CHECK-GI-DOT-LABEL: test_udot_v24i8:
2108; CHECK-GI-DOT:       // %bb.0: // %entry
2109; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
2110; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
2111; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
2112; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
2113; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
2114; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
2115; CHECK-GI-DOT-NEXT:    udot v1.4s, v4.16b, v2.16b
2116; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
2117; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
2118; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
2119; CHECK-GI-DOT-NEXT:    fmov w0, s0
2120; CHECK-GI-DOT-NEXT:    ret
2121entry:
2122  %a = load <24 x i8>, ptr %p1
2123  %b = load <24 x i8>, ptr %p2
2124  %0 = zext <24 x i8> %a to <24 x i32>
2125  %1 = zext <24 x i8> %b to <24 x i32>
2126  %2 = mul nuw nsw <24 x i32> %1, %0
2127  %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
2128  ret i32 %3
2129}
2130
2131define i32 @test_udot_v48i8(ptr %p1, ptr %p2) {
2132; CHECK-SD-BASE-LABEL: test_udot_v48i8:
2133; CHECK-SD-BASE:       // %bb.0: // %entry
2134; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16]
2135; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32]
2136; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1]
2137; CHECK-SD-BASE-NEXT:    ldr q3, [x0]
2138; CHECK-SD-BASE-NEXT:    umull2 v6.8h, v2.16b, v0.16b
2139; CHECK-SD-BASE-NEXT:    umull v0.8h, v2.8b, v0.8b
2140; CHECK-SD-BASE-NEXT:    umull2 v7.8h, v1.16b, v3.16b
2141; CHECK-SD-BASE-NEXT:    umull v1.8h, v1.8b, v3.8b
2142; CHECK-SD-BASE-NEXT:    umull2 v2.8h, v5.16b, v4.16b
2143; CHECK-SD-BASE-NEXT:    umull v3.8h, v5.8b, v4.8b
2144; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v7.8h, v6.8h
2145; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v1.8h, v0.8h
2146; CHECK-SD-BASE-NEXT:    uaddl v6.4s, v7.4h, v6.4h
2147; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v1.4h, v0.4h
2148; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v4.4s, v2.8h
2149; CHECK-SD-BASE-NEXT:    uaddw2 v4.4s, v5.4s, v3.8h
2150; CHECK-SD-BASE-NEXT:    uaddw v2.4s, v6.4s, v2.4h
2151; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v3.4h
2152; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s
2153; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
2154; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2155; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2156; CHECK-SD-BASE-NEXT:    fmov w0, s0
2157; CHECK-SD-BASE-NEXT:    ret
2158;
2159; CHECK-SD-DOT-LABEL: test_udot_v48i8:
2160; CHECK-SD-DOT:       // %bb.0: // %entry
2161; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
2162; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32]
2163; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32]
2164; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
2165; CHECK-SD-DOT-NEXT:    ldp q3, q1, [x0]
2166; CHECK-SD-DOT-NEXT:    ldp q4, q2, [x1]
2167; CHECK-SD-DOT-NEXT:    udot v0.4s, v4.16b, v3.16b
2168; CHECK-SD-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
2169; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
2170; CHECK-SD-DOT-NEXT:    fmov w0, s0
2171; CHECK-SD-DOT-NEXT:    ret
2172;
2173; CHECK-GI-BASE-LABEL: test_udot_v48i8:
2174; CHECK-GI-BASE:       // %bb.0: // %entry
2175; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
2176; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32]
2177; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
2178; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
2179; CHECK-GI-BASE-NEXT:    ushll v20.8h, v6.8b, #0
2180; CHECK-GI-BASE-NEXT:    ushll2 v6.8h, v6.16b, #0
2181; CHECK-GI-BASE-NEXT:    ushll v4.8h, v0.8b, #0
2182; CHECK-GI-BASE-NEXT:    ushll2 v0.8h, v0.16b, #0
2183; CHECK-GI-BASE-NEXT:    ushll v16.8h, v3.8b, #0
2184; CHECK-GI-BASE-NEXT:    ushll v5.8h, v1.8b, #0
2185; CHECK-GI-BASE-NEXT:    ushll2 v1.8h, v1.16b, #0
2186; CHECK-GI-BASE-NEXT:    ushll v17.8h, v2.8b, #0
2187; CHECK-GI-BASE-NEXT:    ushll2 v3.8h, v3.16b, #0
2188; CHECK-GI-BASE-NEXT:    ushll2 v2.8h, v2.16b, #0
2189; CHECK-GI-BASE-NEXT:    umull v18.4s, v4.4h, v5.4h
2190; CHECK-GI-BASE-NEXT:    umull2 v4.4s, v4.8h, v5.8h
2191; CHECK-GI-BASE-NEXT:    umull v5.4s, v0.4h, v1.4h
2192; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v0.8h, v1.8h
2193; CHECK-GI-BASE-NEXT:    umull v19.4s, v16.4h, v17.4h
2194; CHECK-GI-BASE-NEXT:    ushll v1.8h, v7.8b, #0
2195; CHECK-GI-BASE-NEXT:    umull2 v16.4s, v16.8h, v17.8h
2196; CHECK-GI-BASE-NEXT:    umull v17.4s, v3.4h, v2.4h
2197; CHECK-GI-BASE-NEXT:    umull2 v2.4s, v3.8h, v2.8h
2198; CHECK-GI-BASE-NEXT:    ushll2 v7.8h, v7.16b, #0
2199; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
2200; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
2201; CHECK-GI-BASE-NEXT:    addv s5, v5.4s
2202; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2203; CHECK-GI-BASE-NEXT:    addv s19, v19.4s
2204; CHECK-GI-BASE-NEXT:    umull v3.4s, v1.4h, v20.4h
2205; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
2206; CHECK-GI-BASE-NEXT:    umull2 v1.4s, v1.8h, v20.8h
2207; CHECK-GI-BASE-NEXT:    umull v20.4s, v7.4h, v6.4h
2208; CHECK-GI-BASE-NEXT:    fmov w8, s18
2209; CHECK-GI-BASE-NEXT:    fmov w9, s4
2210; CHECK-GI-BASE-NEXT:    fmov w10, s5
2211; CHECK-GI-BASE-NEXT:    fmov w11, s0
2212; CHECK-GI-BASE-NEXT:    fmov w12, s19
2213; CHECK-GI-BASE-NEXT:    addv s4, v16.4s
2214; CHECK-GI-BASE-NEXT:    addv s5, v17.4s
2215; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
2216; CHECK-GI-BASE-NEXT:    umull2 v0.4s, v7.8h, v6.8h
2217; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2218; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
2219; CHECK-GI-BASE-NEXT:    add w9, w11, w12
2220; CHECK-GI-BASE-NEXT:    add w8, w8, w10
2221; CHECK-GI-BASE-NEXT:    fmov w10, s4
2222; CHECK-GI-BASE-NEXT:    fmov w11, s5
2223; CHECK-GI-BASE-NEXT:    fmov w12, s2
2224; CHECK-GI-BASE-NEXT:    addv s4, v20.4s
2225; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2226; CHECK-GI-BASE-NEXT:    add w9, w9, w10
2227; CHECK-GI-BASE-NEXT:    add w10, w11, w12
2228; CHECK-GI-BASE-NEXT:    fmov w11, s3
2229; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2230; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2231; CHECK-GI-BASE-NEXT:    fmov w10, s1
2232; CHECK-GI-BASE-NEXT:    fmov w11, s0
2233; CHECK-GI-BASE-NEXT:    add w9, w9, w10
2234; CHECK-GI-BASE-NEXT:    fmov w10, s4
2235; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2236; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2237; CHECK-GI-BASE-NEXT:    add w0, w8, w9
2238; CHECK-GI-BASE-NEXT:    ret
2239;
2240; CHECK-GI-DOT-LABEL: test_udot_v48i8:
2241; CHECK-GI-DOT:       // %bb.0: // %entry
2242; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
2243; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
2244; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
2245; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
2246; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
2247; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
2248; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
2249; CHECK-GI-DOT-NEXT:    udot v0.4s, v5.16b, v3.16b
2250; CHECK-GI-DOT-NEXT:    udot v1.4s, v6.16b, v4.16b
2251; CHECK-GI-DOT-NEXT:    udot v2.4s, v16.16b, v7.16b
2252; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
2253; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
2254; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
2255; CHECK-GI-DOT-NEXT:    fmov w8, s0
2256; CHECK-GI-DOT-NEXT:    fmov w9, s1
2257; CHECK-GI-DOT-NEXT:    add w8, w8, w9
2258; CHECK-GI-DOT-NEXT:    fmov w9, s2
2259; CHECK-GI-DOT-NEXT:    add w0, w8, w9
2260; CHECK-GI-DOT-NEXT:    ret
2261entry:
2262  %a = load <48 x i8>, ptr %p1
2263  %b = load <48 x i8>, ptr %p2
2264  %0 = zext <48 x i8> %a to <48 x i32>
2265  %1 = zext <48 x i8> %b to <48 x i32>
2266  %2 = mul nuw nsw <48 x i32> %1, %0
2267  %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
2268  ret i32 %3
2269}
2270
2271define i32 @test_sdot_v8i8(<8 x i8> %a, <8 x i8> %b) {
2272; CHECK-SD-BASE-LABEL: test_sdot_v8i8:
2273; CHECK-SD-BASE:       // %bb.0: // %entry
2274; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b
2275; CHECK-SD-BASE-NEXT:    saddlv s0, v0.8h
2276; CHECK-SD-BASE-NEXT:    fmov w0, s0
2277; CHECK-SD-BASE-NEXT:    ret
2278;
2279; CHECK-SD-DOT-LABEL: test_sdot_v8i8:
2280; CHECK-SD-DOT:       // %bb.0: // %entry
2281; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
2282; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
2283; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
2284; CHECK-SD-DOT-NEXT:    fmov w0, s0
2285; CHECK-SD-DOT-NEXT:    ret
2286;
2287; CHECK-GI-BASE-LABEL: test_sdot_v8i8:
2288; CHECK-GI-BASE:       // %bb.0: // %entry
2289; CHECK-GI-BASE-NEXT:    sshll v0.8h, v0.8b, #0
2290; CHECK-GI-BASE-NEXT:    sshll v1.8h, v1.8b, #0
2291; CHECK-GI-BASE-NEXT:    smull v2.4s, v1.4h, v0.4h
2292; CHECK-GI-BASE-NEXT:    smlal2 v2.4s, v1.8h, v0.8h
2293; CHECK-GI-BASE-NEXT:    addv s0, v2.4s
2294; CHECK-GI-BASE-NEXT:    fmov w0, s0
2295; CHECK-GI-BASE-NEXT:    ret
2296;
2297; CHECK-GI-DOT-LABEL: test_sdot_v8i8:
2298; CHECK-GI-DOT:       // %bb.0: // %entry
2299; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
2300; CHECK-GI-DOT-NEXT:    sdot v2.2s, v1.8b, v0.8b
2301; CHECK-GI-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
2302; CHECK-GI-DOT-NEXT:    fmov w0, s0
2303; CHECK-GI-DOT-NEXT:    ret
2304entry:
2305  %0 = sext <8 x i8> %a to <8 x i32>
2306  %1 = sext <8 x i8> %b to <8 x i32>
2307  %2 = mul nuw nsw <8 x i32> %1, %0
2308  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2309  ret i32 %3
2310}
2311
2312define i32 @test_sdot_v16i8(<16 x i8> %a, <16 x i8> %b) {
2313; CHECK-SD-BASE-LABEL: test_sdot_v16i8:
2314; CHECK-SD-BASE:       // %bb.0: // %entry
2315; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v1.16b, v0.16b
2316; CHECK-SD-BASE-NEXT:    smull v0.8h, v1.8b, v0.8b
2317; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v0.8h, v2.8h
2318; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
2319; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2320; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2321; CHECK-SD-BASE-NEXT:    fmov w0, s0
2322; CHECK-SD-BASE-NEXT:    ret
2323;
2324; CHECK-SD-DOT-LABEL: test_sdot_v16i8:
2325; CHECK-SD-DOT:       // %bb.0: // %entry
2326; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
2327; CHECK-SD-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
2328; CHECK-SD-DOT-NEXT:    addv s0, v2.4s
2329; CHECK-SD-DOT-NEXT:    fmov w0, s0
2330; CHECK-SD-DOT-NEXT:    ret
2331;
2332; CHECK-GI-BASE-LABEL: test_sdot_v16i8:
2333; CHECK-GI-BASE:       // %bb.0: // %entry
2334; CHECK-GI-BASE-NEXT:    sshll v2.8h, v0.8b, #0
2335; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
2336; CHECK-GI-BASE-NEXT:    sshll v3.8h, v1.8b, #0
2337; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
2338; CHECK-GI-BASE-NEXT:    smull v4.4s, v3.4h, v2.4h
2339; CHECK-GI-BASE-NEXT:    smull v5.4s, v1.4h, v0.4h
2340; CHECK-GI-BASE-NEXT:    smlal2 v4.4s, v3.8h, v2.8h
2341; CHECK-GI-BASE-NEXT:    smlal2 v5.4s, v1.8h, v0.8h
2342; CHECK-GI-BASE-NEXT:    add v0.4s, v4.4s, v5.4s
2343; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2344; CHECK-GI-BASE-NEXT:    fmov w0, s0
2345; CHECK-GI-BASE-NEXT:    ret
2346;
2347; CHECK-GI-DOT-LABEL: test_sdot_v16i8:
2348; CHECK-GI-DOT:       // %bb.0: // %entry
2349; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
2350; CHECK-GI-DOT-NEXT:    sdot v2.4s, v1.16b, v0.16b
2351; CHECK-GI-DOT-NEXT:    addv s0, v2.4s
2352; CHECK-GI-DOT-NEXT:    fmov w0, s0
2353; CHECK-GI-DOT-NEXT:    ret
2354entry:
2355  %0 = sext <16 x i8> %a to <16 x i32>
2356  %1 = sext <16 x i8> %b to <16 x i32>
2357  %2 = mul nuw nsw <16 x i32> %1, %0
2358  %3 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %2)
2359  ret i32 %3
2360}
2361
2362define i32 @test_sdot_v24i8(ptr %p1, ptr %p2) {
2363; CHECK-SD-BASE-LABEL: test_sdot_v24i8:
2364; CHECK-SD-BASE:       // %bb.0: // %entry
2365; CHECK-SD-BASE-NEXT:    ldr q0, [x0]
2366; CHECK-SD-BASE-NEXT:    ldr q1, [x1]
2367; CHECK-SD-BASE-NEXT:    ldr d2, [x0, #16]
2368; CHECK-SD-BASE-NEXT:    ldr d3, [x1, #16]
2369; CHECK-SD-BASE-NEXT:    smull v2.8h, v3.8b, v2.8b
2370; CHECK-SD-BASE-NEXT:    smull v3.8h, v1.8b, v0.8b
2371; CHECK-SD-BASE-NEXT:    smull2 v0.8h, v1.16b, v0.16b
2372; CHECK-SD-BASE-NEXT:    saddl2 v1.4s, v3.8h, v2.8h
2373; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h
2374; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v1.4s, v0.8h
2375; CHECK-SD-BASE-NEXT:    saddw v0.4s, v2.4s, v0.4h
2376; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2377; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2378; CHECK-SD-BASE-NEXT:    fmov w0, s0
2379; CHECK-SD-BASE-NEXT:    ret
2380;
2381; CHECK-SD-DOT-LABEL: test_sdot_v24i8:
2382; CHECK-SD-DOT:       // %bb.0: // %entry
2383; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
2384; CHECK-SD-DOT-NEXT:    movi v1.2d, #0000000000000000
2385; CHECK-SD-DOT-NEXT:    ldr q2, [x0]
2386; CHECK-SD-DOT-NEXT:    ldr q3, [x1]
2387; CHECK-SD-DOT-NEXT:    ldr d4, [x0, #16]
2388; CHECK-SD-DOT-NEXT:    ldr d5, [x1, #16]
2389; CHECK-SD-DOT-NEXT:    sdot v1.2s, v5.8b, v4.8b
2390; CHECK-SD-DOT-NEXT:    sdot v0.4s, v3.16b, v2.16b
2391; CHECK-SD-DOT-NEXT:    addp v1.2s, v1.2s, v1.2s
2392; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
2393; CHECK-SD-DOT-NEXT:    fmov w8, s1
2394; CHECK-SD-DOT-NEXT:    fmov w9, s0
2395; CHECK-SD-DOT-NEXT:    add w0, w9, w8
2396; CHECK-SD-DOT-NEXT:    ret
2397;
2398; CHECK-GI-BASE-LABEL: test_sdot_v24i8:
2399; CHECK-GI-BASE:       // %bb.0: // %entry
2400; CHECK-GI-BASE-NEXT:    ldr q0, [x0]
2401; CHECK-GI-BASE-NEXT:    ldr q1, [x1]
2402; CHECK-GI-BASE-NEXT:    ldr d2, [x0, #16]
2403; CHECK-GI-BASE-NEXT:    ldr d3, [x1, #16]
2404; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
2405; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
2406; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
2407; CHECK-GI-BASE-NEXT:    sshll v2.8h, v2.8b, #0
2408; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
2409; CHECK-GI-BASE-NEXT:    sshll v3.8h, v3.8b, #0
2410; CHECK-GI-BASE-NEXT:    smull v6.4s, v5.4h, v4.4h
2411; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v5.8h, v4.8h
2412; CHECK-GI-BASE-NEXT:    smull2 v5.4s, v1.8h, v0.8h
2413; CHECK-GI-BASE-NEXT:    smull v7.4s, v3.4h, v2.4h
2414; CHECK-GI-BASE-NEXT:    smull v0.4s, v1.4h, v0.4h
2415; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v3.8h, v2.8h
2416; CHECK-GI-BASE-NEXT:    addv s2, v6.4s
2417; CHECK-GI-BASE-NEXT:    addv s3, v4.4s
2418; CHECK-GI-BASE-NEXT:    addv s4, v5.4s
2419; CHECK-GI-BASE-NEXT:    addv s5, v7.4s
2420; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2421; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
2422; CHECK-GI-BASE-NEXT:    fmov w8, s2
2423; CHECK-GI-BASE-NEXT:    fmov w9, s3
2424; CHECK-GI-BASE-NEXT:    fmov w10, s4
2425; CHECK-GI-BASE-NEXT:    fmov w11, s5
2426; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2427; CHECK-GI-BASE-NEXT:    fmov w9, s0
2428; CHECK-GI-BASE-NEXT:    add w10, w10, w11
2429; CHECK-GI-BASE-NEXT:    fmov w11, s1
2430; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2431; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2432; CHECK-GI-BASE-NEXT:    add w0, w8, w9
2433; CHECK-GI-BASE-NEXT:    ret
2434;
2435; CHECK-GI-DOT-LABEL: test_sdot_v24i8:
2436; CHECK-GI-DOT:       // %bb.0: // %entry
2437; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
2438; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
2439; CHECK-GI-DOT-NEXT:    ldr q2, [x0]
2440; CHECK-GI-DOT-NEXT:    ldr d3, [x0, #16]
2441; CHECK-GI-DOT-NEXT:    ldr q4, [x1]
2442; CHECK-GI-DOT-NEXT:    ldr d5, [x1, #16]
2443; CHECK-GI-DOT-NEXT:    sdot v1.4s, v4.16b, v2.16b
2444; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
2445; CHECK-GI-DOT-NEXT:    add v0.4s, v1.4s, v0.4s
2446; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
2447; CHECK-GI-DOT-NEXT:    fmov w0, s0
2448; CHECK-GI-DOT-NEXT:    ret
2449entry:
2450  %a = load <24 x i8>, ptr %p1
2451  %b = load <24 x i8>, ptr %p2
2452  %0 = sext <24 x i8> %a to <24 x i32>
2453  %1 = sext <24 x i8> %b to <24 x i32>
2454  %2 = mul nuw nsw <24 x i32> %1, %0
2455  %3 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %2)
2456  ret i32 %3
2457}
2458
2459define i32 @test_sdot_v48i8(ptr %p1, ptr %p2) {
2460; CHECK-SD-BASE-LABEL: test_sdot_v48i8:
2461; CHECK-SD-BASE:       // %bb.0: // %entry
2462; CHECK-SD-BASE-NEXT:    ldp q4, q0, [x0, #16]
2463; CHECK-SD-BASE-NEXT:    ldr q2, [x1, #32]
2464; CHECK-SD-BASE-NEXT:    ldp q1, q5, [x1]
2465; CHECK-SD-BASE-NEXT:    ldr q3, [x0]
2466; CHECK-SD-BASE-NEXT:    smull2 v6.8h, v2.16b, v0.16b
2467; CHECK-SD-BASE-NEXT:    smull v0.8h, v2.8b, v0.8b
2468; CHECK-SD-BASE-NEXT:    smull2 v7.8h, v1.16b, v3.16b
2469; CHECK-SD-BASE-NEXT:    smull v1.8h, v1.8b, v3.8b
2470; CHECK-SD-BASE-NEXT:    smull2 v2.8h, v5.16b, v4.16b
2471; CHECK-SD-BASE-NEXT:    smull v3.8h, v5.8b, v4.8b
2472; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v7.8h, v6.8h
2473; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v1.8h, v0.8h
2474; CHECK-SD-BASE-NEXT:    saddl v6.4s, v7.4h, v6.4h
2475; CHECK-SD-BASE-NEXT:    saddl v0.4s, v1.4h, v0.4h
2476; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v4.4s, v2.8h
2477; CHECK-SD-BASE-NEXT:    saddw2 v4.4s, v5.4s, v3.8h
2478; CHECK-SD-BASE-NEXT:    saddw v2.4s, v6.4s, v2.4h
2479; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v3.4h
2480; CHECK-SD-BASE-NEXT:    add v1.4s, v4.4s, v1.4s
2481; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
2482; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2483; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2484; CHECK-SD-BASE-NEXT:    fmov w0, s0
2485; CHECK-SD-BASE-NEXT:    ret
2486;
2487; CHECK-SD-DOT-LABEL: test_sdot_v48i8:
2488; CHECK-SD-DOT:       // %bb.0: // %entry
2489; CHECK-SD-DOT-NEXT:    movi v0.2d, #0000000000000000
2490; CHECK-SD-DOT-NEXT:    ldr q1, [x0, #32]
2491; CHECK-SD-DOT-NEXT:    ldr q2, [x1, #32]
2492; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
2493; CHECK-SD-DOT-NEXT:    ldp q3, q1, [x0]
2494; CHECK-SD-DOT-NEXT:    ldp q4, q2, [x1]
2495; CHECK-SD-DOT-NEXT:    sdot v0.4s, v4.16b, v3.16b
2496; CHECK-SD-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
2497; CHECK-SD-DOT-NEXT:    addv s0, v0.4s
2498; CHECK-SD-DOT-NEXT:    fmov w0, s0
2499; CHECK-SD-DOT-NEXT:    ret
2500;
2501; CHECK-GI-BASE-LABEL: test_sdot_v48i8:
2502; CHECK-GI-BASE:       // %bb.0: // %entry
2503; CHECK-GI-BASE-NEXT:    ldp q0, q3, [x1]
2504; CHECK-GI-BASE-NEXT:    ldr q6, [x0, #32]
2505; CHECK-GI-BASE-NEXT:    ldp q1, q2, [x0]
2506; CHECK-GI-BASE-NEXT:    ldr q7, [x1, #32]
2507; CHECK-GI-BASE-NEXT:    sshll v20.8h, v6.8b, #0
2508; CHECK-GI-BASE-NEXT:    sshll2 v6.8h, v6.16b, #0
2509; CHECK-GI-BASE-NEXT:    sshll v4.8h, v0.8b, #0
2510; CHECK-GI-BASE-NEXT:    sshll2 v0.8h, v0.16b, #0
2511; CHECK-GI-BASE-NEXT:    sshll v16.8h, v3.8b, #0
2512; CHECK-GI-BASE-NEXT:    sshll v5.8h, v1.8b, #0
2513; CHECK-GI-BASE-NEXT:    sshll2 v1.8h, v1.16b, #0
2514; CHECK-GI-BASE-NEXT:    sshll v17.8h, v2.8b, #0
2515; CHECK-GI-BASE-NEXT:    sshll2 v3.8h, v3.16b, #0
2516; CHECK-GI-BASE-NEXT:    sshll2 v2.8h, v2.16b, #0
2517; CHECK-GI-BASE-NEXT:    smull v18.4s, v4.4h, v5.4h
2518; CHECK-GI-BASE-NEXT:    smull2 v4.4s, v4.8h, v5.8h
2519; CHECK-GI-BASE-NEXT:    smull v5.4s, v0.4h, v1.4h
2520; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v0.8h, v1.8h
2521; CHECK-GI-BASE-NEXT:    smull v19.4s, v16.4h, v17.4h
2522; CHECK-GI-BASE-NEXT:    sshll v1.8h, v7.8b, #0
2523; CHECK-GI-BASE-NEXT:    smull2 v16.4s, v16.8h, v17.8h
2524; CHECK-GI-BASE-NEXT:    smull v17.4s, v3.4h, v2.4h
2525; CHECK-GI-BASE-NEXT:    smull2 v2.4s, v3.8h, v2.8h
2526; CHECK-GI-BASE-NEXT:    sshll2 v7.8h, v7.16b, #0
2527; CHECK-GI-BASE-NEXT:    addv s18, v18.4s
2528; CHECK-GI-BASE-NEXT:    addv s4, v4.4s
2529; CHECK-GI-BASE-NEXT:    addv s5, v5.4s
2530; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2531; CHECK-GI-BASE-NEXT:    addv s19, v19.4s
2532; CHECK-GI-BASE-NEXT:    smull v3.4s, v1.4h, v20.4h
2533; CHECK-GI-BASE-NEXT:    addv s2, v2.4s
2534; CHECK-GI-BASE-NEXT:    smull2 v1.4s, v1.8h, v20.8h
2535; CHECK-GI-BASE-NEXT:    smull v20.4s, v7.4h, v6.4h
2536; CHECK-GI-BASE-NEXT:    fmov w8, s18
2537; CHECK-GI-BASE-NEXT:    fmov w9, s4
2538; CHECK-GI-BASE-NEXT:    fmov w10, s5
2539; CHECK-GI-BASE-NEXT:    fmov w11, s0
2540; CHECK-GI-BASE-NEXT:    fmov w12, s19
2541; CHECK-GI-BASE-NEXT:    addv s4, v16.4s
2542; CHECK-GI-BASE-NEXT:    addv s5, v17.4s
2543; CHECK-GI-BASE-NEXT:    addv s3, v3.4s
2544; CHECK-GI-BASE-NEXT:    smull2 v0.4s, v7.8h, v6.8h
2545; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2546; CHECK-GI-BASE-NEXT:    addv s1, v1.4s
2547; CHECK-GI-BASE-NEXT:    add w9, w11, w12
2548; CHECK-GI-BASE-NEXT:    add w8, w8, w10
2549; CHECK-GI-BASE-NEXT:    fmov w10, s4
2550; CHECK-GI-BASE-NEXT:    fmov w11, s5
2551; CHECK-GI-BASE-NEXT:    fmov w12, s2
2552; CHECK-GI-BASE-NEXT:    addv s4, v20.4s
2553; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
2554; CHECK-GI-BASE-NEXT:    add w9, w9, w10
2555; CHECK-GI-BASE-NEXT:    add w10, w11, w12
2556; CHECK-GI-BASE-NEXT:    fmov w11, s3
2557; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2558; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2559; CHECK-GI-BASE-NEXT:    fmov w10, s1
2560; CHECK-GI-BASE-NEXT:    fmov w11, s0
2561; CHECK-GI-BASE-NEXT:    add w9, w9, w10
2562; CHECK-GI-BASE-NEXT:    fmov w10, s4
2563; CHECK-GI-BASE-NEXT:    add w8, w8, w9
2564; CHECK-GI-BASE-NEXT:    add w9, w10, w11
2565; CHECK-GI-BASE-NEXT:    add w0, w8, w9
2566; CHECK-GI-BASE-NEXT:    ret
2567;
2568; CHECK-GI-DOT-LABEL: test_sdot_v48i8:
2569; CHECK-GI-DOT:       // %bb.0: // %entry
2570; CHECK-GI-DOT-NEXT:    movi v0.2d, #0000000000000000
2571; CHECK-GI-DOT-NEXT:    movi v1.2d, #0000000000000000
2572; CHECK-GI-DOT-NEXT:    ldr q7, [x0, #32]
2573; CHECK-GI-DOT-NEXT:    ldp q3, q4, [x0]
2574; CHECK-GI-DOT-NEXT:    movi v2.2d, #0000000000000000
2575; CHECK-GI-DOT-NEXT:    ldp q5, q6, [x1]
2576; CHECK-GI-DOT-NEXT:    ldr q16, [x1, #32]
2577; CHECK-GI-DOT-NEXT:    sdot v0.4s, v5.16b, v3.16b
2578; CHECK-GI-DOT-NEXT:    sdot v1.4s, v6.16b, v4.16b
2579; CHECK-GI-DOT-NEXT:    sdot v2.4s, v16.16b, v7.16b
2580; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
2581; CHECK-GI-DOT-NEXT:    addv s1, v1.4s
2582; CHECK-GI-DOT-NEXT:    addv s2, v2.4s
2583; CHECK-GI-DOT-NEXT:    fmov w8, s0
2584; CHECK-GI-DOT-NEXT:    fmov w9, s1
2585; CHECK-GI-DOT-NEXT:    add w8, w8, w9
2586; CHECK-GI-DOT-NEXT:    fmov w9, s2
2587; CHECK-GI-DOT-NEXT:    add w0, w8, w9
2588; CHECK-GI-DOT-NEXT:    ret
2589entry:
2590  %a = load <48 x i8>, ptr %p1
2591  %b = load <48 x i8>, ptr %p2
2592  %0 = sext <48 x i8> %a to <48 x i32>
2593  %1 = sext <48 x i8> %b to <48 x i32>
2594  %2 = mul nuw nsw <48 x i32> %1, %0
2595  %3 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %2)
2596  ret i32 %3
2597}
2598
2599; Test to ensure that if G_MUL has more than 1 use, it should not be combined to UDOT
2600define i32 @test_udot_v8i8_multi_use(<8 x i8> %a, <8 x i8> %b) {
2601; CHECK-SD-BASE-LABEL: test_udot_v8i8_multi_use:
2602; CHECK-SD-BASE:       // %bb.0: // %entry
2603; CHECK-SD-BASE-NEXT:    umull v0.8h, v1.8b, v0.8b
2604; CHECK-SD-BASE-NEXT:    uaddlv s1, v0.8h
2605; CHECK-SD-BASE-NEXT:    ushll v0.4s, v0.4h, #0
2606; CHECK-SD-BASE-NEXT:    fmov w9, s0
2607; CHECK-SD-BASE-NEXT:    fmov w8, s1
2608; CHECK-SD-BASE-NEXT:    add w0, w8, w9
2609; CHECK-SD-BASE-NEXT:    ret
2610;
2611; CHECK-SD-DOT-LABEL: test_udot_v8i8_multi_use:
2612; CHECK-SD-DOT:       // %bb.0: // %entry
2613; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
2614; CHECK-SD-DOT-NEXT:    umull v3.8h, v1.8b, v0.8b
2615; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v0.8b
2616; CHECK-SD-DOT-NEXT:    ushll v0.4s, v3.4h, #0
2617; CHECK-SD-DOT-NEXT:    fmov w9, s0
2618; CHECK-SD-DOT-NEXT:    addp v1.2s, v2.2s, v2.2s
2619; CHECK-SD-DOT-NEXT:    fmov w8, s1
2620; CHECK-SD-DOT-NEXT:    add w0, w8, w9
2621; CHECK-SD-DOT-NEXT:    ret
2622;
2623; CHECK-GI-LABEL: test_udot_v8i8_multi_use:
2624; CHECK-GI:       // %bb.0: // %entry
2625; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
2626; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
2627; CHECK-GI-NEXT:    umull v2.4s, v1.4h, v0.4h
2628; CHECK-GI-NEXT:    mov v3.16b, v2.16b
2629; CHECK-GI-NEXT:    fmov w8, s2
2630; CHECK-GI-NEXT:    umlal2 v3.4s, v1.8h, v0.8h
2631; CHECK-GI-NEXT:    addv s0, v3.4s
2632; CHECK-GI-NEXT:    fmov w9, s0
2633; CHECK-GI-NEXT:    add w0, w9, w8
2634; CHECK-GI-NEXT:    ret
2635entry:
2636  %0 = zext <8 x i8> %a to <8 x i32>
2637  %1 = zext <8 x i8> %b to <8 x i32>
2638  %2 = mul nuw nsw <8 x i32> %1, %0
2639  %3 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %2)
2640  %4 = extractelement <8 x i32> %2, i32 0
2641  %5 = add nuw nsw i32 %3, %4
2642  ret i32 %5
2643}
2644
2645define zeroext i16 @add_pair_v8i16_v8i16(<8 x i16> %x, <8 x i16> %y) {
2646; CHECK-SD-LABEL: add_pair_v8i16_v8i16:
2647; CHECK-SD:       // %bb.0: // %entry
2648; CHECK-SD-NEXT:    add v0.8h, v0.8h, v1.8h
2649; CHECK-SD-NEXT:    addv h0, v0.8h
2650; CHECK-SD-NEXT:    fmov w0, s0
2651; CHECK-SD-NEXT:    ret
2652;
2653; CHECK-GI-LABEL: add_pair_v8i16_v8i16:
2654; CHECK-GI:       // %bb.0: // %entry
2655; CHECK-GI-NEXT:    addv h0, v0.8h
2656; CHECK-GI-NEXT:    addv h1, v1.8h
2657; CHECK-GI-NEXT:    fmov w8, s0
2658; CHECK-GI-NEXT:    fmov w9, s1
2659; CHECK-GI-NEXT:    add w8, w9, w8, uxth
2660; CHECK-GI-NEXT:    and w0, w8, #0xffff
2661; CHECK-GI-NEXT:    ret
2662entry:
2663  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %x)
2664  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %y)
2665  %z = add i16 %z1, %z2
2666  ret i16 %z
2667}
2668
2669define i64 @add_pair_v8i16_v8i64_zext(<8 x i16> %x, <8 x i16> %y) {
2670; CHECK-SD-LABEL: add_pair_v8i16_v8i64_zext:
2671; CHECK-SD:       // %bb.0: // %entry
2672; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
2673; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
2674; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
2675; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
2676; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
2677; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s
2678; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
2679; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
2680; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
2681; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
2682; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
2683; CHECK-SD-NEXT:    addp d0, v0.2d
2684; CHECK-SD-NEXT:    fmov x0, d0
2685; CHECK-SD-NEXT:    ret
2686;
2687; CHECK-GI-LABEL: add_pair_v8i16_v8i64_zext:
2688; CHECK-GI:       // %bb.0: // %entry
2689; CHECK-GI-NEXT:    uaddlv s1, v1.8h
2690; CHECK-GI-NEXT:    uaddlv s0, v0.8h
2691; CHECK-GI-NEXT:    mov w8, v1.s[0]
2692; CHECK-GI-NEXT:    fmov w9, s0
2693; CHECK-GI-NEXT:    add x0, x8, w9, uxtw
2694; CHECK-GI-NEXT:    ret
2695entry:
2696  %xx = zext <8 x i16> %x to <8 x i64>
2697  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
2698  %yy = zext <8 x i16> %y to <8 x i64>
2699  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
2700  %z = add i64 %z1, %z2
2701  ret i64 %z
2702}
2703
2704define i64 @add_pair_v8i16_v8i64_sext(<8 x i16> %x, <8 x i16> %y) {
2705; CHECK-SD-LABEL: add_pair_v8i16_v8i64_sext:
2706; CHECK-SD:       // %bb.0: // %entry
2707; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
2708; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
2709; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
2710; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
2711; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
2712; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s
2713; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
2714; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
2715; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
2716; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
2717; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
2718; CHECK-SD-NEXT:    addp d0, v0.2d
2719; CHECK-SD-NEXT:    fmov x0, d0
2720; CHECK-SD-NEXT:    ret
2721;
2722; CHECK-GI-LABEL: add_pair_v8i16_v8i64_sext:
2723; CHECK-GI:       // %bb.0: // %entry
2724; CHECK-GI-NEXT:    saddlv s1, v1.8h
2725; CHECK-GI-NEXT:    saddlv s0, v0.8h
2726; CHECK-GI-NEXT:    smov x8, v1.s[0]
2727; CHECK-GI-NEXT:    fmov w9, s0
2728; CHECK-GI-NEXT:    add x0, x8, w9, sxtw
2729; CHECK-GI-NEXT:    ret
2730entry:
2731  %xx = sext <8 x i16> %x to <8 x i64>
2732  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
2733  %yy = sext <8 x i16> %y to <8 x i64>
2734  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
2735  %z = add i64 %z1, %z2
2736  ret i64 %z
2737}
2738
2739define i64 @add_pair_v4i16_v4i64_zext(<4 x i16> %x, <4 x i16> %y) {
2740; CHECK-SD-LABEL: add_pair_v4i16_v4i64_zext:
2741; CHECK-SD:       // %bb.0: // %entry
2742; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
2743; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
2744; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
2745; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
2746; CHECK-SD-NEXT:    addp d0, v1.2d
2747; CHECK-SD-NEXT:    fmov x0, d0
2748; CHECK-SD-NEXT:    ret
2749;
2750; CHECK-GI-LABEL: add_pair_v4i16_v4i64_zext:
2751; CHECK-GI:       // %bb.0: // %entry
2752; CHECK-GI-NEXT:    uaddlv s1, v1.4h
2753; CHECK-GI-NEXT:    uaddlv s0, v0.4h
2754; CHECK-GI-NEXT:    mov w8, v1.s[0]
2755; CHECK-GI-NEXT:    fmov w9, s0
2756; CHECK-GI-NEXT:    add x0, x8, w9, uxtw
2757; CHECK-GI-NEXT:    ret
2758entry:
2759  %xx = zext <4 x i16> %x to <4 x i64>
2760  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
2761  %yy = zext <4 x i16> %y to <4 x i64>
2762  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
2763  %z = add i64 %z1, %z2
2764  ret i64 %z
2765}
2766
2767define i64 @add_pair_v4i16_v4i64_sext(<4 x i16> %x, <4 x i16> %y) {
2768; CHECK-SD-LABEL: add_pair_v4i16_v4i64_sext:
2769; CHECK-SD:       // %bb.0: // %entry
2770; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
2771; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
2772; CHECK-SD-NEXT:    saddlp v1.2d, v1.4s
2773; CHECK-SD-NEXT:    sadalp v1.2d, v0.4s
2774; CHECK-SD-NEXT:    addp d0, v1.2d
2775; CHECK-SD-NEXT:    fmov x0, d0
2776; CHECK-SD-NEXT:    ret
2777;
2778; CHECK-GI-LABEL: add_pair_v4i16_v4i64_sext:
2779; CHECK-GI:       // %bb.0: // %entry
2780; CHECK-GI-NEXT:    saddlv s1, v1.4h
2781; CHECK-GI-NEXT:    saddlv s0, v0.4h
2782; CHECK-GI-NEXT:    smov x8, v1.s[0]
2783; CHECK-GI-NEXT:    fmov w9, s0
2784; CHECK-GI-NEXT:    add x0, x8, w9, sxtw
2785; CHECK-GI-NEXT:    ret
2786entry:
2787  %xx = sext <4 x i16> %x to <4 x i64>
2788  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
2789  %yy = sext <4 x i16> %y to <4 x i64>
2790  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
2791  %z = add i64 %z1, %z2
2792  ret i64 %z
2793}
2794
2795define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
2796; CHECK-SD-LABEL: add_pair_v2i16_v2i64_zext:
2797; CHECK-SD:       // %bb.0: // %entry
2798; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
2799; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
2800; CHECK-SD-NEXT:    movi v2.2d, #0x00ffff0000ffff
2801; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
2802; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
2803; CHECK-SD-NEXT:    uaddlv d0, v0.4s
2804; CHECK-SD-NEXT:    fmov x0, d0
2805; CHECK-SD-NEXT:    ret
2806;
2807; CHECK-GI-LABEL: add_pair_v2i16_v2i64_zext:
2808; CHECK-GI:       // %bb.0: // %entry
2809; CHECK-GI-NEXT:    movi v2.2d, #0x0000000000ffff
2810; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
2811; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
2812; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
2813; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
2814; CHECK-GI-NEXT:    addp d0, v0.2d
2815; CHECK-GI-NEXT:    addp d1, v1.2d
2816; CHECK-GI-NEXT:    fmov x8, d0
2817; CHECK-GI-NEXT:    fmov x9, d1
2818; CHECK-GI-NEXT:    add x0, x8, x9
2819; CHECK-GI-NEXT:    ret
2820entry:
2821  %xx = zext <2 x i16> %x to <2 x i64>
2822  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
2823  %yy = zext <2 x i16> %y to <2 x i64>
2824  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
2825  %z = add i64 %z1, %z2
2826  ret i64 %z
2827}
2828
2829define i64 @add_pair_v2i16_v2i64_sext(<2 x i16> %x, <2 x i16> %y) {
2830; CHECK-SD-LABEL: add_pair_v2i16_v2i64_sext:
2831; CHECK-SD:       // %bb.0: // %entry
2832; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
2833; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
2834; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #48
2835; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #48
2836; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #48
2837; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #48
2838; CHECK-SD-NEXT:    addp d0, v0.2d
2839; CHECK-SD-NEXT:    fmov x0, d0
2840; CHECK-SD-NEXT:    ret
2841;
2842; CHECK-GI-LABEL: add_pair_v2i16_v2i64_sext:
2843; CHECK-GI:       // %bb.0: // %entry
2844; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
2845; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
2846; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #48
2847; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #48
2848; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #48
2849; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #48
2850; CHECK-GI-NEXT:    addp d0, v0.2d
2851; CHECK-GI-NEXT:    addp d1, v1.2d
2852; CHECK-GI-NEXT:    fmov x8, d0
2853; CHECK-GI-NEXT:    fmov x9, d1
2854; CHECK-GI-NEXT:    add x0, x8, x9
2855; CHECK-GI-NEXT:    ret
2856entry:
2857  %xx = sext <2 x i16> %x to <2 x i64>
2858  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
2859  %yy = sext <2 x i16> %y to <2 x i64>
2860  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
2861  %z = add i64 %z1, %z2
2862  ret i64 %z
2863}
2864
2865define i32 @add_pair_v16i8_v16i32_zext(<16 x i8> %x, <16 x i8> %y) {
2866; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_zext:
2867; CHECK-SD-BASE:       // %bb.0: // %entry
2868; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v0.16b, #0
2869; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
2870; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v1.16b, #0
2871; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
2872; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v0.8h, v2.8h
2873; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v2.4h
2874; CHECK-SD-BASE-NEXT:    uaddl2 v2.4s, v1.8h, v3.8h
2875; CHECK-SD-BASE-NEXT:    uaddl v1.4s, v1.4h, v3.4h
2876; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
2877; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
2878; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2879; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2880; CHECK-SD-BASE-NEXT:    fmov w0, s0
2881; CHECK-SD-BASE-NEXT:    ret
2882;
2883; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_zext:
2884; CHECK-SD-DOT:       // %bb.0: // %entry
2885; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
2886; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
2887; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
2888; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
2889; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
2890; CHECK-SD-DOT-NEXT:    fmov w0, s0
2891; CHECK-SD-DOT-NEXT:    ret
2892;
2893; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_zext:
2894; CHECK-GI-BASE:       // %bb.0: // %entry
2895; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b
2896; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
2897; CHECK-GI-BASE-NEXT:    fmov w8, s1
2898; CHECK-GI-BASE-NEXT:    fmov w9, s0
2899; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff
2900; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth
2901; CHECK-GI-BASE-NEXT:    ret
2902;
2903; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_zext:
2904; CHECK-GI-DOT:       // %bb.0: // %entry
2905; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
2906; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
2907; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
2908; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b
2909; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
2910; CHECK-GI-DOT-NEXT:    addv s0, v4.4s
2911; CHECK-GI-DOT-NEXT:    addv s1, v3.4s
2912; CHECK-GI-DOT-NEXT:    fmov w8, s0
2913; CHECK-GI-DOT-NEXT:    fmov w9, s1
2914; CHECK-GI-DOT-NEXT:    add w0, w8, w9
2915; CHECK-GI-DOT-NEXT:    ret
2916entry:
2917  %xx = zext <16 x i8> %x to <16 x i32>
2918  %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
2919  %yy = zext <16 x i8> %y to <16 x i32>
2920  %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
2921  %z = add i32 %z1, %z2
2922  ret i32 %z
2923}
2924
2925define i32 @add_pair_v16i8_v16i32_sext(<16 x i8> %x, <16 x i8> %y) {
2926; CHECK-SD-BASE-LABEL: add_pair_v16i8_v16i32_sext:
2927; CHECK-SD-BASE:       // %bb.0: // %entry
2928; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v0.16b, #0
2929; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
2930; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v1.16b, #0
2931; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
2932; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v0.8h, v2.8h
2933; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v2.4h
2934; CHECK-SD-BASE-NEXT:    saddl2 v2.4s, v1.8h, v3.8h
2935; CHECK-SD-BASE-NEXT:    saddl v1.4s, v1.4h, v3.4h
2936; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v4.4s
2937; CHECK-SD-BASE-NEXT:    add v1.4s, v1.4s, v2.4s
2938; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
2939; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
2940; CHECK-SD-BASE-NEXT:    fmov w0, s0
2941; CHECK-SD-BASE-NEXT:    ret
2942;
2943; CHECK-SD-DOT-LABEL: add_pair_v16i8_v16i32_sext:
2944; CHECK-SD-DOT:       // %bb.0: // %entry
2945; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
2946; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
2947; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
2948; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
2949; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
2950; CHECK-SD-DOT-NEXT:    fmov w0, s0
2951; CHECK-SD-DOT-NEXT:    ret
2952;
2953; CHECK-GI-BASE-LABEL: add_pair_v16i8_v16i32_sext:
2954; CHECK-GI-BASE:       // %bb.0: // %entry
2955; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b
2956; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
2957; CHECK-GI-BASE-NEXT:    fmov w8, s1
2958; CHECK-GI-BASE-NEXT:    fmov w9, s0
2959; CHECK-GI-BASE-NEXT:    sxth w8, w8
2960; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth
2961; CHECK-GI-BASE-NEXT:    ret
2962;
2963; CHECK-GI-DOT-LABEL: add_pair_v16i8_v16i32_sext:
2964; CHECK-GI-DOT:       // %bb.0: // %entry
2965; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
2966; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
2967; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
2968; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b
2969; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
2970; CHECK-GI-DOT-NEXT:    addv s0, v4.4s
2971; CHECK-GI-DOT-NEXT:    addv s1, v3.4s
2972; CHECK-GI-DOT-NEXT:    fmov w8, s0
2973; CHECK-GI-DOT-NEXT:    fmov w9, s1
2974; CHECK-GI-DOT-NEXT:    add w0, w8, w9
2975; CHECK-GI-DOT-NEXT:    ret
2976entry:
2977  %xx = sext <16 x i8> %x to <16 x i32>
2978  %z1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %xx)
2979  %yy = sext <16 x i8> %y to <16 x i32>
2980  %z2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %yy)
2981  %z = add i32 %z1, %z2
2982  ret i32 %z
2983}
2984
2985define i32 @add_pair_v8i8_v8i32_zext(<8 x i8> %x, <8 x i8> %y) {
2986; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_zext:
2987; CHECK-SD-BASE:       // %bb.0: // %entry
2988; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
2989; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
2990; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
2991; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
2992; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
2993; CHECK-SD-BASE-NEXT:    fmov w0, s0
2994; CHECK-SD-BASE-NEXT:    ret
2995;
2996; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_zext:
2997; CHECK-SD-DOT:       // %bb.0: // %entry
2998; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
2999; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
3000; CHECK-SD-DOT-NEXT:    udot v2.2s, v1.8b, v3.8b
3001; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
3002; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
3003; CHECK-SD-DOT-NEXT:    fmov w0, s0
3004; CHECK-SD-DOT-NEXT:    ret
3005;
3006; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_zext:
3007; CHECK-GI-BASE:       // %bb.0: // %entry
3008; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
3009; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
3010; CHECK-GI-BASE-NEXT:    fmov w8, s1
3011; CHECK-GI-BASE-NEXT:    fmov w9, s0
3012; CHECK-GI-BASE-NEXT:    and w8, w8, #0xffff
3013; CHECK-GI-BASE-NEXT:    add w0, w8, w9, uxth
3014; CHECK-GI-BASE-NEXT:    ret
3015;
3016; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_zext:
3017; CHECK-GI-DOT:       // %bb.0: // %entry
3018; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
3019; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
3020; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
3021; CHECK-GI-DOT-NEXT:    udot v4.2s, v0.8b, v2.8b
3022; CHECK-GI-DOT-NEXT:    udot v3.2s, v1.8b, v2.8b
3023; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s
3024; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s
3025; CHECK-GI-DOT-NEXT:    fmov w8, s0
3026; CHECK-GI-DOT-NEXT:    fmov w9, s1
3027; CHECK-GI-DOT-NEXT:    add w0, w8, w9
3028; CHECK-GI-DOT-NEXT:    ret
3029entry:
3030  %xx = zext <8 x i8> %x to <8 x i32>
3031  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
3032  %yy = zext <8 x i8> %y to <8 x i32>
3033  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
3034  %z = add i32 %z1, %z2
3035  ret i32 %z
3036}
3037
3038define i32 @add_pair_v8i8_v8i32_sext(<8 x i8> %x, <8 x i8> %y) {
3039; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_sext:
3040; CHECK-SD-BASE:       // %bb.0: // %entry
3041; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
3042; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
3043; CHECK-SD-BASE-NEXT:    saddlp v1.4s, v1.8h
3044; CHECK-SD-BASE-NEXT:    sadalp v1.4s, v0.8h
3045; CHECK-SD-BASE-NEXT:    addv s0, v1.4s
3046; CHECK-SD-BASE-NEXT:    fmov w0, s0
3047; CHECK-SD-BASE-NEXT:    ret
3048;
3049; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_sext:
3050; CHECK-SD-DOT:       // %bb.0: // %entry
3051; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
3052; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
3053; CHECK-SD-DOT-NEXT:    sdot v2.2s, v1.8b, v3.8b
3054; CHECK-SD-DOT-NEXT:    sdot v2.2s, v0.8b, v3.8b
3055; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
3056; CHECK-SD-DOT-NEXT:    fmov w0, s0
3057; CHECK-SD-DOT-NEXT:    ret
3058;
3059; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_sext:
3060; CHECK-GI-BASE:       // %bb.0: // %entry
3061; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
3062; CHECK-GI-BASE-NEXT:    saddlv h0, v0.8b
3063; CHECK-GI-BASE-NEXT:    fmov w8, s1
3064; CHECK-GI-BASE-NEXT:    fmov w9, s0
3065; CHECK-GI-BASE-NEXT:    sxth w8, w8
3066; CHECK-GI-BASE-NEXT:    add w0, w8, w9, sxth
3067; CHECK-GI-BASE-NEXT:    ret
3068;
3069; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_sext:
3070; CHECK-GI-DOT:       // %bb.0: // %entry
3071; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
3072; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
3073; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
3074; CHECK-GI-DOT-NEXT:    sdot v4.2s, v0.8b, v2.8b
3075; CHECK-GI-DOT-NEXT:    sdot v3.2s, v1.8b, v2.8b
3076; CHECK-GI-DOT-NEXT:    addp v0.2s, v4.2s, v4.2s
3077; CHECK-GI-DOT-NEXT:    addp v1.2s, v3.2s, v3.2s
3078; CHECK-GI-DOT-NEXT:    fmov w8, s0
3079; CHECK-GI-DOT-NEXT:    fmov w9, s1
3080; CHECK-GI-DOT-NEXT:    add w0, w8, w9
3081; CHECK-GI-DOT-NEXT:    ret
3082entry:
3083  %xx = sext <8 x i8> %x to <8 x i32>
3084  %z1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %xx)
3085  %yy = sext <8 x i8> %y to <8 x i32>
3086  %z2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %yy)
3087  %z = add i32 %z1, %z2
3088  ret i32 %z
3089}
3090
3091define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
3092; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
3093; CHECK-SD:       // %bb.0: // %entry
3094; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
3095; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
3096; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
3097; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
3098; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
3099; CHECK-SD-NEXT:    uaddlv s0, v0.8h
3100; CHECK-SD-NEXT:    fmov w0, s0
3101; CHECK-SD-NEXT:    ret
3102;
3103; CHECK-GI-LABEL: add_pair_v4i8_v4i32_zext:
3104; CHECK-GI:       // %bb.0: // %entry
3105; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
3106; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
3107; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
3108; CHECK-GI-NEXT:    uaddlv s1, v1.4h
3109; CHECK-GI-NEXT:    uaddlv s0, v0.4h
3110; CHECK-GI-NEXT:    fmov w8, s1
3111; CHECK-GI-NEXT:    fmov w9, s0
3112; CHECK-GI-NEXT:    and w8, w8, #0xffff
3113; CHECK-GI-NEXT:    add w0, w8, w9, uxth
3114; CHECK-GI-NEXT:    ret
3115entry:
3116  %xx = zext <4 x i8> %x to <4 x i32>
3117  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
3118  %yy = zext <4 x i8> %y to <4 x i32>
3119  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
3120  %z = add i32 %z1, %z2
3121  ret i32 %z
3122}
3123
3124define i32 @add_pair_v4i8_v4i32_sext(<4 x i8> %x, <4 x i8> %y) {
3125; CHECK-SD-LABEL: add_pair_v4i8_v4i32_sext:
3126; CHECK-SD:       // %bb.0: // %entry
3127; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
3128; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
3129; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
3130; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
3131; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
3132; CHECK-SD-NEXT:    ssra v0.4s, v1.4s, #24
3133; CHECK-SD-NEXT:    addv s0, v0.4s
3134; CHECK-SD-NEXT:    fmov w0, s0
3135; CHECK-SD-NEXT:    ret
3136;
3137; CHECK-GI-LABEL: add_pair_v4i8_v4i32_sext:
3138; CHECK-GI:       // %bb.0: // %entry
3139; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
3140; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
3141; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
3142; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
3143; CHECK-GI-NEXT:    saddlv s1, v1.4h
3144; CHECK-GI-NEXT:    saddlv s0, v0.4h
3145; CHECK-GI-NEXT:    fmov w8, s1
3146; CHECK-GI-NEXT:    fmov w9, s0
3147; CHECK-GI-NEXT:    sxth w8, w8
3148; CHECK-GI-NEXT:    add w0, w8, w9, sxth
3149; CHECK-GI-NEXT:    ret
3150entry:
3151  %xx = sext <4 x i8> %x to <4 x i32>
3152  %z1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %xx)
3153  %yy = sext <4 x i8> %y to <4 x i32>
3154  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %yy)
3155  %z = add i32 %z1, %z2
3156  ret i32 %z
3157}
3158
3159define zeroext i16 @add_pair_v16i8_v16i16_zext(<16 x i8> %x, <16 x i8> %y) {
3160; CHECK-SD-LABEL: add_pair_v16i8_v16i16_zext:
3161; CHECK-SD:       // %bb.0: // %entry
3162; CHECK-SD-NEXT:    uaddlp v1.8h, v1.16b
3163; CHECK-SD-NEXT:    uadalp v1.8h, v0.16b
3164; CHECK-SD-NEXT:    addv h0, v1.8h
3165; CHECK-SD-NEXT:    fmov w0, s0
3166; CHECK-SD-NEXT:    ret
3167;
3168; CHECK-GI-LABEL: add_pair_v16i8_v16i16_zext:
3169; CHECK-GI:       // %bb.0: // %entry
3170; CHECK-GI-NEXT:    uaddlv h0, v0.16b
3171; CHECK-GI-NEXT:    uaddlv h1, v1.16b
3172; CHECK-GI-NEXT:    fmov w8, s0
3173; CHECK-GI-NEXT:    fmov w9, s1
3174; CHECK-GI-NEXT:    add w8, w8, w9
3175; CHECK-GI-NEXT:    and w0, w8, #0xffff
3176; CHECK-GI-NEXT:    ret
3177entry:
3178  %xx = zext <16 x i8> %x to <16 x i16>
3179  %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
3180  %yy = zext <16 x i8> %y to <16 x i16>
3181  %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
3182  %z = add i16 %z1, %z2
3183  ret i16 %z
3184}
3185
3186define signext i16 @add_pair_v16i8_v16i16_sext(<16 x i8> %x, <16 x i8> %y) {
3187; CHECK-SD-LABEL: add_pair_v16i8_v16i16_sext:
3188; CHECK-SD:       // %bb.0: // %entry
3189; CHECK-SD-NEXT:    saddlp v1.8h, v1.16b
3190; CHECK-SD-NEXT:    sadalp v1.8h, v0.16b
3191; CHECK-SD-NEXT:    addv h0, v1.8h
3192; CHECK-SD-NEXT:    smov w0, v0.h[0]
3193; CHECK-SD-NEXT:    ret
3194;
3195; CHECK-GI-LABEL: add_pair_v16i8_v16i16_sext:
3196; CHECK-GI:       // %bb.0: // %entry
3197; CHECK-GI-NEXT:    saddlv h0, v0.16b
3198; CHECK-GI-NEXT:    saddlv h1, v1.16b
3199; CHECK-GI-NEXT:    fmov w8, s0
3200; CHECK-GI-NEXT:    fmov w9, s1
3201; CHECK-GI-NEXT:    add w8, w8, w9
3202; CHECK-GI-NEXT:    sxth w0, w8
3203; CHECK-GI-NEXT:    ret
3204entry:
3205  %xx = sext <16 x i8> %x to <16 x i16>
3206  %z1 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %xx)
3207  %yy = sext <16 x i8> %y to <16 x i16>
3208  %z2 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %yy)
3209  %z = add i16 %z1, %z2
3210  ret i16 %z
3211}
3212
3213define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
3214; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
3215; CHECK-SD:       // %bb.0: // %entry
3216; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
3217; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
3218; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
3219; CHECK-SD-NEXT:    uaddlv h0, v0.16b
3220; CHECK-SD-NEXT:    umov w0, v0.h[0]
3221; CHECK-SD-NEXT:    ret
3222;
3223; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
3224; CHECK-GI:       // %bb.0: // %entry
3225; CHECK-GI-NEXT:    uaddlv h0, v0.8b
3226; CHECK-GI-NEXT:    uaddlv h1, v1.8b
3227; CHECK-GI-NEXT:    fmov w8, s0
3228; CHECK-GI-NEXT:    fmov w9, s1
3229; CHECK-GI-NEXT:    add w8, w8, w9
3230; CHECK-GI-NEXT:    and w0, w8, #0xffff
3231; CHECK-GI-NEXT:    ret
3232entry:
3233  %xx = zext <8 x i8> %x to <8 x i16>
3234  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
3235  %yy = zext <8 x i8> %y to <8 x i16>
3236  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
3237  %z = add i16 %z1, %z2
3238  ret i16 %z
3239}
3240
3241define signext i16 @add_pair_v8i8_v8i16_sext(<8 x i8> %x, <8 x i8> %y) {
3242; CHECK-SD-LABEL: add_pair_v8i8_v8i16_sext:
3243; CHECK-SD:       // %bb.0: // %entry
3244; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
3245; CHECK-SD-NEXT:    addv h0, v0.8h
3246; CHECK-SD-NEXT:    smov w0, v0.h[0]
3247; CHECK-SD-NEXT:    ret
3248;
3249; CHECK-GI-LABEL: add_pair_v8i8_v8i16_sext:
3250; CHECK-GI:       // %bb.0: // %entry
3251; CHECK-GI-NEXT:    saddlv h0, v0.8b
3252; CHECK-GI-NEXT:    saddlv h1, v1.8b
3253; CHECK-GI-NEXT:    fmov w8, s0
3254; CHECK-GI-NEXT:    fmov w9, s1
3255; CHECK-GI-NEXT:    add w8, w8, w9
3256; CHECK-GI-NEXT:    sxth w0, w8
3257; CHECK-GI-NEXT:    ret
3258entry:
3259  %xx = sext <8 x i8> %x to <8 x i16>
3260  %z1 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %xx)
3261  %yy = sext <8 x i8> %y to <8 x i16>
3262  %z2 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %yy)
3263  %z = add i16 %z1, %z2
3264  ret i16 %z
3265}
3266
3267define zeroext i8 @add_pair_v16i8_v16i8(<16 x i8> %x, <16 x i8> %y) {
3268; CHECK-SD-LABEL: add_pair_v16i8_v16i8:
3269; CHECK-SD:       // %bb.0: // %entry
3270; CHECK-SD-NEXT:    add v0.16b, v0.16b, v1.16b
3271; CHECK-SD-NEXT:    addv b0, v0.16b
3272; CHECK-SD-NEXT:    fmov w0, s0
3273; CHECK-SD-NEXT:    ret
3274;
3275; CHECK-GI-LABEL: add_pair_v16i8_v16i8:
3276; CHECK-GI:       // %bb.0: // %entry
3277; CHECK-GI-NEXT:    addv b0, v0.16b
3278; CHECK-GI-NEXT:    addv b1, v1.16b
3279; CHECK-GI-NEXT:    fmov w8, s0
3280; CHECK-GI-NEXT:    fmov w9, s1
3281; CHECK-GI-NEXT:    add w8, w9, w8, uxtb
3282; CHECK-GI-NEXT:    and w0, w8, #0xff
3283; CHECK-GI-NEXT:    ret
3284entry:
3285  %z1 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
3286  %z2 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %y)
3287  %z = add i8 %z1, %z2
3288  ret i8 %z
3289}
3290
3291define i64 @add_pair_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y) {
3292; CHECK-SD-LABEL: add_pair_v16i8_v16i64_zext:
3293; CHECK-SD:       // %bb.0: // %entry
3294; CHECK-SD-NEXT:    ushll2 v2.8h, v0.16b, #0
3295; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
3296; CHECK-SD-NEXT:    ushll2 v3.8h, v1.16b, #0
3297; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
3298; CHECK-SD-NEXT:    ushll v4.4s, v2.4h, #0
3299; CHECK-SD-NEXT:    ushll2 v2.4s, v2.8h, #0
3300; CHECK-SD-NEXT:    ushll2 v5.4s, v0.8h, #0
3301; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
3302; CHECK-SD-NEXT:    ushll2 v6.4s, v3.8h, #0
3303; CHECK-SD-NEXT:    ushll2 v7.4s, v1.8h, #0
3304; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
3305; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
3306; CHECK-SD-NEXT:    uaddl2 v16.2d, v5.4s, v2.4s
3307; CHECK-SD-NEXT:    uaddl v2.2d, v5.2s, v2.2s
3308; CHECK-SD-NEXT:    uaddl2 v5.2d, v0.4s, v4.4s
3309; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v4.2s
3310; CHECK-SD-NEXT:    uaddl2 v4.2d, v7.4s, v6.4s
3311; CHECK-SD-NEXT:    uaddl v6.2d, v7.2s, v6.2s
3312; CHECK-SD-NEXT:    uaddl2 v7.2d, v1.4s, v3.4s
3313; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
3314; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d
3315; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
3316; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d
3317; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d
3318; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
3319; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
3320; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
3321; CHECK-SD-NEXT:    addp d0, v0.2d
3322; CHECK-SD-NEXT:    fmov x0, d0
3323; CHECK-SD-NEXT:    ret
3324;
3325; CHECK-GI-LABEL: add_pair_v16i8_v16i64_zext:
3326; CHECK-GI:       // %bb.0: // %entry
3327; CHECK-GI-NEXT:    uaddlv h1, v1.16b
3328; CHECK-GI-NEXT:    uaddlv h0, v0.16b
3329; CHECK-GI-NEXT:    fmov w8, s1
3330; CHECK-GI-NEXT:    fmov w9, s0
3331; CHECK-GI-NEXT:    and x8, x8, #0xffff
3332; CHECK-GI-NEXT:    add x0, x8, w9, uxth
3333; CHECK-GI-NEXT:    ret
3334entry:
3335  %xx = zext <16 x i8> %x to <16 x i64>
3336  %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
3337  %yy = zext <16 x i8> %y to <16 x i64>
3338  %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
3339  %z = add i64 %z1, %z2
3340  ret i64 %z
3341}
3342
3343define i64 @add_pair_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y) {
3344; CHECK-SD-LABEL: add_pair_v16i8_v16i64_sext:
3345; CHECK-SD:       // %bb.0: // %entry
3346; CHECK-SD-NEXT:    sshll2 v2.8h, v0.16b, #0
3347; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
3348; CHECK-SD-NEXT:    sshll2 v3.8h, v1.16b, #0
3349; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
3350; CHECK-SD-NEXT:    sshll v4.4s, v2.4h, #0
3351; CHECK-SD-NEXT:    sshll2 v2.4s, v2.8h, #0
3352; CHECK-SD-NEXT:    sshll2 v5.4s, v0.8h, #0
3353; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
3354; CHECK-SD-NEXT:    sshll2 v6.4s, v3.8h, #0
3355; CHECK-SD-NEXT:    sshll2 v7.4s, v1.8h, #0
3356; CHECK-SD-NEXT:    sshll v3.4s, v3.4h, #0
3357; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
3358; CHECK-SD-NEXT:    saddl2 v16.2d, v5.4s, v2.4s
3359; CHECK-SD-NEXT:    saddl v2.2d, v5.2s, v2.2s
3360; CHECK-SD-NEXT:    saddl2 v5.2d, v0.4s, v4.4s
3361; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v4.2s
3362; CHECK-SD-NEXT:    saddl2 v4.2d, v7.4s, v6.4s
3363; CHECK-SD-NEXT:    saddl v6.2d, v7.2s, v6.2s
3364; CHECK-SD-NEXT:    saddl2 v7.2d, v1.4s, v3.4s
3365; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
3366; CHECK-SD-NEXT:    add v3.2d, v5.2d, v16.2d
3367; CHECK-SD-NEXT:    add v0.2d, v0.2d, v2.2d
3368; CHECK-SD-NEXT:    add v2.2d, v7.2d, v4.2d
3369; CHECK-SD-NEXT:    add v1.2d, v1.2d, v6.2d
3370; CHECK-SD-NEXT:    add v0.2d, v0.2d, v3.2d
3371; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
3372; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
3373; CHECK-SD-NEXT:    addp d0, v0.2d
3374; CHECK-SD-NEXT:    fmov x0, d0
3375; CHECK-SD-NEXT:    ret
3376;
3377; CHECK-GI-LABEL: add_pair_v16i8_v16i64_sext:
3378; CHECK-GI:       // %bb.0: // %entry
3379; CHECK-GI-NEXT:    saddlv h1, v1.16b
3380; CHECK-GI-NEXT:    saddlv h0, v0.16b
3381; CHECK-GI-NEXT:    fmov w8, s1
3382; CHECK-GI-NEXT:    fmov w9, s0
3383; CHECK-GI-NEXT:    sxth x8, w8
3384; CHECK-GI-NEXT:    add x0, x8, w9, sxth
3385; CHECK-GI-NEXT:    ret
3386entry:
3387  %xx = sext <16 x i8> %x to <16 x i64>
3388  %z1 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %xx)
3389  %yy = sext <16 x i8> %y to <16 x i64>
3390  %z2 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %yy)
3391  %z = add i64 %z1, %z2
3392  ret i64 %z
3393}
3394
3395define i64 @add_pair_v8i8_v8i64_zext(<8 x i8> %x, <8 x i8> %y) {
3396; CHECK-SD-LABEL: add_pair_v8i8_v8i64_zext:
3397; CHECK-SD:       // %bb.0: // %entry
3398; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
3399; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
3400; CHECK-SD-NEXT:    ushll2 v2.4s, v0.8h, #0
3401; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
3402; CHECK-SD-NEXT:    ushll2 v3.4s, v1.8h, #0
3403; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
3404; CHECK-SD-NEXT:    uaddl2 v4.2d, v0.4s, v2.4s
3405; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v2.2s
3406; CHECK-SD-NEXT:    uaddl2 v2.2d, v1.4s, v3.4s
3407; CHECK-SD-NEXT:    uaddl v1.2d, v1.2s, v3.2s
3408; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
3409; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
3410; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
3411; CHECK-SD-NEXT:    addp d0, v0.2d
3412; CHECK-SD-NEXT:    fmov x0, d0
3413; CHECK-SD-NEXT:    ret
3414;
3415; CHECK-GI-LABEL: add_pair_v8i8_v8i64_zext:
3416; CHECK-GI:       // %bb.0: // %entry
3417; CHECK-GI-NEXT:    uaddlv h1, v1.8b
3418; CHECK-GI-NEXT:    uaddlv h0, v0.8b
3419; CHECK-GI-NEXT:    fmov w8, s1
3420; CHECK-GI-NEXT:    fmov w9, s0
3421; CHECK-GI-NEXT:    and x8, x8, #0xffff
3422; CHECK-GI-NEXT:    add x0, x8, w9, uxth
3423; CHECK-GI-NEXT:    ret
3424entry:
3425  %xx = zext <8 x i8> %x to <8 x i64>
3426  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
3427  %yy = zext <8 x i8> %y to <8 x i64>
3428  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
3429  %z = add i64 %z1, %z2
3430  ret i64 %z
3431}
3432
3433define i64 @add_pair_v8i8_v8i64_sext(<8 x i8> %x, <8 x i8> %y) {
3434; CHECK-SD-LABEL: add_pair_v8i8_v8i64_sext:
3435; CHECK-SD:       // %bb.0: // %entry
3436; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
3437; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
3438; CHECK-SD-NEXT:    sshll2 v2.4s, v0.8h, #0
3439; CHECK-SD-NEXT:    sshll v0.4s, v0.4h, #0
3440; CHECK-SD-NEXT:    sshll2 v3.4s, v1.8h, #0
3441; CHECK-SD-NEXT:    sshll v1.4s, v1.4h, #0
3442; CHECK-SD-NEXT:    saddl2 v4.2d, v0.4s, v2.4s
3443; CHECK-SD-NEXT:    saddl v0.2d, v0.2s, v2.2s
3444; CHECK-SD-NEXT:    saddl2 v2.2d, v1.4s, v3.4s
3445; CHECK-SD-NEXT:    saddl v1.2d, v1.2s, v3.2s
3446; CHECK-SD-NEXT:    add v0.2d, v0.2d, v4.2d
3447; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
3448; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
3449; CHECK-SD-NEXT:    addp d0, v0.2d
3450; CHECK-SD-NEXT:    fmov x0, d0
3451; CHECK-SD-NEXT:    ret
3452;
3453; CHECK-GI-LABEL: add_pair_v8i8_v8i64_sext:
3454; CHECK-GI:       // %bb.0: // %entry
3455; CHECK-GI-NEXT:    saddlv h1, v1.8b
3456; CHECK-GI-NEXT:    saddlv h0, v0.8b
3457; CHECK-GI-NEXT:    fmov w8, s1
3458; CHECK-GI-NEXT:    fmov w9, s0
3459; CHECK-GI-NEXT:    sxth x8, w8
3460; CHECK-GI-NEXT:    add x0, x8, w9, sxth
3461; CHECK-GI-NEXT:    ret
3462entry:
3463  %xx = sext <8 x i8> %x to <8 x i64>
3464  %z1 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %xx)
3465  %yy = sext <8 x i8> %y to <8 x i64>
3466  %z2 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %yy)
3467  %z = add i64 %z1, %z2
3468  ret i64 %z
3469}
3470
3471define i64 @add_pair_v4i8_v4i64_zext(<4 x i8> %x, <4 x i8> %y) {
3472; CHECK-SD-LABEL: add_pair_v4i8_v4i64_zext:
3473; CHECK-SD:       // %bb.0: // %entry
3474; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
3475; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
3476; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
3477; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
3478; CHECK-SD-NEXT:    uaddlp v1.2d, v1.4s
3479; CHECK-SD-NEXT:    uadalp v1.2d, v0.4s
3480; CHECK-SD-NEXT:    addp d0, v1.2d
3481; CHECK-SD-NEXT:    fmov x0, d0
3482; CHECK-SD-NEXT:    ret
3483;
3484; CHECK-GI-LABEL: add_pair_v4i8_v4i64_zext:
3485; CHECK-GI:       // %bb.0: // %entry
3486; CHECK-GI-NEXT:    movi d2, #0xff00ff00ff00ff
3487; CHECK-GI-NEXT:    and v1.8b, v1.8b, v2.8b
3488; CHECK-GI-NEXT:    and v0.8b, v0.8b, v2.8b
3489; CHECK-GI-NEXT:    uaddlv s1, v1.4h
3490; CHECK-GI-NEXT:    uaddlv s0, v0.4h
3491; CHECK-GI-NEXT:    fmov w8, s1
3492; CHECK-GI-NEXT:    fmov w9, s0
3493; CHECK-GI-NEXT:    and x8, x8, #0xffff
3494; CHECK-GI-NEXT:    add x0, x8, w9, uxth
3495; CHECK-GI-NEXT:    ret
3496entry:
3497  %xx = zext <4 x i8> %x to <4 x i64>
3498  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
3499  %yy = zext <4 x i8> %y to <4 x i64>
3500  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
3501  %z = add i64 %z1, %z2
3502  ret i64 %z
3503}
3504
3505define i64 @add_pair_v4i8_v4i64_sext(<4 x i8> %x, <4 x i8> %y) {
3506; CHECK-SD-LABEL: add_pair_v4i8_v4i64_sext:
3507; CHECK-SD:       // %bb.0: // %entry
3508; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
3509; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
3510; CHECK-SD-NEXT:    ushll v2.2d, v1.2s, #0
3511; CHECK-SD-NEXT:    ushll v3.2d, v0.2s, #0
3512; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
3513; CHECK-SD-NEXT:    ushll2 v1.2d, v1.4s, #0
3514; CHECK-SD-NEXT:    shl v3.2d, v3.2d, #56
3515; CHECK-SD-NEXT:    shl v2.2d, v2.2d, #56
3516; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
3517; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
3518; CHECK-SD-NEXT:    sshr v3.2d, v3.2d, #56
3519; CHECK-SD-NEXT:    sshr v2.2d, v2.2d, #56
3520; CHECK-SD-NEXT:    ssra v3.2d, v0.2d, #56
3521; CHECK-SD-NEXT:    ssra v2.2d, v1.2d, #56
3522; CHECK-SD-NEXT:    add v0.2d, v3.2d, v2.2d
3523; CHECK-SD-NEXT:    addp d0, v0.2d
3524; CHECK-SD-NEXT:    fmov x0, d0
3525; CHECK-SD-NEXT:    ret
3526;
3527; CHECK-GI-LABEL: add_pair_v4i8_v4i64_sext:
3528; CHECK-GI:       // %bb.0: // %entry
3529; CHECK-GI-NEXT:    shl v1.4h, v1.4h, #8
3530; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
3531; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
3532; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
3533; CHECK-GI-NEXT:    saddlv s1, v1.4h
3534; CHECK-GI-NEXT:    saddlv s0, v0.4h
3535; CHECK-GI-NEXT:    fmov w8, s1
3536; CHECK-GI-NEXT:    fmov w9, s0
3537; CHECK-GI-NEXT:    sxth x8, w8
3538; CHECK-GI-NEXT:    add x0, x8, w9, sxth
3539; CHECK-GI-NEXT:    ret
3540entry:
3541  %xx = sext <4 x i8> %x to <4 x i64>
3542  %z1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %xx)
3543  %yy = sext <4 x i8> %y to <4 x i64>
3544  %z2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %yy)
3545  %z = add i64 %z1, %z2
3546  ret i64 %z
3547}
3548
3549define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
3550; CHECK-SD-LABEL: add_pair_v2i8_v2i64_zext:
3551; CHECK-SD:       // %bb.0: // %entry
3552; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
3553; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
3554; CHECK-SD-NEXT:    movi v2.2d, #0x0000ff000000ff
3555; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
3556; CHECK-SD-NEXT:    and v0.16b, v0.16b, v2.16b
3557; CHECK-SD-NEXT:    uaddlv d0, v0.4s
3558; CHECK-SD-NEXT:    fmov x0, d0
3559; CHECK-SD-NEXT:    ret
3560;
3561; CHECK-GI-LABEL: add_pair_v2i8_v2i64_zext:
3562; CHECK-GI:       // %bb.0: // %entry
3563; CHECK-GI-NEXT:    movi v2.2d, #0x000000000000ff
3564; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
3565; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
3566; CHECK-GI-NEXT:    and v0.16b, v0.16b, v2.16b
3567; CHECK-GI-NEXT:    and v1.16b, v1.16b, v2.16b
3568; CHECK-GI-NEXT:    addp d0, v0.2d
3569; CHECK-GI-NEXT:    addp d1, v1.2d
3570; CHECK-GI-NEXT:    fmov x8, d0
3571; CHECK-GI-NEXT:    fmov x9, d1
3572; CHECK-GI-NEXT:    add x0, x8, x9
3573; CHECK-GI-NEXT:    ret
3574entry:
3575  %xx = zext <2 x i8> %x to <2 x i64>
3576  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
3577  %yy = zext <2 x i8> %y to <2 x i64>
3578  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
3579  %z = add i64 %z1, %z2
3580  ret i64 %z
3581}
3582
3583define i64 @add_pair_v2i8_v2i64_sext(<2 x i8> %x, <2 x i8> %y) {
3584; CHECK-SD-LABEL: add_pair_v2i8_v2i64_sext:
3585; CHECK-SD:       // %bb.0: // %entry
3586; CHECK-SD-NEXT:    ushll v0.2d, v0.2s, #0
3587; CHECK-SD-NEXT:    ushll v1.2d, v1.2s, #0
3588; CHECK-SD-NEXT:    shl v0.2d, v0.2d, #56
3589; CHECK-SD-NEXT:    shl v1.2d, v1.2d, #56
3590; CHECK-SD-NEXT:    sshr v0.2d, v0.2d, #56
3591; CHECK-SD-NEXT:    ssra v0.2d, v1.2d, #56
3592; CHECK-SD-NEXT:    addp d0, v0.2d
3593; CHECK-SD-NEXT:    fmov x0, d0
3594; CHECK-SD-NEXT:    ret
3595;
3596; CHECK-GI-LABEL: add_pair_v2i8_v2i64_sext:
3597; CHECK-GI:       // %bb.0: // %entry
3598; CHECK-GI-NEXT:    ushll v0.2d, v0.2s, #0
3599; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
3600; CHECK-GI-NEXT:    shl v0.2d, v0.2d, #56
3601; CHECK-GI-NEXT:    shl v1.2d, v1.2d, #56
3602; CHECK-GI-NEXT:    sshr v0.2d, v0.2d, #56
3603; CHECK-GI-NEXT:    sshr v1.2d, v1.2d, #56
3604; CHECK-GI-NEXT:    addp d0, v0.2d
3605; CHECK-GI-NEXT:    addp d1, v1.2d
3606; CHECK-GI-NEXT:    fmov x8, d0
3607; CHECK-GI-NEXT:    fmov x9, d1
3608; CHECK-GI-NEXT:    add x0, x8, x9
3609; CHECK-GI-NEXT:    ret
3610entry:
3611  %xx = sext <2 x i8> %x to <2 x i64>
3612  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %xx)
3613  %yy = sext <2 x i8> %y to <2 x i64>
3614  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %yy)
3615  %z = add i64 %z1, %z2
3616  ret i64 %z
3617}
3618
3619define i32 @add_pair_v8i8_v8i32_double_sext_zext(<8 x i8> %ax, <8 x i8> %ay, <8 x i8> %bx, <8 x i8> %by) {
3620; CHECK-SD-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
3621; CHECK-SD-BASE:       // %bb.0: // %entry
3622; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
3623; CHECK-SD-BASE-NEXT:    sshll v3.8h, v3.8b, #0
3624; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
3625; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0
3626; CHECK-SD-BASE-NEXT:    uaddlp v1.4s, v1.8h
3627; CHECK-SD-BASE-NEXT:    saddlp v3.4s, v3.8h
3628; CHECK-SD-BASE-NEXT:    uadalp v1.4s, v0.8h
3629; CHECK-SD-BASE-NEXT:    sadalp v3.4s, v2.8h
3630; CHECK-SD-BASE-NEXT:    add v0.4s, v3.4s, v1.4s
3631; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
3632; CHECK-SD-BASE-NEXT:    fmov w0, s0
3633; CHECK-SD-BASE-NEXT:    ret
3634;
3635; CHECK-SD-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
3636; CHECK-SD-DOT:       // %bb.0: // %entry
3637; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
3638; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
3639; CHECK-SD-DOT-NEXT:    movi v6.2d, #0000000000000000
3640; CHECK-SD-DOT-NEXT:    udot v6.2s, v1.8b, v5.8b
3641; CHECK-SD-DOT-NEXT:    sdot v4.2s, v3.8b, v5.8b
3642; CHECK-SD-DOT-NEXT:    udot v6.2s, v0.8b, v5.8b
3643; CHECK-SD-DOT-NEXT:    sdot v4.2s, v2.8b, v5.8b
3644; CHECK-SD-DOT-NEXT:    add v0.2s, v6.2s, v4.2s
3645; CHECK-SD-DOT-NEXT:    addp v0.2s, v0.2s, v0.2s
3646; CHECK-SD-DOT-NEXT:    fmov w0, s0
3647; CHECK-SD-DOT-NEXT:    ret
3648;
3649; CHECK-GI-BASE-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
3650; CHECK-GI-BASE:       // %bb.0: // %entry
3651; CHECK-GI-BASE-NEXT:    saddlv h3, v3.8b
3652; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
3653; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.8b
3654; CHECK-GI-BASE-NEXT:    saddlv h2, v2.8b
3655; CHECK-GI-BASE-NEXT:    fmov w8, s3
3656; CHECK-GI-BASE-NEXT:    fmov w9, s1
3657; CHECK-GI-BASE-NEXT:    fmov w10, s0
3658; CHECK-GI-BASE-NEXT:    fmov w11, s2
3659; CHECK-GI-BASE-NEXT:    sxth w8, w8
3660; CHECK-GI-BASE-NEXT:    and w9, w9, #0xffff
3661; CHECK-GI-BASE-NEXT:    add w9, w9, w10, uxth
3662; CHECK-GI-BASE-NEXT:    add w8, w8, w11, sxth
3663; CHECK-GI-BASE-NEXT:    add w0, w9, w8
3664; CHECK-GI-BASE-NEXT:    ret
3665;
3666; CHECK-GI-DOT-LABEL: add_pair_v8i8_v8i32_double_sext_zext:
3667; CHECK-GI-DOT:       // %bb.0: // %entry
3668; CHECK-GI-DOT-NEXT:    movi v4.8b, #1
3669; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
3670; CHECK-GI-DOT-NEXT:    movi v6.2d, #0000000000000000
3671; CHECK-GI-DOT-NEXT:    movi v7.2d, #0000000000000000
3672; CHECK-GI-DOT-NEXT:    movi v16.2d, #0000000000000000
3673; CHECK-GI-DOT-NEXT:    udot v5.2s, v0.8b, v4.8b
3674; CHECK-GI-DOT-NEXT:    sdot v6.2s, v3.8b, v4.8b
3675; CHECK-GI-DOT-NEXT:    udot v7.2s, v1.8b, v4.8b
3676; CHECK-GI-DOT-NEXT:    sdot v16.2s, v2.8b, v4.8b
3677; CHECK-GI-DOT-NEXT:    addp v0.2s, v5.2s, v5.2s
3678; CHECK-GI-DOT-NEXT:    addp v3.2s, v6.2s, v6.2s
3679; CHECK-GI-DOT-NEXT:    addp v1.2s, v7.2s, v7.2s
3680; CHECK-GI-DOT-NEXT:    addp v2.2s, v16.2s, v16.2s
3681; CHECK-GI-DOT-NEXT:    fmov w8, s0
3682; CHECK-GI-DOT-NEXT:    fmov w11, s3
3683; CHECK-GI-DOT-NEXT:    fmov w9, s1
3684; CHECK-GI-DOT-NEXT:    fmov w10, s2
3685; CHECK-GI-DOT-NEXT:    add w8, w8, w9
3686; CHECK-GI-DOT-NEXT:    add w9, w10, w11
3687; CHECK-GI-DOT-NEXT:    add w0, w8, w9
3688; CHECK-GI-DOT-NEXT:    ret
3689entry:
3690  %axx = zext <8 x i8> %ax to <8 x i32>
3691  %az1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %axx)
3692  %ayy = zext <8 x i8> %ay to <8 x i32>
3693  %az2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %ayy)
3694  %az = add i32 %az1, %az2
3695  %bxx = sext <8 x i8> %bx to <8 x i32>
3696  %bz1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %bxx)
3697  %byy = sext <8 x i8> %by to <8 x i32>
3698  %bz2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %byy)
3699  %bz = add i32 %bz1, %bz2
3700  %z = add i32 %az, %bz
3701  ret i32 %z
3702}
3703
3704define i32 @add_pair_v8i16_v4i32_double_sext_zext_shuffle(<8 x i16> %ax, <8 x i16> %ay, <8 x i16> %bx, <8 x i16> %by) {
3705; CHECK-SD-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
3706; CHECK-SD:       // %bb.0: // %entry
3707; CHECK-SD-NEXT:    uaddlp v1.4s, v1.8h
3708; CHECK-SD-NEXT:    uaddlp v3.4s, v3.8h
3709; CHECK-SD-NEXT:    uadalp v1.4s, v0.8h
3710; CHECK-SD-NEXT:    uadalp v3.4s, v2.8h
3711; CHECK-SD-NEXT:    add v0.4s, v3.4s, v1.4s
3712; CHECK-SD-NEXT:    addv s0, v0.4s
3713; CHECK-SD-NEXT:    fmov w0, s0
3714; CHECK-SD-NEXT:    ret
3715;
3716; CHECK-GI-LABEL: add_pair_v8i16_v4i32_double_sext_zext_shuffle:
3717; CHECK-GI:       // %bb.0: // %entry
3718; CHECK-GI-NEXT:    ushll v4.4s, v0.4h, #0
3719; CHECK-GI-NEXT:    ushll2 v0.4s, v0.8h, #0
3720; CHECK-GI-NEXT:    ushll v5.4s, v1.4h, #0
3721; CHECK-GI-NEXT:    ushll2 v1.4s, v1.8h, #0
3722; CHECK-GI-NEXT:    ushll v6.4s, v2.4h, #0
3723; CHECK-GI-NEXT:    ushll2 v2.4s, v2.8h, #0
3724; CHECK-GI-NEXT:    ushll v7.4s, v3.4h, #0
3725; CHECK-GI-NEXT:    ushll2 v3.4s, v3.8h, #0
3726; CHECK-GI-NEXT:    add v0.4s, v4.4s, v0.4s
3727; CHECK-GI-NEXT:    add v1.4s, v5.4s, v1.4s
3728; CHECK-GI-NEXT:    add v2.4s, v6.4s, v2.4s
3729; CHECK-GI-NEXT:    add v3.4s, v7.4s, v3.4s
3730; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
3731; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
3732; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
3733; CHECK-GI-NEXT:    addv s0, v0.4s
3734; CHECK-GI-NEXT:    fmov w0, s0
3735; CHECK-GI-NEXT:    ret
3736entry:
3737  %axx = zext <8 x i16> %ax to <8 x i32>
3738  %s1h = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3739  %s1l = shufflevector <8 x i32> %axx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3740  %axs = add <4 x i32> %s1h, %s1l
3741  %ayy = zext <8 x i16> %ay to <8 x i32>
3742  %s2h = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3743  %s2l = shufflevector <8 x i32> %ayy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3744  %ays = add <4 x i32> %s2h, %s2l
3745  %az = add <4 x i32> %axs, %ays
3746  %bxx = zext <8 x i16> %bx to <8 x i32>
3747  %s3h = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3748  %s3l = shufflevector <8 x i32> %bxx, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3749  %bxs = add <4 x i32> %s3h, %s3l
3750  %byy = zext <8 x i16> %by to <8 x i32>
3751  %s4h = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
3752  %s4l = shufflevector <8 x i32> %byy, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
3753  %bys = add <4 x i32> %s4h, %s4l
3754  %bz = add <4 x i32> %bxs, %bys
3755  %z = add <4 x i32> %az, %bz
3756  %z2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
3757  ret i32 %z2
3758}
3759
3760define i64 @add_pair_v2i64_v2i64(<2 x i64> %x, <2 x i64> %y) {
3761; CHECK-SD-LABEL: add_pair_v2i64_v2i64:
3762; CHECK-SD:       // %bb.0: // %entry
3763; CHECK-SD-NEXT:    add v0.2d, v0.2d, v1.2d
3764; CHECK-SD-NEXT:    addp d0, v0.2d
3765; CHECK-SD-NEXT:    fmov x0, d0
3766; CHECK-SD-NEXT:    ret
3767;
3768; CHECK-GI-LABEL: add_pair_v2i64_v2i64:
3769; CHECK-GI:       // %bb.0: // %entry
3770; CHECK-GI-NEXT:    addp d0, v0.2d
3771; CHECK-GI-NEXT:    addp d1, v1.2d
3772; CHECK-GI-NEXT:    fmov x8, d0
3773; CHECK-GI-NEXT:    fmov x9, d1
3774; CHECK-GI-NEXT:    add x0, x8, x9
3775; CHECK-GI-NEXT:    ret
3776entry:
3777  %z1 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %x)
3778  %z2 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %y)
3779  %z = add i64 %z1, %z2
3780  ret i64 %z
3781}
3782
3783; Irregularly sized vectors
3784define i16 @add_v24i8_v24i16_zext(<24 x i8> %x) {
3785; CHECK-SD-LABEL: add_v24i8_v24i16_zext:
3786; CHECK-SD:       // %bb.0: // %entry
3787; CHECK-SD-NEXT:    fmov s0, w0
3788; CHECK-SD-NEXT:    ldr b1, [sp, #64]
3789; CHECK-SD-NEXT:    add x8, sp, #72
3790; CHECK-SD-NEXT:    ldr b2, [sp]
3791; CHECK-SD-NEXT:    add x9, sp, #80
3792; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
3793; CHECK-SD-NEXT:    add x8, sp, #8
3794; CHECK-SD-NEXT:    mov v0.b[1], w1
3795; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8]
3796; CHECK-SD-NEXT:    add x8, sp, #16
3797; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
3798; CHECK-SD-NEXT:    add x9, sp, #88
3799; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
3800; CHECK-SD-NEXT:    add x8, sp, #24
3801; CHECK-SD-NEXT:    mov v0.b[2], w2
3802; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9]
3803; CHECK-SD-NEXT:    add x9, sp, #96
3804; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
3805; CHECK-SD-NEXT:    add x8, sp, #32
3806; CHECK-SD-NEXT:    mov v0.b[3], w3
3807; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
3808; CHECK-SD-NEXT:    add x9, sp, #104
3809; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
3810; CHECK-SD-NEXT:    add x8, sp, #40
3811; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9]
3812; CHECK-SD-NEXT:    add x9, sp, #112
3813; CHECK-SD-NEXT:    mov v0.b[4], w4
3814; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
3815; CHECK-SD-NEXT:    add x8, sp, #48
3816; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
3817; CHECK-SD-NEXT:    add x9, sp, #120
3818; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
3819; CHECK-SD-NEXT:    add x8, sp, #56
3820; CHECK-SD-NEXT:    mov v0.b[5], w5
3821; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
3822; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
3823; CHECK-SD-NEXT:    mov v0.b[6], w6
3824; CHECK-SD-NEXT:    mov v0.b[7], w7
3825; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
3826; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v2.8b
3827; CHECK-SD-NEXT:    addv h0, v0.8h
3828; CHECK-SD-NEXT:    fmov w0, s0
3829; CHECK-SD-NEXT:    ret
3830;
3831; CHECK-GI-LABEL: add_v24i8_v24i16_zext:
3832; CHECK-GI:       // %bb.0: // %entry
3833; CHECK-GI-NEXT:    fmov s0, w0
3834; CHECK-GI-NEXT:    ldr w8, [sp]
3835; CHECK-GI-NEXT:    ldr w9, [sp, #8]
3836; CHECK-GI-NEXT:    ldr w10, [sp, #72]
3837; CHECK-GI-NEXT:    mov v0.b[1], w1
3838; CHECK-GI-NEXT:    mov v0.b[2], w2
3839; CHECK-GI-NEXT:    mov v0.b[3], w3
3840; CHECK-GI-NEXT:    mov v0.b[4], w4
3841; CHECK-GI-NEXT:    mov v0.b[5], w5
3842; CHECK-GI-NEXT:    mov v0.b[6], w6
3843; CHECK-GI-NEXT:    mov v0.b[7], w7
3844; CHECK-GI-NEXT:    mov v0.b[8], w8
3845; CHECK-GI-NEXT:    ldr w8, [sp, #64]
3846; CHECK-GI-NEXT:    fmov s1, w8
3847; CHECK-GI-NEXT:    ldr w8, [sp, #16]
3848; CHECK-GI-NEXT:    mov v0.b[9], w9
3849; CHECK-GI-NEXT:    ldr w9, [sp, #80]
3850; CHECK-GI-NEXT:    mov v1.b[1], w10
3851; CHECK-GI-NEXT:    mov v0.b[10], w8
3852; CHECK-GI-NEXT:    ldr w8, [sp, #24]
3853; CHECK-GI-NEXT:    mov v1.b[2], w9
3854; CHECK-GI-NEXT:    ldr w9, [sp, #88]
3855; CHECK-GI-NEXT:    mov v0.b[11], w8
3856; CHECK-GI-NEXT:    ldr w8, [sp, #32]
3857; CHECK-GI-NEXT:    mov v1.b[3], w9
3858; CHECK-GI-NEXT:    ldr w9, [sp, #96]
3859; CHECK-GI-NEXT:    mov v0.b[12], w8
3860; CHECK-GI-NEXT:    ldr w8, [sp, #40]
3861; CHECK-GI-NEXT:    mov v1.b[4], w9
3862; CHECK-GI-NEXT:    ldr w9, [sp, #104]
3863; CHECK-GI-NEXT:    mov v0.b[13], w8
3864; CHECK-GI-NEXT:    ldr w8, [sp, #48]
3865; CHECK-GI-NEXT:    mov v1.b[5], w9
3866; CHECK-GI-NEXT:    ldr w9, [sp, #112]
3867; CHECK-GI-NEXT:    mov v0.b[14], w8
3868; CHECK-GI-NEXT:    ldr w8, [sp, #56]
3869; CHECK-GI-NEXT:    mov v1.b[6], w9
3870; CHECK-GI-NEXT:    ldr w9, [sp, #120]
3871; CHECK-GI-NEXT:    mov v0.b[15], w8
3872; CHECK-GI-NEXT:    mov v1.b[7], w9
3873; CHECK-GI-NEXT:    uaddlv h0, v0.16b
3874; CHECK-GI-NEXT:    uaddlv h1, v1.8b
3875; CHECK-GI-NEXT:    fmov w8, s0
3876; CHECK-GI-NEXT:    fmov w9, s1
3877; CHECK-GI-NEXT:    add w0, w8, w9
3878; CHECK-GI-NEXT:    ret
3879entry:
3880  %xx = zext <24 x i8> %x to <24 x i16>
3881  %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
3882  ret i16 %z
3883}
3884
3885define i16 @add_v32i8_v32i16_zext(<32 x i8> %x) {
3886; CHECK-SD-LABEL: add_v32i8_v32i16_zext:
3887; CHECK-SD:       // %bb.0: // %entry
3888; CHECK-SD-NEXT:    uaddl2 v2.8h, v0.16b, v1.16b
3889; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
3890; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
3891; CHECK-SD-NEXT:    addv h0, v0.8h
3892; CHECK-SD-NEXT:    fmov w0, s0
3893; CHECK-SD-NEXT:    ret
3894;
3895; CHECK-GI-LABEL: add_v32i8_v32i16_zext:
3896; CHECK-GI:       // %bb.0: // %entry
3897; CHECK-GI-NEXT:    uaddlv h0, v0.16b
3898; CHECK-GI-NEXT:    uaddlv h1, v1.16b
3899; CHECK-GI-NEXT:    fmov w8, s0
3900; CHECK-GI-NEXT:    fmov w9, s1
3901; CHECK-GI-NEXT:    add w0, w8, w9
3902; CHECK-GI-NEXT:    ret
3903entry:
3904  %xx = zext <32 x i8> %x to <32 x i16>
3905  %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
3906  ret i16 %z
3907}
3908
3909define i16 @add_v24i8_v24i16_sext(<24 x i8> %x) {
3910; CHECK-SD-LABEL: add_v24i8_v24i16_sext:
3911; CHECK-SD:       // %bb.0: // %entry
3912; CHECK-SD-NEXT:    fmov s0, w0
3913; CHECK-SD-NEXT:    ldr b1, [sp, #64]
3914; CHECK-SD-NEXT:    add x8, sp, #72
3915; CHECK-SD-NEXT:    ldr b2, [sp]
3916; CHECK-SD-NEXT:    add x9, sp, #80
3917; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
3918; CHECK-SD-NEXT:    add x8, sp, #8
3919; CHECK-SD-NEXT:    mov v0.b[1], w1
3920; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x8]
3921; CHECK-SD-NEXT:    add x8, sp, #16
3922; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
3923; CHECK-SD-NEXT:    add x9, sp, #88
3924; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x8]
3925; CHECK-SD-NEXT:    add x8, sp, #24
3926; CHECK-SD-NEXT:    mov v0.b[2], w2
3927; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x9]
3928; CHECK-SD-NEXT:    add x9, sp, #96
3929; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
3930; CHECK-SD-NEXT:    add x8, sp, #32
3931; CHECK-SD-NEXT:    mov v0.b[3], w3
3932; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x9]
3933; CHECK-SD-NEXT:    add x9, sp, #104
3934; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x8]
3935; CHECK-SD-NEXT:    add x8, sp, #40
3936; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x9]
3937; CHECK-SD-NEXT:    add x9, sp, #112
3938; CHECK-SD-NEXT:    mov v0.b[4], w4
3939; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
3940; CHECK-SD-NEXT:    add x8, sp, #48
3941; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
3942; CHECK-SD-NEXT:    add x9, sp, #120
3943; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
3944; CHECK-SD-NEXT:    add x8, sp, #56
3945; CHECK-SD-NEXT:    mov v0.b[5], w5
3946; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x9]
3947; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x8]
3948; CHECK-SD-NEXT:    mov v0.b[6], w6
3949; CHECK-SD-NEXT:    mov v0.b[7], w7
3950; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
3951; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v2.8b
3952; CHECK-SD-NEXT:    addv h0, v0.8h
3953; CHECK-SD-NEXT:    fmov w0, s0
3954; CHECK-SD-NEXT:    ret
3955;
3956; CHECK-GI-LABEL: add_v24i8_v24i16_sext:
3957; CHECK-GI:       // %bb.0: // %entry
3958; CHECK-GI-NEXT:    fmov s0, w0
3959; CHECK-GI-NEXT:    ldr w8, [sp]
3960; CHECK-GI-NEXT:    ldr w9, [sp, #8]
3961; CHECK-GI-NEXT:    ldr w10, [sp, #72]
3962; CHECK-GI-NEXT:    mov v0.b[1], w1
3963; CHECK-GI-NEXT:    mov v0.b[2], w2
3964; CHECK-GI-NEXT:    mov v0.b[3], w3
3965; CHECK-GI-NEXT:    mov v0.b[4], w4
3966; CHECK-GI-NEXT:    mov v0.b[5], w5
3967; CHECK-GI-NEXT:    mov v0.b[6], w6
3968; CHECK-GI-NEXT:    mov v0.b[7], w7
3969; CHECK-GI-NEXT:    mov v0.b[8], w8
3970; CHECK-GI-NEXT:    ldr w8, [sp, #64]
3971; CHECK-GI-NEXT:    fmov s1, w8
3972; CHECK-GI-NEXT:    ldr w8, [sp, #16]
3973; CHECK-GI-NEXT:    mov v0.b[9], w9
3974; CHECK-GI-NEXT:    ldr w9, [sp, #80]
3975; CHECK-GI-NEXT:    mov v1.b[1], w10
3976; CHECK-GI-NEXT:    mov v0.b[10], w8
3977; CHECK-GI-NEXT:    ldr w8, [sp, #24]
3978; CHECK-GI-NEXT:    mov v1.b[2], w9
3979; CHECK-GI-NEXT:    ldr w9, [sp, #88]
3980; CHECK-GI-NEXT:    mov v0.b[11], w8
3981; CHECK-GI-NEXT:    ldr w8, [sp, #32]
3982; CHECK-GI-NEXT:    mov v1.b[3], w9
3983; CHECK-GI-NEXT:    ldr w9, [sp, #96]
3984; CHECK-GI-NEXT:    mov v0.b[12], w8
3985; CHECK-GI-NEXT:    ldr w8, [sp, #40]
3986; CHECK-GI-NEXT:    mov v1.b[4], w9
3987; CHECK-GI-NEXT:    ldr w9, [sp, #104]
3988; CHECK-GI-NEXT:    mov v0.b[13], w8
3989; CHECK-GI-NEXT:    ldr w8, [sp, #48]
3990; CHECK-GI-NEXT:    mov v1.b[5], w9
3991; CHECK-GI-NEXT:    ldr w9, [sp, #112]
3992; CHECK-GI-NEXT:    mov v0.b[14], w8
3993; CHECK-GI-NEXT:    ldr w8, [sp, #56]
3994; CHECK-GI-NEXT:    mov v1.b[6], w9
3995; CHECK-GI-NEXT:    ldr w9, [sp, #120]
3996; CHECK-GI-NEXT:    mov v0.b[15], w8
3997; CHECK-GI-NEXT:    mov v1.b[7], w9
3998; CHECK-GI-NEXT:    saddlv h0, v0.16b
3999; CHECK-GI-NEXT:    saddlv h1, v1.8b
4000; CHECK-GI-NEXT:    fmov w8, s0
4001; CHECK-GI-NEXT:    fmov w9, s1
4002; CHECK-GI-NEXT:    add w0, w8, w9
4003; CHECK-GI-NEXT:    ret
4004entry:
4005  %xx = sext <24 x i8> %x to <24 x i16>
4006  %z = call i16 @llvm.vector.reduce.add.v24i16(<24 x i16> %xx)
4007  ret i16 %z
4008}
4009
4010define i16 @add_v32i8_v32i16_sext(<32 x i8> %x) {
4011; CHECK-SD-LABEL: add_v32i8_v32i16_sext:
4012; CHECK-SD:       // %bb.0: // %entry
4013; CHECK-SD-NEXT:    saddl2 v2.8h, v0.16b, v1.16b
4014; CHECK-SD-NEXT:    saddl v0.8h, v0.8b, v1.8b
4015; CHECK-SD-NEXT:    add v0.8h, v0.8h, v2.8h
4016; CHECK-SD-NEXT:    addv h0, v0.8h
4017; CHECK-SD-NEXT:    fmov w0, s0
4018; CHECK-SD-NEXT:    ret
4019;
4020; CHECK-GI-LABEL: add_v32i8_v32i16_sext:
4021; CHECK-GI:       // %bb.0: // %entry
4022; CHECK-GI-NEXT:    saddlv h0, v0.16b
4023; CHECK-GI-NEXT:    saddlv h1, v1.16b
4024; CHECK-GI-NEXT:    fmov w8, s0
4025; CHECK-GI-NEXT:    fmov w9, s1
4026; CHECK-GI-NEXT:    add w0, w8, w9
4027; CHECK-GI-NEXT:    ret
4028entry:
4029  %xx = sext <32 x i8> %x to <32 x i16>
4030  %z = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %xx)
4031  ret i16 %z
4032}
4033
4034; Irregularly sized vectors and larger extends
4035define i32 @add_v24i8_v24i32_zext(<24 x i8> %x) {
4036; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_zext:
4037; CHECK-SD-BASE:       // %bb.0: // %entry
4038; CHECK-SD-BASE-NEXT:    fmov s0, w0
4039; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
4040; CHECK-SD-BASE-NEXT:    add x8, sp, #72
4041; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
4042; CHECK-SD-BASE-NEXT:    add x9, sp, #80
4043; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
4044; CHECK-SD-BASE-NEXT:    add x8, sp, #8
4045; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
4046; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8]
4047; CHECK-SD-BASE-NEXT:    add x8, sp, #16
4048; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9]
4049; CHECK-SD-BASE-NEXT:    add x9, sp, #88
4050; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8]
4051; CHECK-SD-BASE-NEXT:    add x8, sp, #24
4052; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
4053; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9]
4054; CHECK-SD-BASE-NEXT:    add x9, sp, #96
4055; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8]
4056; CHECK-SD-BASE-NEXT:    add x8, sp, #32
4057; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
4058; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9]
4059; CHECK-SD-BASE-NEXT:    add x9, sp, #104
4060; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8]
4061; CHECK-SD-BASE-NEXT:    add x8, sp, #40
4062; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9]
4063; CHECK-SD-BASE-NEXT:    add x9, sp, #112
4064; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
4065; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8]
4066; CHECK-SD-BASE-NEXT:    add x8, sp, #48
4067; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9]
4068; CHECK-SD-BASE-NEXT:    add x9, sp, #120
4069; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8]
4070; CHECK-SD-BASE-NEXT:    add x8, sp, #56
4071; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
4072; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9]
4073; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
4074; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
4075; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
4076; CHECK-SD-BASE-NEXT:    ushll v2.8h, v2.8b, #0
4077; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
4078; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
4079; CHECK-SD-BASE-NEXT:    uaddl2 v3.4s, v0.8h, v1.8h
4080; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
4081; CHECK-SD-BASE-NEXT:    uaddw2 v1.4s, v3.4s, v2.8h
4082; CHECK-SD-BASE-NEXT:    uaddw v0.4s, v0.4s, v2.4h
4083; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
4084; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
4085; CHECK-SD-BASE-NEXT:    fmov w0, s0
4086; CHECK-SD-BASE-NEXT:    ret
4087;
4088; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_zext:
4089; CHECK-SD-DOT:       // %bb.0: // %entry
4090; CHECK-SD-DOT-NEXT:    fmov s0, w0
4091; CHECK-SD-DOT-NEXT:    mov x8, sp
4092; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64]
4093; CHECK-SD-DOT-NEXT:    add x9, sp, #72
4094; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
4095; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
4096; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9]
4097; CHECK-SD-DOT-NEXT:    add x9, sp, #80
4098; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
4099; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1
4100; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
4101; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9]
4102; CHECK-SD-DOT-NEXT:    add x9, sp, #88
4103; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2
4104; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9]
4105; CHECK-SD-DOT-NEXT:    add x9, sp, #96
4106; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3
4107; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9]
4108; CHECK-SD-DOT-NEXT:    add x9, sp, #104
4109; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9]
4110; CHECK-SD-DOT-NEXT:    add x9, sp, #112
4111; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4
4112; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9]
4113; CHECK-SD-DOT-NEXT:    add x9, sp, #120
4114; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5
4115; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9]
4116; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6
4117; CHECK-SD-DOT-NEXT:    udot v4.2s, v1.8b, v5.8b
4118; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
4119; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
4120; CHECK-SD-DOT-NEXT:    fmov w9, s1
4121; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
4122; CHECK-SD-DOT-NEXT:    add x8, sp, #8
4123; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
4124; CHECK-SD-DOT-NEXT:    add x8, sp, #16
4125; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
4126; CHECK-SD-DOT-NEXT:    add x8, sp, #24
4127; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8]
4128; CHECK-SD-DOT-NEXT:    add x8, sp, #32
4129; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8]
4130; CHECK-SD-DOT-NEXT:    add x8, sp, #40
4131; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8]
4132; CHECK-SD-DOT-NEXT:    add x8, sp, #48
4133; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8]
4134; CHECK-SD-DOT-NEXT:    add x8, sp, #56
4135; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8]
4136; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
4137; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
4138; CHECK-SD-DOT-NEXT:    fmov w8, s0
4139; CHECK-SD-DOT-NEXT:    add w0, w8, w9
4140; CHECK-SD-DOT-NEXT:    ret
4141;
4142; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_zext:
4143; CHECK-GI-BASE:       // %bb.0: // %entry
4144; CHECK-GI-BASE-NEXT:    fmov s0, w0
4145; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
4146; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
4147; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
4148; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
4149; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
4150; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
4151; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
4152; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
4153; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
4154; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
4155; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
4156; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
4157; CHECK-GI-BASE-NEXT:    fmov s1, w8
4158; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
4159; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
4160; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
4161; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
4162; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
4163; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
4164; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
4165; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
4166; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
4167; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
4168; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
4169; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
4170; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
4171; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
4172; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
4173; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
4174; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
4175; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
4176; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
4177; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
4178; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
4179; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
4180; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
4181; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
4182; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
4183; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
4184; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
4185; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.8b
4186; CHECK-GI-BASE-NEXT:    fmov w8, s0
4187; CHECK-GI-BASE-NEXT:    fmov w9, s1
4188; CHECK-GI-BASE-NEXT:    add w8, w8, w9
4189; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
4190; CHECK-GI-BASE-NEXT:    ret
4191;
4192; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_zext:
4193; CHECK-GI-DOT:       // %bb.0: // %entry
4194; CHECK-GI-DOT-NEXT:    fmov s0, w0
4195; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
4196; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
4197; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
4198; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
4199; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
4200; CHECK-GI-DOT-NEXT:    fmov s1, w9
4201; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
4202; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
4203; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
4204; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
4205; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
4206; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
4207; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
4208; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
4209; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
4210; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
4211; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
4212; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
4213; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
4214; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
4215; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
4216; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
4217; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
4218; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
4219; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
4220; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
4221; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
4222; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
4223; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
4224; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
4225; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
4226; CHECK-GI-DOT-NEXT:    fmov d1, d1
4227; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
4228; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
4229; CHECK-GI-DOT-NEXT:    udot v4.4s, v1.16b, v2.16b
4230; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
4231; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
4232; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
4233; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
4234; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
4235; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
4236; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
4237; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
4238; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
4239; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
4240; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
4241; CHECK-GI-DOT-NEXT:    udot v5.4s, v0.16b, v3.16b
4242; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
4243; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
4244; CHECK-GI-DOT-NEXT:    fmov w0, s0
4245; CHECK-GI-DOT-NEXT:    ret
4246entry:
4247  %xx = zext <24 x i8> %x to <24 x i32>
4248  %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
4249  ret i32 %z
4250}
4251
4252define i32 @add_v32i8_v32i32_zext(<32 x i8> %x) {
4253; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_zext:
4254; CHECK-SD-BASE:       // %bb.0: // %entry
4255; CHECK-SD-BASE-NEXT:    ushll2 v2.8h, v1.16b, #0
4256; CHECK-SD-BASE-NEXT:    ushll2 v3.8h, v0.16b, #0
4257; CHECK-SD-BASE-NEXT:    ushll v1.8h, v1.8b, #0
4258; CHECK-SD-BASE-NEXT:    ushll v0.8h, v0.8b, #0
4259; CHECK-SD-BASE-NEXT:    uaddl2 v4.4s, v3.8h, v2.8h
4260; CHECK-SD-BASE-NEXT:    uaddl v2.4s, v3.4h, v2.4h
4261; CHECK-SD-BASE-NEXT:    uaddl2 v5.4s, v0.8h, v1.8h
4262; CHECK-SD-BASE-NEXT:    uaddl v0.4s, v0.4h, v1.4h
4263; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
4264; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
4265; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
4266; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
4267; CHECK-SD-BASE-NEXT:    fmov w0, s0
4268; CHECK-SD-BASE-NEXT:    ret
4269;
4270; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_zext:
4271; CHECK-SD-DOT:       // %bb.0: // %entry
4272; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
4273; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
4274; CHECK-SD-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
4275; CHECK-SD-DOT-NEXT:    udot v3.4s, v0.16b, v2.16b
4276; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
4277; CHECK-SD-DOT-NEXT:    fmov w0, s0
4278; CHECK-SD-DOT-NEXT:    ret
4279;
4280; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_zext:
4281; CHECK-GI-BASE:       // %bb.0: // %entry
4282; CHECK-GI-BASE-NEXT:    uaddlv h0, v0.16b
4283; CHECK-GI-BASE-NEXT:    uaddlv h1, v1.16b
4284; CHECK-GI-BASE-NEXT:    fmov w8, s0
4285; CHECK-GI-BASE-NEXT:    fmov w9, s1
4286; CHECK-GI-BASE-NEXT:    add w8, w8, w9
4287; CHECK-GI-BASE-NEXT:    and w0, w8, #0xffff
4288; CHECK-GI-BASE-NEXT:    ret
4289;
4290; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_zext:
4291; CHECK-GI-DOT:       // %bb.0: // %entry
4292; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
4293; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
4294; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
4295; CHECK-GI-DOT-NEXT:    udot v4.4s, v0.16b, v2.16b
4296; CHECK-GI-DOT-NEXT:    udot v3.4s, v1.16b, v2.16b
4297; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s
4298; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
4299; CHECK-GI-DOT-NEXT:    fmov w0, s0
4300; CHECK-GI-DOT-NEXT:    ret
4301entry:
4302  %xx = zext <32 x i8> %x to <32 x i32>
4303  %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
4304  ret i32 %z
4305}
4306
4307define i32 @add_v24i8_v24i32_sext(<24 x i8> %x) {
4308; CHECK-SD-BASE-LABEL: add_v24i8_v24i32_sext:
4309; CHECK-SD-BASE:       // %bb.0: // %entry
4310; CHECK-SD-BASE-NEXT:    fmov s0, w0
4311; CHECK-SD-BASE-NEXT:    ldr b1, [sp, #64]
4312; CHECK-SD-BASE-NEXT:    add x8, sp, #72
4313; CHECK-SD-BASE-NEXT:    ldr b2, [sp]
4314; CHECK-SD-BASE-NEXT:    add x9, sp, #80
4315; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[1], [x8]
4316; CHECK-SD-BASE-NEXT:    add x8, sp, #8
4317; CHECK-SD-BASE-NEXT:    mov v0.b[1], w1
4318; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[1], [x8]
4319; CHECK-SD-BASE-NEXT:    add x8, sp, #16
4320; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[2], [x9]
4321; CHECK-SD-BASE-NEXT:    add x9, sp, #88
4322; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[2], [x8]
4323; CHECK-SD-BASE-NEXT:    add x8, sp, #24
4324; CHECK-SD-BASE-NEXT:    mov v0.b[2], w2
4325; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[3], [x9]
4326; CHECK-SD-BASE-NEXT:    add x9, sp, #96
4327; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[3], [x8]
4328; CHECK-SD-BASE-NEXT:    add x8, sp, #32
4329; CHECK-SD-BASE-NEXT:    mov v0.b[3], w3
4330; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[4], [x9]
4331; CHECK-SD-BASE-NEXT:    add x9, sp, #104
4332; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[4], [x8]
4333; CHECK-SD-BASE-NEXT:    add x8, sp, #40
4334; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[5], [x9]
4335; CHECK-SD-BASE-NEXT:    add x9, sp, #112
4336; CHECK-SD-BASE-NEXT:    mov v0.b[4], w4
4337; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[5], [x8]
4338; CHECK-SD-BASE-NEXT:    add x8, sp, #48
4339; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[6], [x9]
4340; CHECK-SD-BASE-NEXT:    add x9, sp, #120
4341; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[6], [x8]
4342; CHECK-SD-BASE-NEXT:    add x8, sp, #56
4343; CHECK-SD-BASE-NEXT:    mov v0.b[5], w5
4344; CHECK-SD-BASE-NEXT:    ld1 { v1.b }[7], [x9]
4345; CHECK-SD-BASE-NEXT:    ld1 { v2.b }[7], [x8]
4346; CHECK-SD-BASE-NEXT:    mov v0.b[6], w6
4347; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
4348; CHECK-SD-BASE-NEXT:    sshll v2.8h, v2.8b, #0
4349; CHECK-SD-BASE-NEXT:    mov v0.b[7], w7
4350; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
4351; CHECK-SD-BASE-NEXT:    saddl2 v3.4s, v0.8h, v1.8h
4352; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
4353; CHECK-SD-BASE-NEXT:    saddw2 v1.4s, v3.4s, v2.8h
4354; CHECK-SD-BASE-NEXT:    saddw v0.4s, v0.4s, v2.4h
4355; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
4356; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
4357; CHECK-SD-BASE-NEXT:    fmov w0, s0
4358; CHECK-SD-BASE-NEXT:    ret
4359;
4360; CHECK-SD-DOT-LABEL: add_v24i8_v24i32_sext:
4361; CHECK-SD-DOT:       // %bb.0: // %entry
4362; CHECK-SD-DOT-NEXT:    fmov s0, w0
4363; CHECK-SD-DOT-NEXT:    mov x8, sp
4364; CHECK-SD-DOT-NEXT:    ldr b1, [sp, #64]
4365; CHECK-SD-DOT-NEXT:    add x9, sp, #72
4366; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
4367; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
4368; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[1], [x9]
4369; CHECK-SD-DOT-NEXT:    add x9, sp, #80
4370; CHECK-SD-DOT-NEXT:    movi v4.2d, #0000000000000000
4371; CHECK-SD-DOT-NEXT:    mov v0.b[1], w1
4372; CHECK-SD-DOT-NEXT:    movi v5.8b, #1
4373; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[2], [x9]
4374; CHECK-SD-DOT-NEXT:    add x9, sp, #88
4375; CHECK-SD-DOT-NEXT:    mov v0.b[2], w2
4376; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[3], [x9]
4377; CHECK-SD-DOT-NEXT:    add x9, sp, #96
4378; CHECK-SD-DOT-NEXT:    mov v0.b[3], w3
4379; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[4], [x9]
4380; CHECK-SD-DOT-NEXT:    add x9, sp, #104
4381; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[5], [x9]
4382; CHECK-SD-DOT-NEXT:    add x9, sp, #112
4383; CHECK-SD-DOT-NEXT:    mov v0.b[4], w4
4384; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[6], [x9]
4385; CHECK-SD-DOT-NEXT:    add x9, sp, #120
4386; CHECK-SD-DOT-NEXT:    mov v0.b[5], w5
4387; CHECK-SD-DOT-NEXT:    ld1 { v1.b }[7], [x9]
4388; CHECK-SD-DOT-NEXT:    mov v0.b[6], w6
4389; CHECK-SD-DOT-NEXT:    sdot v4.2s, v1.8b, v5.8b
4390; CHECK-SD-DOT-NEXT:    mov v0.b[7], w7
4391; CHECK-SD-DOT-NEXT:    addp v1.2s, v4.2s, v4.2s
4392; CHECK-SD-DOT-NEXT:    fmov w9, s1
4393; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[8], [x8]
4394; CHECK-SD-DOT-NEXT:    add x8, sp, #8
4395; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[9], [x8]
4396; CHECK-SD-DOT-NEXT:    add x8, sp, #16
4397; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[10], [x8]
4398; CHECK-SD-DOT-NEXT:    add x8, sp, #24
4399; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[11], [x8]
4400; CHECK-SD-DOT-NEXT:    add x8, sp, #32
4401; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[12], [x8]
4402; CHECK-SD-DOT-NEXT:    add x8, sp, #40
4403; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[13], [x8]
4404; CHECK-SD-DOT-NEXT:    add x8, sp, #48
4405; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[14], [x8]
4406; CHECK-SD-DOT-NEXT:    add x8, sp, #56
4407; CHECK-SD-DOT-NEXT:    ld1 { v0.b }[15], [x8]
4408; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
4409; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
4410; CHECK-SD-DOT-NEXT:    fmov w8, s0
4411; CHECK-SD-DOT-NEXT:    add w0, w8, w9
4412; CHECK-SD-DOT-NEXT:    ret
4413;
4414; CHECK-GI-BASE-LABEL: add_v24i8_v24i32_sext:
4415; CHECK-GI-BASE:       // %bb.0: // %entry
4416; CHECK-GI-BASE-NEXT:    fmov s0, w0
4417; CHECK-GI-BASE-NEXT:    ldr w8, [sp]
4418; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #8]
4419; CHECK-GI-BASE-NEXT:    ldr w10, [sp, #72]
4420; CHECK-GI-BASE-NEXT:    mov v0.b[1], w1
4421; CHECK-GI-BASE-NEXT:    mov v0.b[2], w2
4422; CHECK-GI-BASE-NEXT:    mov v0.b[3], w3
4423; CHECK-GI-BASE-NEXT:    mov v0.b[4], w4
4424; CHECK-GI-BASE-NEXT:    mov v0.b[5], w5
4425; CHECK-GI-BASE-NEXT:    mov v0.b[6], w6
4426; CHECK-GI-BASE-NEXT:    mov v0.b[7], w7
4427; CHECK-GI-BASE-NEXT:    mov v0.b[8], w8
4428; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #64]
4429; CHECK-GI-BASE-NEXT:    fmov s1, w8
4430; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #16]
4431; CHECK-GI-BASE-NEXT:    mov v0.b[9], w9
4432; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #80]
4433; CHECK-GI-BASE-NEXT:    mov v1.b[1], w10
4434; CHECK-GI-BASE-NEXT:    mov v0.b[10], w8
4435; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #24]
4436; CHECK-GI-BASE-NEXT:    mov v1.b[2], w9
4437; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #88]
4438; CHECK-GI-BASE-NEXT:    mov v0.b[11], w8
4439; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #32]
4440; CHECK-GI-BASE-NEXT:    mov v1.b[3], w9
4441; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #96]
4442; CHECK-GI-BASE-NEXT:    mov v0.b[12], w8
4443; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #40]
4444; CHECK-GI-BASE-NEXT:    mov v1.b[4], w9
4445; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #104]
4446; CHECK-GI-BASE-NEXT:    mov v0.b[13], w8
4447; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #48]
4448; CHECK-GI-BASE-NEXT:    mov v1.b[5], w9
4449; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #112]
4450; CHECK-GI-BASE-NEXT:    mov v0.b[14], w8
4451; CHECK-GI-BASE-NEXT:    ldr w8, [sp, #56]
4452; CHECK-GI-BASE-NEXT:    mov v1.b[6], w9
4453; CHECK-GI-BASE-NEXT:    ldr w9, [sp, #120]
4454; CHECK-GI-BASE-NEXT:    mov v0.b[15], w8
4455; CHECK-GI-BASE-NEXT:    mov v1.b[7], w9
4456; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
4457; CHECK-GI-BASE-NEXT:    saddlv h1, v1.8b
4458; CHECK-GI-BASE-NEXT:    fmov w8, s0
4459; CHECK-GI-BASE-NEXT:    fmov w9, s1
4460; CHECK-GI-BASE-NEXT:    add w8, w8, w9
4461; CHECK-GI-BASE-NEXT:    sxth w0, w8
4462; CHECK-GI-BASE-NEXT:    ret
4463;
4464; CHECK-GI-DOT-LABEL: add_v24i8_v24i32_sext:
4465; CHECK-GI-DOT:       // %bb.0: // %entry
4466; CHECK-GI-DOT-NEXT:    fmov s0, w0
4467; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #64]
4468; CHECK-GI-DOT-NEXT:    ldr w8, [sp]
4469; CHECK-GI-DOT-NEXT:    ldr w10, [sp, #72]
4470; CHECK-GI-DOT-NEXT:    movi v2.8b, #1
4471; CHECK-GI-DOT-NEXT:    movi v3.8b, #1
4472; CHECK-GI-DOT-NEXT:    fmov s1, w9
4473; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #80]
4474; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
4475; CHECK-GI-DOT-NEXT:    mov v0.b[1], w1
4476; CHECK-GI-DOT-NEXT:    movi v5.2d, #0000000000000000
4477; CHECK-GI-DOT-NEXT:    mov v1.b[1], w10
4478; CHECK-GI-DOT-NEXT:    mov v3.d[1], v2.d[0]
4479; CHECK-GI-DOT-NEXT:    mov v0.b[2], w2
4480; CHECK-GI-DOT-NEXT:    mov v1.b[2], w9
4481; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #88]
4482; CHECK-GI-DOT-NEXT:    mov v0.b[3], w3
4483; CHECK-GI-DOT-NEXT:    mov v1.b[3], w9
4484; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #96]
4485; CHECK-GI-DOT-NEXT:    mov v0.b[4], w4
4486; CHECK-GI-DOT-NEXT:    mov v1.b[4], w9
4487; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #104]
4488; CHECK-GI-DOT-NEXT:    mov v0.b[5], w5
4489; CHECK-GI-DOT-NEXT:    mov v1.b[5], w9
4490; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #112]
4491; CHECK-GI-DOT-NEXT:    mov v0.b[6], w6
4492; CHECK-GI-DOT-NEXT:    mov v1.b[6], w9
4493; CHECK-GI-DOT-NEXT:    ldr w9, [sp, #120]
4494; CHECK-GI-DOT-NEXT:    mov v0.b[7], w7
4495; CHECK-GI-DOT-NEXT:    mov v1.b[7], w9
4496; CHECK-GI-DOT-NEXT:    mov v0.b[8], w8
4497; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #8]
4498; CHECK-GI-DOT-NEXT:    fmov d1, d1
4499; CHECK-GI-DOT-NEXT:    mov v0.b[9], w8
4500; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #16]
4501; CHECK-GI-DOT-NEXT:    sdot v4.4s, v1.16b, v2.16b
4502; CHECK-GI-DOT-NEXT:    mov v0.b[10], w8
4503; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #24]
4504; CHECK-GI-DOT-NEXT:    mov v0.b[11], w8
4505; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #32]
4506; CHECK-GI-DOT-NEXT:    mov v0.b[12], w8
4507; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #40]
4508; CHECK-GI-DOT-NEXT:    mov v0.b[13], w8
4509; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #48]
4510; CHECK-GI-DOT-NEXT:    mov v0.b[14], w8
4511; CHECK-GI-DOT-NEXT:    ldr w8, [sp, #56]
4512; CHECK-GI-DOT-NEXT:    mov v0.b[15], w8
4513; CHECK-GI-DOT-NEXT:    sdot v5.4s, v0.16b, v3.16b
4514; CHECK-GI-DOT-NEXT:    add v0.4s, v5.4s, v4.4s
4515; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
4516; CHECK-GI-DOT-NEXT:    fmov w0, s0
4517; CHECK-GI-DOT-NEXT:    ret
4518entry:
4519  %xx = sext <24 x i8> %x to <24 x i32>
4520  %z = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %xx)
4521  ret i32 %z
4522}
4523
4524define i32 @add_v32i8_v32i32_sext(<32 x i8> %x) {
4525; CHECK-SD-BASE-LABEL: add_v32i8_v32i32_sext:
4526; CHECK-SD-BASE:       // %bb.0: // %entry
4527; CHECK-SD-BASE-NEXT:    sshll2 v2.8h, v1.16b, #0
4528; CHECK-SD-BASE-NEXT:    sshll2 v3.8h, v0.16b, #0
4529; CHECK-SD-BASE-NEXT:    sshll v1.8h, v1.8b, #0
4530; CHECK-SD-BASE-NEXT:    sshll v0.8h, v0.8b, #0
4531; CHECK-SD-BASE-NEXT:    saddl2 v4.4s, v3.8h, v2.8h
4532; CHECK-SD-BASE-NEXT:    saddl v2.4s, v3.4h, v2.4h
4533; CHECK-SD-BASE-NEXT:    saddl2 v5.4s, v0.8h, v1.8h
4534; CHECK-SD-BASE-NEXT:    saddl v0.4s, v0.4h, v1.4h
4535; CHECK-SD-BASE-NEXT:    add v1.4s, v5.4s, v4.4s
4536; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v2.4s
4537; CHECK-SD-BASE-NEXT:    add v0.4s, v0.4s, v1.4s
4538; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
4539; CHECK-SD-BASE-NEXT:    fmov w0, s0
4540; CHECK-SD-BASE-NEXT:    ret
4541;
4542; CHECK-SD-DOT-LABEL: add_v32i8_v32i32_sext:
4543; CHECK-SD-DOT:       // %bb.0: // %entry
4544; CHECK-SD-DOT-NEXT:    movi v2.16b, #1
4545; CHECK-SD-DOT-NEXT:    movi v3.2d, #0000000000000000
4546; CHECK-SD-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
4547; CHECK-SD-DOT-NEXT:    sdot v3.4s, v0.16b, v2.16b
4548; CHECK-SD-DOT-NEXT:    addv s0, v3.4s
4549; CHECK-SD-DOT-NEXT:    fmov w0, s0
4550; CHECK-SD-DOT-NEXT:    ret
4551;
4552; CHECK-GI-BASE-LABEL: add_v32i8_v32i32_sext:
4553; CHECK-GI-BASE:       // %bb.0: // %entry
4554; CHECK-GI-BASE-NEXT:    saddlv h0, v0.16b
4555; CHECK-GI-BASE-NEXT:    saddlv h1, v1.16b
4556; CHECK-GI-BASE-NEXT:    fmov w8, s0
4557; CHECK-GI-BASE-NEXT:    fmov w9, s1
4558; CHECK-GI-BASE-NEXT:    add w8, w8, w9
4559; CHECK-GI-BASE-NEXT:    sxth w0, w8
4560; CHECK-GI-BASE-NEXT:    ret
4561;
4562; CHECK-GI-DOT-LABEL: add_v32i8_v32i32_sext:
4563; CHECK-GI-DOT:       // %bb.0: // %entry
4564; CHECK-GI-DOT-NEXT:    movi v2.16b, #1
4565; CHECK-GI-DOT-NEXT:    movi v3.2d, #0000000000000000
4566; CHECK-GI-DOT-NEXT:    movi v4.2d, #0000000000000000
4567; CHECK-GI-DOT-NEXT:    sdot v4.4s, v0.16b, v2.16b
4568; CHECK-GI-DOT-NEXT:    sdot v3.4s, v1.16b, v2.16b
4569; CHECK-GI-DOT-NEXT:    add v0.4s, v4.4s, v3.4s
4570; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
4571; CHECK-GI-DOT-NEXT:    fmov w0, s0
4572; CHECK-GI-DOT-NEXT:    ret
4573entry:
4574  %xx = sext <32 x i8> %x to <32 x i32>
4575  %z = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %xx)
4576  ret i32 %z
4577}
4578
4579define i32 @full(ptr %p1, i32 noundef %s1, ptr %p2, i32 noundef %s2) {
4580; CHECK-SD-BASE-LABEL: full:
4581; CHECK-SD-BASE:       // %bb.0: // %entry
4582; CHECK-SD-BASE-NEXT:    // kill: def $w3 killed $w3 def $x3
4583; CHECK-SD-BASE-NEXT:    // kill: def $w1 killed $w1 def $x1
4584; CHECK-SD-BASE-NEXT:    sxtw x8, w3
4585; CHECK-SD-BASE-NEXT:    sxtw x9, w1
4586; CHECK-SD-BASE-NEXT:    ldr d0, [x0]
4587; CHECK-SD-BASE-NEXT:    ldr d1, [x2]
4588; CHECK-SD-BASE-NEXT:    add x10, x0, x9
4589; CHECK-SD-BASE-NEXT:    add x11, x2, x8
4590; CHECK-SD-BASE-NEXT:    uabdl v0.8h, v0.8b, v1.8b
4591; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4592; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4593; CHECK-SD-BASE-NEXT:    add x10, x10, x9
4594; CHECK-SD-BASE-NEXT:    add x11, x11, x8
4595; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4596; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4597; CHECK-SD-BASE-NEXT:    add x11, x11, x8
4598; CHECK-SD-BASE-NEXT:    uaddlp v0.4s, v0.8h
4599; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4600; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4601; CHECK-SD-BASE-NEXT:    add x10, x10, x9
4602; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4603; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4604; CHECK-SD-BASE-NEXT:    add x11, x11, x8
4605; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4606; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4607; CHECK-SD-BASE-NEXT:    add x10, x10, x9
4608; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4609; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4610; CHECK-SD-BASE-NEXT:    add x11, x11, x8
4611; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4612; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4613; CHECK-SD-BASE-NEXT:    add x10, x10, x9
4614; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4615; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4616; CHECK-SD-BASE-NEXT:    add x11, x11, x8
4617; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4618; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4619; CHECK-SD-BASE-NEXT:    add x10, x10, x9
4620; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4621; CHECK-SD-BASE-NEXT:    ldr d2, [x11]
4622; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4623; CHECK-SD-BASE-NEXT:    ldr d1, [x10]
4624; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4625; CHECK-SD-BASE-NEXT:    ldr d2, [x11, x8]
4626; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4627; CHECK-SD-BASE-NEXT:    ldr d1, [x10, x9]
4628; CHECK-SD-BASE-NEXT:    uabdl v1.8h, v1.8b, v2.8b
4629; CHECK-SD-BASE-NEXT:    uadalp v0.4s, v1.8h
4630; CHECK-SD-BASE-NEXT:    addv s0, v0.4s
4631; CHECK-SD-BASE-NEXT:    fmov w0, s0
4632; CHECK-SD-BASE-NEXT:    ret
4633;
4634; CHECK-SD-DOT-LABEL: full:
4635; CHECK-SD-DOT:       // %bb.0: // %entry
4636; CHECK-SD-DOT-NEXT:    ldr d0, [x0]
4637; CHECK-SD-DOT-NEXT:    ldr d1, [x2]
4638; CHECK-SD-DOT-NEXT:    // kill: def $w3 killed $w3 def $x3
4639; CHECK-SD-DOT-NEXT:    // kill: def $w1 killed $w1 def $x1
4640; CHECK-SD-DOT-NEXT:    sxtw x8, w3
4641; CHECK-SD-DOT-NEXT:    sxtw x9, w1
4642; CHECK-SD-DOT-NEXT:    movi v2.2d, #0000000000000000
4643; CHECK-SD-DOT-NEXT:    movi v3.8b, #1
4644; CHECK-SD-DOT-NEXT:    uabd v0.8b, v0.8b, v1.8b
4645; CHECK-SD-DOT-NEXT:    add x11, x2, x8
4646; CHECK-SD-DOT-NEXT:    add x10, x0, x9
4647; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4648; CHECK-SD-DOT-NEXT:    add x11, x11, x8
4649; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4650; CHECK-SD-DOT-NEXT:    add x10, x10, x9
4651; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4652; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4653; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4654; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4655; CHECK-SD-DOT-NEXT:    add x10, x10, x9
4656; CHECK-SD-DOT-NEXT:    add x11, x11, x8
4657; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4658; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4659; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4660; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4661; CHECK-SD-DOT-NEXT:    add x10, x10, x9
4662; CHECK-SD-DOT-NEXT:    add x11, x11, x8
4663; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4664; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4665; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4666; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4667; CHECK-SD-DOT-NEXT:    add x10, x10, x9
4668; CHECK-SD-DOT-NEXT:    add x11, x11, x8
4669; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4670; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4671; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4672; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4673; CHECK-SD-DOT-NEXT:    add x10, x10, x9
4674; CHECK-SD-DOT-NEXT:    add x11, x11, x8
4675; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4676; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4677; CHECK-SD-DOT-NEXT:    ldr d1, [x10]
4678; CHECK-SD-DOT-NEXT:    ldr d4, [x11]
4679; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4680; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4681; CHECK-SD-DOT-NEXT:    ldr d1, [x10, x9]
4682; CHECK-SD-DOT-NEXT:    ldr d4, [x11, x8]
4683; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4684; CHECK-SD-DOT-NEXT:    uabd v0.8b, v1.8b, v4.8b
4685; CHECK-SD-DOT-NEXT:    udot v2.2s, v0.8b, v3.8b
4686; CHECK-SD-DOT-NEXT:    addp v0.2s, v2.2s, v2.2s
4687; CHECK-SD-DOT-NEXT:    fmov w0, s0
4688; CHECK-SD-DOT-NEXT:    ret
4689;
4690; CHECK-GI-LABEL: full:
4691; CHECK-GI:       // %bb.0: // %entry
4692; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
4693; CHECK-GI-NEXT:    // kill: def $w3 killed $w3 def $x3
4694; CHECK-GI-NEXT:    sxtw x9, w1
4695; CHECK-GI-NEXT:    sxtw x8, w3
4696; CHECK-GI-NEXT:    ldr d0, [x0]
4697; CHECK-GI-NEXT:    ldr d1, [x2]
4698; CHECK-GI-NEXT:    add x10, x0, x9
4699; CHECK-GI-NEXT:    add x11, x2, x8
4700; CHECK-GI-NEXT:    usubl v0.8h, v0.8b, v1.8b
4701; CHECK-GI-NEXT:    ldr d1, [x10]
4702; CHECK-GI-NEXT:    ldr d2, [x11]
4703; CHECK-GI-NEXT:    add x10, x10, x9
4704; CHECK-GI-NEXT:    add x11, x11, x8
4705; CHECK-GI-NEXT:    usubl v1.8h, v1.8b, v2.8b
4706; CHECK-GI-NEXT:    ldr d3, [x10]
4707; CHECK-GI-NEXT:    ldr d4, [x11]
4708; CHECK-GI-NEXT:    sshll v5.4s, v0.4h, #0
4709; CHECK-GI-NEXT:    sshll2 v0.4s, v0.8h, #0
4710; CHECK-GI-NEXT:    add x10, x10, x9
4711; CHECK-GI-NEXT:    add x11, x11, x8
4712; CHECK-GI-NEXT:    ldr d2, [x10]
4713; CHECK-GI-NEXT:    add x10, x10, x9
4714; CHECK-GI-NEXT:    sshll v7.4s, v1.4h, #0
4715; CHECK-GI-NEXT:    sshll2 v1.4s, v1.8h, #0
4716; CHECK-GI-NEXT:    ldr d6, [x11]
4717; CHECK-GI-NEXT:    add x11, x11, x8
4718; CHECK-GI-NEXT:    usubl v3.8h, v3.8b, v4.8b
4719; CHECK-GI-NEXT:    abs v5.4s, v5.4s
4720; CHECK-GI-NEXT:    abs v0.4s, v0.4s
4721; CHECK-GI-NEXT:    ldr d4, [x10]
4722; CHECK-GI-NEXT:    ldr d16, [x11]
4723; CHECK-GI-NEXT:    abs v7.4s, v7.4s
4724; CHECK-GI-NEXT:    abs v1.4s, v1.4s
4725; CHECK-GI-NEXT:    add x10, x10, x9
4726; CHECK-GI-NEXT:    add x11, x11, x8
4727; CHECK-GI-NEXT:    usubl v2.8h, v2.8b, v6.8b
4728; CHECK-GI-NEXT:    ldr d6, [x10]
4729; CHECK-GI-NEXT:    ldr d17, [x11]
4730; CHECK-GI-NEXT:    add x10, x10, x9
4731; CHECK-GI-NEXT:    add x11, x11, x8
4732; CHECK-GI-NEXT:    usubl v4.8h, v4.8b, v16.8b
4733; CHECK-GI-NEXT:    sshll v16.4s, v3.4h, #0
4734; CHECK-GI-NEXT:    sshll2 v3.4s, v3.8h, #0
4735; CHECK-GI-NEXT:    add v0.4s, v5.4s, v0.4s
4736; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
4737; CHECK-GI-NEXT:    ldr d5, [x10]
4738; CHECK-GI-NEXT:    ldr d7, [x11]
4739; CHECK-GI-NEXT:    sshll v18.4s, v2.4h, #0
4740; CHECK-GI-NEXT:    sshll2 v2.4s, v2.8h, #0
4741; CHECK-GI-NEXT:    usubl v6.8h, v6.8b, v17.8b
4742; CHECK-GI-NEXT:    ldr d17, [x11, x8]
4743; CHECK-GI-NEXT:    sshll v19.4s, v4.4h, #0
4744; CHECK-GI-NEXT:    usubl v5.8h, v5.8b, v7.8b
4745; CHECK-GI-NEXT:    ldr d7, [x10, x9]
4746; CHECK-GI-NEXT:    sshll2 v4.4s, v4.8h, #0
4747; CHECK-GI-NEXT:    abs v16.4s, v16.4s
4748; CHECK-GI-NEXT:    abs v3.4s, v3.4s
4749; CHECK-GI-NEXT:    abs v18.4s, v18.4s
4750; CHECK-GI-NEXT:    abs v2.4s, v2.4s
4751; CHECK-GI-NEXT:    usubl v7.8h, v7.8b, v17.8b
4752; CHECK-GI-NEXT:    sshll v17.4s, v6.4h, #0
4753; CHECK-GI-NEXT:    sshll2 v6.4s, v6.8h, #0
4754; CHECK-GI-NEXT:    abs v19.4s, v19.4s
4755; CHECK-GI-NEXT:    abs v4.4s, v4.4s
4756; CHECK-GI-NEXT:    add v3.4s, v16.4s, v3.4s
4757; CHECK-GI-NEXT:    sshll v16.4s, v5.4h, #0
4758; CHECK-GI-NEXT:    sshll2 v5.4s, v5.8h, #0
4759; CHECK-GI-NEXT:    add v2.4s, v18.4s, v2.4s
4760; CHECK-GI-NEXT:    abs v17.4s, v17.4s
4761; CHECK-GI-NEXT:    addv s1, v1.4s
4762; CHECK-GI-NEXT:    abs v6.4s, v6.4s
4763; CHECK-GI-NEXT:    addv s0, v0.4s
4764; CHECK-GI-NEXT:    add v4.4s, v19.4s, v4.4s
4765; CHECK-GI-NEXT:    addv s3, v3.4s
4766; CHECK-GI-NEXT:    sshll v18.4s, v7.4h, #0
4767; CHECK-GI-NEXT:    sshll2 v7.4s, v7.8h, #0
4768; CHECK-GI-NEXT:    abs v16.4s, v16.4s
4769; CHECK-GI-NEXT:    abs v5.4s, v5.4s
4770; CHECK-GI-NEXT:    fmov w8, s1
4771; CHECK-GI-NEXT:    add v6.4s, v17.4s, v6.4s
4772; CHECK-GI-NEXT:    addv s2, v2.4s
4773; CHECK-GI-NEXT:    fmov w9, s0
4774; CHECK-GI-NEXT:    addv s4, v4.4s
4775; CHECK-GI-NEXT:    fmov w10, s3
4776; CHECK-GI-NEXT:    abs v18.4s, v18.4s
4777; CHECK-GI-NEXT:    abs v7.4s, v7.4s
4778; CHECK-GI-NEXT:    add v1.4s, v16.4s, v5.4s
4779; CHECK-GI-NEXT:    add w8, w8, w9
4780; CHECK-GI-NEXT:    addv s3, v6.4s
4781; CHECK-GI-NEXT:    fmov w9, s2
4782; CHECK-GI-NEXT:    add w8, w10, w8
4783; CHECK-GI-NEXT:    fmov w10, s4
4784; CHECK-GI-NEXT:    add v0.4s, v18.4s, v7.4s
4785; CHECK-GI-NEXT:    addv s1, v1.4s
4786; CHECK-GI-NEXT:    add w8, w9, w8
4787; CHECK-GI-NEXT:    fmov w9, s3
4788; CHECK-GI-NEXT:    add w8, w10, w8
4789; CHECK-GI-NEXT:    addv s0, v0.4s
4790; CHECK-GI-NEXT:    add w8, w9, w8
4791; CHECK-GI-NEXT:    fmov w9, s1
4792; CHECK-GI-NEXT:    add w8, w9, w8
4793; CHECK-GI-NEXT:    fmov w9, s0
4794; CHECK-GI-NEXT:    add w0, w9, w8
4795; CHECK-GI-NEXT:    ret
4796entry:
4797  %idx.ext8 = sext i32 %s2 to i64
4798  %idx.ext = sext i32 %s1 to i64
4799  %0 = load <8 x i8>, ptr %p1, align 1
4800  %1 = zext <8 x i8> %0 to <8 x i32>
4801  %2 = load <8 x i8>, ptr %p2, align 1
4802  %3 = zext <8 x i8> %2 to <8 x i32>
4803  %4 = sub nsw <8 x i32> %1, %3
4804  %5 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %4, i1 true)
4805  %6 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %5)
4806  %add.ptr = getelementptr inbounds i8, ptr %p1, i64 %idx.ext
4807  %add.ptr9 = getelementptr inbounds i8, ptr %p2, i64 %idx.ext8
4808  %7 = load <8 x i8>, ptr %add.ptr, align 1
4809  %8 = zext <8 x i8> %7 to <8 x i32>
4810  %9 = load <8 x i8>, ptr %add.ptr9, align 1
4811  %10 = zext <8 x i8> %9 to <8 x i32>
4812  %11 = sub nsw <8 x i32> %8, %10
4813  %12 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %11, i1 true)
4814  %13 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %12)
4815  %op.rdx.1 = add i32 %13, %6
4816  %add.ptr.1 = getelementptr inbounds i8, ptr %add.ptr, i64 %idx.ext
4817  %add.ptr9.1 = getelementptr inbounds i8, ptr %add.ptr9, i64 %idx.ext8
4818  %14 = load <8 x i8>, ptr %add.ptr.1, align 1
4819  %15 = zext <8 x i8> %14 to <8 x i32>
4820  %16 = load <8 x i8>, ptr %add.ptr9.1, align 1
4821  %17 = zext <8 x i8> %16 to <8 x i32>
4822  %18 = sub nsw <8 x i32> %15, %17
4823  %19 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %18, i1 true)
4824  %20 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %19)
4825  %op.rdx.2 = add i32 %20, %op.rdx.1
4826  %add.ptr.2 = getelementptr inbounds i8, ptr %add.ptr.1, i64 %idx.ext
4827  %add.ptr9.2 = getelementptr inbounds i8, ptr %add.ptr9.1, i64 %idx.ext8
4828  %21 = load <8 x i8>, ptr %add.ptr.2, align 1
4829  %22 = zext <8 x i8> %21 to <8 x i32>
4830  %23 = load <8 x i8>, ptr %add.ptr9.2, align 1
4831  %24 = zext <8 x i8> %23 to <8 x i32>
4832  %25 = sub nsw <8 x i32> %22, %24
4833  %26 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %25, i1 true)
4834  %27 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %26)
4835  %op.rdx.3 = add i32 %27, %op.rdx.2
4836  %add.ptr.3 = getelementptr inbounds i8, ptr %add.ptr.2, i64 %idx.ext
4837  %add.ptr9.3 = getelementptr inbounds i8, ptr %add.ptr9.2, i64 %idx.ext8
4838  %28 = load <8 x i8>, ptr %add.ptr.3, align 1
4839  %29 = zext <8 x i8> %28 to <8 x i32>
4840  %30 = load <8 x i8>, ptr %add.ptr9.3, align 1
4841  %31 = zext <8 x i8> %30 to <8 x i32>
4842  %32 = sub nsw <8 x i32> %29, %31
4843  %33 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %32, i1 true)
4844  %34 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %33)
4845  %op.rdx.4 = add i32 %34, %op.rdx.3
4846  %add.ptr.4 = getelementptr inbounds i8, ptr %add.ptr.3, i64 %idx.ext
4847  %add.ptr9.4 = getelementptr inbounds i8, ptr %add.ptr9.3, i64 %idx.ext8
4848  %35 = load <8 x i8>, ptr %add.ptr.4, align 1
4849  %36 = zext <8 x i8> %35 to <8 x i32>
4850  %37 = load <8 x i8>, ptr %add.ptr9.4, align 1
4851  %38 = zext <8 x i8> %37 to <8 x i32>
4852  %39 = sub nsw <8 x i32> %36, %38
4853  %40 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %39, i1 true)
4854  %41 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %40)
4855  %op.rdx.5 = add i32 %41, %op.rdx.4
4856  %add.ptr.5 = getelementptr inbounds i8, ptr %add.ptr.4, i64 %idx.ext
4857  %add.ptr9.5 = getelementptr inbounds i8, ptr %add.ptr9.4, i64 %idx.ext8
4858  %42 = load <8 x i8>, ptr %add.ptr.5, align 1
4859  %43 = zext <8 x i8> %42 to <8 x i32>
4860  %44 = load <8 x i8>, ptr %add.ptr9.5, align 1
4861  %45 = zext <8 x i8> %44 to <8 x i32>
4862  %46 = sub nsw <8 x i32> %43, %45
4863  %47 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %46, i1 true)
4864  %48 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %47)
4865  %op.rdx.6 = add i32 %48, %op.rdx.5
4866  %add.ptr.6 = getelementptr inbounds i8, ptr %add.ptr.5, i64 %idx.ext
4867  %add.ptr9.6 = getelementptr inbounds i8, ptr %add.ptr9.5, i64 %idx.ext8
4868  %49 = load <8 x i8>, ptr %add.ptr.6, align 1
4869  %50 = zext <8 x i8> %49 to <8 x i32>
4870  %51 = load <8 x i8>, ptr %add.ptr9.6, align 1
4871  %52 = zext <8 x i8> %51 to <8 x i32>
4872  %53 = sub nsw <8 x i32> %50, %52
4873  %54 = tail call <8 x i32> @llvm.abs.v8i32(<8 x i32> %53, i1 true)
4874  %55 = tail call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %54)
4875  %op.rdx.7 = add i32 %55, %op.rdx.6
4876  ret i32 %op.rdx.7
4877}
4878
4879define i32 @extract_hi_lo(<8 x i16> %a) {
4880; CHECK-SD-LABEL: extract_hi_lo:
4881; CHECK-SD:       // %bb.0: // %entry
4882; CHECK-SD-NEXT:    uaddlv s0, v0.8h
4883; CHECK-SD-NEXT:    fmov w0, s0
4884; CHECK-SD-NEXT:    ret
4885;
4886; CHECK-GI-LABEL: extract_hi_lo:
4887; CHECK-GI:       // %bb.0: // %entry
4888; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
4889; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
4890; CHECK-GI-NEXT:    addv s0, v0.4s
4891; CHECK-GI-NEXT:    fmov w0, s0
4892; CHECK-GI-NEXT:    ret
4893entry:
4894  %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4895  %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4896  %z1 = zext <4 x i16> %e1 to <4 x i32>
4897  %z2 = zext <4 x i16> %e2 to <4 x i32>
4898  %z4 = add <4 x i32> %z1, %z2
4899  %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
4900  ret i32 %z5
4901}
4902
4903define i32 @extract_hi_hi(<8 x i16> %a) {
4904; CHECK-SD-LABEL: extract_hi_hi:
4905; CHECK-SD:       // %bb.0: // %entry
4906; CHECK-SD-NEXT:    mov v0.d[0], v0.d[1]
4907; CHECK-SD-NEXT:    uaddlv s0, v0.8h
4908; CHECK-SD-NEXT:    fmov w0, s0
4909; CHECK-SD-NEXT:    ret
4910;
4911; CHECK-GI-LABEL: extract_hi_hi:
4912; CHECK-GI:       // %bb.0: // %entry
4913; CHECK-GI-NEXT:    uaddl2 v0.4s, v0.8h, v0.8h
4914; CHECK-GI-NEXT:    addv s0, v0.4s
4915; CHECK-GI-NEXT:    fmov w0, s0
4916; CHECK-GI-NEXT:    ret
4917entry:
4918  %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
4919  %z2 = zext <4 x i16> %e2 to <4 x i32>
4920  %z4 = add <4 x i32> %z2, %z2
4921  %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
4922  ret i32 %z5
4923}
4924
4925define i32 @extract_lo_lo(<8 x i16> %a) {
4926; CHECK-SD-LABEL: extract_lo_lo:
4927; CHECK-SD:       // %bb.0: // %entry
4928; CHECK-SD-NEXT:    mov v0.d[1], v0.d[0]
4929; CHECK-SD-NEXT:    uaddlv s0, v0.8h
4930; CHECK-SD-NEXT:    fmov w0, s0
4931; CHECK-SD-NEXT:    ret
4932;
4933; CHECK-GI-LABEL: extract_lo_lo:
4934; CHECK-GI:       // %bb.0: // %entry
4935; CHECK-GI-NEXT:    uaddl v0.4s, v0.4h, v0.4h
4936; CHECK-GI-NEXT:    addv s0, v0.4s
4937; CHECK-GI-NEXT:    fmov w0, s0
4938; CHECK-GI-NEXT:    ret
4939entry:
4940  %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
4941  %z1 = zext <4 x i16> %e1 to <4 x i32>
4942  %z4 = add <4 x i32> %z1, %z1
4943  %z5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z4)
4944  ret i32 %z5
4945}
4946
4947declare <8 x i32> @llvm.abs.v8i32(<8 x i32>, i1 immarg) #1
4948declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>)
4949declare i16 @llvm.vector.reduce.add.v24i16(<24 x i16>)
4950declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>)
4951declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
4952declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)
4953declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
4954declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
4955declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
4956declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
4957declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
4958declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
4959declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
4960declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>)
4961declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
4962declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>)
4963declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>)
4964declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>)
4965declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>)
4966