xref: /llvm-project/llvm/test/CodeGen/AArch64/neon-dotreduce.ll (revision a35640f29e82dffbe87fb75af9b50c6e1312b455)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm    < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
3; RUN: llc -mtriple aarch64-linux-gnu -mattr=+dotprod,+i8mm -global-isel -global-isel-abort=2 < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
4
5; CHECK-GI:       warning: Instruction selection used fallback path for test_udot_v5i8
6; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v5i8_nomla
7; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v5i8
8; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v5i8_double
9; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v5i8_double_nomla
10; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v25i8
11; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v25i8_nomla
12; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v25i8
13; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v25i8_double
14; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v25i8_double_nomla
15; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v33i8
16; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_udot_v33i8_nomla
17; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v33i8
18; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v33i8_double
19; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for test_sdot_v33i8_double_nomla
20
21declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
22declare i32 @llvm.vector.reduce.add.v5i32(<5 x i32>)
23declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>)
24declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>)
25declare i32 @llvm.vector.reduce.add.v24i32(<24 x i32>)
26declare i32 @llvm.vector.reduce.add.v25i32(<25 x i32>)
27declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>)
28declare i32 @llvm.vector.reduce.add.v33i32(<33 x i32>)
29declare i32 @llvm.vector.reduce.add.v48i32(<48 x i32>)
30declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>)
31
32define i32 @test_udot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
33; CHECK-SD-LABEL: test_udot_v4i8:
34; CHECK-SD:       // %bb.0: // %entry
35; CHECK-SD-NEXT:    ldr s0, [x0]
36; CHECK-SD-NEXT:    ldr s1, [x1]
37; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
38; CHECK-SD-NEXT:    ushll v1.8h, v1.8b, #0
39; CHECK-SD-NEXT:    umull v0.4s, v1.4h, v0.4h
40; CHECK-SD-NEXT:    addv s0, v0.4s
41; CHECK-SD-NEXT:    fmov w8, s0
42; CHECK-SD-NEXT:    add w0, w8, w2
43; CHECK-SD-NEXT:    ret
44;
45; CHECK-GI-LABEL: test_udot_v4i8:
46; CHECK-GI:       // %bb.0: // %entry
47; CHECK-GI-NEXT:    ldr w8, [x0]
48; CHECK-GI-NEXT:    ldr w9, [x1]
49; CHECK-GI-NEXT:    fmov s0, w8
50; CHECK-GI-NEXT:    fmov s2, w9
51; CHECK-GI-NEXT:    uxtb w8, w8
52; CHECK-GI-NEXT:    uxtb w9, w9
53; CHECK-GI-NEXT:    mov b1, v0.b[1]
54; CHECK-GI-NEXT:    mov b3, v0.b[2]
55; CHECK-GI-NEXT:    mov b5, v2.b[2]
56; CHECK-GI-NEXT:    mov b4, v0.b[3]
57; CHECK-GI-NEXT:    mov b0, v2.b[1]
58; CHECK-GI-NEXT:    mov b6, v2.b[3]
59; CHECK-GI-NEXT:    fmov s2, w9
60; CHECK-GI-NEXT:    fmov w10, s1
61; CHECK-GI-NEXT:    fmov w11, s3
62; CHECK-GI-NEXT:    fmov s1, w8
63; CHECK-GI-NEXT:    fmov w13, s5
64; CHECK-GI-NEXT:    fmov w8, s4
65; CHECK-GI-NEXT:    fmov w12, s0
66; CHECK-GI-NEXT:    uxtb w10, w10
67; CHECK-GI-NEXT:    uxtb w11, w11
68; CHECK-GI-NEXT:    uxtb w13, w13
69; CHECK-GI-NEXT:    uxtb w8, w8
70; CHECK-GI-NEXT:    uxtb w12, w12
71; CHECK-GI-NEXT:    mov v1.h[1], w10
72; CHECK-GI-NEXT:    fmov w10, s6
73; CHECK-GI-NEXT:    fmov s0, w11
74; CHECK-GI-NEXT:    fmov s3, w13
75; CHECK-GI-NEXT:    mov v2.h[1], w12
76; CHECK-GI-NEXT:    uxtb w10, w10
77; CHECK-GI-NEXT:    mov v0.h[1], w8
78; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
79; CHECK-GI-NEXT:    mov v3.h[1], w10
80; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
81; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
82; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
83; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
84; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
85; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v1.4s
86; CHECK-GI-NEXT:    addv s0, v0.4s
87; CHECK-GI-NEXT:    fmov w8, s0
88; CHECK-GI-NEXT:    add w0, w8, w2
89; CHECK-GI-NEXT:    ret
90entry:
91  %0 = load <4 x i8>, ptr %a
92  %1 = zext <4 x i8> %0 to <4 x i32>
93  %2 = load <4 x i8>, ptr %b
94  %3 = zext <4 x i8> %2 to <4 x i32>
95  %4 = mul nuw nsw <4 x i32> %3, %1
96  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
97  %op.extra = add i32 %5, %sum
98  ret i32 %op.extra
99}
100
101define i32 @test_udot_v4i8_nomla(ptr nocapture readonly %a1) {
102; CHECK-SD-LABEL: test_udot_v4i8_nomla:
103; CHECK-SD:       // %bb.0: // %entry
104; CHECK-SD-NEXT:    ldr s0, [x0]
105; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
106; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
107; CHECK-SD-NEXT:    addv s0, v0.4s
108; CHECK-SD-NEXT:    fmov w0, s0
109; CHECK-SD-NEXT:    ret
110;
111; CHECK-GI-LABEL: test_udot_v4i8_nomla:
112; CHECK-GI:       // %bb.0: // %entry
113; CHECK-GI-NEXT:    ldr w8, [x0]
114; CHECK-GI-NEXT:    fmov s0, w8
115; CHECK-GI-NEXT:    mov b1, v0.b[1]
116; CHECK-GI-NEXT:    mov v2.b[0], v0.b[0]
117; CHECK-GI-NEXT:    mov b3, v0.b[2]
118; CHECK-GI-NEXT:    mov b0, v0.b[3]
119; CHECK-GI-NEXT:    mov v2.b[1], v1.b[0]
120; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
121; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
122; CHECK-GI-NEXT:    ushll v0.8h, v2.8b, #0
123; CHECK-GI-NEXT:    uaddlv s0, v0.4h
124; CHECK-GI-NEXT:    fmov w8, s0
125; CHECK-GI-NEXT:    and w0, w8, #0xffff
126; CHECK-GI-NEXT:    ret
127entry:
128  %0 = load <4 x i8>, ptr %a1
129  %1 = zext <4 x i8> %0 to <4 x i32>
130  %2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %1)
131  ret i32 %2
132}
133define i32 @test_sdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
134; CHECK-SD-LABEL: test_sdot_v4i8:
135; CHECK-SD:       // %bb.0: // %entry
136; CHECK-SD-NEXT:    ldr s0, [x0]
137; CHECK-SD-NEXT:    ldr s1, [x1]
138; CHECK-SD-NEXT:    sshll v0.8h, v0.8b, #0
139; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
140; CHECK-SD-NEXT:    smull v0.4s, v1.4h, v0.4h
141; CHECK-SD-NEXT:    addv s0, v0.4s
142; CHECK-SD-NEXT:    fmov w8, s0
143; CHECK-SD-NEXT:    add w0, w8, w2
144; CHECK-SD-NEXT:    ret
145;
146; CHECK-GI-LABEL: test_sdot_v4i8:
147; CHECK-GI:       // %bb.0: // %entry
148; CHECK-GI-NEXT:    ldr w8, [x0]
149; CHECK-GI-NEXT:    ldr w9, [x1]
150; CHECK-GI-NEXT:    fmov s0, w8
151; CHECK-GI-NEXT:    fmov s2, w9
152; CHECK-GI-NEXT:    sxtb w8, w8
153; CHECK-GI-NEXT:    sxtb w9, w9
154; CHECK-GI-NEXT:    mov b1, v0.b[1]
155; CHECK-GI-NEXT:    mov b3, v0.b[2]
156; CHECK-GI-NEXT:    mov b5, v2.b[2]
157; CHECK-GI-NEXT:    mov b4, v0.b[3]
158; CHECK-GI-NEXT:    mov b0, v2.b[1]
159; CHECK-GI-NEXT:    mov b6, v2.b[3]
160; CHECK-GI-NEXT:    fmov s2, w9
161; CHECK-GI-NEXT:    fmov w10, s1
162; CHECK-GI-NEXT:    fmov w11, s3
163; CHECK-GI-NEXT:    fmov s1, w8
164; CHECK-GI-NEXT:    fmov w13, s5
165; CHECK-GI-NEXT:    fmov w8, s4
166; CHECK-GI-NEXT:    fmov w12, s0
167; CHECK-GI-NEXT:    sxtb w10, w10
168; CHECK-GI-NEXT:    sxtb w11, w11
169; CHECK-GI-NEXT:    sxtb w13, w13
170; CHECK-GI-NEXT:    sxtb w8, w8
171; CHECK-GI-NEXT:    sxtb w12, w12
172; CHECK-GI-NEXT:    mov v1.h[1], w10
173; CHECK-GI-NEXT:    fmov w10, s6
174; CHECK-GI-NEXT:    fmov s0, w11
175; CHECK-GI-NEXT:    fmov s3, w13
176; CHECK-GI-NEXT:    mov v2.h[1], w12
177; CHECK-GI-NEXT:    sxtb w10, w10
178; CHECK-GI-NEXT:    mov v0.h[1], w8
179; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
180; CHECK-GI-NEXT:    mov v3.h[1], w10
181; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
182; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
183; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
184; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
185; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
186; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v1.4s
187; CHECK-GI-NEXT:    addv s0, v0.4s
188; CHECK-GI-NEXT:    fmov w8, s0
189; CHECK-GI-NEXT:    add w0, w8, w2
190; CHECK-GI-NEXT:    ret
191entry:
192  %0 = load <4 x i8>, ptr %a
193  %1 = sext <4 x i8> %0 to <4 x i32>
194  %2 = load <4 x i8>, ptr %b
195  %3 = sext <4 x i8> %2 to <4 x i32>
196  %4 = mul nsw <4 x i32> %3, %1
197  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
198  %op.extra = add nsw i32 %5, %sum
199  ret i32 %op.extra
200}
201
202define i32 @test_sdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
203; CHECK-SD-LABEL: test_sdot_v4i8_double:
204; CHECK-SD:       // %bb.0: // %entry
205; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
206; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
207; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
208; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
209; CHECK-SD-NEXT:    shl v2.4s, v2.4s, #24
210; CHECK-SD-NEXT:    shl v3.4s, v3.4s, #24
211; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
212; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
213; CHECK-SD-NEXT:    sshr v2.4s, v2.4s, #24
214; CHECK-SD-NEXT:    sshr v3.4s, v3.4s, #24
215; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #24
216; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
217; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v3.4s
218; CHECK-SD-NEXT:    mla v2.4s, v0.4s, v1.4s
219; CHECK-SD-NEXT:    addv s0, v2.4s
220; CHECK-SD-NEXT:    fmov w0, s0
221; CHECK-SD-NEXT:    ret
222;
223; CHECK-GI-LABEL: test_sdot_v4i8_double:
224; CHECK-GI:       // %bb.0: // %entry
225; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
226; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
227; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
228; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
229; CHECK-GI-NEXT:    shl v0.4s, v0.4s, #24
230; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
231; CHECK-GI-NEXT:    shl v2.4s, v2.4s, #24
232; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #24
233; CHECK-GI-NEXT:    sshr v0.4s, v0.4s, #24
234; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
235; CHECK-GI-NEXT:    sshr v2.4s, v2.4s, #24
236; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #24
237; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
238; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
239; CHECK-GI-NEXT:    addv s0, v0.4s
240; CHECK-GI-NEXT:    addv s1, v1.4s
241; CHECK-GI-NEXT:    fmov w8, s0
242; CHECK-GI-NEXT:    fmov w9, s1
243; CHECK-GI-NEXT:    add w0, w8, w9
244; CHECK-GI-NEXT:    ret
245entry:
246  %az = sext <4 x i8> %a to <4 x i32>
247  %bz = sext <4 x i8> %b to <4 x i32>
248  %m1 = mul nuw nsw <4 x i32> %az, %bz
249  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
250  %cz = sext <4 x i8> %c to <4 x i32>
251  %dz = sext <4 x i8> %d to <4 x i32>
252  %m2 = mul nuw nsw <4 x i32> %cz, %dz
253  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
254  %x = add i32 %r1, %r2
255  ret i32 %x
256}
257
258define i32 @test_sdot_v4i8_double_nomla(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
259; CHECK-SD-LABEL: test_sdot_v4i8_double_nomla:
260; CHECK-SD:       // %bb.0: // %entry
261; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
262; CHECK-SD-NEXT:    ushll v1.4s, v2.4h, #0
263; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
264; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
265; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
266; CHECK-SD-NEXT:    ssra v0.4s, v1.4s, #24
267; CHECK-SD-NEXT:    addv s0, v0.4s
268; CHECK-SD-NEXT:    fmov w0, s0
269; CHECK-SD-NEXT:    ret
270;
271; CHECK-GI-LABEL: test_sdot_v4i8_double_nomla:
272; CHECK-GI:       // %bb.0: // %entry
273; CHECK-GI-NEXT:    shl v1.4h, v2.4h, #8
274; CHECK-GI-NEXT:    shl v0.4h, v0.4h, #8
275; CHECK-GI-NEXT:    sshr v1.4h, v1.4h, #8
276; CHECK-GI-NEXT:    sshr v0.4h, v0.4h, #8
277; CHECK-GI-NEXT:    saddlv s1, v1.4h
278; CHECK-GI-NEXT:    saddlv s0, v0.4h
279; CHECK-GI-NEXT:    fmov w8, s1
280; CHECK-GI-NEXT:    fmov w9, s0
281; CHECK-GI-NEXT:    sxth w8, w8
282; CHECK-GI-NEXT:    add w0, w8, w9, sxth
283; CHECK-GI-NEXT:    ret
284entry:
285  %az = sext <4 x i8> %a to <4 x i32>
286  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %az)
287  %cz = sext <4 x i8> %c to <4 x i32>
288  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %cz)
289  %x = add i32 %r1, %r2
290  ret i32 %x
291}
292
293define i32 @test_usdot_v4i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
294; CHECK-SD-LABEL: test_usdot_v4i8:
295; CHECK-SD:       // %bb.0: // %entry
296; CHECK-SD-NEXT:    ldr s0, [x0]
297; CHECK-SD-NEXT:    ldr s1, [x1]
298; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
299; CHECK-SD-NEXT:    sshll v1.8h, v1.8b, #0
300; CHECK-SD-NEXT:    smull v0.4s, v1.4h, v0.4h
301; CHECK-SD-NEXT:    addv s0, v0.4s
302; CHECK-SD-NEXT:    fmov w8, s0
303; CHECK-SD-NEXT:    add w0, w8, w2
304; CHECK-SD-NEXT:    ret
305;
306; CHECK-GI-LABEL: test_usdot_v4i8:
307; CHECK-GI:       // %bb.0: // %entry
308; CHECK-GI-NEXT:    ldr w8, [x0]
309; CHECK-GI-NEXT:    ldr w9, [x1]
310; CHECK-GI-NEXT:    fmov s0, w8
311; CHECK-GI-NEXT:    fmov s2, w9
312; CHECK-GI-NEXT:    uxtb w8, w8
313; CHECK-GI-NEXT:    sxtb w9, w9
314; CHECK-GI-NEXT:    mov b1, v0.b[1]
315; CHECK-GI-NEXT:    mov b3, v0.b[2]
316; CHECK-GI-NEXT:    mov b5, v2.b[2]
317; CHECK-GI-NEXT:    mov b4, v0.b[3]
318; CHECK-GI-NEXT:    mov b0, v2.b[1]
319; CHECK-GI-NEXT:    mov b6, v2.b[3]
320; CHECK-GI-NEXT:    fmov s2, w9
321; CHECK-GI-NEXT:    fmov w10, s1
322; CHECK-GI-NEXT:    fmov w11, s3
323; CHECK-GI-NEXT:    fmov s1, w8
324; CHECK-GI-NEXT:    fmov w13, s5
325; CHECK-GI-NEXT:    fmov w8, s4
326; CHECK-GI-NEXT:    fmov w12, s0
327; CHECK-GI-NEXT:    uxtb w10, w10
328; CHECK-GI-NEXT:    uxtb w11, w11
329; CHECK-GI-NEXT:    sxtb w13, w13
330; CHECK-GI-NEXT:    uxtb w8, w8
331; CHECK-GI-NEXT:    sxtb w12, w12
332; CHECK-GI-NEXT:    mov v1.h[1], w10
333; CHECK-GI-NEXT:    fmov w10, s6
334; CHECK-GI-NEXT:    fmov s0, w11
335; CHECK-GI-NEXT:    fmov s3, w13
336; CHECK-GI-NEXT:    mov v2.h[1], w12
337; CHECK-GI-NEXT:    sxtb w10, w10
338; CHECK-GI-NEXT:    mov v0.h[1], w8
339; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
340; CHECK-GI-NEXT:    mov v3.h[1], w10
341; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
342; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
343; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
344; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
345; CHECK-GI-NEXT:    mov v2.d[1], v3.d[0]
346; CHECK-GI-NEXT:    mul v0.4s, v2.4s, v1.4s
347; CHECK-GI-NEXT:    addv s0, v0.4s
348; CHECK-GI-NEXT:    fmov w8, s0
349; CHECK-GI-NEXT:    add w0, w8, w2
350; CHECK-GI-NEXT:    ret
351entry:
352  %0 = load <4 x i8>, ptr %a
353  %1 = zext <4 x i8> %0 to <4 x i32>
354  %2 = load <4 x i8>, ptr %b
355  %3 = sext <4 x i8> %2 to <4 x i32>
356  %4 = mul nsw <4 x i32> %3, %1
357  %5 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %4)
358  %op.extra = add nsw i32 %5, %sum
359  ret i32 %op.extra
360}
361
362define i32 @test_usdot_v4i8_double(<4 x i8> %a, <4 x i8> %b, <4 x i8> %c, <4 x i8> %d) {
363; CHECK-SD-LABEL: test_usdot_v4i8_double:
364; CHECK-SD:       // %bb.0: // %entry
365; CHECK-SD-NEXT:    ushll v3.4s, v3.4h, #0
366; CHECK-SD-NEXT:    bic v2.4h, #255, lsl #8
367; CHECK-SD-NEXT:    ushll v1.4s, v1.4h, #0
368; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
369; CHECK-SD-NEXT:    shl v3.4s, v3.4s, #24
370; CHECK-SD-NEXT:    ushll v2.4s, v2.4h, #0
371; CHECK-SD-NEXT:    shl v1.4s, v1.4s, #24
372; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
373; CHECK-SD-NEXT:    sshr v3.4s, v3.4s, #24
374; CHECK-SD-NEXT:    sshr v1.4s, v1.4s, #24
375; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v3.4s
376; CHECK-SD-NEXT:    mla v2.4s, v0.4s, v1.4s
377; CHECK-SD-NEXT:    addv s0, v2.4s
378; CHECK-SD-NEXT:    fmov w0, s0
379; CHECK-SD-NEXT:    ret
380;
381; CHECK-GI-LABEL: test_usdot_v4i8_double:
382; CHECK-GI:       // %bb.0: // %entry
383; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
384; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
385; CHECK-GI-NEXT:    movi v4.2d, #0x0000ff000000ff
386; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
387; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
388; CHECK-GI-NEXT:    shl v1.4s, v1.4s, #24
389; CHECK-GI-NEXT:    shl v3.4s, v3.4s, #24
390; CHECK-GI-NEXT:    and v0.16b, v0.16b, v4.16b
391; CHECK-GI-NEXT:    and v2.16b, v2.16b, v4.16b
392; CHECK-GI-NEXT:    sshr v1.4s, v1.4s, #24
393; CHECK-GI-NEXT:    sshr v3.4s, v3.4s, #24
394; CHECK-GI-NEXT:    mul v0.4s, v0.4s, v1.4s
395; CHECK-GI-NEXT:    mul v1.4s, v2.4s, v3.4s
396; CHECK-GI-NEXT:    addv s0, v0.4s
397; CHECK-GI-NEXT:    addv s1, v1.4s
398; CHECK-GI-NEXT:    fmov w8, s0
399; CHECK-GI-NEXT:    fmov w9, s1
400; CHECK-GI-NEXT:    add w0, w8, w9
401; CHECK-GI-NEXT:    ret
402entry:
403  %az = zext <4 x i8> %a to <4 x i32>
404  %bz = sext <4 x i8> %b to <4 x i32>
405  %m1 = mul nuw nsw <4 x i32> %az, %bz
406  %r1 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m1)
407  %cz = zext <4 x i8> %c to <4 x i32>
408  %dz = sext <4 x i8> %d to <4 x i32>
409  %m2 = mul nuw nsw <4 x i32> %cz, %dz
410  %r2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %m2)
411  %x = add i32 %r1, %r2
412  ret i32 %x
413}
414
415define i32 @test_udot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
416; CHECK-LABEL: test_udot_v5i8:
417; CHECK:       // %bb.0: // %entry
418; CHECK-NEXT:    ldr d0, [x0]
419; CHECK-NEXT:    ldr d1, [x1]
420; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
421; CHECK-NEXT:    movi v1.2d, #0000000000000000
422; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
423; CHECK-NEXT:    mov v1.s[0], v2.s[0]
424; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
425; CHECK-NEXT:    addv s0, v0.4s
426; CHECK-NEXT:    fmov w8, s0
427; CHECK-NEXT:    add w0, w8, w2
428; CHECK-NEXT:    ret
429entry:
430  %0 = load <5 x i8>, ptr %a
431  %1 = zext <5 x i8> %0 to <5 x i32>
432  %2 = load <5 x i8>, ptr %b
433  %3 = zext <5 x i8> %2 to <5 x i32>
434  %4 = mul nuw nsw <5 x i32> %3, %1
435  %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
436  %op.extra = add i32 %5, %sum
437  ret i32 %op.extra
438}
439
440define i32 @test_udot_v5i8_nomla(ptr nocapture readonly %a1) {
441; CHECK-LABEL: test_udot_v5i8_nomla:
442; CHECK:       // %bb.0: // %entry
443; CHECK-NEXT:    ldr d0, [x0]
444; CHECK-NEXT:    movi v1.2d, #0000000000000000
445; CHECK-NEXT:    ushll v0.8h, v0.8b, #0
446; CHECK-NEXT:    ushll2 v2.4s, v0.8h, #0
447; CHECK-NEXT:    mov v1.s[0], v2.s[0]
448; CHECK-NEXT:    uaddw v0.4s, v1.4s, v0.4h
449; CHECK-NEXT:    addv s0, v0.4s
450; CHECK-NEXT:    fmov w0, s0
451; CHECK-NEXT:    ret
452entry:
453  %0 = load <5 x i8>, ptr %a1
454  %1 = zext <5 x i8> %0 to <5 x i32>
455  %2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %1)
456  ret i32 %2
457}
458define i32 @test_sdot_v5i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
459; CHECK-LABEL: test_sdot_v5i8:
460; CHECK:       // %bb.0: // %entry
461; CHECK-NEXT:    ldr d0, [x0]
462; CHECK-NEXT:    ldr d1, [x1]
463; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
464; CHECK-NEXT:    movi v1.2d, #0000000000000000
465; CHECK-NEXT:    sshll2 v2.4s, v0.8h, #0
466; CHECK-NEXT:    mov v1.s[0], v2.s[0]
467; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
468; CHECK-NEXT:    addv s0, v0.4s
469; CHECK-NEXT:    fmov w8, s0
470; CHECK-NEXT:    add w0, w8, w2
471; CHECK-NEXT:    ret
472entry:
473  %0 = load <5 x i8>, ptr %a
474  %1 = sext <5 x i8> %0 to <5 x i32>
475  %2 = load <5 x i8>, ptr %b
476  %3 = sext <5 x i8> %2 to <5 x i32>
477  %4 = mul nsw <5 x i32> %3, %1
478  %5 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %4)
479  %op.extra = add nsw i32 %5, %sum
480  ret i32 %op.extra
481}
482
483define i32 @test_sdot_v5i8_double(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
484; CHECK-LABEL: test_sdot_v5i8_double:
485; CHECK:       // %bb.0: // %entry
486; CHECK-NEXT:    smull v2.8h, v2.8b, v3.8b
487; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
488; CHECK-NEXT:    movi v1.2d, #0000000000000000
489; CHECK-NEXT:    movi v3.2d, #0000000000000000
490; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
491; CHECK-NEXT:    sshll2 v5.4s, v2.8h, #0
492; CHECK-NEXT:    mov v3.s[0], v4.s[0]
493; CHECK-NEXT:    mov v1.s[0], v5.s[0]
494; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
495; CHECK-NEXT:    saddw v1.4s, v1.4s, v2.4h
496; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
497; CHECK-NEXT:    addv s0, v0.4s
498; CHECK-NEXT:    fmov w0, s0
499; CHECK-NEXT:    ret
500entry:
501  %az = sext <5 x i8> %a to <5 x i32>
502  %bz = sext <5 x i8> %b to <5 x i32>
503  %m1 = mul nuw nsw <5 x i32> %az, %bz
504  %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m1)
505  %cz = sext <5 x i8> %c to <5 x i32>
506  %dz = sext <5 x i8> %d to <5 x i32>
507  %m2 = mul nuw nsw <5 x i32> %cz, %dz
508  %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %m2)
509  %x = add i32 %r1, %r2
510  ret i32 %x
511}
512
513define i32 @test_sdot_v5i8_double_nomla(<5 x i8> %a, <5 x i8> %b, <5 x i8> %c, <5 x i8> %d) {
514; CHECK-LABEL: test_sdot_v5i8_double_nomla:
515; CHECK:       // %bb.0: // %entry
516; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
517; CHECK-NEXT:    sshll v1.8h, v2.8b, #0
518; CHECK-NEXT:    movi v2.2d, #0000000000000000
519; CHECK-NEXT:    movi v3.2d, #0000000000000000
520; CHECK-NEXT:    sshll2 v4.4s, v0.8h, #0
521; CHECK-NEXT:    sshll2 v5.4s, v1.8h, #0
522; CHECK-NEXT:    mov v3.s[0], v4.s[0]
523; CHECK-NEXT:    mov v2.s[0], v5.s[0]
524; CHECK-NEXT:    saddw v0.4s, v3.4s, v0.4h
525; CHECK-NEXT:    saddw v1.4s, v2.4s, v1.4h
526; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
527; CHECK-NEXT:    addv s0, v0.4s
528; CHECK-NEXT:    fmov w0, s0
529; CHECK-NEXT:    ret
530entry:
531  %az = sext <5 x i8> %a to <5 x i32>
532  %r1 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %az)
533  %cz = sext <5 x i8> %c to <5 x i32>
534  %r2 = call i32 @llvm.vector.reduce.add.v5i32(<5 x i32> %cz)
535  %x = add i32 %r1, %r2
536  ret i32 %x
537}
538
539define i32 @test_udot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
540; CHECK-LABEL: test_udot_v8i8:
541; CHECK:       // %bb.0: // %entry
542; CHECK-NEXT:    movi v0.2d, #0000000000000000
543; CHECK-NEXT:    ldr d1, [x0]
544; CHECK-NEXT:    ldr d2, [x1]
545; CHECK-NEXT:    udot v0.2s, v2.8b, v1.8b
546; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
547; CHECK-NEXT:    fmov w0, s0
548; CHECK-NEXT:    ret
549entry:
550  %0 = load <8 x i8>, ptr %a
551  %1 = zext <8 x i8> %0 to <8 x i32>
552  %2 = load <8 x i8>, ptr %b
553  %3 = zext <8 x i8> %2 to <8 x i32>
554  %4 = mul nuw nsw <8 x i32> %3, %1
555  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
556  ret i32 %5
557}
558
559define i32 @test_udot_v8i8_nomla(ptr nocapture readonly %a1) {
560; CHECK-SD-LABEL: test_udot_v8i8_nomla:
561; CHECK-SD:       // %bb.0: // %entry
562; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
563; CHECK-SD-NEXT:    movi v1.8b, #1
564; CHECK-SD-NEXT:    ldr d2, [x0]
565; CHECK-SD-NEXT:    udot v0.2s, v2.8b, v1.8b
566; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
567; CHECK-SD-NEXT:    fmov w0, s0
568; CHECK-SD-NEXT:    ret
569;
570; CHECK-GI-LABEL: test_udot_v8i8_nomla:
571; CHECK-GI:       // %bb.0: // %entry
572; CHECK-GI-NEXT:    movi v0.8b, #1
573; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
574; CHECK-GI-NEXT:    ldr d2, [x0]
575; CHECK-GI-NEXT:    udot v1.2s, v2.8b, v0.8b
576; CHECK-GI-NEXT:    addp v0.2s, v1.2s, v1.2s
577; CHECK-GI-NEXT:    fmov w0, s0
578; CHECK-GI-NEXT:    ret
579entry:
580  %0 = load <8 x i8>, ptr %a1
581  %1 = zext <8 x i8> %0 to <8 x i32>
582  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
583  ret i32 %2
584}
585
586define i32 @test_sdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
587; CHECK-LABEL: test_sdot_v8i8:
588; CHECK:       // %bb.0: // %entry
589; CHECK-NEXT:    movi v0.2d, #0000000000000000
590; CHECK-NEXT:    ldr d1, [x0]
591; CHECK-NEXT:    ldr d2, [x1]
592; CHECK-NEXT:    sdot v0.2s, v2.8b, v1.8b
593; CHECK-NEXT:    addp v0.2s, v0.2s, v0.2s
594; CHECK-NEXT:    fmov w0, s0
595; CHECK-NEXT:    ret
596entry:
597  %0 = load <8 x i8>, ptr %a
598  %1 = sext <8 x i8> %0 to <8 x i32>
599  %2 = load <8 x i8>, ptr %b
600  %3 = sext <8 x i8> %2 to <8 x i32>
601  %4 = mul nsw <8 x i32> %3, %1
602  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
603  ret i32 %5
604}
605
606define i32 @test_sdot_v8i8_nomla(ptr nocapture readonly %a1) {
607; CHECK-SD-LABEL: test_sdot_v8i8_nomla:
608; CHECK-SD:       // %bb.0: // %entry
609; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
610; CHECK-SD-NEXT:    movi v1.8b, #1
611; CHECK-SD-NEXT:    ldr d2, [x0]
612; CHECK-SD-NEXT:    sdot v0.2s, v2.8b, v1.8b
613; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
614; CHECK-SD-NEXT:    fmov w0, s0
615; CHECK-SD-NEXT:    ret
616;
617; CHECK-GI-LABEL: test_sdot_v8i8_nomla:
618; CHECK-GI:       // %bb.0: // %entry
619; CHECK-GI-NEXT:    movi v0.8b, #1
620; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
621; CHECK-GI-NEXT:    ldr d2, [x0]
622; CHECK-GI-NEXT:    sdot v1.2s, v2.8b, v0.8b
623; CHECK-GI-NEXT:    addp v0.2s, v1.2s, v1.2s
624; CHECK-GI-NEXT:    fmov w0, s0
625; CHECK-GI-NEXT:    ret
626entry:
627  %0 = load <8 x i8>, ptr %a1
628  %1 = sext <8 x i8> %0 to <8 x i32>
629  %2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %1)
630  ret i32 %2
631}
632
633define i32 @test_usdot_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
634; CHECK-SD-LABEL: test_usdot_v8i8:
635; CHECK-SD:       // %bb.0: // %entry
636; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
637; CHECK-SD-NEXT:    ldr d1, [x0]
638; CHECK-SD-NEXT:    ldr d2, [x1]
639; CHECK-SD-NEXT:    usdot v0.2s, v1.8b, v2.8b
640; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
641; CHECK-SD-NEXT:    fmov w0, s0
642; CHECK-SD-NEXT:    ret
643;
644; CHECK-GI-LABEL: test_usdot_v8i8:
645; CHECK-GI:       // %bb.0: // %entry
646; CHECK-GI-NEXT:    ldr d0, [x0]
647; CHECK-GI-NEXT:    ldr d1, [x1]
648; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
649; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
650; CHECK-GI-NEXT:    ushll2 v2.4s, v0.8h, #0
651; CHECK-GI-NEXT:    sshll2 v3.4s, v1.8h, #0
652; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
653; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
654; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
655; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
656; CHECK-GI-NEXT:    addv s0, v2.4s
657; CHECK-GI-NEXT:    fmov w0, s0
658; CHECK-GI-NEXT:    ret
659entry:
660  %0 = load <8 x i8>, ptr %a
661  %1 = zext <8 x i8> %0 to <8 x i32>
662  %2 = load <8 x i8>, ptr %b
663  %3 = sext <8 x i8> %2 to <8 x i32>
664  %4 = mul nsw <8 x i32> %3, %1
665  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
666  ret i32 %5
667}
668
669define i32 @test_usdot_swapped_operands_v8i8(ptr nocapture readonly %a, ptr nocapture readonly %b) {
670; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8:
671; CHECK-SD:       // %bb.0: // %entry
672; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
673; CHECK-SD-NEXT:    ldr d1, [x0]
674; CHECK-SD-NEXT:    ldr d2, [x1]
675; CHECK-SD-NEXT:    usdot v0.2s, v2.8b, v1.8b
676; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
677; CHECK-SD-NEXT:    fmov w0, s0
678; CHECK-SD-NEXT:    ret
679;
680; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8:
681; CHECK-GI:       // %bb.0: // %entry
682; CHECK-GI-NEXT:    ldr d0, [x0]
683; CHECK-GI-NEXT:    ldr d1, [x1]
684; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
685; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
686; CHECK-GI-NEXT:    sshll2 v2.4s, v0.8h, #0
687; CHECK-GI-NEXT:    ushll2 v3.4s, v1.8h, #0
688; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
689; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
690; CHECK-GI-NEXT:    mul v2.4s, v3.4s, v2.4s
691; CHECK-GI-NEXT:    mla v2.4s, v1.4s, v0.4s
692; CHECK-GI-NEXT:    addv s0, v2.4s
693; CHECK-GI-NEXT:    fmov w0, s0
694; CHECK-GI-NEXT:    ret
695entry:
696  %0 = load <8 x i8>, ptr %a
697  %1 = sext <8 x i8> %0 to <8 x i32>
698  %2 = load <8 x i8>, ptr %b
699  %3 = zext <8 x i8> %2 to <8 x i32>
700  %4 = mul nsw <8 x i32> %3, %1
701  %5 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %4)
702  ret i32 %5
703}
704
705define i32 @test_udot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
706; CHECK-LABEL: test_udot_v16i8:
707; CHECK:       // %bb.0: // %entry
708; CHECK-NEXT:    movi v0.2d, #0000000000000000
709; CHECK-NEXT:    ldr q1, [x0]
710; CHECK-NEXT:    ldr q2, [x1]
711; CHECK-NEXT:    udot v0.4s, v2.16b, v1.16b
712; CHECK-NEXT:    addv s0, v0.4s
713; CHECK-NEXT:    fmov w8, s0
714; CHECK-NEXT:    add w0, w8, w2
715; CHECK-NEXT:    ret
716entry:
717  %0 = load <16 x i8>, ptr %a
718  %1 = zext <16 x i8> %0 to <16 x i32>
719  %2 = load <16 x i8>, ptr %b
720  %3 = zext <16 x i8> %2 to <16 x i32>
721  %4 = mul nuw nsw <16 x i32> %3, %1
722  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
723  %op.extra = add i32 %5, %sum
724  ret i32 %op.extra
725}
726
727define i32 @test_udot_v16i8_nomla(ptr nocapture readonly %a1) {
728; CHECK-LABEL: test_udot_v16i8_nomla:
729; CHECK:       // %bb.0: // %entry
730; CHECK-NEXT:    movi v0.16b, #1
731; CHECK-NEXT:    movi v1.2d, #0000000000000000
732; CHECK-NEXT:    ldr q2, [x0]
733; CHECK-NEXT:    udot v1.4s, v2.16b, v0.16b
734; CHECK-NEXT:    addv s0, v1.4s
735; CHECK-NEXT:    fmov w0, s0
736; CHECK-NEXT:    ret
737entry:
738  %0 = load <16 x i8>, ptr %a1
739  %1 = zext <16 x i8> %0 to <16 x i32>
740  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
741  ret i32 %2
742}
743
744define i32 @test_sdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
745; CHECK-LABEL: test_sdot_v16i8:
746; CHECK:       // %bb.0: // %entry
747; CHECK-NEXT:    movi v0.2d, #0000000000000000
748; CHECK-NEXT:    ldr q1, [x0]
749; CHECK-NEXT:    ldr q2, [x1]
750; CHECK-NEXT:    sdot v0.4s, v2.16b, v1.16b
751; CHECK-NEXT:    addv s0, v0.4s
752; CHECK-NEXT:    fmov w8, s0
753; CHECK-NEXT:    add w0, w8, w2
754; CHECK-NEXT:    ret
755entry:
756  %0 = load <16 x i8>, ptr %a
757  %1 = sext <16 x i8> %0 to <16 x i32>
758  %2 = load <16 x i8>, ptr %b
759  %3 = sext <16 x i8> %2 to <16 x i32>
760  %4 = mul nsw <16 x i32> %3, %1
761  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
762  %op.extra = add nsw i32 %5, %sum
763  ret i32 %op.extra
764}
765
766define i32 @test_sdot_v16i8_nomla(ptr nocapture readonly %a1) {
767; CHECK-LABEL: test_sdot_v16i8_nomla:
768; CHECK:       // %bb.0: // %entry
769; CHECK-NEXT:    movi v0.16b, #1
770; CHECK-NEXT:    movi v1.2d, #0000000000000000
771; CHECK-NEXT:    ldr q2, [x0]
772; CHECK-NEXT:    sdot v1.4s, v2.16b, v0.16b
773; CHECK-NEXT:    addv s0, v1.4s
774; CHECK-NEXT:    fmov w0, s0
775; CHECK-NEXT:    ret
776entry:
777  %0 = load <16 x i8>, ptr %a1
778  %1 = sext <16 x i8> %0 to <16 x i32>
779  %2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %1)
780  ret i32 %2
781}
782
783define i32 @test_usdot_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
784; CHECK-SD-LABEL: test_usdot_v16i8:
785; CHECK-SD:       // %bb.0: // %entry
786; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
787; CHECK-SD-NEXT:    ldr q1, [x0]
788; CHECK-SD-NEXT:    ldr q2, [x1]
789; CHECK-SD-NEXT:    usdot v0.4s, v1.16b, v2.16b
790; CHECK-SD-NEXT:    addv s0, v0.4s
791; CHECK-SD-NEXT:    fmov w8, s0
792; CHECK-SD-NEXT:    add w0, w8, w2
793; CHECK-SD-NEXT:    ret
794;
795; CHECK-GI-LABEL: test_usdot_v16i8:
796; CHECK-GI:       // %bb.0: // %entry
797; CHECK-GI-NEXT:    ldr q0, [x0]
798; CHECK-GI-NEXT:    ldr q1, [x1]
799; CHECK-GI-NEXT:    ushll v2.8h, v0.8b, #0
800; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
801; CHECK-GI-NEXT:    sshll v3.8h, v1.8b, #0
802; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
803; CHECK-GI-NEXT:    ushll2 v4.4s, v2.8h, #0
804; CHECK-GI-NEXT:    ushll2 v5.4s, v0.8h, #0
805; CHECK-GI-NEXT:    sshll2 v6.4s, v3.8h, #0
806; CHECK-GI-NEXT:    sshll2 v7.4s, v1.8h, #0
807; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
808; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
809; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
810; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
811; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
812; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
813; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
814; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
815; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
816; CHECK-GI-NEXT:    addv s0, v0.4s
817; CHECK-GI-NEXT:    fmov w8, s0
818; CHECK-GI-NEXT:    add w0, w8, w2
819; CHECK-GI-NEXT:    ret
820entry:
821  %0 = load <16 x i8>, ptr %a
822  %1 = zext <16 x i8> %0 to <16 x i32>
823  %2 = load <16 x i8>, ptr %b
824  %3 = sext <16 x i8> %2 to <16 x i32>
825  %4 = mul nsw <16 x i32> %3, %1
826  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
827  %op.extra = add nsw i32 %5, %sum
828  ret i32 %op.extra
829}
830
831define i32 @test_usdot_swapped_operands_v16i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
832; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8:
833; CHECK-SD:       // %bb.0: // %entry
834; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
835; CHECK-SD-NEXT:    ldr q1, [x0]
836; CHECK-SD-NEXT:    ldr q2, [x1]
837; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v1.16b
838; CHECK-SD-NEXT:    addv s0, v0.4s
839; CHECK-SD-NEXT:    fmov w8, s0
840; CHECK-SD-NEXT:    add w0, w8, w2
841; CHECK-SD-NEXT:    ret
842;
843; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8:
844; CHECK-GI:       // %bb.0: // %entry
845; CHECK-GI-NEXT:    ldr q0, [x0]
846; CHECK-GI-NEXT:    ldr q1, [x1]
847; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
848; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
849; CHECK-GI-NEXT:    ushll v3.8h, v1.8b, #0
850; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
851; CHECK-GI-NEXT:    sshll2 v4.4s, v2.8h, #0
852; CHECK-GI-NEXT:    sshll2 v5.4s, v0.8h, #0
853; CHECK-GI-NEXT:    ushll2 v6.4s, v3.8h, #0
854; CHECK-GI-NEXT:    ushll2 v7.4s, v1.8h, #0
855; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
856; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
857; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
858; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
859; CHECK-GI-NEXT:    mul v4.4s, v6.4s, v4.4s
860; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
861; CHECK-GI-NEXT:    mla v4.4s, v3.4s, v2.4s
862; CHECK-GI-NEXT:    mla v5.4s, v1.4s, v0.4s
863; CHECK-GI-NEXT:    add v0.4s, v4.4s, v5.4s
864; CHECK-GI-NEXT:    addv s0, v0.4s
865; CHECK-GI-NEXT:    fmov w8, s0
866; CHECK-GI-NEXT:    add w0, w8, w2
867; CHECK-GI-NEXT:    ret
868entry:
869  %0 = load <16 x i8>, ptr %a
870  %1 = sext <16 x i8> %0 to <16 x i32>
871  %2 = load <16 x i8>, ptr %b
872  %3 = zext <16 x i8> %2 to <16 x i32>
873  %4 = mul nsw <16 x i32> %3, %1
874  %5 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %4)
875  %op.extra = add nsw i32 %5, %sum
876  ret i32 %op.extra
877}
878
879define i32 @test_udot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
880; CHECK-SD-LABEL: test_udot_v8i8_double:
881; CHECK-SD:       // %bb.0: // %entry
882; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
883; CHECK-SD-NEXT:    udot v4.2s, v2.8b, v3.8b
884; CHECK-SD-NEXT:    udot v4.2s, v0.8b, v1.8b
885; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
886; CHECK-SD-NEXT:    fmov w0, s0
887; CHECK-SD-NEXT:    ret
888;
889; CHECK-GI-LABEL: test_udot_v8i8_double:
890; CHECK-GI:       // %bb.0: // %entry
891; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
892; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
893; CHECK-GI-NEXT:    udot v5.2s, v0.8b, v1.8b
894; CHECK-GI-NEXT:    udot v4.2s, v2.8b, v3.8b
895; CHECK-GI-NEXT:    addp v0.2s, v5.2s, v5.2s
896; CHECK-GI-NEXT:    addp v1.2s, v4.2s, v4.2s
897; CHECK-GI-NEXT:    fmov w8, s0
898; CHECK-GI-NEXT:    fmov w9, s1
899; CHECK-GI-NEXT:    add w0, w8, w9
900; CHECK-GI-NEXT:    ret
901entry:
902  %az = zext <8 x i8> %a to <8 x i32>
903  %bz = zext <8 x i8> %b to <8 x i32>
904  %m1 = mul nuw nsw <8 x i32> %az, %bz
905  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
906  %cz = zext <8 x i8> %c to <8 x i32>
907  %dz = zext <8 x i8> %d to <8 x i32>
908  %m2 = mul nuw nsw <8 x i32> %cz, %dz
909  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
910  %x = add i32 %r1, %r2
911  ret i32 %x
912}
913
914define i32 @test_udot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
915; CHECK-SD-LABEL: test_udot_v8i8_double_nomla:
916; CHECK-SD:       // %bb.0: // %entry
917; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
918; CHECK-SD-NEXT:    movi v3.8b, #1
919; CHECK-SD-NEXT:    udot v1.2s, v2.8b, v3.8b
920; CHECK-SD-NEXT:    udot v1.2s, v0.8b, v3.8b
921; CHECK-SD-NEXT:    addp v0.2s, v1.2s, v1.2s
922; CHECK-SD-NEXT:    fmov w0, s0
923; CHECK-SD-NEXT:    ret
924;
925; CHECK-GI-LABEL: test_udot_v8i8_double_nomla:
926; CHECK-GI:       // %bb.0: // %entry
927; CHECK-GI-NEXT:    movi v1.8b, #1
928; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
929; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
930; CHECK-GI-NEXT:    udot v4.2s, v0.8b, v1.8b
931; CHECK-GI-NEXT:    udot v3.2s, v2.8b, v1.8b
932; CHECK-GI-NEXT:    addp v0.2s, v4.2s, v4.2s
933; CHECK-GI-NEXT:    addp v1.2s, v3.2s, v3.2s
934; CHECK-GI-NEXT:    fmov w8, s0
935; CHECK-GI-NEXT:    fmov w9, s1
936; CHECK-GI-NEXT:    add w0, w8, w9
937; CHECK-GI-NEXT:    ret
938entry:
939  %az = zext <8 x i8> %a to <8 x i32>
940  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
941  %cz = zext <8 x i8> %c to <8 x i32>
942  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
943  %x = add i32 %r1, %r2
944  ret i32 %x
945}
946
947define i32 @test_udot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
948; CHECK-SD-LABEL: test_udot_v16i8_double:
949; CHECK-SD:       // %bb.0: // %entry
950; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
951; CHECK-SD-NEXT:    udot v4.4s, v2.16b, v3.16b
952; CHECK-SD-NEXT:    udot v4.4s, v0.16b, v1.16b
953; CHECK-SD-NEXT:    addv s0, v4.4s
954; CHECK-SD-NEXT:    fmov w0, s0
955; CHECK-SD-NEXT:    ret
956;
957; CHECK-GI-LABEL: test_udot_v16i8_double:
958; CHECK-GI:       // %bb.0: // %entry
959; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
960; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
961; CHECK-GI-NEXT:    udot v5.4s, v0.16b, v1.16b
962; CHECK-GI-NEXT:    udot v4.4s, v2.16b, v3.16b
963; CHECK-GI-NEXT:    addv s0, v5.4s
964; CHECK-GI-NEXT:    addv s1, v4.4s
965; CHECK-GI-NEXT:    fmov w8, s0
966; CHECK-GI-NEXT:    fmov w9, s1
967; CHECK-GI-NEXT:    add w0, w8, w9
968; CHECK-GI-NEXT:    ret
969entry:
970  %az = zext <16 x i8> %a to <16 x i32>
971  %bz = zext <16 x i8> %b to <16 x i32>
972  %m1 = mul nuw nsw <16 x i32> %az, %bz
973  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
974  %cz = zext <16 x i8> %c to <16 x i32>
975  %dz = zext <16 x i8> %d to <16 x i32>
976  %m2 = mul nuw nsw <16 x i32> %cz, %dz
977  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
978  %x = add i32 %r1, %r2
979  ret i32 %x
980}
981
982define i32 @test_udot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
983; CHECK-SD-LABEL: test_udot_v16i8_double_nomla:
984; CHECK-SD:       // %bb.0: // %entry
985; CHECK-SD-NEXT:    movi v1.16b, #1
986; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
987; CHECK-SD-NEXT:    udot v3.4s, v2.16b, v1.16b
988; CHECK-SD-NEXT:    udot v3.4s, v0.16b, v1.16b
989; CHECK-SD-NEXT:    addv s0, v3.4s
990; CHECK-SD-NEXT:    fmov w0, s0
991; CHECK-SD-NEXT:    ret
992;
993; CHECK-GI-LABEL: test_udot_v16i8_double_nomla:
994; CHECK-GI:       // %bb.0: // %entry
995; CHECK-GI-NEXT:    movi v1.16b, #1
996; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
997; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
998; CHECK-GI-NEXT:    udot v4.4s, v0.16b, v1.16b
999; CHECK-GI-NEXT:    udot v3.4s, v2.16b, v1.16b
1000; CHECK-GI-NEXT:    addv s0, v4.4s
1001; CHECK-GI-NEXT:    addv s1, v3.4s
1002; CHECK-GI-NEXT:    fmov w8, s0
1003; CHECK-GI-NEXT:    fmov w9, s1
1004; CHECK-GI-NEXT:    add w0, w8, w9
1005; CHECK-GI-NEXT:    ret
1006entry:
1007  %az = zext <16 x i8> %a to <16 x i32>
1008  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
1009  %cz = zext <16 x i8> %c to <16 x i32>
1010  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
1011  %x = add i32 %r1, %r2
1012  ret i32 %x
1013}
1014
1015define i32 @test_sdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1016; CHECK-SD-LABEL: test_sdot_v8i8_double:
1017; CHECK-SD:       // %bb.0: // %entry
1018; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1019; CHECK-SD-NEXT:    sdot v4.2s, v2.8b, v3.8b
1020; CHECK-SD-NEXT:    sdot v4.2s, v0.8b, v1.8b
1021; CHECK-SD-NEXT:    addp v0.2s, v4.2s, v4.2s
1022; CHECK-SD-NEXT:    fmov w0, s0
1023; CHECK-SD-NEXT:    ret
1024;
1025; CHECK-GI-LABEL: test_sdot_v8i8_double:
1026; CHECK-GI:       // %bb.0: // %entry
1027; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
1028; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
1029; CHECK-GI-NEXT:    sdot v5.2s, v0.8b, v1.8b
1030; CHECK-GI-NEXT:    sdot v4.2s, v2.8b, v3.8b
1031; CHECK-GI-NEXT:    addp v0.2s, v5.2s, v5.2s
1032; CHECK-GI-NEXT:    addp v1.2s, v4.2s, v4.2s
1033; CHECK-GI-NEXT:    fmov w8, s0
1034; CHECK-GI-NEXT:    fmov w9, s1
1035; CHECK-GI-NEXT:    add w0, w8, w9
1036; CHECK-GI-NEXT:    ret
1037entry:
1038  %az = sext <8 x i8> %a to <8 x i32>
1039  %bz = sext <8 x i8> %b to <8 x i32>
1040  %m1 = mul nuw nsw <8 x i32> %az, %bz
1041  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1042  %cz = sext <8 x i8> %c to <8 x i32>
1043  %dz = sext <8 x i8> %d to <8 x i32>
1044  %m2 = mul nuw nsw <8 x i32> %cz, %dz
1045  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1046  %x = add i32 %r1, %r2
1047  ret i32 %x
1048}
1049
1050define i32 @test_sdot_v8i8_double_nomla(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1051; CHECK-SD-LABEL: test_sdot_v8i8_double_nomla:
1052; CHECK-SD:       // %bb.0: // %entry
1053; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
1054; CHECK-SD-NEXT:    movi v3.8b, #1
1055; CHECK-SD-NEXT:    sdot v1.2s, v2.8b, v3.8b
1056; CHECK-SD-NEXT:    sdot v1.2s, v0.8b, v3.8b
1057; CHECK-SD-NEXT:    addp v0.2s, v1.2s, v1.2s
1058; CHECK-SD-NEXT:    fmov w0, s0
1059; CHECK-SD-NEXT:    ret
1060;
1061; CHECK-GI-LABEL: test_sdot_v8i8_double_nomla:
1062; CHECK-GI:       // %bb.0: // %entry
1063; CHECK-GI-NEXT:    movi v1.8b, #1
1064; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
1065; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
1066; CHECK-GI-NEXT:    sdot v4.2s, v0.8b, v1.8b
1067; CHECK-GI-NEXT:    sdot v3.2s, v2.8b, v1.8b
1068; CHECK-GI-NEXT:    addp v0.2s, v4.2s, v4.2s
1069; CHECK-GI-NEXT:    addp v1.2s, v3.2s, v3.2s
1070; CHECK-GI-NEXT:    fmov w8, s0
1071; CHECK-GI-NEXT:    fmov w9, s1
1072; CHECK-GI-NEXT:    add w0, w8, w9
1073; CHECK-GI-NEXT:    ret
1074entry:
1075  %az = sext <8 x i8> %a to <8 x i32>
1076  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %az)
1077  %cz = sext <8 x i8> %c to <8 x i32>
1078  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %cz)
1079  %x = add i32 %r1, %r2
1080  ret i32 %x
1081}
1082
1083define i32 @test_sdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1084; CHECK-SD-LABEL: test_sdot_v16i8_double:
1085; CHECK-SD:       // %bb.0: // %entry
1086; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1087; CHECK-SD-NEXT:    sdot v4.4s, v2.16b, v3.16b
1088; CHECK-SD-NEXT:    sdot v4.4s, v0.16b, v1.16b
1089; CHECK-SD-NEXT:    addv s0, v4.4s
1090; CHECK-SD-NEXT:    fmov w0, s0
1091; CHECK-SD-NEXT:    ret
1092;
1093; CHECK-GI-LABEL: test_sdot_v16i8_double:
1094; CHECK-GI:       // %bb.0: // %entry
1095; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
1096; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
1097; CHECK-GI-NEXT:    sdot v5.4s, v0.16b, v1.16b
1098; CHECK-GI-NEXT:    sdot v4.4s, v2.16b, v3.16b
1099; CHECK-GI-NEXT:    addv s0, v5.4s
1100; CHECK-GI-NEXT:    addv s1, v4.4s
1101; CHECK-GI-NEXT:    fmov w8, s0
1102; CHECK-GI-NEXT:    fmov w9, s1
1103; CHECK-GI-NEXT:    add w0, w8, w9
1104; CHECK-GI-NEXT:    ret
1105entry:
1106  %az = sext <16 x i8> %a to <16 x i32>
1107  %bz = sext <16 x i8> %b to <16 x i32>
1108  %m1 = mul nuw nsw <16 x i32> %az, %bz
1109  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1110  %cz = sext <16 x i8> %c to <16 x i32>
1111  %dz = sext <16 x i8> %d to <16 x i32>
1112  %m2 = mul nuw nsw <16 x i32> %cz, %dz
1113  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1114  %x = add i32 %r1, %r2
1115  ret i32 %x
1116}
1117
1118define i32 @test_sdot_v16i8_double_nomla(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1119; CHECK-SD-LABEL: test_sdot_v16i8_double_nomla:
1120; CHECK-SD:       // %bb.0: // %entry
1121; CHECK-SD-NEXT:    movi v1.16b, #1
1122; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
1123; CHECK-SD-NEXT:    sdot v3.4s, v2.16b, v1.16b
1124; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
1125; CHECK-SD-NEXT:    addv s0, v3.4s
1126; CHECK-SD-NEXT:    fmov w0, s0
1127; CHECK-SD-NEXT:    ret
1128;
1129; CHECK-GI-LABEL: test_sdot_v16i8_double_nomla:
1130; CHECK-GI:       // %bb.0: // %entry
1131; CHECK-GI-NEXT:    movi v1.16b, #1
1132; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
1133; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
1134; CHECK-GI-NEXT:    sdot v4.4s, v0.16b, v1.16b
1135; CHECK-GI-NEXT:    sdot v3.4s, v2.16b, v1.16b
1136; CHECK-GI-NEXT:    addv s0, v4.4s
1137; CHECK-GI-NEXT:    addv s1, v3.4s
1138; CHECK-GI-NEXT:    fmov w8, s0
1139; CHECK-GI-NEXT:    fmov w9, s1
1140; CHECK-GI-NEXT:    add w0, w8, w9
1141; CHECK-GI-NEXT:    ret
1142entry:
1143  %az = sext <16 x i8> %a to <16 x i32>
1144  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %az)
1145  %cz = sext <16 x i8> %c to <16 x i32>
1146  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %cz)
1147  %x = add i32 %r1, %r2
1148  ret i32 %x
1149}
1150
1151
1152define i32 @test_usdot_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1153; CHECK-SD-LABEL: test_usdot_v8i8_double:
1154; CHECK-SD:       // %bb.0: // %entry
1155; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1156; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
1157; CHECK-SD-NEXT:    usdot v5.2s, v0.8b, v1.8b
1158; CHECK-SD-NEXT:    usdot v4.2s, v2.8b, v3.8b
1159; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
1160; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
1161; CHECK-SD-NEXT:    fmov w0, s0
1162; CHECK-SD-NEXT:    ret
1163;
1164; CHECK-GI-LABEL: test_usdot_v8i8_double:
1165; CHECK-GI:       // %bb.0: // %entry
1166; CHECK-GI-NEXT:    ushll v0.8h, v0.8b, #0
1167; CHECK-GI-NEXT:    sshll v1.8h, v1.8b, #0
1168; CHECK-GI-NEXT:    ushll v2.8h, v2.8b, #0
1169; CHECK-GI-NEXT:    sshll v3.8h, v3.8b, #0
1170; CHECK-GI-NEXT:    ushll2 v4.4s, v0.8h, #0
1171; CHECK-GI-NEXT:    sshll2 v5.4s, v1.8h, #0
1172; CHECK-GI-NEXT:    ushll2 v6.4s, v2.8h, #0
1173; CHECK-GI-NEXT:    sshll2 v7.4s, v3.8h, #0
1174; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
1175; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
1176; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
1177; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
1178; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
1179; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
1180; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
1181; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
1182; CHECK-GI-NEXT:    addv s0, v4.4s
1183; CHECK-GI-NEXT:    addv s1, v5.4s
1184; CHECK-GI-NEXT:    fmov w8, s0
1185; CHECK-GI-NEXT:    fmov w9, s1
1186; CHECK-GI-NEXT:    add w0, w8, w9
1187; CHECK-GI-NEXT:    ret
1188entry:
1189  %az = zext <8 x i8> %a to <8 x i32>
1190  %bz = sext <8 x i8> %b to <8 x i32>
1191  %m1 = mul nuw nsw <8 x i32> %az, %bz
1192  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1193  %cz = zext <8 x i8> %c to <8 x i32>
1194  %dz = sext <8 x i8> %d to <8 x i32>
1195  %m2 = mul nuw nsw <8 x i32> %cz, %dz
1196  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1197  %x = add i32 %r1, %r2
1198  ret i32 %x
1199}
1200
1201define i32 @test_usdot_swapped_operands_v8i8_double(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
1202; CHECK-SD-LABEL: test_usdot_swapped_operands_v8i8_double:
1203; CHECK-SD:       // %bb.0: // %entry
1204; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1205; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
1206; CHECK-SD-NEXT:    usdot v5.2s, v1.8b, v0.8b
1207; CHECK-SD-NEXT:    usdot v4.2s, v3.8b, v2.8b
1208; CHECK-SD-NEXT:    add v0.2s, v5.2s, v4.2s
1209; CHECK-SD-NEXT:    addp v0.2s, v0.2s, v0.2s
1210; CHECK-SD-NEXT:    fmov w0, s0
1211; CHECK-SD-NEXT:    ret
1212;
1213; CHECK-GI-LABEL: test_usdot_swapped_operands_v8i8_double:
1214; CHECK-GI:       // %bb.0: // %entry
1215; CHECK-GI-NEXT:    sshll v0.8h, v0.8b, #0
1216; CHECK-GI-NEXT:    ushll v1.8h, v1.8b, #0
1217; CHECK-GI-NEXT:    sshll v2.8h, v2.8b, #0
1218; CHECK-GI-NEXT:    ushll v3.8h, v3.8b, #0
1219; CHECK-GI-NEXT:    sshll2 v4.4s, v0.8h, #0
1220; CHECK-GI-NEXT:    ushll2 v5.4s, v1.8h, #0
1221; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
1222; CHECK-GI-NEXT:    ushll2 v7.4s, v3.8h, #0
1223; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
1224; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
1225; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
1226; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
1227; CHECK-GI-NEXT:    mul v4.4s, v4.4s, v5.4s
1228; CHECK-GI-NEXT:    mul v5.4s, v6.4s, v7.4s
1229; CHECK-GI-NEXT:    mla v4.4s, v0.4s, v1.4s
1230; CHECK-GI-NEXT:    mla v5.4s, v2.4s, v3.4s
1231; CHECK-GI-NEXT:    addv s0, v4.4s
1232; CHECK-GI-NEXT:    addv s1, v5.4s
1233; CHECK-GI-NEXT:    fmov w8, s0
1234; CHECK-GI-NEXT:    fmov w9, s1
1235; CHECK-GI-NEXT:    add w0, w8, w9
1236; CHECK-GI-NEXT:    ret
1237entry:
1238  %az = sext <8 x i8> %a to <8 x i32>
1239  %bz = zext <8 x i8> %b to <8 x i32>
1240  %m1 = mul nuw nsw <8 x i32> %az, %bz
1241  %r1 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m1)
1242  %cz = sext <8 x i8> %c to <8 x i32>
1243  %dz = zext <8 x i8> %d to <8 x i32>
1244  %m2 = mul nuw nsw <8 x i32> %cz, %dz
1245  %r2 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %m2)
1246  %x = add i32 %r1, %r2
1247  ret i32 %x
1248}
1249
1250define i32 @test_usdot_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1251; CHECK-SD-LABEL: test_usdot_v16i8_double:
1252; CHECK-SD:       // %bb.0: // %entry
1253; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1254; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
1255; CHECK-SD-NEXT:    usdot v5.4s, v0.16b, v1.16b
1256; CHECK-SD-NEXT:    usdot v4.4s, v2.16b, v3.16b
1257; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
1258; CHECK-SD-NEXT:    addv s0, v0.4s
1259; CHECK-SD-NEXT:    fmov w0, s0
1260; CHECK-SD-NEXT:    ret
1261;
1262; CHECK-GI-LABEL: test_usdot_v16i8_double:
1263; CHECK-GI:       // %bb.0: // %entry
1264; CHECK-GI-NEXT:    ushll v4.8h, v0.8b, #0
1265; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
1266; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
1267; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
1268; CHECK-GI-NEXT:    ushll v6.8h, v2.8b, #0
1269; CHECK-GI-NEXT:    sshll v7.8h, v3.8b, #0
1270; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
1271; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
1272; CHECK-GI-NEXT:    ushll2 v16.4s, v4.8h, #0
1273; CHECK-GI-NEXT:    ushll2 v17.4s, v0.8h, #0
1274; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
1275; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
1276; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
1277; CHECK-GI-NEXT:    sshll2 v21.4s, v7.8h, #0
1278; CHECK-GI-NEXT:    ushll2 v22.4s, v2.8h, #0
1279; CHECK-GI-NEXT:    sshll2 v23.4s, v3.8h, #0
1280; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
1281; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
1282; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
1283; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
1284; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
1285; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
1286; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
1287; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
1288; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
1289; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
1290; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
1291; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
1292; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
1293; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
1294; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
1295; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
1296; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
1297; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
1298; CHECK-GI-NEXT:    addv s0, v0.4s
1299; CHECK-GI-NEXT:    addv s1, v1.4s
1300; CHECK-GI-NEXT:    fmov w8, s0
1301; CHECK-GI-NEXT:    fmov w9, s1
1302; CHECK-GI-NEXT:    add w0, w8, w9
1303; CHECK-GI-NEXT:    ret
1304entry:
1305  %az = zext <16 x i8> %a to <16 x i32>
1306  %bz = sext <16 x i8> %b to <16 x i32>
1307  %m1 = mul nuw nsw <16 x i32> %az, %bz
1308  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1309  %cz = zext <16 x i8> %c to <16 x i32>
1310  %dz = sext <16 x i8> %d to <16 x i32>
1311  %m2 = mul nuw nsw <16 x i32> %cz, %dz
1312  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1313  %x = add i32 %r1, %r2
1314  ret i32 %x
1315}
1316
1317
1318define i32 @test_usdot_swapped_operands_v16i8_double(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <16 x i8> %d) {
1319; CHECK-SD-LABEL: test_usdot_swapped_operands_v16i8_double:
1320; CHECK-SD:       // %bb.0: // %entry
1321; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
1322; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
1323; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v0.16b
1324; CHECK-SD-NEXT:    usdot v4.4s, v3.16b, v2.16b
1325; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
1326; CHECK-SD-NEXT:    addv s0, v0.4s
1327; CHECK-SD-NEXT:    fmov w0, s0
1328; CHECK-SD-NEXT:    ret
1329;
1330; CHECK-GI-LABEL: test_usdot_swapped_operands_v16i8_double:
1331; CHECK-GI:       // %bb.0: // %entry
1332; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
1333; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
1334; CHECK-GI-NEXT:    ushll v5.8h, v1.8b, #0
1335; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
1336; CHECK-GI-NEXT:    sshll v6.8h, v2.8b, #0
1337; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
1338; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
1339; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
1340; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
1341; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
1342; CHECK-GI-NEXT:    ushll2 v18.4s, v5.8h, #0
1343; CHECK-GI-NEXT:    ushll2 v19.4s, v1.8h, #0
1344; CHECK-GI-NEXT:    sshll2 v20.4s, v6.8h, #0
1345; CHECK-GI-NEXT:    ushll2 v21.4s, v7.8h, #0
1346; CHECK-GI-NEXT:    sshll2 v22.4s, v2.8h, #0
1347; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
1348; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
1349; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v18.4s
1350; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
1351; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
1352; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v19.4s
1353; CHECK-GI-NEXT:    mul v18.4s, v20.4s, v21.4s
1354; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
1355; CHECK-GI-NEXT:    mul v19.4s, v22.4s, v23.4s
1356; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
1357; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
1358; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
1359; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
1360; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v5.4s
1361; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v1.4s
1362; CHECK-GI-NEXT:    mla v18.4s, v6.4s, v7.4s
1363; CHECK-GI-NEXT:    mla v19.4s, v2.4s, v3.4s
1364; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
1365; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
1366; CHECK-GI-NEXT:    addv s0, v0.4s
1367; CHECK-GI-NEXT:    addv s1, v1.4s
1368; CHECK-GI-NEXT:    fmov w8, s0
1369; CHECK-GI-NEXT:    fmov w9, s1
1370; CHECK-GI-NEXT:    add w0, w8, w9
1371; CHECK-GI-NEXT:    ret
1372entry:
1373  %az = sext <16 x i8> %a to <16 x i32>
1374  %bz = zext <16 x i8> %b to <16 x i32>
1375  %m1 = mul nuw nsw <16 x i32> %az, %bz
1376  %r1 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m1)
1377  %cz = sext <16 x i8> %c to <16 x i32>
1378  %dz = zext <16 x i8> %d to <16 x i32>
1379  %m2 = mul nuw nsw <16 x i32> %cz, %dz
1380  %r2 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %m2)
1381  %x = add i32 %r1, %r2
1382  ret i32 %x
1383}
1384
1385define i32 @test_udot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1386; CHECK-SD-LABEL: test_udot_v24i8:
1387; CHECK-SD:       // %bb.0: // %entry
1388; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
1389; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
1390; CHECK-SD-NEXT:    ldr q2, [x0]
1391; CHECK-SD-NEXT:    ldr q3, [x1]
1392; CHECK-SD-NEXT:    ldr d4, [x0, #16]
1393; CHECK-SD-NEXT:    ldr d5, [x1, #16]
1394; CHECK-SD-NEXT:    udot v1.2s, v5.8b, v4.8b
1395; CHECK-SD-NEXT:    udot v0.4s, v3.16b, v2.16b
1396; CHECK-SD-NEXT:    addp v1.2s, v1.2s, v1.2s
1397; CHECK-SD-NEXT:    addv s0, v0.4s
1398; CHECK-SD-NEXT:    fmov w8, s1
1399; CHECK-SD-NEXT:    fmov w9, s0
1400; CHECK-SD-NEXT:    add w8, w9, w8
1401; CHECK-SD-NEXT:    add w0, w8, w2
1402; CHECK-SD-NEXT:    ret
1403;
1404; CHECK-GI-LABEL: test_udot_v24i8:
1405; CHECK-GI:       // %bb.0: // %entry
1406; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
1407; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
1408; CHECK-GI-NEXT:    ldr q2, [x0]
1409; CHECK-GI-NEXT:    ldr d3, [x0, #16]
1410; CHECK-GI-NEXT:    ldr q4, [x1]
1411; CHECK-GI-NEXT:    ldr d5, [x1, #16]
1412; CHECK-GI-NEXT:    udot v1.4s, v4.16b, v2.16b
1413; CHECK-GI-NEXT:    udot v0.4s, v5.16b, v3.16b
1414; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
1415; CHECK-GI-NEXT:    addv s0, v0.4s
1416; CHECK-GI-NEXT:    fmov w8, s0
1417; CHECK-GI-NEXT:    add w0, w8, w2
1418; CHECK-GI-NEXT:    ret
1419entry:
1420  %0 = load <24 x i8>, ptr %a
1421  %1 = zext <24 x i8> %0 to <24 x i32>
1422  %2 = load <24 x i8>, ptr %b
1423  %3 = zext <24 x i8> %2 to <24 x i32>
1424  %4 = mul nuw nsw <24 x i32> %3, %1
1425  %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
1426  %op.extra = add i32 %5, %sum
1427  ret i32 %op.extra
1428}
1429
1430define i32 @test_udot_v24i8_nomla(ptr nocapture readonly %a1) {
1431; CHECK-SD-LABEL: test_udot_v24i8_nomla:
1432; CHECK-SD:       // %bb.0: // %entry
1433; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
1434; CHECK-SD-NEXT:    movi v1.8b, #1
1435; CHECK-SD-NEXT:    ldr q4, [x0]
1436; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
1437; CHECK-SD-NEXT:    movi v3.16b, #1
1438; CHECK-SD-NEXT:    ldr d5, [x0, #16]
1439; CHECK-SD-NEXT:    udot v2.2s, v5.8b, v1.8b
1440; CHECK-SD-NEXT:    udot v0.4s, v4.16b, v3.16b
1441; CHECK-SD-NEXT:    addp v1.2s, v2.2s, v2.2s
1442; CHECK-SD-NEXT:    addv s0, v0.4s
1443; CHECK-SD-NEXT:    fmov w8, s1
1444; CHECK-SD-NEXT:    fmov w9, s0
1445; CHECK-SD-NEXT:    add w0, w9, w8
1446; CHECK-SD-NEXT:    ret
1447;
1448; CHECK-GI-LABEL: test_udot_v24i8_nomla:
1449; CHECK-GI:       // %bb.0: // %entry
1450; CHECK-GI-NEXT:    movi v0.8b, #1
1451; CHECK-GI-NEXT:    movi v1.8b, #1
1452; CHECK-GI-NEXT:    ldr q4, [x0]
1453; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
1454; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
1455; CHECK-GI-NEXT:    ldr d5, [x0, #16]
1456; CHECK-GI-NEXT:    mov v1.d[1], v0.d[0]
1457; CHECK-GI-NEXT:    udot v2.4s, v5.16b, v0.16b
1458; CHECK-GI-NEXT:    udot v3.4s, v4.16b, v1.16b
1459; CHECK-GI-NEXT:    add v0.4s, v3.4s, v2.4s
1460; CHECK-GI-NEXT:    addv s0, v0.4s
1461; CHECK-GI-NEXT:    fmov w0, s0
1462; CHECK-GI-NEXT:    ret
1463entry:
1464  %0 = load <24 x i8>, ptr %a1
1465  %1 = zext <24 x i8> %0 to <24 x i32>
1466  %2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %1)
1467  ret i32 %2
1468}
1469define i32 @test_sdot_v24i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
1470; CHECK-SD-LABEL: test_sdot_v24i8:
1471; CHECK-SD:       // %bb.0: // %entry
1472; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
1473; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
1474; CHECK-SD-NEXT:    ldr q2, [x0]
1475; CHECK-SD-NEXT:    ldr q3, [x1]
1476; CHECK-SD-NEXT:    ldr d4, [x0, #16]
1477; CHECK-SD-NEXT:    ldr d5, [x1, #16]
1478; CHECK-SD-NEXT:    sdot v1.2s, v5.8b, v4.8b
1479; CHECK-SD-NEXT:    sdot v0.4s, v3.16b, v2.16b
1480; CHECK-SD-NEXT:    addp v1.2s, v1.2s, v1.2s
1481; CHECK-SD-NEXT:    addv s0, v0.4s
1482; CHECK-SD-NEXT:    fmov w8, s1
1483; CHECK-SD-NEXT:    fmov w9, s0
1484; CHECK-SD-NEXT:    add w8, w9, w8
1485; CHECK-SD-NEXT:    add w0, w8, w2
1486; CHECK-SD-NEXT:    ret
1487;
1488; CHECK-GI-LABEL: test_sdot_v24i8:
1489; CHECK-GI:       // %bb.0: // %entry
1490; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
1491; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
1492; CHECK-GI-NEXT:    ldr q2, [x0]
1493; CHECK-GI-NEXT:    ldr d3, [x0, #16]
1494; CHECK-GI-NEXT:    ldr q4, [x1]
1495; CHECK-GI-NEXT:    ldr d5, [x1, #16]
1496; CHECK-GI-NEXT:    sdot v1.4s, v4.16b, v2.16b
1497; CHECK-GI-NEXT:    sdot v0.4s, v5.16b, v3.16b
1498; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
1499; CHECK-GI-NEXT:    addv s0, v0.4s
1500; CHECK-GI-NEXT:    fmov w8, s0
1501; CHECK-GI-NEXT:    add w0, w8, w2
1502; CHECK-GI-NEXT:    ret
1503entry:
1504  %0 = load <24 x i8>, ptr %a
1505  %1 = sext <24 x i8> %0 to <24 x i32>
1506  %2 = load <24 x i8>, ptr %b
1507  %3 = sext <24 x i8> %2 to <24 x i32>
1508  %4 = mul nsw <24 x i32> %3, %1
1509  %5 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %4)
1510  %op.extra = add nsw i32 %5, %sum
1511  ret i32 %op.extra
1512}
1513
1514define i32 @test_sdot_v24i8_double(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
1515; CHECK-SD-LABEL: test_sdot_v24i8_double:
1516; CHECK-SD:       // %bb.0: // %entry
1517; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1518; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
1519; CHECK-SD-NEXT:    .cfi_offset w29, -16
1520; CHECK-SD-NEXT:    fmov s0, w0
1521; CHECK-SD-NEXT:    ldr b1, [sp, #144]
1522; CHECK-SD-NEXT:    add x10, sp, #152
1523; CHECK-SD-NEXT:    add x9, sp, #160
1524; CHECK-SD-NEXT:    add x8, sp, #168
1525; CHECK-SD-NEXT:    ldr b2, [sp, #272]
1526; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x10]
1527; CHECK-SD-NEXT:    add x11, sp, #280
1528; CHECK-SD-NEXT:    ldr b3, [sp, #80]
1529; CHECK-SD-NEXT:    mov v0.b[1], w1
1530; CHECK-SD-NEXT:    ldr b4, [sp, #528]
1531; CHECK-SD-NEXT:    add x10, sp, #88
1532; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x11]
1533; CHECK-SD-NEXT:    add x11, sp, #536
1534; CHECK-SD-NEXT:    ldr b5, [sp, #336]
1535; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x9]
1536; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x10]
1537; CHECK-SD-NEXT:    add x10, sp, #344
1538; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x11]
1539; CHECK-SD-NEXT:    add x11, sp, #176
1540; CHECK-SD-NEXT:    ldr b6, [sp, #656]
1541; CHECK-SD-NEXT:    mov v0.b[2], w2
1542; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x10]
1543; CHECK-SD-NEXT:    ldr b7, [sp, #464]
1544; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
1545; CHECK-SD-NEXT:    add x12, sp, #664
1546; CHECK-SD-NEXT:    add x9, sp, #472
1547; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
1548; CHECK-SD-NEXT:    add x8, sp, #96
1549; CHECK-SD-NEXT:    add x10, sp, #184
1550; CHECK-SD-NEXT:    add x12, sp, #288
1551; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x9]
1552; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
1553; CHECK-SD-NEXT:    mov v0.b[3], w3
1554; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x11]
1555; CHECK-SD-NEXT:    add x8, sp, #352
1556; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x12]
1557; CHECK-SD-NEXT:    add x13, sp, #544
1558; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
1559; CHECK-SD-NEXT:    add x8, sp, #672
1560; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x13]
1561; CHECK-SD-NEXT:    add x9, sp, #192
1562; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x10]
1563; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
1564; CHECK-SD-NEXT:    add x8, sp, #480
1565; CHECK-SD-NEXT:    mov v0.b[4], w4
1566; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x8]
1567; CHECK-SD-NEXT:    add x8, sp, #296
1568; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x8]
1569; CHECK-SD-NEXT:    add x8, sp, #552
1570; CHECK-SD-NEXT:    add x12, sp, #200
1571; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x9]
1572; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x8]
1573; CHECK-SD-NEXT:    add x8, sp, #360
1574; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
1575; CHECK-SD-NEXT:    add x8, sp, #104
1576; CHECK-SD-NEXT:    add x9, sp, #560
1577; CHECK-SD-NEXT:    mov v0.b[5], w5
1578; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
1579; CHECK-SD-NEXT:    add x8, sp, #368
1580; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x12]
1581; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x9]
1582; CHECK-SD-NEXT:    add x13, sp, #208
1583; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
1584; CHECK-SD-NEXT:    add x12, sp, #304
1585; CHECK-SD-NEXT:    add x8, sp, #568
1586; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x12]
1587; CHECK-SD-NEXT:    add x12, sp, #16
1588; CHECK-SD-NEXT:    add x17, sp, #376
1589; CHECK-SD-NEXT:    mov v0.b[6], w6
1590; CHECK-SD-NEXT:    ld1 { v1.b }[8], [x13]
1591; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x8]
1592; CHECK-SD-NEXT:    add x14, sp, #216
1593; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x17]
1594; CHECK-SD-NEXT:    add x13, sp, #576
1595; CHECK-SD-NEXT:    add x11, sp, #224
1596; CHECK-SD-NEXT:    add x10, sp, #232
1597; CHECK-SD-NEXT:    add x15, sp, #240
1598; CHECK-SD-NEXT:    ld1 { v1.b }[9], [x14]
1599; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
1600; CHECK-SD-NEXT:    add x13, sp, #384
1601; CHECK-SD-NEXT:    mov v0.b[7], w7
1602; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x13]
1603; CHECK-SD-NEXT:    add x13, sp, #112
1604; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x13]
1605; CHECK-SD-NEXT:    add x13, sp, #32
1606; CHECK-SD-NEXT:    add x14, sp, #584
1607; CHECK-SD-NEXT:    ld1 { v1.b }[10], [x11]
1608; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x14]
1609; CHECK-SD-NEXT:    add x11, sp, #312
1610; CHECK-SD-NEXT:    add x14, sp, #40
1611; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x11]
1612; CHECK-SD-NEXT:    add x11, sp, #592
1613; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x12]
1614; CHECK-SD-NEXT:    add x12, sp, #24
1615; CHECK-SD-NEXT:    add x16, sp, #248
1616; CHECK-SD-NEXT:    ld1 { v1.b }[11], [x10]
1617; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x11]
1618; CHECK-SD-NEXT:    add x11, sp, #400
1619; CHECK-SD-NEXT:    add x9, sp, #256
1620; CHECK-SD-NEXT:    add x8, sp, #264
1621; CHECK-SD-NEXT:    add x10, sp, #72
1622; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x12]
1623; CHECK-SD-NEXT:    add x12, sp, #392
1624; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
1625; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x12]
1626; CHECK-SD-NEXT:    add x12, sp, #48
1627; CHECK-SD-NEXT:    ld1 { v1.b }[12], [x15]
1628; CHECK-SD-NEXT:    add x15, sp, #120
1629; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
1630; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
1631; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x13]
1632; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x15]
1633; CHECK-SD-NEXT:    add x15, sp, #408
1634; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x11]
1635; CHECK-SD-NEXT:    add x13, sp, #56
1636; CHECK-SD-NEXT:    ld1 { v1.b }[13], [x16]
1637; CHECK-SD-NEXT:    add x11, sp, #64
1638; CHECK-SD-NEXT:    add x16, sp, #616
1639; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
1640; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x14]
1641; CHECK-SD-NEXT:    add x14, sp, #600
1642; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
1643; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x15]
1644; CHECK-SD-NEXT:    add x15, sp, #608
1645; CHECK-SD-NEXT:    ld1 { v1.b }[14], [x9]
1646; CHECK-SD-NEXT:    add x9, sp, #488
1647; CHECK-SD-NEXT:    add x14, sp, #320
1648; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
1649; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x9]
1650; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x14]
1651; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x15]
1652; CHECK-SD-NEXT:    add x14, sp, #624
1653; CHECK-SD-NEXT:    add x9, sp, #688
1654; CHECK-SD-NEXT:    ld1 { v1.b }[15], [x8]
1655; CHECK-SD-NEXT:    add x8, sp, #432
1656; CHECK-SD-NEXT:    add x12, sp, #328
1657; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x13]
1658; CHECK-SD-NEXT:    add x13, sp, #416
1659; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x12]
1660; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x13]
1661; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x16]
1662; CHECK-SD-NEXT:    add x16, sp, #680
1663; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x16]
1664; CHECK-SD-NEXT:    add x13, sp, #632
1665; CHECK-SD-NEXT:    add x12, sp, #504
1666; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x11]
1667; CHECK-SD-NEXT:    add x11, sp, #424
1668; CHECK-SD-NEXT:    add x15, sp, #128
1669; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x11]
1670; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x14]
1671; CHECK-SD-NEXT:    add x11, sp, #696
1672; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x9]
1673; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x15]
1674; CHECK-SD-NEXT:    add x9, sp, #640
1675; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
1676; CHECK-SD-NEXT:    add x10, sp, #496
1677; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
1678; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x10]
1679; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x13]
1680; CHECK-SD-NEXT:    add x10, sp, #440
1681; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x11]
1682; CHECK-SD-NEXT:    add x11, sp, #512
1683; CHECK-SD-NEXT:    add x8, sp, #136
1684; CHECK-SD-NEXT:    sdot v17.4s, v0.16b, v1.16b
1685; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x10]
1686; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x12]
1687; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
1688; CHECK-SD-NEXT:    add x9, sp, #448
1689; CHECK-SD-NEXT:    add x10, sp, #704
1690; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x8]
1691; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x10]
1692; CHECK-SD-NEXT:    add x8, sp, #648
1693; CHECK-SD-NEXT:    add x10, sp, #520
1694; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x9]
1695; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x11]
1696; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x8]
1697; CHECK-SD-NEXT:    add x8, sp, #456
1698; CHECK-SD-NEXT:    add x9, sp, #712
1699; CHECK-SD-NEXT:    sdot v19.2s, v3.8b, v2.8b
1700; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x9]
1701; CHECK-SD-NEXT:    addv s0, v17.4s
1702; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x8]
1703; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x10]
1704; CHECK-SD-NEXT:    addp v1.2s, v19.2s, v19.2s
1705; CHECK-SD-NEXT:    fmov w8, s0
1706; CHECK-SD-NEXT:    sdot v16.4s, v5.16b, v4.16b
1707; CHECK-SD-NEXT:    sdot v18.2s, v7.8b, v6.8b
1708; CHECK-SD-NEXT:    fmov w9, s1
1709; CHECK-SD-NEXT:    addv s2, v16.4s
1710; CHECK-SD-NEXT:    addp v3.2s, v18.2s, v18.2s
1711; CHECK-SD-NEXT:    add w8, w8, w9
1712; CHECK-SD-NEXT:    fmov w10, s2
1713; CHECK-SD-NEXT:    fmov w11, s3
1714; CHECK-SD-NEXT:    add w9, w10, w11
1715; CHECK-SD-NEXT:    add w0, w8, w9
1716; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1717; CHECK-SD-NEXT:    ret
1718;
1719; CHECK-GI-LABEL: test_sdot_v24i8_double:
1720; CHECK-GI:       // %bb.0: // %entry
1721; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1722; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
1723; CHECK-GI-NEXT:    .cfi_offset w29, -16
1724; CHECK-GI-NEXT:    ldr w8, [sp, #80]
1725; CHECK-GI-NEXT:    ldr w9, [sp, #88]
1726; CHECK-GI-NEXT:    fmov s1, w0
1727; CHECK-GI-NEXT:    ldr w11, [sp, #336]
1728; CHECK-GI-NEXT:    ldr w10, [sp, #280]
1729; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
1730; CHECK-GI-NEXT:    fmov s0, w8
1731; CHECK-GI-NEXT:    ldr w8, [sp, #96]
1732; CHECK-GI-NEXT:    ldr w12, [sp, #152]
1733; CHECK-GI-NEXT:    mov v1.b[1], w1
1734; CHECK-GI-NEXT:    fmov s4, w11
1735; CHECK-GI-NEXT:    ldr w11, [sp, #584]
1736; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
1737; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
1738; CHECK-GI-NEXT:    movi v19.2d, #0000000000000000
1739; CHECK-GI-NEXT:    mov v0.b[1], w9
1740; CHECK-GI-NEXT:    ldr w9, [sp, #272]
1741; CHECK-GI-NEXT:    fmov s2, w9
1742; CHECK-GI-NEXT:    ldr w9, [sp, #144]
1743; CHECK-GI-NEXT:    mov v1.b[2], w2
1744; CHECK-GI-NEXT:    mov v0.b[2], w8
1745; CHECK-GI-NEXT:    ldr w8, [sp, #528]
1746; CHECK-GI-NEXT:    fmov s3, w9
1747; CHECK-GI-NEXT:    mov v2.b[1], w10
1748; CHECK-GI-NEXT:    ldr w9, [sp, #344]
1749; CHECK-GI-NEXT:    ldr w10, [sp, #536]
1750; CHECK-GI-NEXT:    fmov s5, w8
1751; CHECK-GI-NEXT:    ldr w8, [sp, #288]
1752; CHECK-GI-NEXT:    mov v1.b[3], w3
1753; CHECK-GI-NEXT:    mov v3.b[1], w12
1754; CHECK-GI-NEXT:    mov v4.b[1], w9
1755; CHECK-GI-NEXT:    ldr w9, [sp, #160]
1756; CHECK-GI-NEXT:    mov v5.b[1], w10
1757; CHECK-GI-NEXT:    mov v2.b[2], w8
1758; CHECK-GI-NEXT:    ldr w8, [sp, #104]
1759; CHECK-GI-NEXT:    ldr w10, [sp, #352]
1760; CHECK-GI-NEXT:    mov v1.b[4], w4
1761; CHECK-GI-NEXT:    mov v3.b[2], w9
1762; CHECK-GI-NEXT:    ldr w9, [sp, #544]
1763; CHECK-GI-NEXT:    mov v0.b[3], w8
1764; CHECK-GI-NEXT:    ldr w8, [sp, #296]
1765; CHECK-GI-NEXT:    mov v4.b[2], w10
1766; CHECK-GI-NEXT:    ldr w10, [sp, #360]
1767; CHECK-GI-NEXT:    mov v5.b[2], w9
1768; CHECK-GI-NEXT:    ldr w9, [sp, #168]
1769; CHECK-GI-NEXT:    mov v2.b[3], w8
1770; CHECK-GI-NEXT:    ldr w8, [sp, #112]
1771; CHECK-GI-NEXT:    mov v1.b[5], w5
1772; CHECK-GI-NEXT:    mov v3.b[3], w9
1773; CHECK-GI-NEXT:    ldr w9, [sp, #552]
1774; CHECK-GI-NEXT:    mov v0.b[4], w8
1775; CHECK-GI-NEXT:    ldr w8, [sp, #304]
1776; CHECK-GI-NEXT:    mov v4.b[3], w10
1777; CHECK-GI-NEXT:    mov v5.b[3], w9
1778; CHECK-GI-NEXT:    ldr w9, [sp, #176]
1779; CHECK-GI-NEXT:    ldr w10, [sp, #368]
1780; CHECK-GI-NEXT:    mov v2.b[4], w8
1781; CHECK-GI-NEXT:    ldr w8, [sp, #120]
1782; CHECK-GI-NEXT:    mov v1.b[6], w6
1783; CHECK-GI-NEXT:    mov v3.b[4], w9
1784; CHECK-GI-NEXT:    ldr w9, [sp, #560]
1785; CHECK-GI-NEXT:    mov v0.b[5], w8
1786; CHECK-GI-NEXT:    ldr w8, [sp, #312]
1787; CHECK-GI-NEXT:    mov v4.b[4], w10
1788; CHECK-GI-NEXT:    mov v5.b[4], w9
1789; CHECK-GI-NEXT:    ldr w9, [sp, #184]
1790; CHECK-GI-NEXT:    ldr w10, [sp, #376]
1791; CHECK-GI-NEXT:    mov v2.b[5], w8
1792; CHECK-GI-NEXT:    ldr w8, [sp, #128]
1793; CHECK-GI-NEXT:    mov v1.b[7], w7
1794; CHECK-GI-NEXT:    mov v3.b[5], w9
1795; CHECK-GI-NEXT:    ldr w9, [sp, #568]
1796; CHECK-GI-NEXT:    mov v0.b[6], w8
1797; CHECK-GI-NEXT:    ldr w8, [sp, #320]
1798; CHECK-GI-NEXT:    mov v4.b[5], w10
1799; CHECK-GI-NEXT:    mov v5.b[5], w9
1800; CHECK-GI-NEXT:    ldr w9, [sp, #192]
1801; CHECK-GI-NEXT:    ldr w10, [sp, #384]
1802; CHECK-GI-NEXT:    mov v2.b[6], w8
1803; CHECK-GI-NEXT:    ldr w8, [sp, #136]
1804; CHECK-GI-NEXT:    mov v3.b[6], w9
1805; CHECK-GI-NEXT:    ldr w9, [sp, #576]
1806; CHECK-GI-NEXT:    mov v0.b[7], w8
1807; CHECK-GI-NEXT:    ldr w8, [sp, #328]
1808; CHECK-GI-NEXT:    mov v4.b[6], w10
1809; CHECK-GI-NEXT:    ldr w10, [sp, #200]
1810; CHECK-GI-NEXT:    mov v5.b[6], w9
1811; CHECK-GI-NEXT:    ldr w9, [sp, #392]
1812; CHECK-GI-NEXT:    mov v2.b[7], w8
1813; CHECK-GI-NEXT:    ldr w8, [sp, #464]
1814; CHECK-GI-NEXT:    mov v3.b[7], w10
1815; CHECK-GI-NEXT:    ldr w10, [sp, #16]
1816; CHECK-GI-NEXT:    fmov s6, w8
1817; CHECK-GI-NEXT:    ldr w8, [sp, #208]
1818; CHECK-GI-NEXT:    mov v4.b[7], w9
1819; CHECK-GI-NEXT:    mov v1.b[8], w10
1820; CHECK-GI-NEXT:    ldr w10, [sp, #656]
1821; CHECK-GI-NEXT:    ldr w9, [sp, #472]
1822; CHECK-GI-NEXT:    mov v5.b[7], w11
1823; CHECK-GI-NEXT:    ldr w11, [sp, #400]
1824; CHECK-GI-NEXT:    fmov d0, d0
1825; CHECK-GI-NEXT:    fmov s7, w10
1826; CHECK-GI-NEXT:    mov v6.b[1], w9
1827; CHECK-GI-NEXT:    ldr w9, [sp, #592]
1828; CHECK-GI-NEXT:    mov v3.b[8], w8
1829; CHECK-GI-NEXT:    ldr w10, [sp, #664]
1830; CHECK-GI-NEXT:    ldr w8, [sp, #24]
1831; CHECK-GI-NEXT:    mov v4.b[8], w11
1832; CHECK-GI-NEXT:    ldr w11, [sp, #216]
1833; CHECK-GI-NEXT:    fmov d2, d2
1834; CHECK-GI-NEXT:    mov v5.b[8], w9
1835; CHECK-GI-NEXT:    ldr w9, [sp, #480]
1836; CHECK-GI-NEXT:    mov v7.b[1], w10
1837; CHECK-GI-NEXT:    mov v1.b[9], w8
1838; CHECK-GI-NEXT:    ldr w8, [sp, #408]
1839; CHECK-GI-NEXT:    ldr w10, [sp, #600]
1840; CHECK-GI-NEXT:    mov v3.b[9], w11
1841; CHECK-GI-NEXT:    mov v6.b[2], w9
1842; CHECK-GI-NEXT:    ldr w9, [sp, #672]
1843; CHECK-GI-NEXT:    ldr w11, [sp, #32]
1844; CHECK-GI-NEXT:    mov v4.b[9], w8
1845; CHECK-GI-NEXT:    ldr w8, [sp, #224]
1846; CHECK-GI-NEXT:    mov v5.b[9], w10
1847; CHECK-GI-NEXT:    ldr w10, [sp, #488]
1848; CHECK-GI-NEXT:    mov v7.b[2], w9
1849; CHECK-GI-NEXT:    mov v1.b[10], w11
1850; CHECK-GI-NEXT:    ldr w9, [sp, #416]
1851; CHECK-GI-NEXT:    ldr w11, [sp, #608]
1852; CHECK-GI-NEXT:    mov v3.b[10], w8
1853; CHECK-GI-NEXT:    mov v6.b[3], w10
1854; CHECK-GI-NEXT:    ldr w10, [sp, #680]
1855; CHECK-GI-NEXT:    ldr w8, [sp, #40]
1856; CHECK-GI-NEXT:    mov v4.b[10], w9
1857; CHECK-GI-NEXT:    ldr w9, [sp, #232]
1858; CHECK-GI-NEXT:    mov v5.b[10], w11
1859; CHECK-GI-NEXT:    ldr w11, [sp, #496]
1860; CHECK-GI-NEXT:    mov v7.b[3], w10
1861; CHECK-GI-NEXT:    mov v1.b[11], w8
1862; CHECK-GI-NEXT:    ldr w8, [sp, #424]
1863; CHECK-GI-NEXT:    ldr w10, [sp, #616]
1864; CHECK-GI-NEXT:    mov v3.b[11], w9
1865; CHECK-GI-NEXT:    mov v6.b[4], w11
1866; CHECK-GI-NEXT:    ldr w11, [sp, #688]
1867; CHECK-GI-NEXT:    ldr w9, [sp, #48]
1868; CHECK-GI-NEXT:    mov v4.b[11], w8
1869; CHECK-GI-NEXT:    ldr w8, [sp, #240]
1870; CHECK-GI-NEXT:    mov v5.b[11], w10
1871; CHECK-GI-NEXT:    ldr w10, [sp, #504]
1872; CHECK-GI-NEXT:    mov v7.b[4], w11
1873; CHECK-GI-NEXT:    mov v1.b[12], w9
1874; CHECK-GI-NEXT:    ldr w9, [sp, #432]
1875; CHECK-GI-NEXT:    ldr w11, [sp, #624]
1876; CHECK-GI-NEXT:    mov v3.b[12], w8
1877; CHECK-GI-NEXT:    mov v6.b[5], w10
1878; CHECK-GI-NEXT:    ldr w10, [sp, #696]
1879; CHECK-GI-NEXT:    ldr w8, [sp, #56]
1880; CHECK-GI-NEXT:    mov v4.b[12], w9
1881; CHECK-GI-NEXT:    ldr w9, [sp, #248]
1882; CHECK-GI-NEXT:    mov v5.b[12], w11
1883; CHECK-GI-NEXT:    ldr w11, [sp, #512]
1884; CHECK-GI-NEXT:    mov v7.b[5], w10
1885; CHECK-GI-NEXT:    mov v1.b[13], w8
1886; CHECK-GI-NEXT:    ldr w8, [sp, #440]
1887; CHECK-GI-NEXT:    ldr w10, [sp, #632]
1888; CHECK-GI-NEXT:    mov v3.b[13], w9
1889; CHECK-GI-NEXT:    mov v6.b[6], w11
1890; CHECK-GI-NEXT:    ldr w11, [sp, #704]
1891; CHECK-GI-NEXT:    ldr w9, [sp, #64]
1892; CHECK-GI-NEXT:    mov v4.b[13], w8
1893; CHECK-GI-NEXT:    ldr w8, [sp, #256]
1894; CHECK-GI-NEXT:    mov v5.b[13], w10
1895; CHECK-GI-NEXT:    ldr w10, [sp, #520]
1896; CHECK-GI-NEXT:    mov v7.b[6], w11
1897; CHECK-GI-NEXT:    mov v1.b[14], w9
1898; CHECK-GI-NEXT:    ldr w9, [sp, #448]
1899; CHECK-GI-NEXT:    ldr w11, [sp, #640]
1900; CHECK-GI-NEXT:    mov v3.b[14], w8
1901; CHECK-GI-NEXT:    mov v6.b[7], w10
1902; CHECK-GI-NEXT:    ldr w10, [sp, #712]
1903; CHECK-GI-NEXT:    ldr w8, [sp, #72]
1904; CHECK-GI-NEXT:    mov v4.b[14], w9
1905; CHECK-GI-NEXT:    ldr w9, [sp, #264]
1906; CHECK-GI-NEXT:    mov v5.b[14], w11
1907; CHECK-GI-NEXT:    mov v7.b[7], w10
1908; CHECK-GI-NEXT:    sdot v18.4s, v0.16b, v2.16b
1909; CHECK-GI-NEXT:    mov v1.b[15], w8
1910; CHECK-GI-NEXT:    ldr w8, [sp, #456]
1911; CHECK-GI-NEXT:    mov v3.b[15], w9
1912; CHECK-GI-NEXT:    ldr w9, [sp, #648]
1913; CHECK-GI-NEXT:    fmov d6, d6
1914; CHECK-GI-NEXT:    mov v4.b[15], w8
1915; CHECK-GI-NEXT:    mov v5.b[15], w9
1916; CHECK-GI-NEXT:    fmov d7, d7
1917; CHECK-GI-NEXT:    sdot v17.4s, v1.16b, v3.16b
1918; CHECK-GI-NEXT:    sdot v19.4s, v4.16b, v5.16b
1919; CHECK-GI-NEXT:    sdot v16.4s, v6.16b, v7.16b
1920; CHECK-GI-NEXT:    add v0.4s, v17.4s, v18.4s
1921; CHECK-GI-NEXT:    add v1.4s, v19.4s, v16.4s
1922; CHECK-GI-NEXT:    addv s0, v0.4s
1923; CHECK-GI-NEXT:    addv s1, v1.4s
1924; CHECK-GI-NEXT:    fmov w8, s0
1925; CHECK-GI-NEXT:    fmov w9, s1
1926; CHECK-GI-NEXT:    add w0, w8, w9
1927; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
1928; CHECK-GI-NEXT:    ret
1929entry:
1930  %az = sext <24 x i8> %a to <24 x i32>
1931  %bz = sext <24 x i8> %b to <24 x i32>
1932  %m1 = mul nuw nsw <24 x i32> %az, %bz
1933  %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m1)
1934  %cz = sext <24 x i8> %c to <24 x i32>
1935  %dz = sext <24 x i8> %d to <24 x i32>
1936  %m2 = mul nuw nsw <24 x i32> %cz, %dz
1937  %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %m2)
1938  %x = add i32 %r1, %r2
1939  ret i32 %x
1940}
1941
1942define i32 @test_sdot_v24i8_double_nomla(<24 x i8> %a, <24 x i8> %b, <24 x i8> %c, <24 x i8> %d) {
1943; CHECK-SD-LABEL: test_sdot_v24i8_double_nomla:
1944; CHECK-SD:       // %bb.0: // %entry
1945; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
1946; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
1947; CHECK-SD-NEXT:    .cfi_offset w29, -16
1948; CHECK-SD-NEXT:    fmov s0, w0
1949; CHECK-SD-NEXT:    ldr b1, [sp, #336]
1950; CHECK-SD-NEXT:    add x8, sp, #344
1951; CHECK-SD-NEXT:    add x9, sp, #400
1952; CHECK-SD-NEXT:    ldr b2, [sp, #80]
1953; CHECK-SD-NEXT:    ldr b3, [sp, #464]
1954; CHECK-SD-NEXT:    ld1 { v1.b }[1], [x8]
1955; CHECK-SD-NEXT:    add x8, sp, #352
1956; CHECK-SD-NEXT:    add x10, sp, #408
1957; CHECK-SD-NEXT:    mov v0.b[1], w1
1958; CHECK-SD-NEXT:    add x11, sp, #472
1959; CHECK-SD-NEXT:    add x12, sp, #480
1960; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x11]
1961; CHECK-SD-NEXT:    add x11, sp, #416
1962; CHECK-SD-NEXT:    add x13, sp, #488
1963; CHECK-SD-NEXT:    ld1 { v1.b }[2], [x8]
1964; CHECK-SD-NEXT:    add x8, sp, #360
1965; CHECK-SD-NEXT:    add x14, sp, #496
1966; CHECK-SD-NEXT:    movi v4.16b, #1
1967; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
1968; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
1969; CHECK-SD-NEXT:    mov v0.b[2], w2
1970; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x12]
1971; CHECK-SD-NEXT:    add x12, sp, #424
1972; CHECK-SD-NEXT:    ld1 { v1.b }[3], [x8]
1973; CHECK-SD-NEXT:    add x8, sp, #368
1974; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
1975; CHECK-SD-NEXT:    movi v16.8b, #1
1976; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
1977; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x13]
1978; CHECK-SD-NEXT:    add x13, sp, #432
1979; CHECK-SD-NEXT:    mov v0.b[3], w3
1980; CHECK-SD-NEXT:    ld1 { v1.b }[4], [x8]
1981; CHECK-SD-NEXT:    add x8, sp, #376
1982; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x14]
1983; CHECK-SD-NEXT:    ld1 { v1.b }[5], [x8]
1984; CHECK-SD-NEXT:    add x8, sp, #384
1985; CHECK-SD-NEXT:    mov v0.b[4], w4
1986; CHECK-SD-NEXT:    ld1 { v1.b }[6], [x8]
1987; CHECK-SD-NEXT:    add x8, sp, #392
1988; CHECK-SD-NEXT:    mov v0.b[5], w5
1989; CHECK-SD-NEXT:    ld1 { v1.b }[7], [x8]
1990; CHECK-SD-NEXT:    add x8, sp, #16
1991; CHECK-SD-NEXT:    mov v0.b[6], w6
1992; CHECK-SD-NEXT:    ld1 { v1.b }[8], [x9]
1993; CHECK-SD-NEXT:    add x9, sp, #88
1994; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
1995; CHECK-SD-NEXT:    add x9, sp, #40
1996; CHECK-SD-NEXT:    ld1 { v1.b }[9], [x10]
1997; CHECK-SD-NEXT:    add x10, sp, #96
1998; CHECK-SD-NEXT:    mov v0.b[7], w7
1999; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x10]
2000; CHECK-SD-NEXT:    add x10, sp, #56
2001; CHECK-SD-NEXT:    ld1 { v1.b }[10], [x11]
2002; CHECK-SD-NEXT:    add x11, sp, #104
2003; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x11]
2004; CHECK-SD-NEXT:    add x11, sp, #72
2005; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x8]
2006; CHECK-SD-NEXT:    add x8, sp, #24
2007; CHECK-SD-NEXT:    ld1 { v1.b }[11], [x12]
2008; CHECK-SD-NEXT:    add x12, sp, #112
2009; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x12]
2010; CHECK-SD-NEXT:    add x12, sp, #440
2011; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x8]
2012; CHECK-SD-NEXT:    add x8, sp, #32
2013; CHECK-SD-NEXT:    ld1 { v1.b }[12], [x13]
2014; CHECK-SD-NEXT:    add x13, sp, #504
2015; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x13]
2016; CHECK-SD-NEXT:    add x13, sp, #512
2017; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
2018; CHECK-SD-NEXT:    add x8, sp, #48
2019; CHECK-SD-NEXT:    ld1 { v1.b }[13], [x12]
2020; CHECK-SD-NEXT:    add x12, sp, #448
2021; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x13]
2022; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x9]
2023; CHECK-SD-NEXT:    add x9, sp, #64
2024; CHECK-SD-NEXT:    ld1 { v1.b }[14], [x12]
2025; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x8]
2026; CHECK-SD-NEXT:    add x8, sp, #120
2027; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x8]
2028; CHECK-SD-NEXT:    add x8, sp, #128
2029; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x10]
2030; CHECK-SD-NEXT:    add x10, sp, #136
2031; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x8]
2032; CHECK-SD-NEXT:    add x8, sp, #456
2033; CHECK-SD-NEXT:    ld1 { v1.b }[15], [x8]
2034; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x9]
2035; CHECK-SD-NEXT:    add x9, sp, #520
2036; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x10]
2037; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x9]
2038; CHECK-SD-NEXT:    sdot v5.4s, v1.16b, v4.16b
2039; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x11]
2040; CHECK-SD-NEXT:    sdot v17.2s, v2.8b, v16.8b
2041; CHECK-SD-NEXT:    sdot v7.2s, v3.8b, v16.8b
2042; CHECK-SD-NEXT:    sdot v6.4s, v0.16b, v4.16b
2043; CHECK-SD-NEXT:    addv s3, v5.4s
2044; CHECK-SD-NEXT:    addp v1.2s, v17.2s, v17.2s
2045; CHECK-SD-NEXT:    addp v2.2s, v7.2s, v7.2s
2046; CHECK-SD-NEXT:    fmov w10, s3
2047; CHECK-SD-NEXT:    addv s0, v6.4s
2048; CHECK-SD-NEXT:    fmov w9, s1
2049; CHECK-SD-NEXT:    fmov w11, s2
2050; CHECK-SD-NEXT:    fmov w8, s0
2051; CHECK-SD-NEXT:    add w8, w8, w9
2052; CHECK-SD-NEXT:    add w9, w10, w11
2053; CHECK-SD-NEXT:    add w0, w8, w9
2054; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
2055; CHECK-SD-NEXT:    ret
2056;
2057; CHECK-GI-LABEL: test_sdot_v24i8_double_nomla:
2058; CHECK-GI:       // %bb.0: // %entry
2059; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
2060; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
2061; CHECK-GI-NEXT:    .cfi_offset w29, -16
2062; CHECK-GI-NEXT:    ldr w9, [sp, #336]
2063; CHECK-GI-NEXT:    ldr w8, [sp, #344]
2064; CHECK-GI-NEXT:    fmov s0, w0
2065; CHECK-GI-NEXT:    ldr w10, [sp, #16]
2066; CHECK-GI-NEXT:    ldr w11, [sp, #88]
2067; CHECK-GI-NEXT:    movi v4.8b, #1
2068; CHECK-GI-NEXT:    fmov s1, w9
2069; CHECK-GI-NEXT:    ldr w9, [sp, #464]
2070; CHECK-GI-NEXT:    ldr w12, [sp, #400]
2071; CHECK-GI-NEXT:    mov v0.b[1], w1
2072; CHECK-GI-NEXT:    movi v5.8b, #1
2073; CHECK-GI-NEXT:    movi v6.8b, #1
2074; CHECK-GI-NEXT:    fmov s2, w9
2075; CHECK-GI-NEXT:    ldr w9, [sp, #96]
2076; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
2077; CHECK-GI-NEXT:    mov v1.b[1], w8
2078; CHECK-GI-NEXT:    ldr w8, [sp, #352]
2079; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
2080; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
2081; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
2082; CHECK-GI-NEXT:    mov v0.b[2], w2
2083; CHECK-GI-NEXT:    mov v5.d[1], v4.d[0]
2084; CHECK-GI-NEXT:    mov v6.d[1], v4.d[0]
2085; CHECK-GI-NEXT:    mov v1.b[2], w8
2086; CHECK-GI-NEXT:    ldr w8, [sp, #360]
2087; CHECK-GI-NEXT:    mov v0.b[3], w3
2088; CHECK-GI-NEXT:    mov v1.b[3], w8
2089; CHECK-GI-NEXT:    ldr w8, [sp, #368]
2090; CHECK-GI-NEXT:    mov v0.b[4], w4
2091; CHECK-GI-NEXT:    mov v1.b[4], w8
2092; CHECK-GI-NEXT:    ldr w8, [sp, #376]
2093; CHECK-GI-NEXT:    mov v0.b[5], w5
2094; CHECK-GI-NEXT:    mov v1.b[5], w8
2095; CHECK-GI-NEXT:    ldr w8, [sp, #384]
2096; CHECK-GI-NEXT:    mov v0.b[6], w6
2097; CHECK-GI-NEXT:    mov v1.b[6], w8
2098; CHECK-GI-NEXT:    ldr w8, [sp, #392]
2099; CHECK-GI-NEXT:    mov v0.b[7], w7
2100; CHECK-GI-NEXT:    mov v1.b[7], w8
2101; CHECK-GI-NEXT:    ldr w8, [sp, #80]
2102; CHECK-GI-NEXT:    fmov s3, w8
2103; CHECK-GI-NEXT:    ldr w8, [sp, #472]
2104; CHECK-GI-NEXT:    mov v0.b[8], w10
2105; CHECK-GI-NEXT:    ldr w10, [sp, #408]
2106; CHECK-GI-NEXT:    mov v1.b[8], w12
2107; CHECK-GI-NEXT:    mov v2.b[1], w8
2108; CHECK-GI-NEXT:    ldr w8, [sp, #24]
2109; CHECK-GI-NEXT:    mov v3.b[1], w11
2110; CHECK-GI-NEXT:    ldr w11, [sp, #480]
2111; CHECK-GI-NEXT:    mov v0.b[9], w8
2112; CHECK-GI-NEXT:    ldr w8, [sp, #32]
2113; CHECK-GI-NEXT:    mov v1.b[9], w10
2114; CHECK-GI-NEXT:    mov v2.b[2], w11
2115; CHECK-GI-NEXT:    ldr w10, [sp, #416]
2116; CHECK-GI-NEXT:    mov v3.b[2], w9
2117; CHECK-GI-NEXT:    ldr w9, [sp, #104]
2118; CHECK-GI-NEXT:    ldr w11, [sp, #488]
2119; CHECK-GI-NEXT:    mov v0.b[10], w8
2120; CHECK-GI-NEXT:    ldr w8, [sp, #40]
2121; CHECK-GI-NEXT:    mov v1.b[10], w10
2122; CHECK-GI-NEXT:    mov v2.b[3], w11
2123; CHECK-GI-NEXT:    ldr w10, [sp, #424]
2124; CHECK-GI-NEXT:    mov v3.b[3], w9
2125; CHECK-GI-NEXT:    ldr w9, [sp, #112]
2126; CHECK-GI-NEXT:    ldr w11, [sp, #496]
2127; CHECK-GI-NEXT:    mov v0.b[11], w8
2128; CHECK-GI-NEXT:    ldr w8, [sp, #48]
2129; CHECK-GI-NEXT:    mov v1.b[11], w10
2130; CHECK-GI-NEXT:    mov v2.b[4], w11
2131; CHECK-GI-NEXT:    ldr w10, [sp, #432]
2132; CHECK-GI-NEXT:    mov v3.b[4], w9
2133; CHECK-GI-NEXT:    ldr w9, [sp, #120]
2134; CHECK-GI-NEXT:    ldr w11, [sp, #504]
2135; CHECK-GI-NEXT:    mov v0.b[12], w8
2136; CHECK-GI-NEXT:    ldr w8, [sp, #56]
2137; CHECK-GI-NEXT:    mov v1.b[12], w10
2138; CHECK-GI-NEXT:    mov v2.b[5], w11
2139; CHECK-GI-NEXT:    ldr w10, [sp, #440]
2140; CHECK-GI-NEXT:    mov v3.b[5], w9
2141; CHECK-GI-NEXT:    ldr w9, [sp, #128]
2142; CHECK-GI-NEXT:    ldr w11, [sp, #512]
2143; CHECK-GI-NEXT:    mov v0.b[13], w8
2144; CHECK-GI-NEXT:    ldr w8, [sp, #64]
2145; CHECK-GI-NEXT:    mov v1.b[13], w10
2146; CHECK-GI-NEXT:    mov v2.b[6], w11
2147; CHECK-GI-NEXT:    ldr w10, [sp, #448]
2148; CHECK-GI-NEXT:    mov v3.b[6], w9
2149; CHECK-GI-NEXT:    ldr w9, [sp, #136]
2150; CHECK-GI-NEXT:    ldr w11, [sp, #520]
2151; CHECK-GI-NEXT:    mov v0.b[14], w8
2152; CHECK-GI-NEXT:    ldr w8, [sp, #72]
2153; CHECK-GI-NEXT:    mov v1.b[14], w10
2154; CHECK-GI-NEXT:    mov v2.b[7], w11
2155; CHECK-GI-NEXT:    mov v3.b[7], w9
2156; CHECK-GI-NEXT:    ldr w9, [sp, #456]
2157; CHECK-GI-NEXT:    mov v0.b[15], w8
2158; CHECK-GI-NEXT:    mov v1.b[15], w9
2159; CHECK-GI-NEXT:    fmov d2, d2
2160; CHECK-GI-NEXT:    fmov d3, d3
2161; CHECK-GI-NEXT:    sdot v16.4s, v0.16b, v5.16b
2162; CHECK-GI-NEXT:    sdot v18.4s, v1.16b, v6.16b
2163; CHECK-GI-NEXT:    sdot v7.4s, v2.16b, v4.16b
2164; CHECK-GI-NEXT:    sdot v17.4s, v3.16b, v4.16b
2165; CHECK-GI-NEXT:    add v1.4s, v18.4s, v7.4s
2166; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
2167; CHECK-GI-NEXT:    addv s1, v1.4s
2168; CHECK-GI-NEXT:    addv s0, v0.4s
2169; CHECK-GI-NEXT:    fmov w9, s1
2170; CHECK-GI-NEXT:    fmov w8, s0
2171; CHECK-GI-NEXT:    add w0, w8, w9
2172; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
2173; CHECK-GI-NEXT:    ret
2174entry:
2175  %az = sext <24 x i8> %a to <24 x i32>
2176  %r1 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %az)
2177  %cz = sext <24 x i8> %c to <24 x i32>
2178  %r2 = call i32 @llvm.vector.reduce.add.v24i32(<24 x i32> %cz)
2179  %x = add i32 %r1, %r2
2180  ret i32 %x
2181}
2182
2183define i32 @test_udot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2184; CHECK-LABEL: test_udot_v25i8:
2185; CHECK:       // %bb.0: // %entry
2186; CHECK-NEXT:    ldp q3, q0, [x1]
2187; CHECK-NEXT:    movi v5.2d, #0000000000000000
2188; CHECK-NEXT:    ldp q2, q1, [x0]
2189; CHECK-NEXT:    umull2 v4.8h, v0.16b, v1.16b
2190; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
2191; CHECK-NEXT:    umull v1.8h, v3.8b, v2.8b
2192; CHECK-NEXT:    umull2 v2.8h, v3.16b, v2.16b
2193; CHECK-NEXT:    ushll v3.4s, v4.4h, #0
2194; CHECK-NEXT:    uaddl2 v4.4s, v1.8h, v0.8h
2195; CHECK-NEXT:    uaddl v0.4s, v1.4h, v0.4h
2196; CHECK-NEXT:    mov v5.s[0], v3.s[0]
2197; CHECK-NEXT:    uaddw2 v1.4s, v4.4s, v2.8h
2198; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
2199; CHECK-NEXT:    uaddw v2.4s, v5.4s, v2.4h
2200; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
2201; CHECK-NEXT:    addv s0, v0.4s
2202; CHECK-NEXT:    fmov w8, s0
2203; CHECK-NEXT:    add w0, w8, w2
2204; CHECK-NEXT:    ret
2205entry:
2206  %0 = load <25 x i8>, ptr %a
2207  %1 = zext <25 x i8> %0 to <25 x i32>
2208  %2 = load <25 x i8>, ptr %b
2209  %3 = zext <25 x i8> %2 to <25 x i32>
2210  %4 = mul nuw nsw <25 x i32> %3, %1
2211  %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
2212  %op.extra = add i32 %5, %sum
2213  ret i32 %op.extra
2214}
2215
2216define i32 @test_udot_v25i8_nomla(ptr nocapture readonly %a1) {
2217; CHECK-LABEL: test_udot_v25i8_nomla:
2218; CHECK:       // %bb.0: // %entry
2219; CHECK-NEXT:    ldp q2, q1, [x0]
2220; CHECK-NEXT:    movi v0.2d, #0000000000000000
2221; CHECK-NEXT:    ushll2 v3.8h, v1.16b, #0
2222; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
2223; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
2224; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
2225; CHECK-NEXT:    ushll v3.4s, v3.4h, #0
2226; CHECK-NEXT:    uaddl2 v5.4s, v4.8h, v1.8h
2227; CHECK-NEXT:    uaddl v1.4s, v4.4h, v1.4h
2228; CHECK-NEXT:    mov v0.s[0], v3.s[0]
2229; CHECK-NEXT:    uaddw2 v3.4s, v5.4s, v2.8h
2230; CHECK-NEXT:    add v1.4s, v1.4s, v3.4s
2231; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
2232; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
2233; CHECK-NEXT:    addv s0, v0.4s
2234; CHECK-NEXT:    fmov w0, s0
2235; CHECK-NEXT:    ret
2236entry:
2237  %0 = load <25 x i8>, ptr %a1
2238  %1 = zext <25 x i8> %0 to <25 x i32>
2239  %2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %1)
2240  ret i32 %2
2241}
2242define i32 @test_sdot_v25i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2243; CHECK-LABEL: test_sdot_v25i8:
2244; CHECK:       // %bb.0: // %entry
2245; CHECK-NEXT:    ldp q3, q0, [x1]
2246; CHECK-NEXT:    movi v5.2d, #0000000000000000
2247; CHECK-NEXT:    ldp q2, q1, [x0]
2248; CHECK-NEXT:    smull2 v4.8h, v0.16b, v1.16b
2249; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
2250; CHECK-NEXT:    smull v1.8h, v3.8b, v2.8b
2251; CHECK-NEXT:    smull2 v2.8h, v3.16b, v2.16b
2252; CHECK-NEXT:    sshll v3.4s, v4.4h, #0
2253; CHECK-NEXT:    saddl2 v4.4s, v1.8h, v0.8h
2254; CHECK-NEXT:    saddl v0.4s, v1.4h, v0.4h
2255; CHECK-NEXT:    mov v5.s[0], v3.s[0]
2256; CHECK-NEXT:    saddw2 v1.4s, v4.4s, v2.8h
2257; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
2258; CHECK-NEXT:    saddw v2.4s, v5.4s, v2.4h
2259; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
2260; CHECK-NEXT:    addv s0, v0.4s
2261; CHECK-NEXT:    fmov w8, s0
2262; CHECK-NEXT:    add w0, w8, w2
2263; CHECK-NEXT:    ret
2264entry:
2265  %0 = load <25 x i8>, ptr %a
2266  %1 = sext <25 x i8> %0 to <25 x i32>
2267  %2 = load <25 x i8>, ptr %b
2268  %3 = sext <25 x i8> %2 to <25 x i32>
2269  %4 = mul nsw <25 x i32> %3, %1
2270  %5 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %4)
2271  %op.extra = add nsw i32 %5, %sum
2272  ret i32 %op.extra
2273}
2274
2275define i32 @test_sdot_v25i8_double(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
2276; CHECK-LABEL: test_sdot_v25i8_double:
2277; CHECK:       // %bb.0: // %entry
2278; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
2279; CHECK-NEXT:    .cfi_def_cfa_offset 16
2280; CHECK-NEXT:    .cfi_offset w29, -16
2281; CHECK-NEXT:    ldr b0, [sp, #216]
2282; CHECK-NEXT:    add x8, sp, #224
2283; CHECK-NEXT:    ldr b1, [sp, #16]
2284; CHECK-NEXT:    ldr b2, [sp, #280]
2285; CHECK-NEXT:    add x9, sp, #240
2286; CHECK-NEXT:    ldr b4, [sp, #80]
2287; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
2288; CHECK-NEXT:    add x8, sp, #24
2289; CHECK-NEXT:    add x10, sp, #48
2290; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
2291; CHECK-NEXT:    add x8, sp, #232
2292; CHECK-NEXT:    add x11, sp, #96
2293; CHECK-NEXT:    ldr b5, [sp, #152]
2294; CHECK-NEXT:    add x12, sp, #168
2295; CHECK-NEXT:    ldr b6, [sp, #616]
2296; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
2297; CHECK-NEXT:    add x8, sp, #32
2298; CHECK-NEXT:    fmov s3, w0
2299; CHECK-NEXT:    ld1 { v1.b }[2], [x8]
2300; CHECK-NEXT:    add x8, sp, #288
2301; CHECK-NEXT:    ldr b7, [sp, #416]
2302; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
2303; CHECK-NEXT:    add x8, sp, #40
2304; CHECK-NEXT:    ldr b22, [sp, #744]
2305; CHECK-NEXT:    ld1 { v0.b }[3], [x9]
2306; CHECK-NEXT:    add x9, sp, #248
2307; CHECK-NEXT:    mov v3.b[1], w1
2308; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
2309; CHECK-NEXT:    add x8, sp, #88
2310; CHECK-NEXT:    ldr b23, [sp, #544]
2311; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
2312; CHECK-NEXT:    add x8, sp, #256
2313; CHECK-NEXT:    ldr b19, [sp, #680]
2314; CHECK-NEXT:    ld1 { v0.b }[4], [x9]
2315; CHECK-NEXT:    add x9, sp, #296
2316; CHECK-NEXT:    ldr b20, [sp, #480]
2317; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
2318; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
2319; CHECK-NEXT:    add x10, sp, #160
2320; CHECK-NEXT:    ld1 { v4.b }[2], [x11]
2321; CHECK-NEXT:    add x11, sp, #304
2322; CHECK-NEXT:    ld1 { v5.b }[1], [x10]
2323; CHECK-NEXT:    ld1 { v0.b }[5], [x8]
2324; CHECK-NEXT:    add x8, sp, #56
2325; CHECK-NEXT:    add x10, sp, #264
2326; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
2327; CHECK-NEXT:    add x8, sp, #64
2328; CHECK-NEXT:    ld1 { v2.b }[3], [x11]
2329; CHECK-NEXT:    add x9, sp, #272
2330; CHECK-NEXT:    ld1 { v5.b }[2], [x12]
2331; CHECK-NEXT:    add x11, sp, #72
2332; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
2333; CHECK-NEXT:    add x10, sp, #312
2334; CHECK-NEXT:    mov v3.b[2], w2
2335; CHECK-NEXT:    ld1 { v1.b }[6], [x8]
2336; CHECK-NEXT:    add x8, sp, #104
2337; CHECK-NEXT:    ld1 { v2.b }[4], [x10]
2338; CHECK-NEXT:    ld1 { v4.b }[3], [x8]
2339; CHECK-NEXT:    add x8, sp, #112
2340; CHECK-NEXT:    add x10, sp, #128
2341; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
2342; CHECK-NEXT:    add x9, sp, #320
2343; CHECK-NEXT:    ldr b21, [sp, #552]
2344; CHECK-NEXT:    ld1 { v2.b }[5], [x9]
2345; CHECK-NEXT:    add x9, sp, #176
2346; CHECK-NEXT:    ld1 { v1.b }[7], [x11]
2347; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
2348; CHECK-NEXT:    add x8, sp, #624
2349; CHECK-NEXT:    ld1 { v5.b }[3], [x9]
2350; CHECK-NEXT:    ld1 { v6.b }[1], [x8]
2351; CHECK-NEXT:    add x8, sp, #120
2352; CHECK-NEXT:    add x9, sp, #328
2353; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
2354; CHECK-NEXT:    add x9, sp, #184
2355; CHECK-NEXT:    add x11, sp, #192
2356; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
2357; CHECK-NEXT:    add x8, sp, #632
2358; CHECK-NEXT:    ld1 { v5.b }[4], [x9]
2359; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
2360; CHECK-NEXT:    add x9, sp, #640
2361; CHECK-NEXT:    add x8, sp, #336
2362; CHECK-NEXT:    ld1 { v2.b }[7], [x8]
2363; CHECK-NEXT:    add x8, sp, #656
2364; CHECK-NEXT:    smull v23.8h, v23.8b, v22.8b
2365; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
2366; CHECK-NEXT:    add x11, sp, #648
2367; CHECK-NEXT:    ld1 { v4.b }[6], [x10]
2368; CHECK-NEXT:    ld1 { v6.b }[3], [x9]
2369; CHECK-NEXT:    add x9, sp, #200
2370; CHECK-NEXT:    add x10, sp, #136
2371; CHECK-NEXT:    ldr b22, [sp, #352]
2372; CHECK-NEXT:    add x12, sp, #360
2373; CHECK-NEXT:    mov v3.b[3], w3
2374; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
2375; CHECK-NEXT:    add x9, sp, #208
2376; CHECK-NEXT:    ld1 { v4.b }[7], [x10]
2377; CHECK-NEXT:    ld1 { v6.b }[4], [x11]
2378; CHECK-NEXT:    add x11, sp, #424
2379; CHECK-NEXT:    add x10, sp, #488
2380; CHECK-NEXT:    ld1 { v7.b }[1], [x11]
2381; CHECK-NEXT:    add x11, sp, #560
2382; CHECK-NEXT:    ld1 { v20.b }[1], [x10]
2383; CHECK-NEXT:    ld1 { v5.b }[7], [x9]
2384; CHECK-NEXT:    add x9, sp, #440
2385; CHECK-NEXT:    ld1 { v21.b }[1], [x11]
2386; CHECK-NEXT:    ld1 { v6.b }[5], [x8]
2387; CHECK-NEXT:    add x8, sp, #432
2388; CHECK-NEXT:    ld1 { v22.b }[1], [x12]
2389; CHECK-NEXT:    ld1 { v7.b }[2], [x8]
2390; CHECK-NEXT:    add x11, sp, #496
2391; CHECK-NEXT:    add x12, sp, #568
2392; CHECK-NEXT:    add x13, sp, #368
2393; CHECK-NEXT:    ld1 { v20.b }[2], [x11]
2394; CHECK-NEXT:    ld1 { v21.b }[2], [x12]
2395; CHECK-NEXT:    ld1 { v22.b }[2], [x13]
2396; CHECK-NEXT:    add x10, sp, #448
2397; CHECK-NEXT:    mov v3.b[4], w4
2398; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
2399; CHECK-NEXT:    add x9, sp, #688
2400; CHECK-NEXT:    add x11, sp, #576
2401; CHECK-NEXT:    ld1 { v19.b }[1], [x9]
2402; CHECK-NEXT:    add x9, sp, #696
2403; CHECK-NEXT:    add x12, sp, #376
2404; CHECK-NEXT:    ld1 { v21.b }[3], [x11]
2405; CHECK-NEXT:    ld1 { v22.b }[3], [x12]
2406; CHECK-NEXT:    add x11, sp, #512
2407; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
2408; CHECK-NEXT:    add x10, sp, #504
2409; CHECK-NEXT:    add x12, sp, #584
2410; CHECK-NEXT:    ld1 { v19.b }[2], [x9]
2411; CHECK-NEXT:    add x9, sp, #704
2412; CHECK-NEXT:    ld1 { v20.b }[3], [x10]
2413; CHECK-NEXT:    add x13, sp, #384
2414; CHECK-NEXT:    mov v3.b[5], w5
2415; CHECK-NEXT:    ld1 { v21.b }[4], [x12]
2416; CHECK-NEXT:    ld1 { v22.b }[4], [x13]
2417; CHECK-NEXT:    add x10, sp, #456
2418; CHECK-NEXT:    ldr b16, [sp, #344]
2419; CHECK-NEXT:    ld1 { v19.b }[3], [x9]
2420; CHECK-NEXT:    add x9, sp, #712
2421; CHECK-NEXT:    ld1 { v20.b }[4], [x11]
2422; CHECK-NEXT:    ldr b17, [sp, #144]
2423; CHECK-NEXT:    ld1 { v7.b }[5], [x10]
2424; CHECK-NEXT:    add x10, sp, #520
2425; CHECK-NEXT:    add x11, sp, #592
2426; CHECK-NEXT:    add x12, sp, #392
2427; CHECK-NEXT:    mov v3.b[6], w6
2428; CHECK-NEXT:    ld1 { v19.b }[4], [x9]
2429; CHECK-NEXT:    add x9, sp, #720
2430; CHECK-NEXT:    ld1 { v20.b }[5], [x10]
2431; CHECK-NEXT:    ld1 { v21.b }[5], [x11]
2432; CHECK-NEXT:    ld1 { v22.b }[5], [x12]
2433; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
2434; CHECK-NEXT:    add x8, sp, #664
2435; CHECK-NEXT:    add x10, sp, #464
2436; CHECK-NEXT:    add x11, sp, #528
2437; CHECK-NEXT:    ld1 { v19.b }[5], [x9]
2438; CHECK-NEXT:    add x9, sp, #728
2439; CHECK-NEXT:    add x12, sp, #600
2440; CHECK-NEXT:    add x13, sp, #400
2441; CHECK-NEXT:    ld1 { v6.b }[6], [x8]
2442; CHECK-NEXT:    ld1 { v20.b }[6], [x11]
2443; CHECK-NEXT:    ld1 { v21.b }[6], [x12]
2444; CHECK-NEXT:    ld1 { v22.b }[6], [x13]
2445; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
2446; CHECK-NEXT:    ld1 { v19.b }[6], [x9]
2447; CHECK-NEXT:    add x9, sp, #736
2448; CHECK-NEXT:    mov v3.b[7], w7
2449; CHECK-NEXT:    sshll v18.4s, v16.4h, #0
2450; CHECK-NEXT:    movi v16.2d, #0000000000000000
2451; CHECK-NEXT:    movi v17.2d, #0000000000000000
2452; CHECK-NEXT:    add x8, sp, #672
2453; CHECK-NEXT:    add x10, sp, #472
2454; CHECK-NEXT:    add x11, sp, #608
2455; CHECK-NEXT:    ld1 { v19.b }[7], [x9]
2456; CHECK-NEXT:    add x9, sp, #536
2457; CHECK-NEXT:    add x12, sp, #408
2458; CHECK-NEXT:    ld1 { v20.b }[7], [x9]
2459; CHECK-NEXT:    ld1 { v21.b }[7], [x11]
2460; CHECK-NEXT:    ld1 { v22.b }[7], [x12]
2461; CHECK-NEXT:    ld1 { v6.b }[7], [x8]
2462; CHECK-NEXT:    ld1 { v7.b }[7], [x10]
2463; CHECK-NEXT:    sshll v23.4s, v23.4h, #0
2464; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
2465; CHECK-NEXT:    smull v1.8h, v4.8b, v2.8b
2466; CHECK-NEXT:    smull v2.8h, v3.8b, v5.8b
2467; CHECK-NEXT:    smull v3.8h, v20.8b, v19.8b
2468; CHECK-NEXT:    smull v4.8h, v22.8b, v21.8b
2469; CHECK-NEXT:    mov v17.s[0], v18.s[0]
2470; CHECK-NEXT:    smull v5.8h, v7.8b, v6.8b
2471; CHECK-NEXT:    mov v16.s[0], v23.s[0]
2472; CHECK-NEXT:    saddl2 v6.4s, v2.8h, v1.8h
2473; CHECK-NEXT:    saddl v1.4s, v2.4h, v1.4h
2474; CHECK-NEXT:    saddl2 v2.4s, v4.8h, v3.8h
2475; CHECK-NEXT:    saddl v3.4s, v4.4h, v3.4h
2476; CHECK-NEXT:    saddw v4.4s, v17.4s, v0.4h
2477; CHECK-NEXT:    saddw v7.4s, v16.4s, v5.4h
2478; CHECK-NEXT:    saddw2 v0.4s, v6.4s, v0.8h
2479; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
2480; CHECK-NEXT:    saddw2 v2.4s, v2.4s, v5.8h
2481; CHECK-NEXT:    add v3.4s, v3.4s, v7.4s
2482; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
2483; CHECK-NEXT:    add v1.4s, v3.4s, v2.4s
2484; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
2485; CHECK-NEXT:    addv s0, v0.4s
2486; CHECK-NEXT:    fmov w0, s0
2487; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
2488; CHECK-NEXT:    ret
2489entry:
2490  %az = sext <25 x i8> %a to <25 x i32>
2491  %bz = sext <25 x i8> %b to <25 x i32>
2492  %m1 = mul nuw nsw <25 x i32> %az, %bz
2493  %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m1)
2494  %cz = sext <25 x i8> %c to <25 x i32>
2495  %dz = sext <25 x i8> %d to <25 x i32>
2496  %m2 = mul nuw nsw <25 x i32> %cz, %dz
2497  %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %m2)
2498  %x = add i32 %r1, %r2
2499  ret i32 %x
2500}
2501
2502define i32 @test_sdot_v25i8_double_nomla(<25 x i8> %a, <25 x i8> %b, <25 x i8> %c, <25 x i8> %d) {
2503; CHECK-LABEL: test_sdot_v25i8_double_nomla:
2504; CHECK:       // %bb.0: // %entry
2505; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
2506; CHECK-NEXT:    .cfi_def_cfa_offset 16
2507; CHECK-NEXT:    .cfi_offset w29, -16
2508; CHECK-NEXT:    fmov s0, w0
2509; CHECK-NEXT:    ldr b1, [sp, #80]
2510; CHECK-NEXT:    add x10, sp, #88
2511; CHECK-NEXT:    ldr b2, [sp, #16]
2512; CHECK-NEXT:    add x9, sp, #96
2513; CHECK-NEXT:    ldr b3, [sp, #480]
2514; CHECK-NEXT:    ld1 { v1.b }[1], [x10]
2515; CHECK-NEXT:    add x10, sp, #24
2516; CHECK-NEXT:    ldr b4, [sp, #352]
2517; CHECK-NEXT:    mov v0.b[1], w1
2518; CHECK-NEXT:    ld1 { v2.b }[1], [x10]
2519; CHECK-NEXT:    add x11, sp, #488
2520; CHECK-NEXT:    add x10, sp, #360
2521; CHECK-NEXT:    ldr b5, [sp, #416]
2522; CHECK-NEXT:    add x8, sp, #104
2523; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
2524; CHECK-NEXT:    add x9, sp, #32
2525; CHECK-NEXT:    ld1 { v3.b }[1], [x11]
2526; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
2527; CHECK-NEXT:    add x11, sp, #424
2528; CHECK-NEXT:    ld1 { v4.b }[1], [x10]
2529; CHECK-NEXT:    mov v0.b[2], w2
2530; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
2531; CHECK-NEXT:    add x9, sp, #368
2532; CHECK-NEXT:    ld1 { v1.b }[3], [x8]
2533; CHECK-NEXT:    add x8, sp, #40
2534; CHECK-NEXT:    add x12, sp, #496
2535; CHECK-NEXT:    ld1 { v2.b }[3], [x8]
2536; CHECK-NEXT:    ld1 { v4.b }[2], [x9]
2537; CHECK-NEXT:    add x8, sp, #432
2538; CHECK-NEXT:    ld1 { v3.b }[2], [x12]
2539; CHECK-NEXT:    add x13, sp, #48
2540; CHECK-NEXT:    ld1 { v5.b }[2], [x8]
2541; CHECK-NEXT:    mov v0.b[3], w3
2542; CHECK-NEXT:    add x10, sp, #112
2543; CHECK-NEXT:    add x8, sp, #504
2544; CHECK-NEXT:    ld1 { v2.b }[4], [x13]
2545; CHECK-NEXT:    add x13, sp, #376
2546; CHECK-NEXT:    ld1 { v1.b }[4], [x10]
2547; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
2548; CHECK-NEXT:    add x13, sp, #440
2549; CHECK-NEXT:    ld1 { v3.b }[3], [x8]
2550; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
2551; CHECK-NEXT:    add x11, sp, #120
2552; CHECK-NEXT:    add x8, sp, #56
2553; CHECK-NEXT:    mov v0.b[4], w4
2554; CHECK-NEXT:    add x13, sp, #512
2555; CHECK-NEXT:    ld1 { v1.b }[5], [x11]
2556; CHECK-NEXT:    ld1 { v2.b }[5], [x8]
2557; CHECK-NEXT:    add x8, sp, #384
2558; CHECK-NEXT:    add x11, sp, #448
2559; CHECK-NEXT:    ld1 { v3.b }[4], [x13]
2560; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
2561; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
2562; CHECK-NEXT:    add x12, sp, #128
2563; CHECK-NEXT:    add x10, sp, #64
2564; CHECK-NEXT:    add x8, sp, #520
2565; CHECK-NEXT:    mov v0.b[5], w5
2566; CHECK-NEXT:    ld1 { v1.b }[6], [x12]
2567; CHECK-NEXT:    ld1 { v2.b }[6], [x10]
2568; CHECK-NEXT:    add x10, sp, #392
2569; CHECK-NEXT:    add x11, sp, #456
2570; CHECK-NEXT:    ldr b6, [sp, #144]
2571; CHECK-NEXT:    ldr b7, [sp, #544]
2572; CHECK-NEXT:    ld1 { v3.b }[5], [x8]
2573; CHECK-NEXT:    ld1 { v4.b }[5], [x10]
2574; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
2575; CHECK-NEXT:    add x9, sp, #136
2576; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
2577; CHECK-NEXT:    mov v0.b[6], w6
2578; CHECK-NEXT:    ld1 { v1.b }[7], [x9]
2579; CHECK-NEXT:    add x8, sp, #528
2580; CHECK-NEXT:    add x9, sp, #400
2581; CHECK-NEXT:    add x10, sp, #464
2582; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
2583; CHECK-NEXT:    ld1 { v3.b }[6], [x8]
2584; CHECK-NEXT:    ld1 { v4.b }[6], [x9]
2585; CHECK-NEXT:    ld1 { v5.b }[6], [x10]
2586; CHECK-NEXT:    movi v16.2d, #0000000000000000
2587; CHECK-NEXT:    movi v17.2d, #0000000000000000
2588; CHECK-NEXT:    add x14, sp, #72
2589; CHECK-NEXT:    mov v0.b[7], w7
2590; CHECK-NEXT:    sshll v6.4s, v6.4h, #0
2591; CHECK-NEXT:    add x8, sp, #536
2592; CHECK-NEXT:    add x9, sp, #408
2593; CHECK-NEXT:    add x10, sp, #472
2594; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
2595; CHECK-NEXT:    ld1 { v2.b }[7], [x14]
2596; CHECK-NEXT:    ld1 { v3.b }[7], [x8]
2597; CHECK-NEXT:    ld1 { v4.b }[7], [x9]
2598; CHECK-NEXT:    ld1 { v5.b }[7], [x10]
2599; CHECK-NEXT:    mov v16.s[0], v6.s[0]
2600; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
2601; CHECK-NEXT:    mov v17.s[0], v7.s[0]
2602; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
2603; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
2604; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
2605; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
2606; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
2607; CHECK-NEXT:    saddl v7.4s, v0.4h, v1.4h
2608; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
2609; CHECK-NEXT:    saddw v6.4s, v16.4s, v2.4h
2610; CHECK-NEXT:    saddl v1.4s, v4.4h, v3.4h
2611; CHECK-NEXT:    saddl2 v3.4s, v4.8h, v3.8h
2612; CHECK-NEXT:    saddw v4.4s, v17.4s, v5.4h
2613; CHECK-NEXT:    saddw2 v0.4s, v0.4s, v2.8h
2614; CHECK-NEXT:    add v6.4s, v7.4s, v6.4s
2615; CHECK-NEXT:    saddw2 v2.4s, v3.4s, v5.8h
2616; CHECK-NEXT:    add v1.4s, v1.4s, v4.4s
2617; CHECK-NEXT:    add v0.4s, v6.4s, v0.4s
2618; CHECK-NEXT:    add v1.4s, v1.4s, v2.4s
2619; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
2620; CHECK-NEXT:    addv s0, v0.4s
2621; CHECK-NEXT:    fmov w0, s0
2622; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
2623; CHECK-NEXT:    ret
2624entry:
2625  %az = sext <25 x i8> %a to <25 x i32>
2626  %r1 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %az)
2627  %cz = sext <25 x i8> %c to <25 x i32>
2628  %r2 = call i32 @llvm.vector.reduce.add.v25i32(<25 x i32> %cz)
2629  %x = add i32 %r1, %r2
2630  ret i32 %x
2631}
2632
2633define i32 @test_udot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2634; CHECK-SD-LABEL: test_udot_v32i8:
2635; CHECK-SD:       // %bb.0: // %entry
2636; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
2637; CHECK-SD-NEXT:    ldp q1, q3, [x0]
2638; CHECK-SD-NEXT:    ldp q2, q4, [x1]
2639; CHECK-SD-NEXT:    udot v0.4s, v4.16b, v3.16b
2640; CHECK-SD-NEXT:    udot v0.4s, v2.16b, v1.16b
2641; CHECK-SD-NEXT:    addv s0, v0.4s
2642; CHECK-SD-NEXT:    fmov w8, s0
2643; CHECK-SD-NEXT:    add w0, w8, w2
2644; CHECK-SD-NEXT:    ret
2645;
2646; CHECK-GI-LABEL: test_udot_v32i8:
2647; CHECK-GI:       // %bb.0: // %entry
2648; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
2649; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
2650; CHECK-GI-NEXT:    ldp q2, q3, [x0]
2651; CHECK-GI-NEXT:    ldp q4, q5, [x1]
2652; CHECK-GI-NEXT:    udot v1.4s, v4.16b, v2.16b
2653; CHECK-GI-NEXT:    udot v0.4s, v5.16b, v3.16b
2654; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
2655; CHECK-GI-NEXT:    addv s0, v0.4s
2656; CHECK-GI-NEXT:    fmov w8, s0
2657; CHECK-GI-NEXT:    add w0, w8, w2
2658; CHECK-GI-NEXT:    ret
2659entry:
2660  %0 = load <32 x i8>, ptr %a
2661  %1 = zext <32 x i8> %0 to <32 x i32>
2662  %2 = load <32 x i8>, ptr %b
2663  %3 = zext <32 x i8> %2 to <32 x i32>
2664  %4 = mul nuw nsw <32 x i32> %3, %1
2665  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2666  %op.extra = add i32 %5, %sum
2667  ret i32 %op.extra
2668}
2669
2670define i32 @test_udot_v32i8_nomla(ptr nocapture readonly %a1) {
2671; CHECK-SD-LABEL: test_udot_v32i8_nomla:
2672; CHECK-SD:       // %bb.0: // %entry
2673; CHECK-SD-NEXT:    movi v0.16b, #1
2674; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
2675; CHECK-SD-NEXT:    ldp q2, q3, [x0]
2676; CHECK-SD-NEXT:    udot v1.4s, v3.16b, v0.16b
2677; CHECK-SD-NEXT:    udot v1.4s, v2.16b, v0.16b
2678; CHECK-SD-NEXT:    addv s0, v1.4s
2679; CHECK-SD-NEXT:    fmov w0, s0
2680; CHECK-SD-NEXT:    ret
2681;
2682; CHECK-GI-LABEL: test_udot_v32i8_nomla:
2683; CHECK-GI:       // %bb.0: // %entry
2684; CHECK-GI-NEXT:    movi v0.16b, #1
2685; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
2686; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
2687; CHECK-GI-NEXT:    ldp q3, q4, [x0]
2688; CHECK-GI-NEXT:    udot v2.4s, v3.16b, v0.16b
2689; CHECK-GI-NEXT:    udot v1.4s, v4.16b, v0.16b
2690; CHECK-GI-NEXT:    add v0.4s, v2.4s, v1.4s
2691; CHECK-GI-NEXT:    addv s0, v0.4s
2692; CHECK-GI-NEXT:    fmov w0, s0
2693; CHECK-GI-NEXT:    ret
2694entry:
2695  %0 = load <32 x i8>, ptr %a1
2696  %1 = zext <32 x i8> %0 to <32 x i32>
2697  %2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %1)
2698  ret i32 %2
2699}
2700define i32 @test_sdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2701; CHECK-SD-LABEL: test_sdot_v32i8:
2702; CHECK-SD:       // %bb.0: // %entry
2703; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
2704; CHECK-SD-NEXT:    ldp q1, q3, [x0]
2705; CHECK-SD-NEXT:    ldp q2, q4, [x1]
2706; CHECK-SD-NEXT:    sdot v0.4s, v4.16b, v3.16b
2707; CHECK-SD-NEXT:    sdot v0.4s, v2.16b, v1.16b
2708; CHECK-SD-NEXT:    addv s0, v0.4s
2709; CHECK-SD-NEXT:    fmov w8, s0
2710; CHECK-SD-NEXT:    add w0, w8, w2
2711; CHECK-SD-NEXT:    ret
2712;
2713; CHECK-GI-LABEL: test_sdot_v32i8:
2714; CHECK-GI:       // %bb.0: // %entry
2715; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
2716; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
2717; CHECK-GI-NEXT:    ldp q2, q3, [x0]
2718; CHECK-GI-NEXT:    ldp q4, q5, [x1]
2719; CHECK-GI-NEXT:    sdot v1.4s, v4.16b, v2.16b
2720; CHECK-GI-NEXT:    sdot v0.4s, v5.16b, v3.16b
2721; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
2722; CHECK-GI-NEXT:    addv s0, v0.4s
2723; CHECK-GI-NEXT:    fmov w8, s0
2724; CHECK-GI-NEXT:    add w0, w8, w2
2725; CHECK-GI-NEXT:    ret
2726entry:
2727  %0 = load <32 x i8>, ptr %a
2728  %1 = sext <32 x i8> %0 to <32 x i32>
2729  %2 = load <32 x i8>, ptr %b
2730  %3 = sext <32 x i8> %2 to <32 x i32>
2731  %4 = mul nsw <32 x i32> %3, %1
2732  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2733  %op.extra = add nsw i32 %5, %sum
2734  ret i32 %op.extra
2735}
2736
2737define i32 @test_sdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
2738; CHECK-SD-LABEL: test_sdot_v32i8_double:
2739; CHECK-SD:       // %bb.0: // %entry
2740; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
2741; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
2742; CHECK-SD-NEXT:    sdot v17.4s, v1.16b, v3.16b
2743; CHECK-SD-NEXT:    sdot v16.4s, v5.16b, v7.16b
2744; CHECK-SD-NEXT:    sdot v17.4s, v0.16b, v2.16b
2745; CHECK-SD-NEXT:    sdot v16.4s, v4.16b, v6.16b
2746; CHECK-SD-NEXT:    add v0.4s, v17.4s, v16.4s
2747; CHECK-SD-NEXT:    addv s0, v0.4s
2748; CHECK-SD-NEXT:    fmov w0, s0
2749; CHECK-SD-NEXT:    ret
2750;
2751; CHECK-GI-LABEL: test_sdot_v32i8_double:
2752; CHECK-GI:       // %bb.0: // %entry
2753; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
2754; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
2755; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
2756; CHECK-GI-NEXT:    movi v19.2d, #0000000000000000
2757; CHECK-GI-NEXT:    sdot v16.4s, v0.16b, v2.16b
2758; CHECK-GI-NEXT:    sdot v18.4s, v1.16b, v3.16b
2759; CHECK-GI-NEXT:    sdot v17.4s, v5.16b, v7.16b
2760; CHECK-GI-NEXT:    sdot v19.4s, v4.16b, v6.16b
2761; CHECK-GI-NEXT:    add v0.4s, v16.4s, v18.4s
2762; CHECK-GI-NEXT:    add v1.4s, v19.4s, v17.4s
2763; CHECK-GI-NEXT:    addv s0, v0.4s
2764; CHECK-GI-NEXT:    addv s1, v1.4s
2765; CHECK-GI-NEXT:    fmov w8, s0
2766; CHECK-GI-NEXT:    fmov w9, s1
2767; CHECK-GI-NEXT:    add w0, w8, w9
2768; CHECK-GI-NEXT:    ret
2769entry:
2770  %az = sext <32 x i8> %a to <32 x i32>
2771  %bz = sext <32 x i8> %b to <32 x i32>
2772  %m1 = mul nuw nsw <32 x i32> %az, %bz
2773  %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
2774  %cz = sext <32 x i8> %c to <32 x i32>
2775  %dz = sext <32 x i8> %d to <32 x i32>
2776  %m2 = mul nuw nsw <32 x i32> %cz, %dz
2777  %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
2778  %x = add i32 %r1, %r2
2779  ret i32 %x
2780}
2781
2782define i32 @test_sdot_v32i8_double_nomla(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
2783; CHECK-SD-LABEL: test_sdot_v32i8_double_nomla:
2784; CHECK-SD:       // %bb.0: // %entry
2785; CHECK-SD-NEXT:    movi v2.16b, #1
2786; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
2787; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
2788; CHECK-SD-NEXT:    sdot v6.4s, v1.16b, v2.16b
2789; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v2.16b
2790; CHECK-SD-NEXT:    sdot v6.4s, v0.16b, v2.16b
2791; CHECK-SD-NEXT:    sdot v3.4s, v4.16b, v2.16b
2792; CHECK-SD-NEXT:    add v0.4s, v6.4s, v3.4s
2793; CHECK-SD-NEXT:    addv s0, v0.4s
2794; CHECK-SD-NEXT:    fmov w0, s0
2795; CHECK-SD-NEXT:    ret
2796;
2797; CHECK-GI-LABEL: test_sdot_v32i8_double_nomla:
2798; CHECK-GI:       // %bb.0: // %entry
2799; CHECK-GI-NEXT:    movi v2.16b, #1
2800; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
2801; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
2802; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
2803; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
2804; CHECK-GI-NEXT:    sdot v3.4s, v0.16b, v2.16b
2805; CHECK-GI-NEXT:    sdot v6.4s, v5.16b, v2.16b
2806; CHECK-GI-NEXT:    sdot v7.4s, v1.16b, v2.16b
2807; CHECK-GI-NEXT:    sdot v16.4s, v4.16b, v2.16b
2808; CHECK-GI-NEXT:    add v0.4s, v3.4s, v7.4s
2809; CHECK-GI-NEXT:    add v1.4s, v16.4s, v6.4s
2810; CHECK-GI-NEXT:    addv s0, v0.4s
2811; CHECK-GI-NEXT:    addv s1, v1.4s
2812; CHECK-GI-NEXT:    fmov w8, s0
2813; CHECK-GI-NEXT:    fmov w9, s1
2814; CHECK-GI-NEXT:    add w0, w8, w9
2815; CHECK-GI-NEXT:    ret
2816entry:
2817  %az = sext <32 x i8> %a to <32 x i32>
2818  %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %az)
2819  %cz = sext <32 x i8> %c to <32 x i32>
2820  %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %cz)
2821  %x = add i32 %r1, %r2
2822  ret i32 %x
2823}
2824
2825define i32 @test_usdot_v32i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
2826; CHECK-SD-LABEL: test_usdot_v32i8:
2827; CHECK-SD:       // %bb.0: // %entry
2828; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
2829; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
2830; CHECK-SD-NEXT:    ldp q2, q3, [x0]
2831; CHECK-SD-NEXT:    ldp q4, q5, [x1]
2832; CHECK-SD-NEXT:    usdot v1.4s, v3.16b, v5.16b
2833; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v4.16b
2834; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
2835; CHECK-SD-NEXT:    addv s0, v0.4s
2836; CHECK-SD-NEXT:    fmov w8, s0
2837; CHECK-SD-NEXT:    add w0, w8, w2
2838; CHECK-SD-NEXT:    ret
2839;
2840; CHECK-GI-LABEL: test_usdot_v32i8:
2841; CHECK-GI:       // %bb.0: // %entry
2842; CHECK-GI-NEXT:    ldp q0, q1, [x1]
2843; CHECK-GI-NEXT:    ldp q2, q3, [x0]
2844; CHECK-GI-NEXT:    sshll v4.8h, v0.8b, #0
2845; CHECK-GI-NEXT:    sshll2 v0.8h, v0.16b, #0
2846; CHECK-GI-NEXT:    sshll v5.8h, v1.8b, #0
2847; CHECK-GI-NEXT:    sshll2 v1.8h, v1.16b, #0
2848; CHECK-GI-NEXT:    ushll v6.8h, v2.8b, #0
2849; CHECK-GI-NEXT:    ushll2 v2.8h, v2.16b, #0
2850; CHECK-GI-NEXT:    ushll v7.8h, v3.8b, #0
2851; CHECK-GI-NEXT:    ushll2 v3.8h, v3.16b, #0
2852; CHECK-GI-NEXT:    sshll2 v16.4s, v4.8h, #0
2853; CHECK-GI-NEXT:    sshll2 v17.4s, v0.8h, #0
2854; CHECK-GI-NEXT:    sshll2 v18.4s, v5.8h, #0
2855; CHECK-GI-NEXT:    sshll2 v19.4s, v1.8h, #0
2856; CHECK-GI-NEXT:    ushll2 v20.4s, v6.8h, #0
2857; CHECK-GI-NEXT:    ushll2 v21.4s, v2.8h, #0
2858; CHECK-GI-NEXT:    ushll2 v22.4s, v7.8h, #0
2859; CHECK-GI-NEXT:    ushll2 v23.4s, v3.8h, #0
2860; CHECK-GI-NEXT:    sshll v4.4s, v4.4h, #0
2861; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
2862; CHECK-GI-NEXT:    sshll v5.4s, v5.4h, #0
2863; CHECK-GI-NEXT:    sshll v1.4s, v1.4h, #0
2864; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v20.4s
2865; CHECK-GI-NEXT:    mul v17.4s, v17.4s, v21.4s
2866; CHECK-GI-NEXT:    ushll v6.4s, v6.4h, #0
2867; CHECK-GI-NEXT:    mul v18.4s, v18.4s, v22.4s
2868; CHECK-GI-NEXT:    mul v19.4s, v19.4s, v23.4s
2869; CHECK-GI-NEXT:    ushll v2.4s, v2.4h, #0
2870; CHECK-GI-NEXT:    ushll v7.4s, v7.4h, #0
2871; CHECK-GI-NEXT:    ushll v3.4s, v3.4h, #0
2872; CHECK-GI-NEXT:    mla v16.4s, v4.4s, v6.4s
2873; CHECK-GI-NEXT:    mla v17.4s, v0.4s, v2.4s
2874; CHECK-GI-NEXT:    mla v18.4s, v5.4s, v7.4s
2875; CHECK-GI-NEXT:    mla v19.4s, v1.4s, v3.4s
2876; CHECK-GI-NEXT:    add v0.4s, v16.4s, v17.4s
2877; CHECK-GI-NEXT:    add v1.4s, v18.4s, v19.4s
2878; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
2879; CHECK-GI-NEXT:    addv s0, v0.4s
2880; CHECK-GI-NEXT:    fmov w8, s0
2881; CHECK-GI-NEXT:    add w0, w8, w2
2882; CHECK-GI-NEXT:    ret
2883entry:
2884  %0 = load <32 x i8>, ptr %a
2885  %1 = zext <32 x i8> %0 to <32 x i32>
2886  %2 = load <32 x i8>, ptr %b
2887  %3 = sext <32 x i8> %2 to <32 x i32>
2888  %4 = mul nsw <32 x i32> %3, %1
2889  %5 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %4)
2890  %op.extra = add nsw i32 %5, %sum
2891  ret i32 %op.extra
2892}
2893
2894define i32 @test_usdot_v32i8_double(<32 x i8> %a, <32 x i8> %b, <32 x i8> %c, <32 x i8> %d) {
2895; CHECK-SD-LABEL: test_usdot_v32i8_double:
2896; CHECK-SD:       // %bb.0: // %entry
2897; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
2898; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
2899; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
2900; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
2901; CHECK-SD-NEXT:    usdot v16.4s, v1.16b, v3.16b
2902; CHECK-SD-NEXT:    usdot v18.4s, v0.16b, v2.16b
2903; CHECK-SD-NEXT:    usdot v17.4s, v4.16b, v6.16b
2904; CHECK-SD-NEXT:    usdot v19.4s, v5.16b, v7.16b
2905; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
2906; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
2907; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
2908; CHECK-SD-NEXT:    addv s0, v0.4s
2909; CHECK-SD-NEXT:    fmov w0, s0
2910; CHECK-SD-NEXT:    ret
2911;
2912; CHECK-GI-LABEL: test_usdot_v32i8_double:
2913; CHECK-GI:       // %bb.0: // %entry
2914; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
2915; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
2916; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
2917; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
2918; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
2919; CHECK-GI-NEXT:    .cfi_offset b8, -8
2920; CHECK-GI-NEXT:    .cfi_offset b9, -16
2921; CHECK-GI-NEXT:    .cfi_offset b10, -24
2922; CHECK-GI-NEXT:    .cfi_offset b11, -32
2923; CHECK-GI-NEXT:    .cfi_offset b12, -40
2924; CHECK-GI-NEXT:    .cfi_offset b13, -48
2925; CHECK-GI-NEXT:    .cfi_offset b14, -56
2926; CHECK-GI-NEXT:    .cfi_offset b15, -64
2927; CHECK-GI-NEXT:    ushll v16.8h, v0.8b, #0
2928; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
2929; CHECK-GI-NEXT:    ushll v17.8h, v1.8b, #0
2930; CHECK-GI-NEXT:    ushll2 v1.8h, v1.16b, #0
2931; CHECK-GI-NEXT:    sshll v18.8h, v2.8b, #0
2932; CHECK-GI-NEXT:    sshll2 v2.8h, v2.16b, #0
2933; CHECK-GI-NEXT:    sshll v19.8h, v3.8b, #0
2934; CHECK-GI-NEXT:    sshll2 v3.8h, v3.16b, #0
2935; CHECK-GI-NEXT:    ushll v27.8h, v4.8b, #0
2936; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
2937; CHECK-GI-NEXT:    ushll v28.8h, v5.8b, #0
2938; CHECK-GI-NEXT:    sshll v29.8h, v6.8b, #0
2939; CHECK-GI-NEXT:    sshll2 v6.8h, v6.16b, #0
2940; CHECK-GI-NEXT:    ushll2 v5.8h, v5.16b, #0
2941; CHECK-GI-NEXT:    sshll v30.8h, v7.8b, #0
2942; CHECK-GI-NEXT:    sshll2 v7.8h, v7.16b, #0
2943; CHECK-GI-NEXT:    ushll2 v20.4s, v16.8h, #0
2944; CHECK-GI-NEXT:    ushll2 v21.4s, v0.8h, #0
2945; CHECK-GI-NEXT:    ushll2 v22.4s, v17.8h, #0
2946; CHECK-GI-NEXT:    ushll2 v23.4s, v1.8h, #0
2947; CHECK-GI-NEXT:    sshll2 v24.4s, v18.8h, #0
2948; CHECK-GI-NEXT:    sshll2 v25.4s, v2.8h, #0
2949; CHECK-GI-NEXT:    sshll2 v26.4s, v19.8h, #0
2950; CHECK-GI-NEXT:    sshll2 v31.4s, v3.8h, #0
2951; CHECK-GI-NEXT:    ushll2 v8.4s, v27.8h, #0
2952; CHECK-GI-NEXT:    ushll2 v9.4s, v4.8h, #0
2953; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
2954; CHECK-GI-NEXT:    sshll2 v11.4s, v29.8h, #0
2955; CHECK-GI-NEXT:    sshll2 v12.4s, v6.8h, #0
2956; CHECK-GI-NEXT:    ushll2 v13.4s, v5.8h, #0
2957; CHECK-GI-NEXT:    sshll2 v14.4s, v30.8h, #0
2958; CHECK-GI-NEXT:    sshll2 v15.4s, v7.8h, #0
2959; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v24.4s
2960; CHECK-GI-NEXT:    mul v21.4s, v21.4s, v25.4s
2961; CHECK-GI-NEXT:    mul v22.4s, v22.4s, v26.4s
2962; CHECK-GI-NEXT:    mul v23.4s, v23.4s, v31.4s
2963; CHECK-GI-NEXT:    mul v24.4s, v8.4s, v11.4s
2964; CHECK-GI-NEXT:    mul v25.4s, v9.4s, v12.4s
2965; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
2966; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
2967; CHECK-GI-NEXT:    mul v26.4s, v10.4s, v14.4s
2968; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
2969; CHECK-GI-NEXT:    mul v31.4s, v13.4s, v15.4s
2970; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
2971; CHECK-GI-NEXT:    ushll v0.4s, v0.4h, #0
2972; CHECK-GI-NEXT:    ushll v17.4s, v17.4h, #0
2973; CHECK-GI-NEXT:    ushll v1.4s, v1.4h, #0
2974; CHECK-GI-NEXT:    sshll v18.4s, v18.4h, #0
2975; CHECK-GI-NEXT:    sshll v2.4s, v2.4h, #0
2976; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
2977; CHECK-GI-NEXT:    sshll v3.4s, v3.4h, #0
2978; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
2979; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
2980; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
2981; CHECK-GI-NEXT:    ushll v5.4s, v5.4h, #0
2982; CHECK-GI-NEXT:    sshll v29.4s, v29.4h, #0
2983; CHECK-GI-NEXT:    sshll v6.4s, v6.4h, #0
2984; CHECK-GI-NEXT:    sshll v30.4s, v30.4h, #0
2985; CHECK-GI-NEXT:    sshll v7.4s, v7.4h, #0
2986; CHECK-GI-NEXT:    mla v20.4s, v16.4s, v18.4s
2987; CHECK-GI-NEXT:    mla v21.4s, v0.4s, v2.4s
2988; CHECK-GI-NEXT:    mla v22.4s, v17.4s, v19.4s
2989; CHECK-GI-NEXT:    mla v23.4s, v1.4s, v3.4s
2990; CHECK-GI-NEXT:    mla v24.4s, v27.4s, v29.4s
2991; CHECK-GI-NEXT:    mla v25.4s, v4.4s, v6.4s
2992; CHECK-GI-NEXT:    mla v26.4s, v28.4s, v30.4s
2993; CHECK-GI-NEXT:    mla v31.4s, v5.4s, v7.4s
2994; CHECK-GI-NEXT:    add v0.4s, v20.4s, v21.4s
2995; CHECK-GI-NEXT:    add v1.4s, v22.4s, v23.4s
2996; CHECK-GI-NEXT:    add v2.4s, v24.4s, v25.4s
2997; CHECK-GI-NEXT:    add v3.4s, v26.4s, v31.4s
2998; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
2999; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
3000; CHECK-GI-NEXT:    addv s0, v0.4s
3001; CHECK-GI-NEXT:    addv s1, v1.4s
3002; CHECK-GI-NEXT:    fmov w8, s0
3003; CHECK-GI-NEXT:    fmov w9, s1
3004; CHECK-GI-NEXT:    add w0, w8, w9
3005; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
3006; CHECK-GI-NEXT:    ret
3007entry:
3008  %az = zext <32 x i8> %a to <32 x i32>
3009  %bz = sext <32 x i8> %b to <32 x i32>
3010  %m1 = mul nuw nsw <32 x i32> %az, %bz
3011  %r1 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m1)
3012  %cz = zext <32 x i8> %c to <32 x i32>
3013  %dz = sext <32 x i8> %d to <32 x i32>
3014  %m2 = mul nuw nsw <32 x i32> %cz, %dz
3015  %r2 = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %m2)
3016  %x = add i32 %r1, %r2
3017  ret i32 %x
3018}
3019
3020
3021define i32 @test_udot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3022; CHECK-LABEL: test_udot_v33i8:
3023; CHECK:       // %bb.0: // %entry
3024; CHECK-NEXT:    ldr b0, [x0, #32]
3025; CHECK-NEXT:    ldr b1, [x1, #32]
3026; CHECK-NEXT:    movi v7.2d, #0000000000000000
3027; CHECK-NEXT:    ldp q3, q4, [x1]
3028; CHECK-NEXT:    umull v0.8h, v1.8b, v0.8b
3029; CHECK-NEXT:    ldp q1, q2, [x0]
3030; CHECK-NEXT:    umull v5.8h, v4.8b, v2.8b
3031; CHECK-NEXT:    umull v6.8h, v3.8b, v1.8b
3032; CHECK-NEXT:    umull2 v2.8h, v4.16b, v2.16b
3033; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
3034; CHECK-NEXT:    umull2 v1.8h, v3.16b, v1.16b
3035; CHECK-NEXT:    mov v7.s[0], v0.s[0]
3036; CHECK-NEXT:    uaddl2 v3.4s, v6.8h, v5.8h
3037; CHECK-NEXT:    uaddl2 v0.4s, v1.8h, v2.8h
3038; CHECK-NEXT:    uaddl v1.4s, v1.4h, v2.4h
3039; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
3040; CHECK-NEXT:    uaddw v2.4s, v7.4s, v6.4h
3041; CHECK-NEXT:    uaddw v2.4s, v2.4s, v5.4h
3042; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
3043; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
3044; CHECK-NEXT:    addv s0, v0.4s
3045; CHECK-NEXT:    fmov w8, s0
3046; CHECK-NEXT:    add w0, w8, w2
3047; CHECK-NEXT:    ret
3048entry:
3049  %0 = load <33 x i8>, ptr %a
3050  %1 = zext <33 x i8> %0 to <33 x i32>
3051  %2 = load <33 x i8>, ptr %b
3052  %3 = zext <33 x i8> %2 to <33 x i32>
3053  %4 = mul nuw nsw <33 x i32> %3, %1
3054  %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
3055  %op.extra = add i32 %5, %sum
3056  ret i32 %op.extra
3057}
3058
3059define i32 @test_udot_v33i8_nomla(ptr nocapture readonly %a1) {
3060; CHECK-LABEL: test_udot_v33i8_nomla:
3061; CHECK:       // %bb.0: // %entry
3062; CHECK-NEXT:    ldr b1, [x0, #32]
3063; CHECK-NEXT:    ldp q3, q2, [x0]
3064; CHECK-NEXT:    movi v0.2d, #0000000000000000
3065; CHECK-NEXT:    ushll v1.8h, v1.8b, #0
3066; CHECK-NEXT:    ushll v4.8h, v2.8b, #0
3067; CHECK-NEXT:    ushll v5.8h, v3.8b, #0
3068; CHECK-NEXT:    ushll2 v2.8h, v2.16b, #0
3069; CHECK-NEXT:    ushll2 v3.8h, v3.16b, #0
3070; CHECK-NEXT:    ushll v1.4s, v1.4h, #0
3071; CHECK-NEXT:    uaddl2 v6.4s, v5.8h, v4.8h
3072; CHECK-NEXT:    mov v0.s[0], v1.s[0]
3073; CHECK-NEXT:    uaddl2 v1.4s, v3.8h, v2.8h
3074; CHECK-NEXT:    uaddl v2.4s, v3.4h, v2.4h
3075; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
3076; CHECK-NEXT:    uaddw v0.4s, v0.4s, v5.4h
3077; CHECK-NEXT:    add v1.4s, v2.4s, v1.4s
3078; CHECK-NEXT:    uaddw v0.4s, v0.4s, v4.4h
3079; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
3080; CHECK-NEXT:    addv s0, v0.4s
3081; CHECK-NEXT:    fmov w0, s0
3082; CHECK-NEXT:    ret
3083entry:
3084  %0 = load <33 x i8>, ptr %a1
3085  %1 = zext <33 x i8> %0 to <33 x i32>
3086  %2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %1)
3087  ret i32 %2
3088}
3089define i32 @test_sdot_v33i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3090; CHECK-LABEL: test_sdot_v33i8:
3091; CHECK:       // %bb.0: // %entry
3092; CHECK-NEXT:    ldr b0, [x0, #32]
3093; CHECK-NEXT:    ldr b1, [x1, #32]
3094; CHECK-NEXT:    movi v7.2d, #0000000000000000
3095; CHECK-NEXT:    ldp q3, q4, [x1]
3096; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
3097; CHECK-NEXT:    ldp q1, q2, [x0]
3098; CHECK-NEXT:    smull v5.8h, v4.8b, v2.8b
3099; CHECK-NEXT:    smull v6.8h, v3.8b, v1.8b
3100; CHECK-NEXT:    smull2 v2.8h, v4.16b, v2.16b
3101; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
3102; CHECK-NEXT:    smull2 v1.8h, v3.16b, v1.16b
3103; CHECK-NEXT:    mov v7.s[0], v0.s[0]
3104; CHECK-NEXT:    saddl2 v3.4s, v6.8h, v5.8h
3105; CHECK-NEXT:    saddl2 v0.4s, v1.8h, v2.8h
3106; CHECK-NEXT:    saddl v1.4s, v1.4h, v2.4h
3107; CHECK-NEXT:    add v0.4s, v3.4s, v0.4s
3108; CHECK-NEXT:    saddw v2.4s, v7.4s, v6.4h
3109; CHECK-NEXT:    saddw v2.4s, v2.4s, v5.4h
3110; CHECK-NEXT:    add v0.4s, v1.4s, v0.4s
3111; CHECK-NEXT:    add v0.4s, v2.4s, v0.4s
3112; CHECK-NEXT:    addv s0, v0.4s
3113; CHECK-NEXT:    fmov w8, s0
3114; CHECK-NEXT:    add w0, w8, w2
3115; CHECK-NEXT:    ret
3116entry:
3117  %0 = load <33 x i8>, ptr %a
3118  %1 = sext <33 x i8> %0 to <33 x i32>
3119  %2 = load <33 x i8>, ptr %b
3120  %3 = sext <33 x i8> %2 to <33 x i32>
3121  %4 = mul nsw <33 x i32> %3, %1
3122  %5 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %4)
3123  %op.extra = add nsw i32 %5, %sum
3124  ret i32 %op.extra
3125}
3126
3127define i32 @test_sdot_v33i8_double(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
3128; CHECK-LABEL: test_sdot_v33i8_double:
3129; CHECK:       // %bb.0: // %entry
3130; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
3131; CHECK-NEXT:    .cfi_def_cfa_offset 16
3132; CHECK-NEXT:    .cfi_offset w29, -16
3133; CHECK-NEXT:    ldr b0, [sp, #344]
3134; CHECK-NEXT:    add x8, sp, #352
3135; CHECK-NEXT:    ldr b1, [sp, #80]
3136; CHECK-NEXT:    ldr b2, [sp, #216]
3137; CHECK-NEXT:    add x9, sp, #96
3138; CHECK-NEXT:    add x10, sp, #104
3139; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
3140; CHECK-NEXT:    add x8, sp, #88
3141; CHECK-NEXT:    ldr b4, [sp, #408]
3142; CHECK-NEXT:    ld1 { v1.b }[1], [x8]
3143; CHECK-NEXT:    add x8, sp, #360
3144; CHECK-NEXT:    add x12, sp, #248
3145; CHECK-NEXT:    add x13, sp, #432
3146; CHECK-NEXT:    add x11, sp, #384
3147; CHECK-NEXT:    ldr b5, [sp, #144]
3148; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
3149; CHECK-NEXT:    add x8, sp, #224
3150; CHECK-NEXT:    ldr b6, [sp, #280]
3151; CHECK-NEXT:    ld1 { v2.b }[1], [x8]
3152; CHECK-NEXT:    ld1 { v1.b }[2], [x9]
3153; CHECK-NEXT:    add x8, sp, #368
3154; CHECK-NEXT:    add x9, sp, #232
3155; CHECK-NEXT:    ldr b16, [sp, #744]
3156; CHECK-NEXT:    ldr b17, [sp, #480]
3157; CHECK-NEXT:    ld1 { v0.b }[3], [x8]
3158; CHECK-NEXT:    add x8, sp, #376
3159; CHECK-NEXT:    ldr b18, [sp, #936]
3160; CHECK-NEXT:    ld1 { v2.b }[2], [x9]
3161; CHECK-NEXT:    ld1 { v1.b }[3], [x10]
3162; CHECK-NEXT:    add x9, sp, #240
3163; CHECK-NEXT:    add x10, sp, #392
3164; CHECK-NEXT:    ldr b19, [sp, #672]
3165; CHECK-NEXT:    ldr b7, [sp, #16]
3166; CHECK-NEXT:    ld1 { v0.b }[4], [x8]
3167; CHECK-NEXT:    add x8, sp, #112
3168; CHECK-NEXT:    ldr b21, [sp, #1000]
3169; CHECK-NEXT:    ld1 { v2.b }[3], [x9]
3170; CHECK-NEXT:    ld1 { v1.b }[4], [x8]
3171; CHECK-NEXT:    add x8, sp, #416
3172; CHECK-NEXT:    ld1 { v4.b }[1], [x8]
3173; CHECK-NEXT:    add x8, sp, #120
3174; CHECK-NEXT:    add x9, sp, #400
3175; CHECK-NEXT:    ld1 { v0.b }[5], [x11]
3176; CHECK-NEXT:    add x11, sp, #128
3177; CHECK-NEXT:    ldr b22, [sp, #736]
3178; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
3179; CHECK-NEXT:    add x12, sp, #424
3180; CHECK-NEXT:    ld1 { v1.b }[5], [x8]
3181; CHECK-NEXT:    ld1 { v4.b }[2], [x12]
3182; CHECK-NEXT:    add x12, sp, #152
3183; CHECK-NEXT:    add x8, sp, #136
3184; CHECK-NEXT:    ld1 { v5.b }[1], [x12]
3185; CHECK-NEXT:    add x12, sp, #440
3186; CHECK-NEXT:    ld1 { v0.b }[6], [x10]
3187; CHECK-NEXT:    ld1 { v1.b }[6], [x11]
3188; CHECK-NEXT:    add x11, sp, #288
3189; CHECK-NEXT:    add x10, sp, #256
3190; CHECK-NEXT:    ld1 { v4.b }[3], [x13]
3191; CHECK-NEXT:    ld1 { v6.b }[1], [x11]
3192; CHECK-NEXT:    add x11, sp, #296
3193; CHECK-NEXT:    ld1 { v0.b }[7], [x9]
3194; CHECK-NEXT:    add x9, sp, #160
3195; CHECK-NEXT:    ld1 { v2.b }[5], [x10]
3196; CHECK-NEXT:    ld1 { v5.b }[2], [x9]
3197; CHECK-NEXT:    add x10, sp, #168
3198; CHECK-NEXT:    ld1 { v1.b }[7], [x8]
3199; CHECK-NEXT:    ld1 { v4.b }[4], [x12]
3200; CHECK-NEXT:    add x12, sp, #448
3201; CHECK-NEXT:    ld1 { v6.b }[2], [x11]
3202; CHECK-NEXT:    add x11, sp, #304
3203; CHECK-NEXT:    add x8, sp, #464
3204; CHECK-NEXT:    add x13, sp, #768
3205; CHECK-NEXT:    ld1 { v5.b }[3], [x10]
3206; CHECK-NEXT:    add x10, sp, #176
3207; CHECK-NEXT:    add x9, sp, #264
3208; CHECK-NEXT:    ld1 { v4.b }[5], [x12]
3209; CHECK-NEXT:    add x12, sp, #456
3210; CHECK-NEXT:    ld1 { v6.b }[3], [x11]
3211; CHECK-NEXT:    add x11, sp, #760
3212; CHECK-NEXT:    ld1 { v2.b }[6], [x9]
3213; CHECK-NEXT:    add x9, sp, #272
3214; CHECK-NEXT:    ld1 { v5.b }[4], [x10]
3215; CHECK-NEXT:    add x10, sp, #312
3216; CHECK-NEXT:    fmov s3, w0
3217; CHECK-NEXT:    ld1 { v4.b }[6], [x12]
3218; CHECK-NEXT:    ld1 { v6.b }[4], [x10]
3219; CHECK-NEXT:    add x10, sp, #320
3220; CHECK-NEXT:    add x12, sp, #680
3221; CHECK-NEXT:    ld1 { v2.b }[7], [x9]
3222; CHECK-NEXT:    add x9, sp, #184
3223; CHECK-NEXT:    ld1 { v19.b }[1], [x12]
3224; CHECK-NEXT:    add x12, sp, #776
3225; CHECK-NEXT:    ld1 { v5.b }[5], [x9]
3226; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
3227; CHECK-NEXT:    add x8, sp, #752
3228; CHECK-NEXT:    ld1 { v6.b }[5], [x10]
3229; CHECK-NEXT:    ld1 { v16.b }[1], [x8]
3230; CHECK-NEXT:    add x10, sp, #24
3231; CHECK-NEXT:    smull v22.8h, v22.8b, v21.8b
3232; CHECK-NEXT:    ld1 { v7.b }[1], [x10]
3233; CHECK-NEXT:    add x10, sp, #496
3234; CHECK-NEXT:    mov v3.b[1], w1
3235; CHECK-NEXT:    add x9, sp, #192
3236; CHECK-NEXT:    ldr b20, [sp, #472]
3237; CHECK-NEXT:    ldr b23, [sp, #208]
3238; CHECK-NEXT:    ld1 { v16.b }[2], [x11]
3239; CHECK-NEXT:    add x11, sp, #488
3240; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
3241; CHECK-NEXT:    ld1 { v17.b }[1], [x11]
3242; CHECK-NEXT:    add x11, sp, #944
3243; CHECK-NEXT:    add x9, sp, #328
3244; CHECK-NEXT:    ld1 { v18.b }[1], [x11]
3245; CHECK-NEXT:    add x11, sp, #688
3246; CHECK-NEXT:    ld1 { v6.b }[6], [x9]
3247; CHECK-NEXT:    ld1 { v16.b }[3], [x13]
3248; CHECK-NEXT:    ld1 { v19.b }[2], [x11]
3249; CHECK-NEXT:    add x11, sp, #504
3250; CHECK-NEXT:    ld1 { v17.b }[2], [x10]
3251; CHECK-NEXT:    add x10, sp, #952
3252; CHECK-NEXT:    add x13, sp, #784
3253; CHECK-NEXT:    ld1 { v18.b }[2], [x10]
3254; CHECK-NEXT:    add x10, sp, #32
3255; CHECK-NEXT:    add x9, sp, #40
3256; CHECK-NEXT:    ld1 { v16.b }[4], [x12]
3257; CHECK-NEXT:    add x12, sp, #696
3258; CHECK-NEXT:    ld1 { v7.b }[2], [x10]
3259; CHECK-NEXT:    ld1 { v17.b }[3], [x11]
3260; CHECK-NEXT:    add x11, sp, #960
3261; CHECK-NEXT:    ld1 { v19.b }[3], [x12]
3262; CHECK-NEXT:    ld1 { v18.b }[3], [x11]
3263; CHECK-NEXT:    add x10, sp, #512
3264; CHECK-NEXT:    add x11, sp, #704
3265; CHECK-NEXT:    ld1 { v16.b }[5], [x13]
3266; CHECK-NEXT:    add x12, sp, #792
3267; CHECK-NEXT:    sshll v24.4s, v22.4h, #0
3268; CHECK-NEXT:    ld1 { v17.b }[4], [x10]
3269; CHECK-NEXT:    add x10, sp, #968
3270; CHECK-NEXT:    ld1 { v19.b }[4], [x11]
3271; CHECK-NEXT:    ld1 { v18.b }[4], [x10]
3272; CHECK-NEXT:    add x10, sp, #520
3273; CHECK-NEXT:    add x11, sp, #976
3274; CHECK-NEXT:    ld1 { v16.b }[6], [x12]
3275; CHECK-NEXT:    add x12, sp, #712
3276; CHECK-NEXT:    smull v20.8h, v23.8b, v20.8b
3277; CHECK-NEXT:    ld1 { v17.b }[5], [x10]
3278; CHECK-NEXT:    ld1 { v19.b }[5], [x12]
3279; CHECK-NEXT:    add x12, sp, #720
3280; CHECK-NEXT:    ld1 { v18.b }[5], [x11]
3281; CHECK-NEXT:    add x11, sp, #528
3282; CHECK-NEXT:    add x10, sp, #800
3283; CHECK-NEXT:    ld1 { v16.b }[7], [x10]
3284; CHECK-NEXT:    add x10, sp, #536
3285; CHECK-NEXT:    ldr b22, [sp, #872]
3286; CHECK-NEXT:    ld1 { v17.b }[6], [x11]
3287; CHECK-NEXT:    add x11, sp, #984
3288; CHECK-NEXT:    ld1 { v19.b }[6], [x12]
3289; CHECK-NEXT:    ld1 { v18.b }[6], [x11]
3290; CHECK-NEXT:    add x11, sp, #992
3291; CHECK-NEXT:    add x12, sp, #728
3292; CHECK-NEXT:    ldr b23, [sp, #608]
3293; CHECK-NEXT:    ld1 { v7.b }[3], [x9]
3294; CHECK-NEXT:    add x9, sp, #880
3295; CHECK-NEXT:    ld1 { v17.b }[7], [x10]
3296; CHECK-NEXT:    ld1 { v19.b }[7], [x12]
3297; CHECK-NEXT:    add x10, sp, #816
3298; CHECK-NEXT:    ld1 { v18.b }[7], [x11]
3299; CHECK-NEXT:    add x11, sp, #552
3300; CHECK-NEXT:    add x12, sp, #616
3301; CHECK-NEXT:    mov v3.b[2], w2
3302; CHECK-NEXT:    ld1 { v22.b }[1], [x9]
3303; CHECK-NEXT:    ld1 { v23.b }[1], [x12]
3304; CHECK-NEXT:    smull v16.8h, v17.8b, v16.8b
3305; CHECK-NEXT:    add x12, sp, #560
3306; CHECK-NEXT:    add x9, sp, #888
3307; CHECK-NEXT:    smull v17.8h, v19.8b, v18.8b
3308; CHECK-NEXT:    ldr b18, [sp, #808]
3309; CHECK-NEXT:    ldr b19, [sp, #544]
3310; CHECK-NEXT:    add x13, sp, #624
3311; CHECK-NEXT:    ld1 { v22.b }[2], [x9]
3312; CHECK-NEXT:    add x9, sp, #896
3313; CHECK-NEXT:    ld1 { v18.b }[1], [x10]
3314; CHECK-NEXT:    ld1 { v19.b }[1], [x11]
3315; CHECK-NEXT:    add x11, sp, #824
3316; CHECK-NEXT:    add x10, sp, #48
3317; CHECK-NEXT:    ld1 { v23.b }[2], [x13]
3318; CHECK-NEXT:    mov v3.b[3], w3
3319; CHECK-NEXT:    ld1 { v7.b }[4], [x10]
3320; CHECK-NEXT:    add x10, sp, #832
3321; CHECK-NEXT:    ld1 { v22.b }[3], [x9]
3322; CHECK-NEXT:    ld1 { v18.b }[2], [x11]
3323; CHECK-NEXT:    ld1 { v19.b }[2], [x12]
3324; CHECK-NEXT:    add x11, sp, #568
3325; CHECK-NEXT:    add x12, sp, #632
3326; CHECK-NEXT:    add x9, sp, #904
3327; CHECK-NEXT:    add x13, sp, #640
3328; CHECK-NEXT:    ld1 { v23.b }[3], [x12]
3329; CHECK-NEXT:    add x12, sp, #576
3330; CHECK-NEXT:    mov v3.b[4], w4
3331; CHECK-NEXT:    ld1 { v18.b }[3], [x10]
3332; CHECK-NEXT:    ld1 { v19.b }[3], [x11]
3333; CHECK-NEXT:    add x11, sp, #840
3334; CHECK-NEXT:    add x10, sp, #56
3335; CHECK-NEXT:    ld1 { v22.b }[4], [x9]
3336; CHECK-NEXT:    add x9, sp, #912
3337; CHECK-NEXT:    ld1 { v23.b }[4], [x13]
3338; CHECK-NEXT:    ld1 { v7.b }[5], [x10]
3339; CHECK-NEXT:    add x10, sp, #848
3340; CHECK-NEXT:    ld1 { v18.b }[4], [x11]
3341; CHECK-NEXT:    ld1 { v19.b }[4], [x12]
3342; CHECK-NEXT:    add x11, sp, #584
3343; CHECK-NEXT:    add x12, sp, #648
3344; CHECK-NEXT:    mov v3.b[5], w5
3345; CHECK-NEXT:    ld1 { v22.b }[5], [x9]
3346; CHECK-NEXT:    ld1 { v23.b }[5], [x12]
3347; CHECK-NEXT:    add x12, sp, #592
3348; CHECK-NEXT:    movi v21.2d, #0000000000000000
3349; CHECK-NEXT:    ld1 { v18.b }[5], [x10]
3350; CHECK-NEXT:    ld1 { v19.b }[5], [x11]
3351; CHECK-NEXT:    add x11, sp, #856
3352; CHECK-NEXT:    add x9, sp, #920
3353; CHECK-NEXT:    add x13, sp, #656
3354; CHECK-NEXT:    add x10, sp, #64
3355; CHECK-NEXT:    ld1 { v22.b }[6], [x9]
3356; CHECK-NEXT:    ld1 { v23.b }[6], [x13]
3357; CHECK-NEXT:    mov v3.b[6], w6
3358; CHECK-NEXT:    ld1 { v18.b }[6], [x11]
3359; CHECK-NEXT:    ld1 { v19.b }[6], [x12]
3360; CHECK-NEXT:    ld1 { v7.b }[6], [x10]
3361; CHECK-NEXT:    add x10, sp, #864
3362; CHECK-NEXT:    add x11, sp, #600
3363; CHECK-NEXT:    add x9, sp, #928
3364; CHECK-NEXT:    add x12, sp, #664
3365; CHECK-NEXT:    mov v21.s[0], v24.s[0]
3366; CHECK-NEXT:    ld1 { v22.b }[7], [x9]
3367; CHECK-NEXT:    ld1 { v18.b }[7], [x10]
3368; CHECK-NEXT:    ld1 { v19.b }[7], [x11]
3369; CHECK-NEXT:    ld1 { v23.b }[7], [x12]
3370; CHECK-NEXT:    add x8, sp, #200
3371; CHECK-NEXT:    mov v3.b[7], w7
3372; CHECK-NEXT:    add x10, sp, #336
3373; CHECK-NEXT:    ld1 { v5.b }[7], [x8]
3374; CHECK-NEXT:    add x8, sp, #72
3375; CHECK-NEXT:    ld1 { v6.b }[7], [x10]
3376; CHECK-NEXT:    smull v18.8h, v19.8b, v18.8b
3377; CHECK-NEXT:    movi v19.2d, #0000000000000000
3378; CHECK-NEXT:    ld1 { v7.b }[7], [x8]
3379; CHECK-NEXT:    smull v22.8h, v23.8b, v22.8b
3380; CHECK-NEXT:    sshll v20.4s, v20.4h, #0
3381; CHECK-NEXT:    smull v0.8h, v1.8b, v0.8b
3382; CHECK-NEXT:    saddw v1.4s, v21.4s, v16.4h
3383; CHECK-NEXT:    smull v2.8h, v3.8b, v2.8b
3384; CHECK-NEXT:    smull v3.8h, v5.8b, v4.8b
3385; CHECK-NEXT:    smull v4.8h, v7.8b, v6.8b
3386; CHECK-NEXT:    mov v19.s[0], v20.s[0]
3387; CHECK-NEXT:    saddl2 v5.4s, v18.8h, v17.8h
3388; CHECK-NEXT:    saddl v7.4s, v18.4h, v17.4h
3389; CHECK-NEXT:    saddl2 v6.4s, v16.8h, v22.8h
3390; CHECK-NEXT:    saddw v1.4s, v1.4s, v22.4h
3391; CHECK-NEXT:    saddl2 v17.4s, v2.8h, v0.8h
3392; CHECK-NEXT:    saddl2 v16.4s, v4.8h, v3.8h
3393; CHECK-NEXT:    saddl v3.4s, v4.4h, v3.4h
3394; CHECK-NEXT:    saddw v2.4s, v19.4s, v2.4h
3395; CHECK-NEXT:    add v5.4s, v6.4s, v5.4s
3396; CHECK-NEXT:    add v1.4s, v1.4s, v7.4s
3397; CHECK-NEXT:    add v6.4s, v17.4s, v16.4s
3398; CHECK-NEXT:    saddw v0.4s, v2.4s, v0.4h
3399; CHECK-NEXT:    add v1.4s, v1.4s, v5.4s
3400; CHECK-NEXT:    add v0.4s, v0.4s, v3.4s
3401; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
3402; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
3403; CHECK-NEXT:    addv s0, v0.4s
3404; CHECK-NEXT:    fmov w0, s0
3405; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
3406; CHECK-NEXT:    ret
3407entry:
3408  %az = sext <33 x i8> %a to <33 x i32>
3409  %bz = sext <33 x i8> %b to <33 x i32>
3410  %m1 = mul nuw nsw <33 x i32> %az, %bz
3411  %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m1)
3412  %cz = sext <33 x i8> %c to <33 x i32>
3413  %dz = sext <33 x i8> %d to <33 x i32>
3414  %m2 = mul nuw nsw <33 x i32> %cz, %dz
3415  %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %m2)
3416  %x = add i32 %r1, %r2
3417  ret i32 %x
3418}
3419
3420define i32 @test_sdot_v33i8_double_nomla(<33 x i8> %a, <33 x i8> %b, <33 x i8> %c, <33 x i8> %d) {
3421; CHECK-LABEL: test_sdot_v33i8_double_nomla:
3422; CHECK:       // %bb.0: // %entry
3423; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
3424; CHECK-NEXT:    .cfi_def_cfa_offset 16
3425; CHECK-NEXT:    .cfi_offset w29, -16
3426; CHECK-NEXT:    ldr b0, [sp, #80]
3427; CHECK-NEXT:    add x8, sp, #88
3428; CHECK-NEXT:    ldr b2, [sp, #144]
3429; CHECK-NEXT:    add x9, sp, #152
3430; CHECK-NEXT:    ldr b3, [sp, #16]
3431; CHECK-NEXT:    add x11, sp, #104
3432; CHECK-NEXT:    ld1 { v0.b }[1], [x8]
3433; CHECK-NEXT:    ld1 { v2.b }[1], [x9]
3434; CHECK-NEXT:    add x9, sp, #24
3435; CHECK-NEXT:    add x8, sp, #96
3436; CHECK-NEXT:    ld1 { v3.b }[1], [x9]
3437; CHECK-NEXT:    ldr b5, [sp, #480]
3438; CHECK-NEXT:    fmov s1, w0
3439; CHECK-NEXT:    add x10, sp, #112
3440; CHECK-NEXT:    add x12, sp, #168
3441; CHECK-NEXT:    ld1 { v0.b }[2], [x8]
3442; CHECK-NEXT:    add x8, sp, #160
3443; CHECK-NEXT:    ldr b4, [sp, #608]
3444; CHECK-NEXT:    ld1 { v2.b }[2], [x8]
3445; CHECK-NEXT:    add x8, sp, #32
3446; CHECK-NEXT:    add x13, sp, #496
3447; CHECK-NEXT:    ld1 { v3.b }[2], [x8]
3448; CHECK-NEXT:    mov v1.b[1], w1
3449; CHECK-NEXT:    ldr b6, [sp, #672]
3450; CHECK-NEXT:    ld1 { v0.b }[3], [x11]
3451; CHECK-NEXT:    add x11, sp, #488
3452; CHECK-NEXT:    add x9, sp, #120
3453; CHECK-NEXT:    ld1 { v5.b }[1], [x11]
3454; CHECK-NEXT:    add x11, sp, #40
3455; CHECK-NEXT:    ld1 { v2.b }[3], [x12]
3456; CHECK-NEXT:    ld1 { v3.b }[3], [x11]
3457; CHECK-NEXT:    add x12, sp, #616
3458; CHECK-NEXT:    ldr b16, [sp, #544]
3459; CHECK-NEXT:    ld1 { v0.b }[4], [x10]
3460; CHECK-NEXT:    add x10, sp, #48
3461; CHECK-NEXT:    ld1 { v4.b }[1], [x12]
3462; CHECK-NEXT:    add x12, sp, #176
3463; CHECK-NEXT:    ld1 { v5.b }[2], [x13]
3464; CHECK-NEXT:    add x13, sp, #680
3465; CHECK-NEXT:    ld1 { v3.b }[4], [x10]
3466; CHECK-NEXT:    ld1 { v2.b }[4], [x12]
3467; CHECK-NEXT:    ld1 { v6.b }[1], [x13]
3468; CHECK-NEXT:    add x13, sp, #56
3469; CHECK-NEXT:    ld1 { v0.b }[5], [x9]
3470; CHECK-NEXT:    mov v1.b[2], w2
3471; CHECK-NEXT:    add x8, sp, #128
3472; CHECK-NEXT:    add x14, sp, #184
3473; CHECK-NEXT:    add x11, sp, #136
3474; CHECK-NEXT:    ld1 { v3.b }[5], [x13]
3475; CHECK-NEXT:    add x13, sp, #552
3476; CHECK-NEXT:    ld1 { v2.b }[5], [x14]
3477; CHECK-NEXT:    ld1 { v16.b }[1], [x13]
3478; CHECK-NEXT:    add x14, sp, #624
3479; CHECK-NEXT:    ld1 { v0.b }[6], [x8]
3480; CHECK-NEXT:    add x8, sp, #688
3481; CHECK-NEXT:    add x13, sp, #504
3482; CHECK-NEXT:    ld1 { v4.b }[2], [x14]
3483; CHECK-NEXT:    ld1 { v6.b }[2], [x8]
3484; CHECK-NEXT:    add x8, sp, #560
3485; CHECK-NEXT:    ld1 { v5.b }[3], [x13]
3486; CHECK-NEXT:    ld1 { v16.b }[2], [x8]
3487; CHECK-NEXT:    mov v1.b[3], w3
3488; CHECK-NEXT:    add x9, sp, #64
3489; CHECK-NEXT:    add x15, sp, #632
3490; CHECK-NEXT:    ld1 { v3.b }[6], [x9]
3491; CHECK-NEXT:    ld1 { v0.b }[7], [x11]
3492; CHECK-NEXT:    ld1 { v4.b }[3], [x15]
3493; CHECK-NEXT:    add x8, sp, #696
3494; CHECK-NEXT:    add x9, sp, #568
3495; CHECK-NEXT:    add x11, sp, #512
3496; CHECK-NEXT:    ld1 { v6.b }[3], [x8]
3497; CHECK-NEXT:    ld1 { v16.b }[3], [x9]
3498; CHECK-NEXT:    ld1 { v5.b }[4], [x11]
3499; CHECK-NEXT:    add x8, sp, #640
3500; CHECK-NEXT:    mov v1.b[4], w4
3501; CHECK-NEXT:    ld1 { v4.b }[4], [x8]
3502; CHECK-NEXT:    add x8, sp, #704
3503; CHECK-NEXT:    add x9, sp, #576
3504; CHECK-NEXT:    add x11, sp, #520
3505; CHECK-NEXT:    ld1 { v6.b }[4], [x8]
3506; CHECK-NEXT:    ld1 { v16.b }[4], [x9]
3507; CHECK-NEXT:    ld1 { v5.b }[5], [x11]
3508; CHECK-NEXT:    ldr b18, [sp, #736]
3509; CHECK-NEXT:    add x12, sp, #192
3510; CHECK-NEXT:    ld1 { v2.b }[6], [x12]
3511; CHECK-NEXT:    add x8, sp, #648
3512; CHECK-NEXT:    add x9, sp, #528
3513; CHECK-NEXT:    add x11, sp, #712
3514; CHECK-NEXT:    add x12, sp, #584
3515; CHECK-NEXT:    sshll v18.8h, v18.8b, #0
3516; CHECK-NEXT:    mov v1.b[5], w5
3517; CHECK-NEXT:    ld1 { v6.b }[5], [x11]
3518; CHECK-NEXT:    ld1 { v16.b }[5], [x12]
3519; CHECK-NEXT:    ld1 { v4.b }[5], [x8]
3520; CHECK-NEXT:    ld1 { v5.b }[6], [x9]
3521; CHECK-NEXT:    movi v17.2d, #0000000000000000
3522; CHECK-NEXT:    add x8, sp, #656
3523; CHECK-NEXT:    add x9, sp, #536
3524; CHECK-NEXT:    add x11, sp, #720
3525; CHECK-NEXT:    add x12, sp, #592
3526; CHECK-NEXT:    sshll v18.4s, v18.4h, #0
3527; CHECK-NEXT:    ldr b7, [sp, #208]
3528; CHECK-NEXT:    ld1 { v6.b }[6], [x11]
3529; CHECK-NEXT:    ld1 { v16.b }[6], [x12]
3530; CHECK-NEXT:    ld1 { v4.b }[6], [x8]
3531; CHECK-NEXT:    ld1 { v5.b }[7], [x9]
3532; CHECK-NEXT:    mov v1.b[6], w6
3533; CHECK-NEXT:    sshll v7.8h, v7.8b, #0
3534; CHECK-NEXT:    add x8, sp, #664
3535; CHECK-NEXT:    add x9, sp, #728
3536; CHECK-NEXT:    add x11, sp, #600
3537; CHECK-NEXT:    mov v17.s[0], v18.s[0]
3538; CHECK-NEXT:    ld1 { v6.b }[7], [x9]
3539; CHECK-NEXT:    ld1 { v16.b }[7], [x11]
3540; CHECK-NEXT:    ld1 { v4.b }[7], [x8]
3541; CHECK-NEXT:    sshll v5.8h, v5.8b, #0
3542; CHECK-NEXT:    movi v18.2d, #0000000000000000
3543; CHECK-NEXT:    add x10, sp, #200
3544; CHECK-NEXT:    mov v1.b[7], w7
3545; CHECK-NEXT:    add x9, sp, #72
3546; CHECK-NEXT:    sshll v7.4s, v7.4h, #0
3547; CHECK-NEXT:    ld1 { v2.b }[7], [x10]
3548; CHECK-NEXT:    ld1 { v3.b }[7], [x9]
3549; CHECK-NEXT:    sshll v6.8h, v6.8b, #0
3550; CHECK-NEXT:    sshll v16.8h, v16.8b, #0
3551; CHECK-NEXT:    sshll v4.8h, v4.8b, #0
3552; CHECK-NEXT:    saddw v17.4s, v17.4s, v5.4h
3553; CHECK-NEXT:    sshll v0.8h, v0.8b, #0
3554; CHECK-NEXT:    mov v18.s[0], v7.s[0]
3555; CHECK-NEXT:    sshll v1.8h, v1.8b, #0
3556; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
3557; CHECK-NEXT:    sshll v3.8h, v3.8b, #0
3558; CHECK-NEXT:    saddl2 v7.4s, v16.8h, v6.8h
3559; CHECK-NEXT:    saddl2 v5.4s, v5.8h, v4.8h
3560; CHECK-NEXT:    saddl v6.4s, v16.4h, v6.4h
3561; CHECK-NEXT:    saddw v4.4s, v17.4s, v4.4h
3562; CHECK-NEXT:    saddl2 v17.4s, v1.8h, v0.8h
3563; CHECK-NEXT:    saddl2 v16.4s, v3.8h, v2.8h
3564; CHECK-NEXT:    saddw v1.4s, v18.4s, v1.4h
3565; CHECK-NEXT:    add v5.4s, v5.4s, v7.4s
3566; CHECK-NEXT:    add v4.4s, v4.4s, v6.4s
3567; CHECK-NEXT:    saddl v2.4s, v3.4h, v2.4h
3568; CHECK-NEXT:    add v6.4s, v17.4s, v16.4s
3569; CHECK-NEXT:    saddw v0.4s, v1.4s, v0.4h
3570; CHECK-NEXT:    add v1.4s, v4.4s, v5.4s
3571; CHECK-NEXT:    add v0.4s, v0.4s, v2.4s
3572; CHECK-NEXT:    add v1.4s, v6.4s, v1.4s
3573; CHECK-NEXT:    add v0.4s, v0.4s, v1.4s
3574; CHECK-NEXT:    addv s0, v0.4s
3575; CHECK-NEXT:    fmov w0, s0
3576; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
3577; CHECK-NEXT:    ret
3578entry:
3579  %az = sext <33 x i8> %a to <33 x i32>
3580  %r1 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %az)
3581  %cz = sext <33 x i8> %c to <33 x i32>
3582  %r2 = call i32 @llvm.vector.reduce.add.v33i32(<33 x i32> %cz)
3583  %x = add i32 %r1, %r2
3584  ret i32 %x
3585}
3586
3587define i32 @test_udot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3588; CHECK-SD-LABEL: test_udot_v48i8:
3589; CHECK-SD:       // %bb.0: // %entry
3590; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
3591; CHECK-SD-NEXT:    ldr q1, [x0, #32]
3592; CHECK-SD-NEXT:    ldr q2, [x1, #32]
3593; CHECK-SD-NEXT:    udot v0.4s, v2.16b, v1.16b
3594; CHECK-SD-NEXT:    ldp q3, q1, [x0]
3595; CHECK-SD-NEXT:    ldp q4, q2, [x1]
3596; CHECK-SD-NEXT:    udot v0.4s, v4.16b, v3.16b
3597; CHECK-SD-NEXT:    udot v0.4s, v2.16b, v1.16b
3598; CHECK-SD-NEXT:    addv s0, v0.4s
3599; CHECK-SD-NEXT:    fmov w8, s0
3600; CHECK-SD-NEXT:    add w0, w8, w2
3601; CHECK-SD-NEXT:    ret
3602;
3603; CHECK-GI-LABEL: test_udot_v48i8:
3604; CHECK-GI:       // %bb.0: // %entry
3605; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
3606; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
3607; CHECK-GI-NEXT:    ldr q7, [x0, #32]
3608; CHECK-GI-NEXT:    ldp q3, q4, [x0]
3609; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
3610; CHECK-GI-NEXT:    ldp q5, q6, [x1]
3611; CHECK-GI-NEXT:    ldr q16, [x1, #32]
3612; CHECK-GI-NEXT:    udot v0.4s, v5.16b, v3.16b
3613; CHECK-GI-NEXT:    udot v1.4s, v6.16b, v4.16b
3614; CHECK-GI-NEXT:    udot v2.4s, v16.16b, v7.16b
3615; CHECK-GI-NEXT:    addv s0, v0.4s
3616; CHECK-GI-NEXT:    addv s1, v1.4s
3617; CHECK-GI-NEXT:    addv s2, v2.4s
3618; CHECK-GI-NEXT:    fmov w8, s0
3619; CHECK-GI-NEXT:    fmov w9, s1
3620; CHECK-GI-NEXT:    add w8, w8, w9
3621; CHECK-GI-NEXT:    fmov w9, s2
3622; CHECK-GI-NEXT:    add w8, w8, w9
3623; CHECK-GI-NEXT:    add w0, w8, w2
3624; CHECK-GI-NEXT:    ret
3625entry:
3626  %0 = load <48 x i8>, ptr %a
3627  %1 = zext <48 x i8> %0 to <48 x i32>
3628  %2 = load <48 x i8>, ptr %b
3629  %3 = zext <48 x i8> %2 to <48 x i32>
3630  %4 = mul nuw nsw <48 x i32> %3, %1
3631  %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
3632  %op.extra = add i32 %5, %sum
3633  ret i32 %op.extra
3634}
3635
3636define i32 @test_udot_v48i8_nomla(ptr nocapture readonly %a1) {
3637; CHECK-SD-LABEL: test_udot_v48i8_nomla:
3638; CHECK-SD:       // %bb.0: // %entry
3639; CHECK-SD-NEXT:    movi v0.16b, #1
3640; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
3641; CHECK-SD-NEXT:    ldr q2, [x0, #32]
3642; CHECK-SD-NEXT:    udot v1.4s, v2.16b, v0.16b
3643; CHECK-SD-NEXT:    ldp q3, q2, [x0]
3644; CHECK-SD-NEXT:    udot v1.4s, v3.16b, v0.16b
3645; CHECK-SD-NEXT:    udot v1.4s, v2.16b, v0.16b
3646; CHECK-SD-NEXT:    addv s0, v1.4s
3647; CHECK-SD-NEXT:    fmov w0, s0
3648; CHECK-SD-NEXT:    ret
3649;
3650; CHECK-GI-LABEL: test_udot_v48i8_nomla:
3651; CHECK-GI:       // %bb.0: // %entry
3652; CHECK-GI-NEXT:    movi v0.16b, #1
3653; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
3654; CHECK-GI-NEXT:    ldr q6, [x0, #32]
3655; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
3656; CHECK-GI-NEXT:    ldp q4, q5, [x0]
3657; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
3658; CHECK-GI-NEXT:    udot v1.4s, v4.16b, v0.16b
3659; CHECK-GI-NEXT:    udot v2.4s, v5.16b, v0.16b
3660; CHECK-GI-NEXT:    udot v3.4s, v6.16b, v0.16b
3661; CHECK-GI-NEXT:    addv s0, v1.4s
3662; CHECK-GI-NEXT:    addv s1, v2.4s
3663; CHECK-GI-NEXT:    addv s2, v3.4s
3664; CHECK-GI-NEXT:    fmov w8, s0
3665; CHECK-GI-NEXT:    fmov w9, s1
3666; CHECK-GI-NEXT:    add w8, w8, w9
3667; CHECK-GI-NEXT:    fmov w9, s2
3668; CHECK-GI-NEXT:    add w0, w8, w9
3669; CHECK-GI-NEXT:    ret
3670entry:
3671  %0 = load <48 x i8>, ptr %a1
3672  %1 = zext <48 x i8> %0 to <48 x i32>
3673  %2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %1)
3674  ret i32 %2
3675}
3676define i32 @test_sdot_v48i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
3677; CHECK-SD-LABEL: test_sdot_v48i8:
3678; CHECK-SD:       // %bb.0: // %entry
3679; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
3680; CHECK-SD-NEXT:    ldr q1, [x0, #32]
3681; CHECK-SD-NEXT:    ldr q2, [x1, #32]
3682; CHECK-SD-NEXT:    sdot v0.4s, v2.16b, v1.16b
3683; CHECK-SD-NEXT:    ldp q3, q1, [x0]
3684; CHECK-SD-NEXT:    ldp q4, q2, [x1]
3685; CHECK-SD-NEXT:    sdot v0.4s, v4.16b, v3.16b
3686; CHECK-SD-NEXT:    sdot v0.4s, v2.16b, v1.16b
3687; CHECK-SD-NEXT:    addv s0, v0.4s
3688; CHECK-SD-NEXT:    fmov w8, s0
3689; CHECK-SD-NEXT:    add w0, w8, w2
3690; CHECK-SD-NEXT:    ret
3691;
3692; CHECK-GI-LABEL: test_sdot_v48i8:
3693; CHECK-GI:       // %bb.0: // %entry
3694; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
3695; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
3696; CHECK-GI-NEXT:    ldr q7, [x0, #32]
3697; CHECK-GI-NEXT:    ldp q3, q4, [x0]
3698; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
3699; CHECK-GI-NEXT:    ldp q5, q6, [x1]
3700; CHECK-GI-NEXT:    ldr q16, [x1, #32]
3701; CHECK-GI-NEXT:    sdot v0.4s, v5.16b, v3.16b
3702; CHECK-GI-NEXT:    sdot v1.4s, v6.16b, v4.16b
3703; CHECK-GI-NEXT:    sdot v2.4s, v16.16b, v7.16b
3704; CHECK-GI-NEXT:    addv s0, v0.4s
3705; CHECK-GI-NEXT:    addv s1, v1.4s
3706; CHECK-GI-NEXT:    addv s2, v2.4s
3707; CHECK-GI-NEXT:    fmov w8, s0
3708; CHECK-GI-NEXT:    fmov w9, s1
3709; CHECK-GI-NEXT:    add w8, w8, w9
3710; CHECK-GI-NEXT:    fmov w9, s2
3711; CHECK-GI-NEXT:    add w8, w8, w9
3712; CHECK-GI-NEXT:    add w0, w8, w2
3713; CHECK-GI-NEXT:    ret
3714entry:
3715  %0 = load <48 x i8>, ptr %a
3716  %1 = sext <48 x i8> %0 to <48 x i32>
3717  %2 = load <48 x i8>, ptr %b
3718  %3 = sext <48 x i8> %2 to <48 x i32>
3719  %4 = mul nsw <48 x i32> %3, %1
3720  %5 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %4)
3721  %op.extra = add nsw i32 %5, %sum
3722  ret i32 %op.extra
3723}
3724
3725define i32 @test_sdot_v48i8_double(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
3726; CHECK-SD-LABEL: test_sdot_v48i8_double:
3727; CHECK-SD:       // %bb.0: // %entry
3728; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
3729; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
3730; CHECK-SD-NEXT:    .cfi_offset w29, -16
3731; CHECK-SD-NEXT:    ldr b3, [sp, #592]
3732; CHECK-SD-NEXT:    add x8, sp, #600
3733; CHECK-SD-NEXT:    ldr b6, [sp, #208]
3734; CHECK-SD-NEXT:    ldr b0, [sp, #336]
3735; CHECK-SD-NEXT:    add x9, sp, #344
3736; CHECK-SD-NEXT:    ldr b2, [sp, #464]
3737; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x8]
3738; CHECK-SD-NEXT:    add x8, sp, #216
3739; CHECK-SD-NEXT:    add x10, sp, #624
3740; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x8]
3741; CHECK-SD-NEXT:    add x8, sp, #608
3742; CHECK-SD-NEXT:    ld1 { v0.b }[1], [x9]
3743; CHECK-SD-NEXT:    add x9, sp, #232
3744; CHECK-SD-NEXT:    fmov s1, w0
3745; CHECK-SD-NEXT:    ldr b7, [sp, #1360]
3746; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x8]
3747; CHECK-SD-NEXT:    add x8, sp, #224
3748; CHECK-SD-NEXT:    add x11, sp, #648
3749; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
3750; CHECK-SD-NEXT:    add x8, sp, #616
3751; CHECK-SD-NEXT:    add x12, sp, #376
3752; CHECK-SD-NEXT:    mov v1.b[1], w1
3753; CHECK-SD-NEXT:    ldr b16, [sp, #976]
3754; CHECK-SD-NEXT:    add x14, sp, #288
3755; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x8]
3756; CHECK-SD-NEXT:    add x8, sp, #632
3757; CHECK-SD-NEXT:    add x15, sp, #408
3758; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x9]
3759; CHECK-SD-NEXT:    add x9, sp, #472
3760; CHECK-SD-NEXT:    add x13, sp, #696
3761; CHECK-SD-NEXT:    ld1 { v2.b }[1], [x9]
3762; CHECK-SD-NEXT:    add x9, sp, #240
3763; CHECK-SD-NEXT:    add x16, sp, #448
3764; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x10]
3765; CHECK-SD-NEXT:    add x10, sp, #352
3766; CHECK-SD-NEXT:    mov v1.b[2], w2
3767; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x9]
3768; CHECK-SD-NEXT:    ld1 { v0.b }[2], [x10]
3769; CHECK-SD-NEXT:    add x10, sp, #1368
3770; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x10]
3771; CHECK-SD-NEXT:    add x10, sp, #248
3772; CHECK-SD-NEXT:    add x9, sp, #640
3773; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x8]
3774; CHECK-SD-NEXT:    add x8, sp, #656
3775; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
3776; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x10]
3777; CHECK-SD-NEXT:    add x10, sp, #360
3778; CHECK-SD-NEXT:    mov v1.b[3], w3
3779; CHECK-SD-NEXT:    ld1 { v0.b }[3], [x10]
3780; CHECK-SD-NEXT:    add x10, sp, #256
3781; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
3782; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x9]
3783; CHECK-SD-NEXT:    add x9, sp, #368
3784; CHECK-SD-NEXT:    ldr b17, [sp, #720]
3785; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x10]
3786; CHECK-SD-NEXT:    add x10, sp, #984
3787; CHECK-SD-NEXT:    ld1 { v0.b }[4], [x9]
3788; CHECK-SD-NEXT:    ld1 { v16.b }[1], [x10]
3789; CHECK-SD-NEXT:    add x10, sp, #664
3790; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x11]
3791; CHECK-SD-NEXT:    add x11, sp, #264
3792; CHECK-SD-NEXT:    mov v1.b[4], w4
3793; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x11]
3794; CHECK-SD-NEXT:    add x9, sp, #672
3795; CHECK-SD-NEXT:    add x11, sp, #680
3796; CHECK-SD-NEXT:    ld1 { v0.b }[5], [x12]
3797; CHECK-SD-NEXT:    add x12, sp, #480
3798; CHECK-SD-NEXT:    ld1 { v2.b }[2], [x12]
3799; CHECK-SD-NEXT:    add x12, sp, #272
3800; CHECK-SD-NEXT:    ld1 { v3.b }[8], [x8]
3801; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x12]
3802; CHECK-SD-NEXT:    add x12, sp, #384
3803; CHECK-SD-NEXT:    mov v1.b[5], w5
3804; CHECK-SD-NEXT:    ld1 { v0.b }[6], [x12]
3805; CHECK-SD-NEXT:    add x12, sp, #280
3806; CHECK-SD-NEXT:    add x8, sp, #688
3807; CHECK-SD-NEXT:    ld1 { v3.b }[9], [x10]
3808; CHECK-SD-NEXT:    add x10, sp, #1376
3809; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x10]
3810; CHECK-SD-NEXT:    add x10, sp, #392
3811; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x12]
3812; CHECK-SD-NEXT:    ld1 { v0.b }[7], [x10]
3813; CHECK-SD-NEXT:    mov v1.b[6], w6
3814; CHECK-SD-NEXT:    add x12, sp, #704
3815; CHECK-SD-NEXT:    ld1 { v3.b }[10], [x9]
3816; CHECK-SD-NEXT:    add x9, sp, #400
3817; CHECK-SD-NEXT:    add x10, sp, #712
3818; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x14]
3819; CHECK-SD-NEXT:    add x14, sp, #992
3820; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x9]
3821; CHECK-SD-NEXT:    ld1 { v16.b }[2], [x14]
3822; CHECK-SD-NEXT:    add x14, sp, #296
3823; CHECK-SD-NEXT:    ld1 { v3.b }[11], [x11]
3824; CHECK-SD-NEXT:    add x9, sp, #304
3825; CHECK-SD-NEXT:    add x11, sp, #312
3826; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x14]
3827; CHECK-SD-NEXT:    mov v1.b[7], w7
3828; CHECK-SD-NEXT:    add x14, sp, #320
3829; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x15]
3830; CHECK-SD-NEXT:    add x15, sp, #328
3831; CHECK-SD-NEXT:    ld1 { v3.b }[12], [x8]
3832; CHECK-SD-NEXT:    add x8, sp, #416
3833; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x9]
3834; CHECK-SD-NEXT:    add x9, sp, #1384
3835; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
3836; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x9]
3837; CHECK-SD-NEXT:    add x9, sp, #424
3838; CHECK-SD-NEXT:    ld1 { v3.b }[13], [x13]
3839; CHECK-SD-NEXT:    add x8, sp, #432
3840; CHECK-SD-NEXT:    add x13, sp, #440
3841; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x11]
3842; CHECK-SD-NEXT:    add x11, sp, #16
3843; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x9]
3844; CHECK-SD-NEXT:    add x9, sp, #1000
3845; CHECK-SD-NEXT:    ld1 { v1.b }[8], [x11]
3846; CHECK-SD-NEXT:    ld1 { v16.b }[3], [x9]
3847; CHECK-SD-NEXT:    ld1 { v3.b }[14], [x12]
3848; CHECK-SD-NEXT:    add x12, sp, #488
3849; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x14]
3850; CHECK-SD-NEXT:    add x14, sp, #1392
3851; CHECK-SD-NEXT:    ld1 { v2.b }[3], [x12]
3852; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
3853; CHECK-SD-NEXT:    add x11, sp, #1008
3854; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x8]
3855; CHECK-SD-NEXT:    ld1 { v16.b }[4], [x11]
3856; CHECK-SD-NEXT:    add x8, sp, #1400
3857; CHECK-SD-NEXT:    ld1 { v3.b }[15], [x10]
3858; CHECK-SD-NEXT:    add x10, sp, #496
3859; CHECK-SD-NEXT:    add x9, sp, #24
3860; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x15]
3861; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x8]
3862; CHECK-SD-NEXT:    ld1 { v2.b }[4], [x10]
3863; CHECK-SD-NEXT:    add x10, sp, #1016
3864; CHECK-SD-NEXT:    ld1 { v16.b }[5], [x10]
3865; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x13]
3866; CHECK-SD-NEXT:    add x8, sp, #1408
3867; CHECK-SD-NEXT:    ld1 { v1.b }[9], [x9]
3868; CHECK-SD-NEXT:    add x9, sp, #504
3869; CHECK-SD-NEXT:    add x10, sp, #512
3870; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x8]
3871; CHECK-SD-NEXT:    ld1 { v2.b }[5], [x9]
3872; CHECK-SD-NEXT:    add x9, sp, #1024
3873; CHECK-SD-NEXT:    add x8, sp, #32
3874; CHECK-SD-NEXT:    ld1 { v16.b }[6], [x9]
3875; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x16]
3876; CHECK-SD-NEXT:    ld1 { v1.b }[10], [x8]
3877; CHECK-SD-NEXT:    add x8, sp, #1416
3878; CHECK-SD-NEXT:    add x9, sp, #456
3879; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x8]
3880; CHECK-SD-NEXT:    ld1 { v2.b }[6], [x10]
3881; CHECK-SD-NEXT:    add x10, sp, #1032
3882; CHECK-SD-NEXT:    add x8, sp, #40
3883; CHECK-SD-NEXT:    ld1 { v16.b }[7], [x10]
3884; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x9]
3885; CHECK-SD-NEXT:    ld1 { v1.b }[11], [x8]
3886; CHECK-SD-NEXT:    add x8, sp, #1424
3887; CHECK-SD-NEXT:    add x9, sp, #520
3888; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
3889; CHECK-SD-NEXT:    ld1 { v2.b }[7], [x9]
3890; CHECK-SD-NEXT:    add x9, sp, #1040
3891; CHECK-SD-NEXT:    add x8, sp, #48
3892; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x9]
3893; CHECK-SD-NEXT:    add x10, sp, #528
3894; CHECK-SD-NEXT:    ld1 { v1.b }[12], [x8]
3895; CHECK-SD-NEXT:    add x8, sp, #1432
3896; CHECK-SD-NEXT:    sdot v5.4s, v6.16b, v3.16b
3897; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
3898; CHECK-SD-NEXT:    ld1 { v2.b }[8], [x10]
3899; CHECK-SD-NEXT:    add x8, sp, #1048
3900; CHECK-SD-NEXT:    ldr b3, [sp, #80]
3901; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x8]
3902; CHECK-SD-NEXT:    add x10, sp, #88
3903; CHECK-SD-NEXT:    add x8, sp, #536
3904; CHECK-SD-NEXT:    add x11, sp, #1440
3905; CHECK-SD-NEXT:    add x9, sp, #56
3906; CHECK-SD-NEXT:    ld1 { v3.b }[1], [x10]
3907; CHECK-SD-NEXT:    ld1 { v2.b }[9], [x8]
3908; CHECK-SD-NEXT:    add x8, sp, #1056
3909; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x11]
3910; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x8]
3911; CHECK-SD-NEXT:    ld1 { v1.b }[13], [x9]
3912; CHECK-SD-NEXT:    add x9, sp, #96
3913; CHECK-SD-NEXT:    add x8, sp, #544
3914; CHECK-SD-NEXT:    add x10, sp, #1448
3915; CHECK-SD-NEXT:    ld1 { v3.b }[2], [x9]
3916; CHECK-SD-NEXT:    ld1 { v2.b }[10], [x8]
3917; CHECK-SD-NEXT:    add x8, sp, #1064
3918; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x10]
3919; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x8]
3920; CHECK-SD-NEXT:    add x10, sp, #104
3921; CHECK-SD-NEXT:    add x8, sp, #552
3922; CHECK-SD-NEXT:    add x11, sp, #1456
3923; CHECK-SD-NEXT:    add x9, sp, #64
3924; CHECK-SD-NEXT:    ld1 { v3.b }[3], [x10]
3925; CHECK-SD-NEXT:    ld1 { v2.b }[11], [x8]
3926; CHECK-SD-NEXT:    add x8, sp, #1072
3927; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x11]
3928; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x8]
3929; CHECK-SD-NEXT:    ld1 { v1.b }[14], [x9]
3930; CHECK-SD-NEXT:    add x9, sp, #112
3931; CHECK-SD-NEXT:    add x8, sp, #560
3932; CHECK-SD-NEXT:    add x10, sp, #1464
3933; CHECK-SD-NEXT:    ld1 { v3.b }[4], [x9]
3934; CHECK-SD-NEXT:    ld1 { v2.b }[12], [x8]
3935; CHECK-SD-NEXT:    add x8, sp, #1080
3936; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x10]
3937; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x8]
3938; CHECK-SD-NEXT:    add x10, sp, #120
3939; CHECK-SD-NEXT:    add x8, sp, #568
3940; CHECK-SD-NEXT:    add x11, sp, #1472
3941; CHECK-SD-NEXT:    add x9, sp, #72
3942; CHECK-SD-NEXT:    ld1 { v3.b }[5], [x10]
3943; CHECK-SD-NEXT:    ld1 { v2.b }[13], [x8]
3944; CHECK-SD-NEXT:    add x8, sp, #1088
3945; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x11]
3946; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x8]
3947; CHECK-SD-NEXT:    ld1 { v1.b }[15], [x9]
3948; CHECK-SD-NEXT:    add x9, sp, #128
3949; CHECK-SD-NEXT:    ldr b6, [sp, #1104]
3950; CHECK-SD-NEXT:    add x10, sp, #1480
3951; CHECK-SD-NEXT:    ld1 { v3.b }[6], [x9]
3952; CHECK-SD-NEXT:    add x8, sp, #1096
3953; CHECK-SD-NEXT:    add x9, sp, #1112
3954; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x10]
3955; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x8]
3956; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x9]
3957; CHECK-SD-NEXT:    add x8, sp, #728
3958; CHECK-SD-NEXT:    add x9, sp, #576
3959; CHECK-SD-NEXT:    add x10, sp, #136
3960; CHECK-SD-NEXT:    ld1 { v17.b }[1], [x8]
3961; CHECK-SD-NEXT:    add x8, sp, #1120
3962; CHECK-SD-NEXT:    ld1 { v2.b }[14], [x9]
3963; CHECK-SD-NEXT:    sdot v4.4s, v16.16b, v7.16b
3964; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x8]
3965; CHECK-SD-NEXT:    add x8, sp, #736
3966; CHECK-SD-NEXT:    ldr b7, [sp, #1232]
3967; CHECK-SD-NEXT:    ldr b16, [sp, #848]
3968; CHECK-SD-NEXT:    ld1 { v3.b }[7], [x10]
3969; CHECK-SD-NEXT:    ld1 { v17.b }[2], [x8]
3970; CHECK-SD-NEXT:    add x9, sp, #1240
3971; CHECK-SD-NEXT:    add x10, sp, #856
3972; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x9]
3973; CHECK-SD-NEXT:    ld1 { v16.b }[1], [x10]
3974; CHECK-SD-NEXT:    add x8, sp, #1128
3975; CHECK-SD-NEXT:    add x11, sp, #744
3976; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x8]
3977; CHECK-SD-NEXT:    add x10, sp, #1248
3978; CHECK-SD-NEXT:    ld1 { v17.b }[3], [x11]
3979; CHECK-SD-NEXT:    add x11, sp, #864
3980; CHECK-SD-NEXT:    add x9, sp, #144
3981; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x10]
3982; CHECK-SD-NEXT:    ld1 { v16.b }[2], [x11]
3983; CHECK-SD-NEXT:    add x8, sp, #1136
3984; CHECK-SD-NEXT:    add x12, sp, #752
3985; CHECK-SD-NEXT:    ld1 { v3.b }[8], [x9]
3986; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x8]
3987; CHECK-SD-NEXT:    ld1 { v17.b }[4], [x12]
3988; CHECK-SD-NEXT:    add x9, sp, #1256
3989; CHECK-SD-NEXT:    add x10, sp, #872
3990; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x9]
3991; CHECK-SD-NEXT:    ld1 { v16.b }[3], [x10]
3992; CHECK-SD-NEXT:    add x8, sp, #1144
3993; CHECK-SD-NEXT:    add x11, sp, #760
3994; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x8]
3995; CHECK-SD-NEXT:    add x10, sp, #1264
3996; CHECK-SD-NEXT:    ld1 { v17.b }[5], [x11]
3997; CHECK-SD-NEXT:    add x11, sp, #880
3998; CHECK-SD-NEXT:    add x9, sp, #152
3999; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x10]
4000; CHECK-SD-NEXT:    ld1 { v16.b }[4], [x11]
4001; CHECK-SD-NEXT:    add x8, sp, #1152
4002; CHECK-SD-NEXT:    add x12, sp, #768
4003; CHECK-SD-NEXT:    ld1 { v3.b }[9], [x9]
4004; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x8]
4005; CHECK-SD-NEXT:    ld1 { v17.b }[6], [x12]
4006; CHECK-SD-NEXT:    add x9, sp, #1272
4007; CHECK-SD-NEXT:    add x10, sp, #888
4008; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x9]
4009; CHECK-SD-NEXT:    ld1 { v16.b }[5], [x10]
4010; CHECK-SD-NEXT:    add x8, sp, #1160
4011; CHECK-SD-NEXT:    add x11, sp, #776
4012; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x8]
4013; CHECK-SD-NEXT:    add x10, sp, #1280
4014; CHECK-SD-NEXT:    ld1 { v17.b }[7], [x11]
4015; CHECK-SD-NEXT:    add x11, sp, #896
4016; CHECK-SD-NEXT:    add x9, sp, #160
4017; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
4018; CHECK-SD-NEXT:    ld1 { v16.b }[6], [x11]
4019; CHECK-SD-NEXT:    add x8, sp, #1168
4020; CHECK-SD-NEXT:    add x12, sp, #784
4021; CHECK-SD-NEXT:    ld1 { v3.b }[10], [x9]
4022; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x8]
4023; CHECK-SD-NEXT:    ld1 { v17.b }[8], [x12]
4024; CHECK-SD-NEXT:    add x9, sp, #1288
4025; CHECK-SD-NEXT:    add x10, sp, #904
4026; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
4027; CHECK-SD-NEXT:    ld1 { v16.b }[7], [x10]
4028; CHECK-SD-NEXT:    add x8, sp, #1176
4029; CHECK-SD-NEXT:    add x11, sp, #792
4030; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x8]
4031; CHECK-SD-NEXT:    add x10, sp, #1296
4032; CHECK-SD-NEXT:    ld1 { v17.b }[9], [x11]
4033; CHECK-SD-NEXT:    add x11, sp, #912
4034; CHECK-SD-NEXT:    add x9, sp, #168
4035; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x10]
4036; CHECK-SD-NEXT:    ld1 { v16.b }[8], [x11]
4037; CHECK-SD-NEXT:    add x8, sp, #1184
4038; CHECK-SD-NEXT:    add x12, sp, #800
4039; CHECK-SD-NEXT:    ld1 { v3.b }[11], [x9]
4040; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x8]
4041; CHECK-SD-NEXT:    ld1 { v17.b }[10], [x12]
4042; CHECK-SD-NEXT:    add x9, sp, #1304
4043; CHECK-SD-NEXT:    add x10, sp, #920
4044; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x9]
4045; CHECK-SD-NEXT:    ld1 { v16.b }[9], [x10]
4046; CHECK-SD-NEXT:    add x8, sp, #1192
4047; CHECK-SD-NEXT:    add x11, sp, #808
4048; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x8]
4049; CHECK-SD-NEXT:    add x10, sp, #1312
4050; CHECK-SD-NEXT:    ld1 { v17.b }[11], [x11]
4051; CHECK-SD-NEXT:    add x11, sp, #928
4052; CHECK-SD-NEXT:    add x9, sp, #176
4053; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
4054; CHECK-SD-NEXT:    ld1 { v16.b }[10], [x11]
4055; CHECK-SD-NEXT:    add x8, sp, #1200
4056; CHECK-SD-NEXT:    add x12, sp, #816
4057; CHECK-SD-NEXT:    ld1 { v3.b }[12], [x9]
4058; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
4059; CHECK-SD-NEXT:    ld1 { v17.b }[12], [x12]
4060; CHECK-SD-NEXT:    add x9, sp, #1320
4061; CHECK-SD-NEXT:    add x10, sp, #936
4062; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
4063; CHECK-SD-NEXT:    ld1 { v16.b }[11], [x10]
4064; CHECK-SD-NEXT:    add x8, sp, #1208
4065; CHECK-SD-NEXT:    add x11, sp, #824
4066; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x8]
4067; CHECK-SD-NEXT:    add x10, sp, #1328
4068; CHECK-SD-NEXT:    ld1 { v17.b }[13], [x11]
4069; CHECK-SD-NEXT:    add x11, sp, #944
4070; CHECK-SD-NEXT:    add x9, sp, #184
4071; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
4072; CHECK-SD-NEXT:    ld1 { v16.b }[12], [x11]
4073; CHECK-SD-NEXT:    add x8, sp, #1216
4074; CHECK-SD-NEXT:    add x12, sp, #832
4075; CHECK-SD-NEXT:    ld1 { v3.b }[13], [x9]
4076; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
4077; CHECK-SD-NEXT:    ld1 { v17.b }[14], [x12]
4078; CHECK-SD-NEXT:    add x9, sp, #1336
4079; CHECK-SD-NEXT:    add x10, sp, #952
4080; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
4081; CHECK-SD-NEXT:    ld1 { v16.b }[13], [x10]
4082; CHECK-SD-NEXT:    add x8, sp, #1224
4083; CHECK-SD-NEXT:    add x11, sp, #840
4084; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
4085; CHECK-SD-NEXT:    add x8, sp, #192
4086; CHECK-SD-NEXT:    ld1 { v17.b }[15], [x11]
4087; CHECK-SD-NEXT:    add x10, sp, #1344
4088; CHECK-SD-NEXT:    add x11, sp, #960
4089; CHECK-SD-NEXT:    ld1 { v3.b }[14], [x8]
4090; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x10]
4091; CHECK-SD-NEXT:    ld1 { v16.b }[14], [x11]
4092; CHECK-SD-NEXT:    add x9, sp, #584
4093; CHECK-SD-NEXT:    sdot v5.4s, v1.16b, v0.16b
4094; CHECK-SD-NEXT:    add x8, sp, #200
4095; CHECK-SD-NEXT:    sdot v4.4s, v17.16b, v6.16b
4096; CHECK-SD-NEXT:    ld1 { v2.b }[15], [x9]
4097; CHECK-SD-NEXT:    add x9, sp, #1352
4098; CHECK-SD-NEXT:    add x10, sp, #968
4099; CHECK-SD-NEXT:    ld1 { v3.b }[15], [x8]
4100; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x9]
4101; CHECK-SD-NEXT:    ld1 { v16.b }[15], [x10]
4102; CHECK-SD-NEXT:    sdot v5.4s, v3.16b, v2.16b
4103; CHECK-SD-NEXT:    sdot v4.4s, v16.16b, v7.16b
4104; CHECK-SD-NEXT:    add v0.4s, v5.4s, v4.4s
4105; CHECK-SD-NEXT:    addv s0, v0.4s
4106; CHECK-SD-NEXT:    fmov w0, s0
4107; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
4108; CHECK-SD-NEXT:    ret
4109;
4110; CHECK-GI-LABEL: test_sdot_v48i8_double:
4111; CHECK-GI:       // %bb.0: // %entry
4112; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
4113; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
4114; CHECK-GI-NEXT:    .cfi_offset w29, -16
4115; CHECK-GI-NEXT:    ldr w11, [sp, #80]
4116; CHECK-GI-NEXT:    ldr w10, [sp, #208]
4117; CHECK-GI-NEXT:    fmov s0, w0
4118; CHECK-GI-NEXT:    ldr w8, [sp, #88]
4119; CHECK-GI-NEXT:    ldr w12, [sp, #344]
4120; CHECK-GI-NEXT:    movi v20.2d, #0000000000000000
4121; CHECK-GI-NEXT:    fmov s1, w11
4122; CHECK-GI-NEXT:    ldr w11, [sp, #336]
4123; CHECK-GI-NEXT:    fmov s2, w10
4124; CHECK-GI-NEXT:    ldr w10, [sp, #464]
4125; CHECK-GI-NEXT:    ldr w9, [sp, #216]
4126; CHECK-GI-NEXT:    mov v0.b[1], w1
4127; CHECK-GI-NEXT:    fmov s3, w11
4128; CHECK-GI-NEXT:    ldr w11, [sp, #600]
4129; CHECK-GI-NEXT:    movi v21.2d, #0000000000000000
4130; CHECK-GI-NEXT:    mov v1.b[1], w8
4131; CHECK-GI-NEXT:    ldr w8, [sp, #592]
4132; CHECK-GI-NEXT:    fmov s4, w10
4133; CHECK-GI-NEXT:    mov v2.b[1], w9
4134; CHECK-GI-NEXT:    ldr w9, [sp, #472]
4135; CHECK-GI-NEXT:    ldr w10, [sp, #608]
4136; CHECK-GI-NEXT:    mov v3.b[1], w12
4137; CHECK-GI-NEXT:    fmov s5, w8
4138; CHECK-GI-NEXT:    ldr w8, [sp, #96]
4139; CHECK-GI-NEXT:    mov v4.b[1], w9
4140; CHECK-GI-NEXT:    ldr w9, [sp, #224]
4141; CHECK-GI-NEXT:    mov v0.b[2], w2
4142; CHECK-GI-NEXT:    mov v1.b[2], w8
4143; CHECK-GI-NEXT:    ldr w8, [sp, #352]
4144; CHECK-GI-NEXT:    ldr w12, [sp, #848]
4145; CHECK-GI-NEXT:    mov v2.b[2], w9
4146; CHECK-GI-NEXT:    ldr w9, [sp, #480]
4147; CHECK-GI-NEXT:    mov v5.b[1], w11
4148; CHECK-GI-NEXT:    mov v3.b[2], w8
4149; CHECK-GI-NEXT:    ldr w8, [sp, #104]
4150; CHECK-GI-NEXT:    ldr w11, [sp, #16]
4151; CHECK-GI-NEXT:    mov v4.b[2], w9
4152; CHECK-GI-NEXT:    ldr w9, [sp, #232]
4153; CHECK-GI-NEXT:    mov v0.b[3], w3
4154; CHECK-GI-NEXT:    mov v1.b[3], w8
4155; CHECK-GI-NEXT:    ldr w8, [sp, #360]
4156; CHECK-GI-NEXT:    fmov s7, w12
4157; CHECK-GI-NEXT:    mov v2.b[3], w9
4158; CHECK-GI-NEXT:    ldr w9, [sp, #488]
4159; CHECK-GI-NEXT:    mov v5.b[2], w10
4160; CHECK-GI-NEXT:    mov v3.b[3], w8
4161; CHECK-GI-NEXT:    ldr w8, [sp, #112]
4162; CHECK-GI-NEXT:    ldr w10, [sp, #616]
4163; CHECK-GI-NEXT:    mov v4.b[3], w9
4164; CHECK-GI-NEXT:    ldr w9, [sp, #240]
4165; CHECK-GI-NEXT:    mov v0.b[4], w4
4166; CHECK-GI-NEXT:    mov v1.b[4], w8
4167; CHECK-GI-NEXT:    ldr w8, [sp, #368]
4168; CHECK-GI-NEXT:    ldr w12, [sp, #1112]
4169; CHECK-GI-NEXT:    mov v2.b[4], w9
4170; CHECK-GI-NEXT:    ldr w9, [sp, #496]
4171; CHECK-GI-NEXT:    mov v5.b[3], w10
4172; CHECK-GI-NEXT:    mov v3.b[4], w8
4173; CHECK-GI-NEXT:    ldr w8, [sp, #120]
4174; CHECK-GI-NEXT:    ldr w10, [sp, #624]
4175; CHECK-GI-NEXT:    mov v4.b[4], w9
4176; CHECK-GI-NEXT:    ldr w9, [sp, #248]
4177; CHECK-GI-NEXT:    mov v0.b[5], w5
4178; CHECK-GI-NEXT:    mov v1.b[5], w8
4179; CHECK-GI-NEXT:    ldr w8, [sp, #376]
4180; CHECK-GI-NEXT:    movi v22.2d, #0000000000000000
4181; CHECK-GI-NEXT:    mov v2.b[5], w9
4182; CHECK-GI-NEXT:    ldr w9, [sp, #504]
4183; CHECK-GI-NEXT:    mov v5.b[4], w10
4184; CHECK-GI-NEXT:    mov v3.b[5], w8
4185; CHECK-GI-NEXT:    ldr w8, [sp, #128]
4186; CHECK-GI-NEXT:    ldr w10, [sp, #632]
4187; CHECK-GI-NEXT:    mov v4.b[5], w9
4188; CHECK-GI-NEXT:    ldr w9, [sp, #256]
4189; CHECK-GI-NEXT:    mov v0.b[6], w6
4190; CHECK-GI-NEXT:    mov v1.b[6], w8
4191; CHECK-GI-NEXT:    ldr w8, [sp, #384]
4192; CHECK-GI-NEXT:    movi v23.2d, #0000000000000000
4193; CHECK-GI-NEXT:    mov v2.b[6], w9
4194; CHECK-GI-NEXT:    ldr w9, [sp, #512]
4195; CHECK-GI-NEXT:    mov v5.b[5], w10
4196; CHECK-GI-NEXT:    mov v3.b[6], w8
4197; CHECK-GI-NEXT:    ldr w8, [sp, #136]
4198; CHECK-GI-NEXT:    ldr w10, [sp, #640]
4199; CHECK-GI-NEXT:    mov v4.b[6], w9
4200; CHECK-GI-NEXT:    ldr w9, [sp, #264]
4201; CHECK-GI-NEXT:    mov v0.b[7], w7
4202; CHECK-GI-NEXT:    mov v1.b[7], w8
4203; CHECK-GI-NEXT:    ldr w8, [sp, #392]
4204; CHECK-GI-NEXT:    movi v24.2d, #0000000000000000
4205; CHECK-GI-NEXT:    mov v2.b[7], w9
4206; CHECK-GI-NEXT:    ldr w9, [sp, #520]
4207; CHECK-GI-NEXT:    mov v5.b[6], w10
4208; CHECK-GI-NEXT:    mov v3.b[7], w8
4209; CHECK-GI-NEXT:    ldr w8, [sp, #144]
4210; CHECK-GI-NEXT:    ldr w10, [sp, #648]
4211; CHECK-GI-NEXT:    mov v4.b[7], w9
4212; CHECK-GI-NEXT:    ldr w9, [sp, #272]
4213; CHECK-GI-NEXT:    mov v0.b[8], w11
4214; CHECK-GI-NEXT:    mov v1.b[8], w8
4215; CHECK-GI-NEXT:    ldr w8, [sp, #400]
4216; CHECK-GI-NEXT:    ldr w11, [sp, #24]
4217; CHECK-GI-NEXT:    mov v2.b[8], w9
4218; CHECK-GI-NEXT:    ldr w9, [sp, #528]
4219; CHECK-GI-NEXT:    mov v5.b[7], w10
4220; CHECK-GI-NEXT:    mov v3.b[8], w8
4221; CHECK-GI-NEXT:    ldr w8, [sp, #152]
4222; CHECK-GI-NEXT:    ldr w10, [sp, #656]
4223; CHECK-GI-NEXT:    mov v4.b[8], w9
4224; CHECK-GI-NEXT:    ldr w9, [sp, #280]
4225; CHECK-GI-NEXT:    mov v0.b[9], w11
4226; CHECK-GI-NEXT:    mov v1.b[9], w8
4227; CHECK-GI-NEXT:    ldr w8, [sp, #408]
4228; CHECK-GI-NEXT:    ldr w11, [sp, #32]
4229; CHECK-GI-NEXT:    mov v2.b[9], w9
4230; CHECK-GI-NEXT:    ldr w9, [sp, #536]
4231; CHECK-GI-NEXT:    mov v5.b[8], w10
4232; CHECK-GI-NEXT:    mov v3.b[9], w8
4233; CHECK-GI-NEXT:    ldr w8, [sp, #160]
4234; CHECK-GI-NEXT:    ldr w10, [sp, #664]
4235; CHECK-GI-NEXT:    mov v4.b[9], w9
4236; CHECK-GI-NEXT:    ldr w9, [sp, #288]
4237; CHECK-GI-NEXT:    mov v0.b[10], w11
4238; CHECK-GI-NEXT:    mov v1.b[10], w8
4239; CHECK-GI-NEXT:    ldr w8, [sp, #416]
4240; CHECK-GI-NEXT:    ldr w11, [sp, #40]
4241; CHECK-GI-NEXT:    mov v2.b[10], w9
4242; CHECK-GI-NEXT:    ldr w9, [sp, #544]
4243; CHECK-GI-NEXT:    mov v5.b[9], w10
4244; CHECK-GI-NEXT:    mov v3.b[10], w8
4245; CHECK-GI-NEXT:    ldr w8, [sp, #168]
4246; CHECK-GI-NEXT:    ldr w10, [sp, #672]
4247; CHECK-GI-NEXT:    mov v4.b[10], w9
4248; CHECK-GI-NEXT:    ldr w9, [sp, #296]
4249; CHECK-GI-NEXT:    mov v0.b[11], w11
4250; CHECK-GI-NEXT:    mov v1.b[11], w8
4251; CHECK-GI-NEXT:    ldr w8, [sp, #424]
4252; CHECK-GI-NEXT:    ldr w11, [sp, #48]
4253; CHECK-GI-NEXT:    mov v2.b[11], w9
4254; CHECK-GI-NEXT:    ldr w9, [sp, #552]
4255; CHECK-GI-NEXT:    mov v5.b[10], w10
4256; CHECK-GI-NEXT:    mov v3.b[11], w8
4257; CHECK-GI-NEXT:    ldr w8, [sp, #176]
4258; CHECK-GI-NEXT:    ldr w10, [sp, #680]
4259; CHECK-GI-NEXT:    mov v4.b[11], w9
4260; CHECK-GI-NEXT:    ldr w9, [sp, #304]
4261; CHECK-GI-NEXT:    mov v0.b[12], w11
4262; CHECK-GI-NEXT:    mov v1.b[12], w8
4263; CHECK-GI-NEXT:    ldr w8, [sp, #432]
4264; CHECK-GI-NEXT:    ldr w11, [sp, #56]
4265; CHECK-GI-NEXT:    mov v2.b[12], w9
4266; CHECK-GI-NEXT:    ldr w9, [sp, #560]
4267; CHECK-GI-NEXT:    mov v5.b[11], w10
4268; CHECK-GI-NEXT:    mov v3.b[12], w8
4269; CHECK-GI-NEXT:    ldr w8, [sp, #184]
4270; CHECK-GI-NEXT:    ldr w10, [sp, #688]
4271; CHECK-GI-NEXT:    mov v4.b[12], w9
4272; CHECK-GI-NEXT:    ldr w9, [sp, #312]
4273; CHECK-GI-NEXT:    mov v0.b[13], w11
4274; CHECK-GI-NEXT:    mov v1.b[13], w8
4275; CHECK-GI-NEXT:    ldr w8, [sp, #440]
4276; CHECK-GI-NEXT:    ldr w11, [sp, #64]
4277; CHECK-GI-NEXT:    mov v2.b[13], w9
4278; CHECK-GI-NEXT:    ldr w9, [sp, #568]
4279; CHECK-GI-NEXT:    mov v5.b[12], w10
4280; CHECK-GI-NEXT:    mov v3.b[13], w8
4281; CHECK-GI-NEXT:    ldr w8, [sp, #192]
4282; CHECK-GI-NEXT:    ldr w10, [sp, #696]
4283; CHECK-GI-NEXT:    mov v4.b[13], w9
4284; CHECK-GI-NEXT:    ldr w9, [sp, #320]
4285; CHECK-GI-NEXT:    mov v0.b[14], w11
4286; CHECK-GI-NEXT:    mov v1.b[14], w8
4287; CHECK-GI-NEXT:    ldr w8, [sp, #448]
4288; CHECK-GI-NEXT:    ldr w11, [sp, #72]
4289; CHECK-GI-NEXT:    mov v2.b[14], w9
4290; CHECK-GI-NEXT:    ldr w9, [sp, #576]
4291; CHECK-GI-NEXT:    mov v5.b[13], w10
4292; CHECK-GI-NEXT:    mov v3.b[14], w8
4293; CHECK-GI-NEXT:    ldr w8, [sp, #720]
4294; CHECK-GI-NEXT:    ldr w10, [sp, #704]
4295; CHECK-GI-NEXT:    mov v4.b[14], w9
4296; CHECK-GI-NEXT:    ldr w9, [sp, #728]
4297; CHECK-GI-NEXT:    mov v0.b[15], w11
4298; CHECK-GI-NEXT:    fmov s6, w8
4299; CHECK-GI-NEXT:    ldr w8, [sp, #328]
4300; CHECK-GI-NEXT:    ldr w11, [sp, #456]
4301; CHECK-GI-NEXT:    mov v5.b[14], w10
4302; CHECK-GI-NEXT:    ldr w10, [sp, #200]
4303; CHECK-GI-NEXT:    movi v25.2d, #0000000000000000
4304; CHECK-GI-NEXT:    mov v2.b[15], w8
4305; CHECK-GI-NEXT:    mov v3.b[15], w11
4306; CHECK-GI-NEXT:    ldr w11, [sp, #736]
4307; CHECK-GI-NEXT:    mov v6.b[1], w9
4308; CHECK-GI-NEXT:    ldr w9, [sp, #584]
4309; CHECK-GI-NEXT:    ldr w8, [sp, #856]
4310; CHECK-GI-NEXT:    mov v1.b[15], w10
4311; CHECK-GI-NEXT:    ldr w10, [sp, #712]
4312; CHECK-GI-NEXT:    mov v4.b[15], w9
4313; CHECK-GI-NEXT:    ldr w9, [sp, #976]
4314; CHECK-GI-NEXT:    mov v7.b[1], w8
4315; CHECK-GI-NEXT:    ldr w8, [sp, #1232]
4316; CHECK-GI-NEXT:    mov v5.b[15], w10
4317; CHECK-GI-NEXT:    ldr w10, [sp, #984]
4318; CHECK-GI-NEXT:    mov v6.b[2], w11
4319; CHECK-GI-NEXT:    ldr w11, [sp, #1104]
4320; CHECK-GI-NEXT:    fmov s16, w9
4321; CHECK-GI-NEXT:    ldr w9, [sp, #1360]
4322; CHECK-GI-NEXT:    fmov s18, w8
4323; CHECK-GI-NEXT:    ldr w8, [sp, #1368]
4324; CHECK-GI-NEXT:    fmov s17, w11
4325; CHECK-GI-NEXT:    ldr w11, [sp, #1240]
4326; CHECK-GI-NEXT:    sdot v20.4s, v0.16b, v3.16b
4327; CHECK-GI-NEXT:    mov v16.b[1], w10
4328; CHECK-GI-NEXT:    fmov s19, w9
4329; CHECK-GI-NEXT:    ldr w10, [sp, #864]
4330; CHECK-GI-NEXT:    mov v18.b[1], w11
4331; CHECK-GI-NEXT:    ldr w11, [sp, #992]
4332; CHECK-GI-NEXT:    ldr w9, [sp, #1120]
4333; CHECK-GI-NEXT:    mov v17.b[1], w12
4334; CHECK-GI-NEXT:    mov v7.b[2], w10
4335; CHECK-GI-NEXT:    ldr w10, [sp, #1248]
4336; CHECK-GI-NEXT:    mov v19.b[1], w8
4337; CHECK-GI-NEXT:    ldr w8, [sp, #744]
4338; CHECK-GI-NEXT:    sdot v21.4s, v1.16b, v4.16b
4339; CHECK-GI-NEXT:    mov v16.b[2], w11
4340; CHECK-GI-NEXT:    ldr w11, [sp, #872]
4341; CHECK-GI-NEXT:    addv s0, v20.4s
4342; CHECK-GI-NEXT:    mov v6.b[3], w8
4343; CHECK-GI-NEXT:    ldr w8, [sp, #1000]
4344; CHECK-GI-NEXT:    mov v18.b[2], w10
4345; CHECK-GI-NEXT:    mov v17.b[2], w9
4346; CHECK-GI-NEXT:    ldr w9, [sp, #1376]
4347; CHECK-GI-NEXT:    ldr w10, [sp, #1128]
4348; CHECK-GI-NEXT:    mov v7.b[3], w11
4349; CHECK-GI-NEXT:    ldr w11, [sp, #880]
4350; CHECK-GI-NEXT:    addv s1, v21.4s
4351; CHECK-GI-NEXT:    mov v19.b[2], w9
4352; CHECK-GI-NEXT:    ldr w9, [sp, #752]
4353; CHECK-GI-NEXT:    mov v16.b[3], w8
4354; CHECK-GI-NEXT:    ldr w8, [sp, #1256]
4355; CHECK-GI-NEXT:    sdot v25.4s, v2.16b, v5.16b
4356; CHECK-GI-NEXT:    mov v17.b[3], w10
4357; CHECK-GI-NEXT:    ldr w10, [sp, #1384]
4358; CHECK-GI-NEXT:    mov v6.b[4], w9
4359; CHECK-GI-NEXT:    ldr w9, [sp, #1008]
4360; CHECK-GI-NEXT:    mov v18.b[3], w8
4361; CHECK-GI-NEXT:    ldr w8, [sp, #1136]
4362; CHECK-GI-NEXT:    mov v19.b[3], w10
4363; CHECK-GI-NEXT:    ldr w10, [sp, #760]
4364; CHECK-GI-NEXT:    mov v7.b[4], w11
4365; CHECK-GI-NEXT:    mov v16.b[4], w9
4366; CHECK-GI-NEXT:    ldr w9, [sp, #1264]
4367; CHECK-GI-NEXT:    ldr w11, [sp, #888]
4368; CHECK-GI-NEXT:    mov v17.b[4], w8
4369; CHECK-GI-NEXT:    ldr w8, [sp, #1392]
4370; CHECK-GI-NEXT:    mov v6.b[5], w10
4371; CHECK-GI-NEXT:    ldr w10, [sp, #1016]
4372; CHECK-GI-NEXT:    mov v18.b[4], w9
4373; CHECK-GI-NEXT:    ldr w9, [sp, #1144]
4374; CHECK-GI-NEXT:    mov v19.b[4], w8
4375; CHECK-GI-NEXT:    ldr w8, [sp, #768]
4376; CHECK-GI-NEXT:    mov v7.b[5], w11
4377; CHECK-GI-NEXT:    mov v16.b[5], w10
4378; CHECK-GI-NEXT:    ldr w10, [sp, #1272]
4379; CHECK-GI-NEXT:    ldr w11, [sp, #896]
4380; CHECK-GI-NEXT:    mov v17.b[5], w9
4381; CHECK-GI-NEXT:    ldr w9, [sp, #1400]
4382; CHECK-GI-NEXT:    mov v6.b[6], w8
4383; CHECK-GI-NEXT:    ldr w8, [sp, #1024]
4384; CHECK-GI-NEXT:    mov v18.b[5], w10
4385; CHECK-GI-NEXT:    ldr w10, [sp, #1152]
4386; CHECK-GI-NEXT:    mov v19.b[5], w9
4387; CHECK-GI-NEXT:    ldr w9, [sp, #776]
4388; CHECK-GI-NEXT:    mov v7.b[6], w11
4389; CHECK-GI-NEXT:    mov v16.b[6], w8
4390; CHECK-GI-NEXT:    ldr w8, [sp, #1280]
4391; CHECK-GI-NEXT:    ldr w11, [sp, #904]
4392; CHECK-GI-NEXT:    mov v17.b[6], w10
4393; CHECK-GI-NEXT:    ldr w10, [sp, #1408]
4394; CHECK-GI-NEXT:    mov v6.b[7], w9
4395; CHECK-GI-NEXT:    ldr w9, [sp, #1032]
4396; CHECK-GI-NEXT:    mov v18.b[6], w8
4397; CHECK-GI-NEXT:    ldr w8, [sp, #1160]
4398; CHECK-GI-NEXT:    mov v19.b[6], w10
4399; CHECK-GI-NEXT:    ldr w10, [sp, #784]
4400; CHECK-GI-NEXT:    mov v7.b[7], w11
4401; CHECK-GI-NEXT:    mov v16.b[7], w9
4402; CHECK-GI-NEXT:    ldr w9, [sp, #1288]
4403; CHECK-GI-NEXT:    ldr w11, [sp, #912]
4404; CHECK-GI-NEXT:    mov v17.b[7], w8
4405; CHECK-GI-NEXT:    ldr w8, [sp, #1416]
4406; CHECK-GI-NEXT:    mov v6.b[8], w10
4407; CHECK-GI-NEXT:    ldr w10, [sp, #1040]
4408; CHECK-GI-NEXT:    mov v18.b[7], w9
4409; CHECK-GI-NEXT:    ldr w9, [sp, #1168]
4410; CHECK-GI-NEXT:    mov v19.b[7], w8
4411; CHECK-GI-NEXT:    ldr w8, [sp, #792]
4412; CHECK-GI-NEXT:    mov v7.b[8], w11
4413; CHECK-GI-NEXT:    mov v16.b[8], w10
4414; CHECK-GI-NEXT:    ldr w10, [sp, #1296]
4415; CHECK-GI-NEXT:    ldr w11, [sp, #920]
4416; CHECK-GI-NEXT:    mov v17.b[8], w9
4417; CHECK-GI-NEXT:    ldr w9, [sp, #1424]
4418; CHECK-GI-NEXT:    mov v6.b[9], w8
4419; CHECK-GI-NEXT:    ldr w8, [sp, #1048]
4420; CHECK-GI-NEXT:    mov v18.b[8], w10
4421; CHECK-GI-NEXT:    ldr w10, [sp, #1176]
4422; CHECK-GI-NEXT:    mov v19.b[8], w9
4423; CHECK-GI-NEXT:    ldr w9, [sp, #800]
4424; CHECK-GI-NEXT:    mov v7.b[9], w11
4425; CHECK-GI-NEXT:    mov v16.b[9], w8
4426; CHECK-GI-NEXT:    ldr w8, [sp, #1304]
4427; CHECK-GI-NEXT:    ldr w11, [sp, #928]
4428; CHECK-GI-NEXT:    mov v17.b[9], w10
4429; CHECK-GI-NEXT:    ldr w10, [sp, #1432]
4430; CHECK-GI-NEXT:    mov v6.b[10], w9
4431; CHECK-GI-NEXT:    ldr w9, [sp, #1056]
4432; CHECK-GI-NEXT:    mov v18.b[9], w8
4433; CHECK-GI-NEXT:    ldr w8, [sp, #1184]
4434; CHECK-GI-NEXT:    mov v19.b[9], w10
4435; CHECK-GI-NEXT:    ldr w10, [sp, #808]
4436; CHECK-GI-NEXT:    mov v7.b[10], w11
4437; CHECK-GI-NEXT:    mov v16.b[10], w9
4438; CHECK-GI-NEXT:    ldr w9, [sp, #1312]
4439; CHECK-GI-NEXT:    ldr w11, [sp, #936]
4440; CHECK-GI-NEXT:    mov v17.b[10], w8
4441; CHECK-GI-NEXT:    ldr w8, [sp, #1440]
4442; CHECK-GI-NEXT:    mov v6.b[11], w10
4443; CHECK-GI-NEXT:    ldr w10, [sp, #1064]
4444; CHECK-GI-NEXT:    mov v18.b[10], w9
4445; CHECK-GI-NEXT:    ldr w9, [sp, #1192]
4446; CHECK-GI-NEXT:    mov v19.b[10], w8
4447; CHECK-GI-NEXT:    ldr w8, [sp, #816]
4448; CHECK-GI-NEXT:    mov v7.b[11], w11
4449; CHECK-GI-NEXT:    mov v16.b[11], w10
4450; CHECK-GI-NEXT:    ldr w10, [sp, #1320]
4451; CHECK-GI-NEXT:    ldr w11, [sp, #944]
4452; CHECK-GI-NEXT:    mov v17.b[11], w9
4453; CHECK-GI-NEXT:    ldr w9, [sp, #1448]
4454; CHECK-GI-NEXT:    mov v6.b[12], w8
4455; CHECK-GI-NEXT:    ldr w8, [sp, #1072]
4456; CHECK-GI-NEXT:    mov v18.b[11], w10
4457; CHECK-GI-NEXT:    ldr w10, [sp, #1200]
4458; CHECK-GI-NEXT:    mov v19.b[11], w9
4459; CHECK-GI-NEXT:    ldr w9, [sp, #824]
4460; CHECK-GI-NEXT:    mov v7.b[12], w11
4461; CHECK-GI-NEXT:    mov v16.b[12], w8
4462; CHECK-GI-NEXT:    ldr w8, [sp, #1328]
4463; CHECK-GI-NEXT:    ldr w11, [sp, #952]
4464; CHECK-GI-NEXT:    mov v17.b[12], w10
4465; CHECK-GI-NEXT:    ldr w10, [sp, #1456]
4466; CHECK-GI-NEXT:    mov v6.b[13], w9
4467; CHECK-GI-NEXT:    ldr w9, [sp, #1080]
4468; CHECK-GI-NEXT:    mov v18.b[12], w8
4469; CHECK-GI-NEXT:    ldr w8, [sp, #1208]
4470; CHECK-GI-NEXT:    mov v19.b[12], w10
4471; CHECK-GI-NEXT:    ldr w10, [sp, #832]
4472; CHECK-GI-NEXT:    mov v7.b[13], w11
4473; CHECK-GI-NEXT:    mov v16.b[13], w9
4474; CHECK-GI-NEXT:    ldr w9, [sp, #1336]
4475; CHECK-GI-NEXT:    ldr w11, [sp, #960]
4476; CHECK-GI-NEXT:    mov v17.b[13], w8
4477; CHECK-GI-NEXT:    ldr w8, [sp, #1464]
4478; CHECK-GI-NEXT:    mov v6.b[14], w10
4479; CHECK-GI-NEXT:    ldr w10, [sp, #1088]
4480; CHECK-GI-NEXT:    mov v18.b[13], w9
4481; CHECK-GI-NEXT:    ldr w9, [sp, #1216]
4482; CHECK-GI-NEXT:    mov v19.b[13], w8
4483; CHECK-GI-NEXT:    ldr w8, [sp, #840]
4484; CHECK-GI-NEXT:    mov v7.b[14], w11
4485; CHECK-GI-NEXT:    mov v16.b[14], w10
4486; CHECK-GI-NEXT:    ldr w10, [sp, #1344]
4487; CHECK-GI-NEXT:    ldr w11, [sp, #968]
4488; CHECK-GI-NEXT:    mov v17.b[14], w9
4489; CHECK-GI-NEXT:    mov v6.b[15], w8
4490; CHECK-GI-NEXT:    ldr w8, [sp, #1096]
4491; CHECK-GI-NEXT:    mov v18.b[14], w10
4492; CHECK-GI-NEXT:    ldr w9, [sp, #1472]
4493; CHECK-GI-NEXT:    ldr w10, [sp, #1224]
4494; CHECK-GI-NEXT:    mov v7.b[15], w11
4495; CHECK-GI-NEXT:    addv s4, v25.4s
4496; CHECK-GI-NEXT:    mov v16.b[15], w8
4497; CHECK-GI-NEXT:    ldr w8, [sp, #1352]
4498; CHECK-GI-NEXT:    mov v19.b[14], w9
4499; CHECK-GI-NEXT:    mov v17.b[15], w10
4500; CHECK-GI-NEXT:    ldr w9, [sp, #1480]
4501; CHECK-GI-NEXT:    mov v18.b[15], w8
4502; CHECK-GI-NEXT:    fmov w8, s0
4503; CHECK-GI-NEXT:    fmov w11, s4
4504; CHECK-GI-NEXT:    mov v19.b[15], w9
4505; CHECK-GI-NEXT:    fmov w9, s1
4506; CHECK-GI-NEXT:    sdot v22.4s, v6.16b, v17.16b
4507; CHECK-GI-NEXT:    sdot v23.4s, v7.16b, v18.16b
4508; CHECK-GI-NEXT:    add w8, w8, w9
4509; CHECK-GI-NEXT:    sdot v24.4s, v16.16b, v19.16b
4510; CHECK-GI-NEXT:    add w8, w8, w11
4511; CHECK-GI-NEXT:    addv s2, v22.4s
4512; CHECK-GI-NEXT:    addv s3, v23.4s
4513; CHECK-GI-NEXT:    addv s5, v24.4s
4514; CHECK-GI-NEXT:    fmov w9, s2
4515; CHECK-GI-NEXT:    fmov w10, s3
4516; CHECK-GI-NEXT:    add w9, w9, w10
4517; CHECK-GI-NEXT:    fmov w10, s5
4518; CHECK-GI-NEXT:    add w9, w9, w10
4519; CHECK-GI-NEXT:    add w0, w8, w9
4520; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
4521; CHECK-GI-NEXT:    ret
4522entry:
4523  %az = sext <48 x i8> %a to <48 x i32>
4524  %bz = sext <48 x i8> %b to <48 x i32>
4525  %m1 = mul nuw nsw <48 x i32> %az, %bz
4526  %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m1)
4527  %cz = sext <48 x i8> %c to <48 x i32>
4528  %dz = sext <48 x i8> %d to <48 x i32>
4529  %m2 = mul nuw nsw <48 x i32> %cz, %dz
4530  %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %m2)
4531  %x = add i32 %r1, %r2
4532  ret i32 %x
4533}
4534
4535define i32 @test_sdot_v48i8_double_nomla(<48 x i8> %a, <48 x i8> %b, <48 x i8> %c, <48 x i8> %d) {
4536; CHECK-SD-LABEL: test_sdot_v48i8_double_nomla:
4537; CHECK-SD:       // %bb.0: // %entry
4538; CHECK-SD-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
4539; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
4540; CHECK-SD-NEXT:    .cfi_offset w29, -16
4541; CHECK-SD-NEXT:    ldr b5, [sp, #208]
4542; CHECK-SD-NEXT:    add x8, sp, #216
4543; CHECK-SD-NEXT:    fmov s0, w0
4544; CHECK-SD-NEXT:    ldr b4, [sp, #976]
4545; CHECK-SD-NEXT:    add x9, sp, #984
4546; CHECK-SD-NEXT:    add x12, sp, #328
4547; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x8]
4548; CHECK-SD-NEXT:    add x8, sp, #224
4549; CHECK-SD-NEXT:    movi v1.16b, #1
4550; CHECK-SD-NEXT:    mov v0.b[1], w1
4551; CHECK-SD-NEXT:    ld1 { v4.b }[1], [x9]
4552; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
4553; CHECK-SD-NEXT:    add x11, sp, #992
4554; CHECK-SD-NEXT:    ldr b6, [sp, #720]
4555; CHECK-SD-NEXT:    ldr b7, [sp, #80]
4556; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x8]
4557; CHECK-SD-NEXT:    add x8, sp, #232
4558; CHECK-SD-NEXT:    add x13, sp, #88
4559; CHECK-SD-NEXT:    ld1 { v4.b }[2], [x11]
4560; CHECK-SD-NEXT:    ld1 { v7.b }[1], [x13]
4561; CHECK-SD-NEXT:    add x13, sp, #856
4562; CHECK-SD-NEXT:    mov v0.b[2], w2
4563; CHECK-SD-NEXT:    add x14, sp, #1008
4564; CHECK-SD-NEXT:    add x15, sp, #872
4565; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x8]
4566; CHECK-SD-NEXT:    add x8, sp, #240
4567; CHECK-SD-NEXT:    add x16, sp, #888
4568; CHECK-SD-NEXT:    add x10, sp, #16
4569; CHECK-SD-NEXT:    add x9, sp, #24
4570; CHECK-SD-NEXT:    add x11, sp, #40
4571; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
4572; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x8]
4573; CHECK-SD-NEXT:    add x8, sp, #248
4574; CHECK-SD-NEXT:    mov v0.b[3], w3
4575; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x8]
4576; CHECK-SD-NEXT:    add x8, sp, #256
4577; CHECK-SD-NEXT:    mov v0.b[4], w4
4578; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x8]
4579; CHECK-SD-NEXT:    add x8, sp, #264
4580; CHECK-SD-NEXT:    mov v0.b[5], w5
4581; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x8]
4582; CHECK-SD-NEXT:    add x8, sp, #272
4583; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x8]
4584; CHECK-SD-NEXT:    add x8, sp, #280
4585; CHECK-SD-NEXT:    mov v0.b[6], w6
4586; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x8]
4587; CHECK-SD-NEXT:    add x8, sp, #288
4588; CHECK-SD-NEXT:    mov v0.b[7], w7
4589; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x8]
4590; CHECK-SD-NEXT:    add x8, sp, #296
4591; CHECK-SD-NEXT:    ld1 { v0.b }[8], [x10]
4592; CHECK-SD-NEXT:    add x10, sp, #128
4593; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
4594; CHECK-SD-NEXT:    add x8, sp, #304
4595; CHECK-SD-NEXT:    ld1 { v0.b }[9], [x9]
4596; CHECK-SD-NEXT:    add x9, sp, #136
4597; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x8]
4598; CHECK-SD-NEXT:    add x8, sp, #312
4599; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
4600; CHECK-SD-NEXT:    add x8, sp, #320
4601; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
4602; CHECK-SD-NEXT:    add x8, sp, #32
4603; CHECK-SD-NEXT:    ld1 { v0.b }[10], [x8]
4604; CHECK-SD-NEXT:    add x8, sp, #144
4605; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x12]
4606; CHECK-SD-NEXT:    add x12, sp, #728
4607; CHECK-SD-NEXT:    ld1 { v6.b }[1], [x12]
4608; CHECK-SD-NEXT:    add x12, sp, #1000
4609; CHECK-SD-NEXT:    ld1 { v0.b }[11], [x11]
4610; CHECK-SD-NEXT:    ld1 { v4.b }[3], [x12]
4611; CHECK-SD-NEXT:    add x12, sp, #736
4612; CHECK-SD-NEXT:    add x11, sp, #920
4613; CHECK-SD-NEXT:    sdot v3.4s, v5.16b, v1.16b
4614; CHECK-SD-NEXT:    ldr b5, [sp, #848]
4615; CHECK-SD-NEXT:    ld1 { v6.b }[2], [x12]
4616; CHECK-SD-NEXT:    add x12, sp, #48
4617; CHECK-SD-NEXT:    ld1 { v5.b }[1], [x13]
4618; CHECK-SD-NEXT:    add x13, sp, #744
4619; CHECK-SD-NEXT:    ld1 { v4.b }[4], [x14]
4620; CHECK-SD-NEXT:    add x14, sp, #96
4621; CHECK-SD-NEXT:    ld1 { v0.b }[12], [x12]
4622; CHECK-SD-NEXT:    ld1 { v6.b }[3], [x13]
4623; CHECK-SD-NEXT:    add x13, sp, #864
4624; CHECK-SD-NEXT:    ld1 { v7.b }[2], [x14]
4625; CHECK-SD-NEXT:    add x14, sp, #1016
4626; CHECK-SD-NEXT:    ld1 { v5.b }[2], [x13]
4627; CHECK-SD-NEXT:    add x13, sp, #752
4628; CHECK-SD-NEXT:    ld1 { v4.b }[5], [x14]
4629; CHECK-SD-NEXT:    add x14, sp, #104
4630; CHECK-SD-NEXT:    ld1 { v6.b }[4], [x13]
4631; CHECK-SD-NEXT:    add x13, sp, #1024
4632; CHECK-SD-NEXT:    ld1 { v7.b }[3], [x14]
4633; CHECK-SD-NEXT:    ld1 { v5.b }[3], [x15]
4634; CHECK-SD-NEXT:    add x15, sp, #760
4635; CHECK-SD-NEXT:    add x14, sp, #112
4636; CHECK-SD-NEXT:    ld1 { v4.b }[6], [x13]
4637; CHECK-SD-NEXT:    add x13, sp, #880
4638; CHECK-SD-NEXT:    ld1 { v6.b }[5], [x15]
4639; CHECK-SD-NEXT:    add x15, sp, #1032
4640; CHECK-SD-NEXT:    ld1 { v7.b }[4], [x14]
4641; CHECK-SD-NEXT:    ld1 { v5.b }[4], [x13]
4642; CHECK-SD-NEXT:    add x14, sp, #768
4643; CHECK-SD-NEXT:    add x13, sp, #120
4644; CHECK-SD-NEXT:    ld1 { v4.b }[7], [x15]
4645; CHECK-SD-NEXT:    add x15, sp, #1040
4646; CHECK-SD-NEXT:    ld1 { v6.b }[6], [x14]
4647; CHECK-SD-NEXT:    ld1 { v7.b }[5], [x13]
4648; CHECK-SD-NEXT:    add x13, sp, #776
4649; CHECK-SD-NEXT:    ld1 { v5.b }[5], [x16]
4650; CHECK-SD-NEXT:    add x14, sp, #1048
4651; CHECK-SD-NEXT:    ld1 { v4.b }[8], [x15]
4652; CHECK-SD-NEXT:    add x15, sp, #896
4653; CHECK-SD-NEXT:    ld1 { v6.b }[7], [x13]
4654; CHECK-SD-NEXT:    ld1 { v7.b }[6], [x10]
4655; CHECK-SD-NEXT:    add x10, sp, #784
4656; CHECK-SD-NEXT:    ld1 { v5.b }[6], [x15]
4657; CHECK-SD-NEXT:    add x13, sp, #1056
4658; CHECK-SD-NEXT:    ld1 { v4.b }[9], [x14]
4659; CHECK-SD-NEXT:    add x14, sp, #904
4660; CHECK-SD-NEXT:    ld1 { v6.b }[8], [x10]
4661; CHECK-SD-NEXT:    ld1 { v7.b }[7], [x9]
4662; CHECK-SD-NEXT:    add x9, sp, #792
4663; CHECK-SD-NEXT:    ld1 { v5.b }[7], [x14]
4664; CHECK-SD-NEXT:    add x10, sp, #1064
4665; CHECK-SD-NEXT:    ld1 { v4.b }[10], [x13]
4666; CHECK-SD-NEXT:    add x13, sp, #912
4667; CHECK-SD-NEXT:    ld1 { v6.b }[9], [x9]
4668; CHECK-SD-NEXT:    ld1 { v7.b }[8], [x8]
4669; CHECK-SD-NEXT:    add x9, sp, #800
4670; CHECK-SD-NEXT:    ld1 { v5.b }[8], [x13]
4671; CHECK-SD-NEXT:    add x8, sp, #152
4672; CHECK-SD-NEXT:    ld1 { v4.b }[11], [x10]
4673; CHECK-SD-NEXT:    add x10, sp, #1072
4674; CHECK-SD-NEXT:    ld1 { v6.b }[10], [x9]
4675; CHECK-SD-NEXT:    ld1 { v7.b }[9], [x8]
4676; CHECK-SD-NEXT:    add x9, sp, #808
4677; CHECK-SD-NEXT:    ld1 { v5.b }[9], [x11]
4678; CHECK-SD-NEXT:    add x8, sp, #56
4679; CHECK-SD-NEXT:    ld1 { v4.b }[12], [x10]
4680; CHECK-SD-NEXT:    add x10, sp, #160
4681; CHECK-SD-NEXT:    ld1 { v0.b }[13], [x8]
4682; CHECK-SD-NEXT:    ld1 { v6.b }[11], [x9]
4683; CHECK-SD-NEXT:    add x9, sp, #928
4684; CHECK-SD-NEXT:    ld1 { v7.b }[10], [x10]
4685; CHECK-SD-NEXT:    add x10, sp, #1080
4686; CHECK-SD-NEXT:    ld1 { v5.b }[10], [x9]
4687; CHECK-SD-NEXT:    add x8, sp, #816
4688; CHECK-SD-NEXT:    ld1 { v4.b }[13], [x10]
4689; CHECK-SD-NEXT:    add x9, sp, #168
4690; CHECK-SD-NEXT:    add x10, sp, #176
4691; CHECK-SD-NEXT:    ld1 { v6.b }[12], [x8]
4692; CHECK-SD-NEXT:    add x8, sp, #936
4693; CHECK-SD-NEXT:    ld1 { v7.b }[11], [x9]
4694; CHECK-SD-NEXT:    add x9, sp, #1088
4695; CHECK-SD-NEXT:    ld1 { v5.b }[11], [x8]
4696; CHECK-SD-NEXT:    add x8, sp, #64
4697; CHECK-SD-NEXT:    ld1 { v4.b }[14], [x9]
4698; CHECK-SD-NEXT:    add x9, sp, #824
4699; CHECK-SD-NEXT:    ld1 { v0.b }[14], [x8]
4700; CHECK-SD-NEXT:    ld1 { v6.b }[13], [x9]
4701; CHECK-SD-NEXT:    add x9, sp, #944
4702; CHECK-SD-NEXT:    ld1 { v7.b }[12], [x10]
4703; CHECK-SD-NEXT:    add x10, sp, #1096
4704; CHECK-SD-NEXT:    ld1 { v5.b }[12], [x9]
4705; CHECK-SD-NEXT:    add x8, sp, #832
4706; CHECK-SD-NEXT:    ld1 { v4.b }[15], [x10]
4707; CHECK-SD-NEXT:    add x9, sp, #184
4708; CHECK-SD-NEXT:    add x10, sp, #72
4709; CHECK-SD-NEXT:    ld1 { v6.b }[14], [x8]
4710; CHECK-SD-NEXT:    add x8, sp, #952
4711; CHECK-SD-NEXT:    ld1 { v7.b }[13], [x9]
4712; CHECK-SD-NEXT:    ld1 { v5.b }[13], [x8]
4713; CHECK-SD-NEXT:    add x8, sp, #840
4714; CHECK-SD-NEXT:    ld1 { v0.b }[15], [x10]
4715; CHECK-SD-NEXT:    sdot v2.4s, v4.16b, v1.16b
4716; CHECK-SD-NEXT:    add x9, sp, #192
4717; CHECK-SD-NEXT:    ld1 { v6.b }[15], [x8]
4718; CHECK-SD-NEXT:    add x8, sp, #960
4719; CHECK-SD-NEXT:    ld1 { v7.b }[14], [x9]
4720; CHECK-SD-NEXT:    ld1 { v5.b }[14], [x8]
4721; CHECK-SD-NEXT:    sdot v3.4s, v0.16b, v1.16b
4722; CHECK-SD-NEXT:    add x8, sp, #200
4723; CHECK-SD-NEXT:    add x9, sp, #968
4724; CHECK-SD-NEXT:    sdot v2.4s, v6.16b, v1.16b
4725; CHECK-SD-NEXT:    ld1 { v7.b }[15], [x8]
4726; CHECK-SD-NEXT:    ld1 { v5.b }[15], [x9]
4727; CHECK-SD-NEXT:    sdot v3.4s, v7.16b, v1.16b
4728; CHECK-SD-NEXT:    sdot v2.4s, v5.16b, v1.16b
4729; CHECK-SD-NEXT:    add v0.4s, v3.4s, v2.4s
4730; CHECK-SD-NEXT:    addv s0, v0.4s
4731; CHECK-SD-NEXT:    fmov w0, s0
4732; CHECK-SD-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
4733; CHECK-SD-NEXT:    ret
4734;
4735; CHECK-GI-LABEL: test_sdot_v48i8_double_nomla:
4736; CHECK-GI:       // %bb.0: // %entry
4737; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
4738; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
4739; CHECK-GI-NEXT:    .cfi_offset w29, -16
4740; CHECK-GI-NEXT:    ldr w10, [sp, #80]
4741; CHECK-GI-NEXT:    ldr w11, [sp, #208]
4742; CHECK-GI-NEXT:    fmov s0, w0
4743; CHECK-GI-NEXT:    ldr w9, [sp, #88]
4744; CHECK-GI-NEXT:    ldr w12, [sp, #728]
4745; CHECK-GI-NEXT:    movi v6.16b, #1
4746; CHECK-GI-NEXT:    fmov s1, w10
4747; CHECK-GI-NEXT:    fmov s2, w11
4748; CHECK-GI-NEXT:    ldr w11, [sp, #720]
4749; CHECK-GI-NEXT:    ldr w10, [sp, #216]
4750; CHECK-GI-NEXT:    mov v0.b[1], w1
4751; CHECK-GI-NEXT:    ldr w13, [sp, #856]
4752; CHECK-GI-NEXT:    fmov s3, w11
4753; CHECK-GI-NEXT:    ldr w8, [sp, #96]
4754; CHECK-GI-NEXT:    ldr w11, [sp, #224]
4755; CHECK-GI-NEXT:    mov v1.b[1], w9
4756; CHECK-GI-NEXT:    ldr w9, [sp, #848]
4757; CHECK-GI-NEXT:    mov v2.b[1], w10
4758; CHECK-GI-NEXT:    ldr w10, [sp, #976]
4759; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
4760; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
4761; CHECK-GI-NEXT:    fmov s4, w9
4762; CHECK-GI-NEXT:    mov v3.b[1], w12
4763; CHECK-GI-NEXT:    ldr w9, [sp, #984]
4764; CHECK-GI-NEXT:    fmov s5, w10
4765; CHECK-GI-NEXT:    mov v0.b[2], w2
4766; CHECK-GI-NEXT:    ldr w10, [sp, #736]
4767; CHECK-GI-NEXT:    mov v1.b[2], w8
4768; CHECK-GI-NEXT:    ldr w8, [sp, #864]
4769; CHECK-GI-NEXT:    mov v2.b[2], w11
4770; CHECK-GI-NEXT:    mov v4.b[1], w13
4771; CHECK-GI-NEXT:    ldr w11, [sp, #992]
4772; CHECK-GI-NEXT:    ldr w12, [sp, #776]
4773; CHECK-GI-NEXT:    mov v5.b[1], w9
4774; CHECK-GI-NEXT:    mov v3.b[2], w10
4775; CHECK-GI-NEXT:    ldr w9, [sp, #104]
4776; CHECK-GI-NEXT:    ldr w10, [sp, #232]
4777; CHECK-GI-NEXT:    mov v0.b[3], w3
4778; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
4779; CHECK-GI-NEXT:    mov v1.b[3], w9
4780; CHECK-GI-NEXT:    ldr w9, [sp, #872]
4781; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
4782; CHECK-GI-NEXT:    mov v4.b[2], w8
4783; CHECK-GI-NEXT:    ldr w8, [sp, #744]
4784; CHECK-GI-NEXT:    mov v2.b[3], w10
4785; CHECK-GI-NEXT:    mov v5.b[2], w11
4786; CHECK-GI-NEXT:    ldr w11, [sp, #1000]
4787; CHECK-GI-NEXT:    ldr w10, [sp, #240]
4788; CHECK-GI-NEXT:    mov v3.b[3], w8
4789; CHECK-GI-NEXT:    ldr w8, [sp, #112]
4790; CHECK-GI-NEXT:    mov v0.b[4], w4
4791; CHECK-GI-NEXT:    movi v19.2d, #0000000000000000
4792; CHECK-GI-NEXT:    movi v20.2d, #0000000000000000
4793; CHECK-GI-NEXT:    mov v4.b[3], w9
4794; CHECK-GI-NEXT:    ldr w9, [sp, #752]
4795; CHECK-GI-NEXT:    mov v1.b[4], w8
4796; CHECK-GI-NEXT:    ldr w8, [sp, #880]
4797; CHECK-GI-NEXT:    mov v5.b[3], w11
4798; CHECK-GI-NEXT:    mov v2.b[4], w10
4799; CHECK-GI-NEXT:    mov v3.b[4], w9
4800; CHECK-GI-NEXT:    ldr w9, [sp, #120]
4801; CHECK-GI-NEXT:    ldr w11, [sp, #1008]
4802; CHECK-GI-NEXT:    ldr w10, [sp, #248]
4803; CHECK-GI-NEXT:    mov v0.b[5], w5
4804; CHECK-GI-NEXT:    mov v4.b[4], w8
4805; CHECK-GI-NEXT:    ldr w8, [sp, #760]
4806; CHECK-GI-NEXT:    mov v1.b[5], w9
4807; CHECK-GI-NEXT:    ldr w9, [sp, #888]
4808; CHECK-GI-NEXT:    mov v5.b[4], w11
4809; CHECK-GI-NEXT:    mov v2.b[5], w10
4810; CHECK-GI-NEXT:    mov v3.b[5], w8
4811; CHECK-GI-NEXT:    ldr w8, [sp, #128]
4812; CHECK-GI-NEXT:    ldr w11, [sp, #1016]
4813; CHECK-GI-NEXT:    ldr w10, [sp, #256]
4814; CHECK-GI-NEXT:    mov v0.b[6], w6
4815; CHECK-GI-NEXT:    mov v4.b[5], w9
4816; CHECK-GI-NEXT:    ldr w9, [sp, #768]
4817; CHECK-GI-NEXT:    mov v1.b[6], w8
4818; CHECK-GI-NEXT:    ldr w8, [sp, #896]
4819; CHECK-GI-NEXT:    mov v5.b[5], w11
4820; CHECK-GI-NEXT:    mov v2.b[6], w10
4821; CHECK-GI-NEXT:    mov v3.b[6], w9
4822; CHECK-GI-NEXT:    ldr w9, [sp, #136]
4823; CHECK-GI-NEXT:    ldr w11, [sp, #1024]
4824; CHECK-GI-NEXT:    ldr w10, [sp, #264]
4825; CHECK-GI-NEXT:    mov v0.b[7], w7
4826; CHECK-GI-NEXT:    mov v4.b[6], w8
4827; CHECK-GI-NEXT:    mov v1.b[7], w9
4828; CHECK-GI-NEXT:    ldr w9, [sp, #904]
4829; CHECK-GI-NEXT:    mov v5.b[6], w11
4830; CHECK-GI-NEXT:    mov v2.b[7], w10
4831; CHECK-GI-NEXT:    ldr w8, [sp, #16]
4832; CHECK-GI-NEXT:    mov v3.b[7], w12
4833; CHECK-GI-NEXT:    ldr w10, [sp, #144]
4834; CHECK-GI-NEXT:    ldr w12, [sp, #1032]
4835; CHECK-GI-NEXT:    mov v0.b[8], w8
4836; CHECK-GI-NEXT:    ldr w8, [sp, #784]
4837; CHECK-GI-NEXT:    ldr w11, [sp, #272]
4838; CHECK-GI-NEXT:    mov v4.b[7], w9
4839; CHECK-GI-NEXT:    mov v1.b[8], w10
4840; CHECK-GI-NEXT:    ldr w10, [sp, #912]
4841; CHECK-GI-NEXT:    mov v5.b[7], w12
4842; CHECK-GI-NEXT:    ldr w9, [sp, #24]
4843; CHECK-GI-NEXT:    ldr w12, [sp, #1040]
4844; CHECK-GI-NEXT:    mov v3.b[8], w8
4845; CHECK-GI-NEXT:    ldr w8, [sp, #152]
4846; CHECK-GI-NEXT:    mov v2.b[8], w11
4847; CHECK-GI-NEXT:    mov v0.b[9], w9
4848; CHECK-GI-NEXT:    ldr w9, [sp, #792]
4849; CHECK-GI-NEXT:    ldr w11, [sp, #280]
4850; CHECK-GI-NEXT:    mov v4.b[8], w10
4851; CHECK-GI-NEXT:    mov v1.b[9], w8
4852; CHECK-GI-NEXT:    ldr w10, [sp, #920]
4853; CHECK-GI-NEXT:    mov v5.b[8], w12
4854; CHECK-GI-NEXT:    ldr w8, [sp, #32]
4855; CHECK-GI-NEXT:    ldr w12, [sp, #1048]
4856; CHECK-GI-NEXT:    mov v3.b[9], w9
4857; CHECK-GI-NEXT:    ldr w9, [sp, #160]
4858; CHECK-GI-NEXT:    mov v2.b[9], w11
4859; CHECK-GI-NEXT:    mov v0.b[10], w8
4860; CHECK-GI-NEXT:    ldr w8, [sp, #800]
4861; CHECK-GI-NEXT:    ldr w11, [sp, #288]
4862; CHECK-GI-NEXT:    mov v4.b[9], w10
4863; CHECK-GI-NEXT:    mov v1.b[10], w9
4864; CHECK-GI-NEXT:    ldr w10, [sp, #928]
4865; CHECK-GI-NEXT:    mov v5.b[9], w12
4866; CHECK-GI-NEXT:    ldr w9, [sp, #40]
4867; CHECK-GI-NEXT:    ldr w12, [sp, #1056]
4868; CHECK-GI-NEXT:    mov v3.b[10], w8
4869; CHECK-GI-NEXT:    ldr w8, [sp, #168]
4870; CHECK-GI-NEXT:    mov v2.b[10], w11
4871; CHECK-GI-NEXT:    mov v0.b[11], w9
4872; CHECK-GI-NEXT:    ldr w9, [sp, #808]
4873; CHECK-GI-NEXT:    ldr w11, [sp, #296]
4874; CHECK-GI-NEXT:    mov v4.b[10], w10
4875; CHECK-GI-NEXT:    mov v1.b[11], w8
4876; CHECK-GI-NEXT:    ldr w10, [sp, #936]
4877; CHECK-GI-NEXT:    mov v5.b[10], w12
4878; CHECK-GI-NEXT:    ldr w8, [sp, #48]
4879; CHECK-GI-NEXT:    ldr w12, [sp, #1064]
4880; CHECK-GI-NEXT:    mov v3.b[11], w9
4881; CHECK-GI-NEXT:    ldr w9, [sp, #176]
4882; CHECK-GI-NEXT:    mov v2.b[11], w11
4883; CHECK-GI-NEXT:    mov v0.b[12], w8
4884; CHECK-GI-NEXT:    ldr w8, [sp, #816]
4885; CHECK-GI-NEXT:    ldr w11, [sp, #304]
4886; CHECK-GI-NEXT:    mov v4.b[11], w10
4887; CHECK-GI-NEXT:    mov v1.b[12], w9
4888; CHECK-GI-NEXT:    ldr w10, [sp, #944]
4889; CHECK-GI-NEXT:    mov v5.b[11], w12
4890; CHECK-GI-NEXT:    ldr w9, [sp, #56]
4891; CHECK-GI-NEXT:    ldr w12, [sp, #1072]
4892; CHECK-GI-NEXT:    mov v3.b[12], w8
4893; CHECK-GI-NEXT:    ldr w8, [sp, #184]
4894; CHECK-GI-NEXT:    mov v2.b[12], w11
4895; CHECK-GI-NEXT:    mov v0.b[13], w9
4896; CHECK-GI-NEXT:    ldr w9, [sp, #824]
4897; CHECK-GI-NEXT:    ldr w11, [sp, #312]
4898; CHECK-GI-NEXT:    mov v4.b[12], w10
4899; CHECK-GI-NEXT:    mov v1.b[13], w8
4900; CHECK-GI-NEXT:    ldr w10, [sp, #952]
4901; CHECK-GI-NEXT:    mov v5.b[12], w12
4902; CHECK-GI-NEXT:    ldr w8, [sp, #64]
4903; CHECK-GI-NEXT:    ldr w12, [sp, #1080]
4904; CHECK-GI-NEXT:    mov v3.b[13], w9
4905; CHECK-GI-NEXT:    ldr w9, [sp, #192]
4906; CHECK-GI-NEXT:    mov v2.b[13], w11
4907; CHECK-GI-NEXT:    mov v0.b[14], w8
4908; CHECK-GI-NEXT:    ldr w8, [sp, #832]
4909; CHECK-GI-NEXT:    ldr w11, [sp, #320]
4910; CHECK-GI-NEXT:    mov v4.b[13], w10
4911; CHECK-GI-NEXT:    mov v1.b[14], w9
4912; CHECK-GI-NEXT:    ldr w10, [sp, #960]
4913; CHECK-GI-NEXT:    mov v5.b[13], w12
4914; CHECK-GI-NEXT:    ldr w9, [sp, #72]
4915; CHECK-GI-NEXT:    ldr w12, [sp, #1088]
4916; CHECK-GI-NEXT:    mov v3.b[14], w8
4917; CHECK-GI-NEXT:    ldr w8, [sp, #200]
4918; CHECK-GI-NEXT:    mov v2.b[14], w11
4919; CHECK-GI-NEXT:    mov v0.b[15], w9
4920; CHECK-GI-NEXT:    ldr w9, [sp, #840]
4921; CHECK-GI-NEXT:    ldr w11, [sp, #328]
4922; CHECK-GI-NEXT:    mov v4.b[14], w10
4923; CHECK-GI-NEXT:    mov v1.b[15], w8
4924; CHECK-GI-NEXT:    ldr w8, [sp, #968]
4925; CHECK-GI-NEXT:    mov v5.b[14], w12
4926; CHECK-GI-NEXT:    ldr w10, [sp, #1096]
4927; CHECK-GI-NEXT:    mov v3.b[15], w9
4928; CHECK-GI-NEXT:    mov v2.b[15], w11
4929; CHECK-GI-NEXT:    sdot v7.4s, v0.16b, v6.16b
4930; CHECK-GI-NEXT:    mov v4.b[15], w8
4931; CHECK-GI-NEXT:    sdot v16.4s, v1.16b, v6.16b
4932; CHECK-GI-NEXT:    mov v5.b[15], w10
4933; CHECK-GI-NEXT:    sdot v17.4s, v3.16b, v6.16b
4934; CHECK-GI-NEXT:    sdot v20.4s, v2.16b, v6.16b
4935; CHECK-GI-NEXT:    addv s0, v7.4s
4936; CHECK-GI-NEXT:    sdot v18.4s, v4.16b, v6.16b
4937; CHECK-GI-NEXT:    addv s1, v16.4s
4938; CHECK-GI-NEXT:    sdot v19.4s, v5.16b, v6.16b
4939; CHECK-GI-NEXT:    addv s2, v17.4s
4940; CHECK-GI-NEXT:    addv s4, v20.4s
4941; CHECK-GI-NEXT:    fmov w8, s0
4942; CHECK-GI-NEXT:    fmov w9, s1
4943; CHECK-GI-NEXT:    addv s3, v18.4s
4944; CHECK-GI-NEXT:    addv s5, v19.4s
4945; CHECK-GI-NEXT:    fmov w10, s2
4946; CHECK-GI-NEXT:    add w8, w8, w9
4947; CHECK-GI-NEXT:    fmov w9, s4
4948; CHECK-GI-NEXT:    fmov w11, s3
4949; CHECK-GI-NEXT:    add w8, w8, w9
4950; CHECK-GI-NEXT:    add w10, w10, w11
4951; CHECK-GI-NEXT:    fmov w11, s5
4952; CHECK-GI-NEXT:    add w9, w10, w11
4953; CHECK-GI-NEXT:    add w0, w8, w9
4954; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
4955; CHECK-GI-NEXT:    ret
4956entry:
4957  %az = sext <48 x i8> %a to <48 x i32>
4958  %r1 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %az)
4959  %cz = sext <48 x i8> %c to <48 x i32>
4960  %r2 = call i32 @llvm.vector.reduce.add.v48i32(<48 x i32> %cz)
4961  %x = add i32 %r1, %r2
4962  ret i32 %x
4963}
4964
4965define i32 @test_udot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
4966; CHECK-SD-LABEL: test_udot_v64i8:
4967; CHECK-SD:       // %bb.0: // %entry
4968; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
4969; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
4970; CHECK-SD-NEXT:    ldp q2, q3, [x0, #32]
4971; CHECK-SD-NEXT:    ldp q4, q5, [x1, #32]
4972; CHECK-SD-NEXT:    udot v1.4s, v5.16b, v3.16b
4973; CHECK-SD-NEXT:    udot v0.4s, v4.16b, v2.16b
4974; CHECK-SD-NEXT:    ldp q2, q3, [x0]
4975; CHECK-SD-NEXT:    ldp q4, q5, [x1]
4976; CHECK-SD-NEXT:    udot v1.4s, v5.16b, v3.16b
4977; CHECK-SD-NEXT:    udot v0.4s, v4.16b, v2.16b
4978; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
4979; CHECK-SD-NEXT:    addv s0, v0.4s
4980; CHECK-SD-NEXT:    fmov w8, s0
4981; CHECK-SD-NEXT:    add w0, w8, w2
4982; CHECK-SD-NEXT:    ret
4983;
4984; CHECK-GI-LABEL: test_udot_v64i8:
4985; CHECK-GI:       // %bb.0: // %entry
4986; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
4987; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
4988; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
4989; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
4990; CHECK-GI-NEXT:    ldp q1, q2, [x0]
4991; CHECK-GI-NEXT:    ldp q6, q7, [x0, #32]
4992; CHECK-GI-NEXT:    ldp q16, q17, [x1]
4993; CHECK-GI-NEXT:    ldp q18, q19, [x1, #32]
4994; CHECK-GI-NEXT:    udot v0.4s, v16.16b, v1.16b
4995; CHECK-GI-NEXT:    udot v4.4s, v17.16b, v2.16b
4996; CHECK-GI-NEXT:    udot v5.4s, v18.16b, v6.16b
4997; CHECK-GI-NEXT:    udot v3.4s, v19.16b, v7.16b
4998; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
4999; CHECK-GI-NEXT:    add v1.4s, v5.4s, v3.4s
5000; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5001; CHECK-GI-NEXT:    addv s0, v0.4s
5002; CHECK-GI-NEXT:    fmov w8, s0
5003; CHECK-GI-NEXT:    add w0, w8, w2
5004; CHECK-GI-NEXT:    ret
5005entry:
5006  %0 = load <64 x i8>, ptr %a
5007  %1 = zext <64 x i8> %0 to <64 x i32>
5008  %2 = load <64 x i8>, ptr %b
5009  %3 = zext <64 x i8> %2 to <64 x i32>
5010  %4 = mul nuw nsw <64 x i32> %3, %1
5011  %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
5012  %op.extra = add i32 %5, %sum
5013  ret i32 %op.extra
5014}
5015
5016define i32 @test_udot_v64i8_nomla(ptr nocapture readonly %a1) {
5017; CHECK-SD-LABEL: test_udot_v64i8_nomla:
5018; CHECK-SD:       // %bb.0: // %entry
5019; CHECK-SD-NEXT:    movi v0.16b, #1
5020; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
5021; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
5022; CHECK-SD-NEXT:    ldp q3, q4, [x0, #32]
5023; CHECK-SD-NEXT:    udot v2.4s, v4.16b, v0.16b
5024; CHECK-SD-NEXT:    udot v1.4s, v3.16b, v0.16b
5025; CHECK-SD-NEXT:    ldp q3, q4, [x0]
5026; CHECK-SD-NEXT:    udot v2.4s, v4.16b, v0.16b
5027; CHECK-SD-NEXT:    udot v1.4s, v3.16b, v0.16b
5028; CHECK-SD-NEXT:    add v0.4s, v1.4s, v2.4s
5029; CHECK-SD-NEXT:    addv s0, v0.4s
5030; CHECK-SD-NEXT:    fmov w0, s0
5031; CHECK-SD-NEXT:    ret
5032;
5033; CHECK-GI-LABEL: test_udot_v64i8_nomla:
5034; CHECK-GI:       // %bb.0: // %entry
5035; CHECK-GI-NEXT:    movi v0.16b, #1
5036; CHECK-GI-NEXT:    movi v1.2d, #0000000000000000
5037; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
5038; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
5039; CHECK-GI-NEXT:    ldp q5, q6, [x0]
5040; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
5041; CHECK-GI-NEXT:    ldp q7, q16, [x0, #32]
5042; CHECK-GI-NEXT:    udot v1.4s, v5.16b, v0.16b
5043; CHECK-GI-NEXT:    udot v3.4s, v6.16b, v0.16b
5044; CHECK-GI-NEXT:    udot v2.4s, v16.16b, v0.16b
5045; CHECK-GI-NEXT:    udot v4.4s, v7.16b, v0.16b
5046; CHECK-GI-NEXT:    add v0.4s, v1.4s, v3.4s
5047; CHECK-GI-NEXT:    add v1.4s, v4.4s, v2.4s
5048; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5049; CHECK-GI-NEXT:    addv s0, v0.4s
5050; CHECK-GI-NEXT:    fmov w0, s0
5051; CHECK-GI-NEXT:    ret
5052entry:
5053  %0 = load <64 x i8>, ptr %a1
5054  %1 = zext <64 x i8> %0 to <64 x i32>
5055  %2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %1)
5056  ret i32 %2
5057}
5058define i32 @test_sdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
5059; CHECK-SD-LABEL: test_sdot_v64i8:
5060; CHECK-SD:       // %bb.0: // %entry
5061; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
5062; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
5063; CHECK-SD-NEXT:    ldp q2, q3, [x0, #32]
5064; CHECK-SD-NEXT:    ldp q4, q5, [x1, #32]
5065; CHECK-SD-NEXT:    sdot v1.4s, v5.16b, v3.16b
5066; CHECK-SD-NEXT:    sdot v0.4s, v4.16b, v2.16b
5067; CHECK-SD-NEXT:    ldp q2, q3, [x0]
5068; CHECK-SD-NEXT:    ldp q4, q5, [x1]
5069; CHECK-SD-NEXT:    sdot v1.4s, v5.16b, v3.16b
5070; CHECK-SD-NEXT:    sdot v0.4s, v4.16b, v2.16b
5071; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
5072; CHECK-SD-NEXT:    addv s0, v0.4s
5073; CHECK-SD-NEXT:    fmov w8, s0
5074; CHECK-SD-NEXT:    add w0, w8, w2
5075; CHECK-SD-NEXT:    ret
5076;
5077; CHECK-GI-LABEL: test_sdot_v64i8:
5078; CHECK-GI:       // %bb.0: // %entry
5079; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
5080; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
5081; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
5082; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
5083; CHECK-GI-NEXT:    ldp q1, q2, [x0]
5084; CHECK-GI-NEXT:    ldp q6, q7, [x0, #32]
5085; CHECK-GI-NEXT:    ldp q16, q17, [x1]
5086; CHECK-GI-NEXT:    ldp q18, q19, [x1, #32]
5087; CHECK-GI-NEXT:    sdot v0.4s, v16.16b, v1.16b
5088; CHECK-GI-NEXT:    sdot v4.4s, v17.16b, v2.16b
5089; CHECK-GI-NEXT:    sdot v5.4s, v18.16b, v6.16b
5090; CHECK-GI-NEXT:    sdot v3.4s, v19.16b, v7.16b
5091; CHECK-GI-NEXT:    add v0.4s, v0.4s, v4.4s
5092; CHECK-GI-NEXT:    add v1.4s, v5.4s, v3.4s
5093; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5094; CHECK-GI-NEXT:    addv s0, v0.4s
5095; CHECK-GI-NEXT:    fmov w8, s0
5096; CHECK-GI-NEXT:    add w0, w8, w2
5097; CHECK-GI-NEXT:    ret
5098entry:
5099  %0 = load <64 x i8>, ptr %a
5100  %1 = sext <64 x i8> %0 to <64 x i32>
5101  %2 = load <64 x i8>, ptr %b
5102  %3 = sext <64 x i8> %2 to <64 x i32>
5103  %4 = mul nsw <64 x i32> %3, %1
5104  %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
5105  %op.extra = add nsw i32 %5, %sum
5106  ret i32 %op.extra
5107}
5108
5109define i32 @test_sdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
5110; CHECK-SD-LABEL: test_sdot_v64i8_double:
5111; CHECK-SD:       // %bb.0: // %entry
5112; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
5113; CHECK-SD-NEXT:    movi v17.2d, #0000000000000000
5114; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
5115; CHECK-SD-NEXT:    movi v19.2d, #0000000000000000
5116; CHECK-SD-NEXT:    ldp q20, q21, [sp, #96]
5117; CHECK-SD-NEXT:    ldp q22, q23, [sp, #32]
5118; CHECK-SD-NEXT:    sdot v16.4s, v3.16b, v7.16b
5119; CHECK-SD-NEXT:    sdot v18.4s, v2.16b, v6.16b
5120; CHECK-SD-NEXT:    sdot v19.4s, v23.16b, v21.16b
5121; CHECK-SD-NEXT:    sdot v17.4s, v22.16b, v20.16b
5122; CHECK-SD-NEXT:    ldp q2, q3, [sp, #64]
5123; CHECK-SD-NEXT:    ldp q6, q7, [sp]
5124; CHECK-SD-NEXT:    sdot v16.4s, v1.16b, v5.16b
5125; CHECK-SD-NEXT:    sdot v18.4s, v0.16b, v4.16b
5126; CHECK-SD-NEXT:    sdot v19.4s, v7.16b, v3.16b
5127; CHECK-SD-NEXT:    sdot v17.4s, v6.16b, v2.16b
5128; CHECK-SD-NEXT:    add v0.4s, v18.4s, v16.4s
5129; CHECK-SD-NEXT:    add v1.4s, v17.4s, v19.4s
5130; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
5131; CHECK-SD-NEXT:    addv s0, v0.4s
5132; CHECK-SD-NEXT:    fmov w0, s0
5133; CHECK-SD-NEXT:    ret
5134;
5135; CHECK-GI-LABEL: test_sdot_v64i8_double:
5136; CHECK-GI:       // %bb.0: // %entry
5137; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
5138; CHECK-GI-NEXT:    movi v21.2d, #0000000000000000
5139; CHECK-GI-NEXT:    movi v22.2d, #0000000000000000
5140; CHECK-GI-NEXT:    movi v23.2d, #0000000000000000
5141; CHECK-GI-NEXT:    ldp q16, q17, [sp]
5142; CHECK-GI-NEXT:    movi v24.2d, #0000000000000000
5143; CHECK-GI-NEXT:    movi v25.2d, #0000000000000000
5144; CHECK-GI-NEXT:    movi v26.2d, #0000000000000000
5145; CHECK-GI-NEXT:    movi v27.2d, #0000000000000000
5146; CHECK-GI-NEXT:    ldp q19, q20, [sp, #32]
5147; CHECK-GI-NEXT:    sdot v18.4s, v0.16b, v4.16b
5148; CHECK-GI-NEXT:    ldp q0, q4, [sp, #64]
5149; CHECK-GI-NEXT:    sdot v21.4s, v1.16b, v5.16b
5150; CHECK-GI-NEXT:    ldp q1, q5, [sp, #96]
5151; CHECK-GI-NEXT:    sdot v22.4s, v2.16b, v6.16b
5152; CHECK-GI-NEXT:    sdot v23.4s, v3.16b, v7.16b
5153; CHECK-GI-NEXT:    sdot v24.4s, v16.16b, v0.16b
5154; CHECK-GI-NEXT:    sdot v26.4s, v17.16b, v4.16b
5155; CHECK-GI-NEXT:    sdot v27.4s, v19.16b, v1.16b
5156; CHECK-GI-NEXT:    sdot v25.4s, v20.16b, v5.16b
5157; CHECK-GI-NEXT:    add v0.4s, v18.4s, v21.4s
5158; CHECK-GI-NEXT:    add v1.4s, v22.4s, v23.4s
5159; CHECK-GI-NEXT:    add v2.4s, v24.4s, v26.4s
5160; CHECK-GI-NEXT:    add v3.4s, v27.4s, v25.4s
5161; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5162; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
5163; CHECK-GI-NEXT:    addv s0, v0.4s
5164; CHECK-GI-NEXT:    addv s1, v1.4s
5165; CHECK-GI-NEXT:    fmov w8, s0
5166; CHECK-GI-NEXT:    fmov w9, s1
5167; CHECK-GI-NEXT:    add w0, w8, w9
5168; CHECK-GI-NEXT:    ret
5169entry:
5170  %az = sext <64 x i8> %a to <64 x i32>
5171  %bz = sext <64 x i8> %b to <64 x i32>
5172  %m1 = mul nuw nsw <64 x i32> %az, %bz
5173  %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
5174  %cz = sext <64 x i8> %c to <64 x i32>
5175  %dz = sext <64 x i8> %d to <64 x i32>
5176  %m2 = mul nuw nsw <64 x i32> %cz, %dz
5177  %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
5178  %x = add i32 %r1, %r2
5179  ret i32 %x
5180}
5181
5182define i32 @test_sdot_v64i8_double_nomla(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
5183; CHECK-SD-LABEL: test_sdot_v64i8_double_nomla:
5184; CHECK-SD:       // %bb.0: // %entry
5185; CHECK-SD-NEXT:    movi v4.16b, #1
5186; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
5187; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
5188; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
5189; CHECK-SD-NEXT:    ldp q17, q18, [sp, #32]
5190; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
5191; CHECK-SD-NEXT:    sdot v5.4s, v3.16b, v4.16b
5192; CHECK-SD-NEXT:    sdot v6.4s, v17.16b, v4.16b
5193; CHECK-SD-NEXT:    sdot v7.4s, v2.16b, v4.16b
5194; CHECK-SD-NEXT:    ldp q2, q3, [sp]
5195; CHECK-SD-NEXT:    sdot v16.4s, v18.16b, v4.16b
5196; CHECK-SD-NEXT:    sdot v5.4s, v1.16b, v4.16b
5197; CHECK-SD-NEXT:    sdot v6.4s, v2.16b, v4.16b
5198; CHECK-SD-NEXT:    sdot v7.4s, v0.16b, v4.16b
5199; CHECK-SD-NEXT:    sdot v16.4s, v3.16b, v4.16b
5200; CHECK-SD-NEXT:    add v0.4s, v7.4s, v5.4s
5201; CHECK-SD-NEXT:    add v1.4s, v6.4s, v16.4s
5202; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
5203; CHECK-SD-NEXT:    addv s0, v0.4s
5204; CHECK-SD-NEXT:    fmov w0, s0
5205; CHECK-SD-NEXT:    ret
5206;
5207; CHECK-GI-LABEL: test_sdot_v64i8_double_nomla:
5208; CHECK-GI:       // %bb.0: // %entry
5209; CHECK-GI-NEXT:    movi v4.16b, #1
5210; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
5211; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
5212; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
5213; CHECK-GI-NEXT:    ldp q21, q22, [sp]
5214; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
5215; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
5216; CHECK-GI-NEXT:    movi v18.2d, #0000000000000000
5217; CHECK-GI-NEXT:    movi v19.2d, #0000000000000000
5218; CHECK-GI-NEXT:    movi v20.2d, #0000000000000000
5219; CHECK-GI-NEXT:    sdot v5.4s, v0.16b, v4.16b
5220; CHECK-GI-NEXT:    sdot v6.4s, v1.16b, v4.16b
5221; CHECK-GI-NEXT:    ldp q0, q1, [sp, #32]
5222; CHECK-GI-NEXT:    sdot v7.4s, v2.16b, v4.16b
5223; CHECK-GI-NEXT:    sdot v16.4s, v3.16b, v4.16b
5224; CHECK-GI-NEXT:    sdot v17.4s, v21.16b, v4.16b
5225; CHECK-GI-NEXT:    sdot v19.4s, v22.16b, v4.16b
5226; CHECK-GI-NEXT:    sdot v20.4s, v0.16b, v4.16b
5227; CHECK-GI-NEXT:    sdot v18.4s, v1.16b, v4.16b
5228; CHECK-GI-NEXT:    add v0.4s, v5.4s, v6.4s
5229; CHECK-GI-NEXT:    add v1.4s, v7.4s, v16.4s
5230; CHECK-GI-NEXT:    add v2.4s, v17.4s, v19.4s
5231; CHECK-GI-NEXT:    add v3.4s, v20.4s, v18.4s
5232; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5233; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
5234; CHECK-GI-NEXT:    addv s0, v0.4s
5235; CHECK-GI-NEXT:    addv s1, v1.4s
5236; CHECK-GI-NEXT:    fmov w8, s0
5237; CHECK-GI-NEXT:    fmov w9, s1
5238; CHECK-GI-NEXT:    add w0, w8, w9
5239; CHECK-GI-NEXT:    ret
5240entry:
5241  %az = sext <64 x i8> %a to <64 x i32>
5242  %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %az)
5243  %cz = sext <64 x i8> %c to <64 x i32>
5244  %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %cz)
5245  %x = add i32 %r1, %r2
5246  ret i32 %x
5247}
5248
5249define i32 @test_usdot_v64i8(ptr nocapture readonly %a, ptr nocapture readonly %b, i32 %sum) {
5250; CHECK-SD-LABEL: test_usdot_v64i8:
5251; CHECK-SD:       // %bb.0: // %entry
5252; CHECK-SD-NEXT:    movi v0.2d, #0000000000000000
5253; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
5254; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
5255; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
5256; CHECK-SD-NEXT:    ldp q1, q2, [x0, #32]
5257; CHECK-SD-NEXT:    ldp q6, q7, [x1, #32]
5258; CHECK-SD-NEXT:    ldp q16, q17, [x0]
5259; CHECK-SD-NEXT:    ldp q18, q19, [x1]
5260; CHECK-SD-NEXT:    usdot v0.4s, v2.16b, v7.16b
5261; CHECK-SD-NEXT:    usdot v5.4s, v1.16b, v6.16b
5262; CHECK-SD-NEXT:    usdot v4.4s, v17.16b, v19.16b
5263; CHECK-SD-NEXT:    usdot v3.4s, v16.16b, v18.16b
5264; CHECK-SD-NEXT:    add v0.4s, v4.4s, v0.4s
5265; CHECK-SD-NEXT:    add v1.4s, v3.4s, v5.4s
5266; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
5267; CHECK-SD-NEXT:    addv s0, v0.4s
5268; CHECK-SD-NEXT:    fmov w8, s0
5269; CHECK-SD-NEXT:    add w0, w8, w2
5270; CHECK-SD-NEXT:    ret
5271;
5272; CHECK-GI-LABEL: test_usdot_v64i8:
5273; CHECK-GI:       // %bb.0: // %entry
5274; CHECK-GI-NEXT:    stp d15, d14, [sp, #-64]! // 16-byte Folded Spill
5275; CHECK-GI-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
5276; CHECK-GI-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
5277; CHECK-GI-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
5278; CHECK-GI-NEXT:    .cfi_def_cfa_offset 64
5279; CHECK-GI-NEXT:    .cfi_offset b8, -8
5280; CHECK-GI-NEXT:    .cfi_offset b9, -16
5281; CHECK-GI-NEXT:    .cfi_offset b10, -24
5282; CHECK-GI-NEXT:    .cfi_offset b11, -32
5283; CHECK-GI-NEXT:    .cfi_offset b12, -40
5284; CHECK-GI-NEXT:    .cfi_offset b13, -48
5285; CHECK-GI-NEXT:    .cfi_offset b14, -56
5286; CHECK-GI-NEXT:    .cfi_offset b15, -64
5287; CHECK-GI-NEXT:    ldp q0, q1, [x1]
5288; CHECK-GI-NEXT:    ldp q21, q17, [x0]
5289; CHECK-GI-NEXT:    ldp q3, q19, [x1, #32]
5290; CHECK-GI-NEXT:    ldp q18, q4, [x0, #32]
5291; CHECK-GI-NEXT:    sshll v2.8h, v0.8b, #0
5292; CHECK-GI-NEXT:    sshll2 v5.8h, v0.16b, #0
5293; CHECK-GI-NEXT:    sshll v7.8h, v1.8b, #0
5294; CHECK-GI-NEXT:    sshll2 v22.8h, v1.16b, #0
5295; CHECK-GI-NEXT:    sshll v23.8h, v3.8b, #0
5296; CHECK-GI-NEXT:    sshll2 v24.8h, v3.16b, #0
5297; CHECK-GI-NEXT:    sshll v25.8h, v19.8b, #0
5298; CHECK-GI-NEXT:    sshll2 v26.8h, v19.16b, #0
5299; CHECK-GI-NEXT:    ushll v27.8h, v21.8b, #0
5300; CHECK-GI-NEXT:    ushll2 v28.8h, v21.16b, #0
5301; CHECK-GI-NEXT:    ushll v30.8h, v17.8b, #0
5302; CHECK-GI-NEXT:    ushll2 v17.8h, v17.16b, #0
5303; CHECK-GI-NEXT:    ushll v8.8h, v18.8b, #0
5304; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
5305; CHECK-GI-NEXT:    ushll v9.8h, v4.8b, #0
5306; CHECK-GI-NEXT:    ushll2 v4.8h, v4.16b, #0
5307; CHECK-GI-NEXT:    sshll v0.4s, v2.4h, #0
5308; CHECK-GI-NEXT:    sshll2 v6.4s, v2.8h, #0
5309; CHECK-GI-NEXT:    sshll v1.4s, v5.4h, #0
5310; CHECK-GI-NEXT:    sshll2 v16.4s, v5.8h, #0
5311; CHECK-GI-NEXT:    sshll v2.4s, v7.4h, #0
5312; CHECK-GI-NEXT:    sshll2 v20.4s, v7.8h, #0
5313; CHECK-GI-NEXT:    sshll v3.4s, v22.4h, #0
5314; CHECK-GI-NEXT:    sshll2 v22.4s, v22.8h, #0
5315; CHECK-GI-NEXT:    sshll v5.4s, v23.4h, #0
5316; CHECK-GI-NEXT:    sshll2 v23.4s, v23.8h, #0
5317; CHECK-GI-NEXT:    sshll v7.4s, v24.4h, #0
5318; CHECK-GI-NEXT:    sshll2 v24.4s, v24.8h, #0
5319; CHECK-GI-NEXT:    sshll v19.4s, v25.4h, #0
5320; CHECK-GI-NEXT:    sshll2 v25.4s, v25.8h, #0
5321; CHECK-GI-NEXT:    sshll v21.4s, v26.4h, #0
5322; CHECK-GI-NEXT:    sshll2 v26.4s, v26.8h, #0
5323; CHECK-GI-NEXT:    ushll v29.4s, v27.4h, #0
5324; CHECK-GI-NEXT:    ushll2 v27.4s, v27.8h, #0
5325; CHECK-GI-NEXT:    ushll v31.4s, v28.4h, #0
5326; CHECK-GI-NEXT:    ushll2 v28.4s, v28.8h, #0
5327; CHECK-GI-NEXT:    ushll v10.4s, v30.4h, #0
5328; CHECK-GI-NEXT:    ushll2 v30.4s, v30.8h, #0
5329; CHECK-GI-NEXT:    ushll v11.4s, v17.4h, #0
5330; CHECK-GI-NEXT:    ushll2 v17.4s, v17.8h, #0
5331; CHECK-GI-NEXT:    ushll2 v12.4s, v8.8h, #0
5332; CHECK-GI-NEXT:    ushll2 v13.4s, v18.8h, #0
5333; CHECK-GI-NEXT:    ushll2 v14.4s, v9.8h, #0
5334; CHECK-GI-NEXT:    ushll2 v15.4s, v4.8h, #0
5335; CHECK-GI-NEXT:    mul v6.4s, v6.4s, v27.4s
5336; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v28.4s
5337; CHECK-GI-NEXT:    mul v20.4s, v20.4s, v30.4s
5338; CHECK-GI-NEXT:    mul v17.4s, v22.4s, v17.4s
5339; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
5340; CHECK-GI-NEXT:    mul v22.4s, v23.4s, v12.4s
5341; CHECK-GI-NEXT:    mul v23.4s, v24.4s, v13.4s
5342; CHECK-GI-NEXT:    mul v24.4s, v25.4s, v14.4s
5343; CHECK-GI-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
5344; CHECK-GI-NEXT:    mul v25.4s, v26.4s, v15.4s
5345; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
5346; CHECK-GI-NEXT:    ushll v26.4s, v9.4h, #0
5347; CHECK-GI-NEXT:    ushll v4.4s, v4.4h, #0
5348; CHECK-GI-NEXT:    mla v6.4s, v0.4s, v29.4s
5349; CHECK-GI-NEXT:    mla v16.4s, v1.4s, v31.4s
5350; CHECK-GI-NEXT:    mla v20.4s, v2.4s, v10.4s
5351; CHECK-GI-NEXT:    mla v17.4s, v3.4s, v11.4s
5352; CHECK-GI-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
5353; CHECK-GI-NEXT:    mla v22.4s, v5.4s, v8.4s
5354; CHECK-GI-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
5355; CHECK-GI-NEXT:    mla v23.4s, v7.4s, v18.4s
5356; CHECK-GI-NEXT:    mla v24.4s, v19.4s, v26.4s
5357; CHECK-GI-NEXT:    mla v25.4s, v21.4s, v4.4s
5358; CHECK-GI-NEXT:    add v0.4s, v6.4s, v16.4s
5359; CHECK-GI-NEXT:    add v1.4s, v20.4s, v17.4s
5360; CHECK-GI-NEXT:    add v2.4s, v22.4s, v23.4s
5361; CHECK-GI-NEXT:    add v3.4s, v24.4s, v25.4s
5362; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5363; CHECK-GI-NEXT:    add v1.4s, v2.4s, v3.4s
5364; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
5365; CHECK-GI-NEXT:    addv s0, v0.4s
5366; CHECK-GI-NEXT:    fmov w8, s0
5367; CHECK-GI-NEXT:    add w0, w8, w2
5368; CHECK-GI-NEXT:    ldp d15, d14, [sp], #64 // 16-byte Folded Reload
5369; CHECK-GI-NEXT:    ret
5370entry:
5371  %0 = load <64 x i8>, ptr %a
5372  %1 = zext <64 x i8> %0 to <64 x i32>
5373  %2 = load <64 x i8>, ptr %b
5374  %3 = sext <64 x i8> %2 to <64 x i32>
5375  %4 = mul nsw <64 x i32> %3, %1
5376  %5 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %4)
5377  %op.extra = add nsw i32 %5, %sum
5378  ret i32 %op.extra
5379}
5380
5381define i32 @test_usdot_v64i8_double(<64 x i8> %a, <64 x i8> %b, <64 x i8> %c, <64 x i8> %d) {
5382; CHECK-SD-LABEL: test_usdot_v64i8_double:
5383; CHECK-SD:       // %bb.0: // %entry
5384; CHECK-SD-NEXT:    movi v18.2d, #0000000000000000
5385; CHECK-SD-NEXT:    movi v21.2d, #0000000000000000
5386; CHECK-SD-NEXT:    movi v22.2d, #0000000000000000
5387; CHECK-SD-NEXT:    movi v23.2d, #0000000000000000
5388; CHECK-SD-NEXT:    ldp q16, q17, [sp, #64]
5389; CHECK-SD-NEXT:    movi v24.2d, #0000000000000000
5390; CHECK-SD-NEXT:    movi v25.2d, #0000000000000000
5391; CHECK-SD-NEXT:    movi v26.2d, #0000000000000000
5392; CHECK-SD-NEXT:    movi v27.2d, #0000000000000000
5393; CHECK-SD-NEXT:    ldp q19, q20, [sp, #96]
5394; CHECK-SD-NEXT:    usdot v18.4s, v3.16b, v7.16b
5395; CHECK-SD-NEXT:    ldp q3, q7, [sp, #32]
5396; CHECK-SD-NEXT:    usdot v21.4s, v1.16b, v5.16b
5397; CHECK-SD-NEXT:    ldp q1, q5, [sp]
5398; CHECK-SD-NEXT:    usdot v22.4s, v2.16b, v6.16b
5399; CHECK-SD-NEXT:    usdot v23.4s, v0.16b, v4.16b
5400; CHECK-SD-NEXT:    usdot v24.4s, v7.16b, v20.16b
5401; CHECK-SD-NEXT:    usdot v27.4s, v3.16b, v19.16b
5402; CHECK-SD-NEXT:    usdot v26.4s, v5.16b, v17.16b
5403; CHECK-SD-NEXT:    usdot v25.4s, v1.16b, v16.16b
5404; CHECK-SD-NEXT:    add v0.4s, v21.4s, v18.4s
5405; CHECK-SD-NEXT:    add v1.4s, v23.4s, v22.4s
5406; CHECK-SD-NEXT:    add v2.4s, v26.4s, v24.4s
5407; CHECK-SD-NEXT:    add v3.4s, v25.4s, v27.4s
5408; CHECK-SD-NEXT:    add v0.4s, v1.4s, v0.4s
5409; CHECK-SD-NEXT:    add v1.4s, v3.4s, v2.4s
5410; CHECK-SD-NEXT:    add v0.4s, v0.4s, v1.4s
5411; CHECK-SD-NEXT:    addv s0, v0.4s
5412; CHECK-SD-NEXT:    fmov w0, s0
5413; CHECK-SD-NEXT:    ret
5414;
5415; CHECK-GI-LABEL: test_usdot_v64i8_double:
5416; CHECK-GI:       // %bb.0: // %entry
5417; CHECK-GI-NEXT:    sub sp, sp, #304
5418; CHECK-GI-NEXT:    stp d15, d14, [sp, #224] // 16-byte Folded Spill
5419; CHECK-GI-NEXT:    stp d13, d12, [sp, #240] // 16-byte Folded Spill
5420; CHECK-GI-NEXT:    stp d11, d10, [sp, #256] // 16-byte Folded Spill
5421; CHECK-GI-NEXT:    stp d9, d8, [sp, #272] // 16-byte Folded Spill
5422; CHECK-GI-NEXT:    str x29, [sp, #288] // 8-byte Folded Spill
5423; CHECK-GI-NEXT:    .cfi_def_cfa_offset 304
5424; CHECK-GI-NEXT:    .cfi_offset w29, -16
5425; CHECK-GI-NEXT:    .cfi_offset b8, -24
5426; CHECK-GI-NEXT:    .cfi_offset b9, -32
5427; CHECK-GI-NEXT:    .cfi_offset b10, -40
5428; CHECK-GI-NEXT:    .cfi_offset b11, -48
5429; CHECK-GI-NEXT:    .cfi_offset b12, -56
5430; CHECK-GI-NEXT:    .cfi_offset b13, -64
5431; CHECK-GI-NEXT:    .cfi_offset b14, -72
5432; CHECK-GI-NEXT:    .cfi_offset b15, -80
5433; CHECK-GI-NEXT:    ushll v17.8h, v0.8b, #0
5434; CHECK-GI-NEXT:    ushll2 v0.8h, v0.16b, #0
5435; CHECK-GI-NEXT:    ldr x29, [sp, #288] // 8-byte Folded Reload
5436; CHECK-GI-NEXT:    mov v20.16b, v3.16b
5437; CHECK-GI-NEXT:    ushll v16.8h, v1.8b, #0
5438; CHECK-GI-NEXT:    ushll2 v18.8h, v1.16b, #0
5439; CHECK-GI-NEXT:    ushll v26.8h, v2.8b, #0
5440; CHECK-GI-NEXT:    ldp q27, q28, [sp, #304]
5441; CHECK-GI-NEXT:    ushll2 v29.8h, v2.16b, #0
5442; CHECK-GI-NEXT:    ushll v2.4s, v17.4h, #0
5443; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
5444; CHECK-GI-NEXT:    sshll v8.8h, v4.8b, #0
5445; CHECK-GI-NEXT:    ldp q23, q21, [sp, #368]
5446; CHECK-GI-NEXT:    sshll2 v9.8h, v4.16b, #0
5447; CHECK-GI-NEXT:    sshll2 v11.8h, v5.16b, #0
5448; CHECK-GI-NEXT:    mov v25.16b, v7.16b
5449; CHECK-GI-NEXT:    ushll2 v19.4s, v17.8h, #0
5450; CHECK-GI-NEXT:    stp q1, q2, [sp, #192] // 32-byte Folded Spill
5451; CHECK-GI-NEXT:    ushll2 v3.4s, v0.8h, #0
5452; CHECK-GI-NEXT:    ushll2 v17.4s, v18.8h, #0
5453; CHECK-GI-NEXT:    ldp q24, q22, [sp, #336]
5454; CHECK-GI-NEXT:    sshll v10.8h, v5.8b, #0
5455; CHECK-GI-NEXT:    sshll v12.8h, v6.8b, #0
5456; CHECK-GI-NEXT:    sshll2 v13.8h, v6.16b, #0
5457; CHECK-GI-NEXT:    mov v2.16b, v20.16b
5458; CHECK-GI-NEXT:    sshll2 v0.4s, v8.8h, #0
5459; CHECK-GI-NEXT:    sshll2 v4.4s, v9.8h, #0
5460; CHECK-GI-NEXT:    sshll2 v6.4s, v11.8h, #0
5461; CHECK-GI-NEXT:    ushll2 v7.4s, v16.8h, #0
5462; CHECK-GI-NEXT:    ushll2 v31.4s, v29.8h, #0
5463; CHECK-GI-NEXT:    sshll2 v5.4s, v10.8h, #0
5464; CHECK-GI-NEXT:    sshll2 v1.4s, v13.8h, #0
5465; CHECK-GI-NEXT:    ushll2 v30.4s, v26.8h, #0
5466; CHECK-GI-NEXT:    ushll v14.8h, v2.8b, #0
5467; CHECK-GI-NEXT:    mul v20.4s, v19.4s, v0.4s
5468; CHECK-GI-NEXT:    mul v19.4s, v3.4s, v4.4s
5469; CHECK-GI-NEXT:    sshll v0.8h, v25.8b, #0
5470; CHECK-GI-NEXT:    mul v4.4s, v17.4s, v6.4s
5471; CHECK-GI-NEXT:    sshll2 v15.4s, v12.8h, #0
5472; CHECK-GI-NEXT:    ldp q17, q3, [sp, #400]
5473; CHECK-GI-NEXT:    mul v5.4s, v7.4s, v5.4s
5474; CHECK-GI-NEXT:    mul v7.4s, v31.4s, v1.4s
5475; CHECK-GI-NEXT:    ushll2 v31.8h, v2.16b, #0
5476; CHECK-GI-NEXT:    sshll2 v25.8h, v25.16b, #0
5477; CHECK-GI-NEXT:    sshll2 v1.4s, v0.8h, #0
5478; CHECK-GI-NEXT:    ushll v2.4s, v14.4h, #0
5479; CHECK-GI-NEXT:    sshll v0.4s, v0.4h, #0
5480; CHECK-GI-NEXT:    str q3, [sp, #96] // 16-byte Folded Spill
5481; CHECK-GI-NEXT:    ushll2 v3.4s, v14.8h, #0
5482; CHECK-GI-NEXT:    mul v6.4s, v30.4s, v15.4s
5483; CHECK-GI-NEXT:    str q31, [sp, #160] // 16-byte Folded Spill
5484; CHECK-GI-NEXT:    ushll v30.4s, v26.4h, #0
5485; CHECK-GI-NEXT:    sshll v26.4s, v8.4h, #0
5486; CHECK-GI-NEXT:    ushll v14.8h, v27.8b, #0
5487; CHECK-GI-NEXT:    ushll v15.4s, v29.4h, #0
5488; CHECK-GI-NEXT:    sshll v29.4s, v9.4h, #0
5489; CHECK-GI-NEXT:    mul v1.4s, v3.4s, v1.4s
5490; CHECK-GI-NEXT:    ushll2 v3.4s, v31.8h, #0
5491; CHECK-GI-NEXT:    ushll v31.8h, v28.8b, #0
5492; CHECK-GI-NEXT:    ushll v16.4s, v16.4h, #0
5493; CHECK-GI-NEXT:    sshll v8.4s, v10.4h, #0
5494; CHECK-GI-NEXT:    sshll v9.4s, v11.4h, #0
5495; CHECK-GI-NEXT:    sshll v10.4s, v12.4h, #0
5496; CHECK-GI-NEXT:    sshll v11.4s, v13.4h, #0
5497; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
5498; CHECK-GI-NEXT:    stp q3, q25, [sp, #112] // 32-byte Folded Spill
5499; CHECK-GI-NEXT:    ldr q3, [sp, #208] // 16-byte Folded Reload
5500; CHECK-GI-NEXT:    ushll2 v28.8h, v28.16b, #0
5501; CHECK-GI-NEXT:    mla v1.4s, v2.4s, v0.4s
5502; CHECK-GI-NEXT:    ushll2 v0.4s, v31.8h, #0
5503; CHECK-GI-NEXT:    mla v5.4s, v16.4s, v8.4s
5504; CHECK-GI-NEXT:    mla v20.4s, v3.4s, v26.4s
5505; CHECK-GI-NEXT:    sshll2 v3.4s, v25.8h, #0
5506; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v10.4s
5507; CHECK-GI-NEXT:    mla v7.4s, v15.4s, v11.4s
5508; CHECK-GI-NEXT:    sshll v25.8h, v23.8b, #0
5509; CHECK-GI-NEXT:    mla v4.4s, v18.4s, v9.4s
5510; CHECK-GI-NEXT:    ushll v30.8h, v22.8b, #0
5511; CHECK-GI-NEXT:    ushll2 v26.8h, v22.16b, #0
5512; CHECK-GI-NEXT:    sshll v22.8h, v21.8b, #0
5513; CHECK-GI-NEXT:    str q3, [sp, #32] // 16-byte Folded Spill
5514; CHECK-GI-NEXT:    ldr q3, [sp, #192] // 16-byte Folded Reload
5515; CHECK-GI-NEXT:    ushll2 v8.8h, v27.16b, #0
5516; CHECK-GI-NEXT:    str q1, [sp, #48] // 16-byte Folded Spill
5517; CHECK-GI-NEXT:    ldr q9, [sp, #32] // 16-byte Folded Reload
5518; CHECK-GI-NEXT:    ushll2 v1.4s, v14.8h, #0
5519; CHECK-GI-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
5520; CHECK-GI-NEXT:    mla v19.4s, v3.4s, v29.4s
5521; CHECK-GI-NEXT:    sshll2 v7.4s, v25.8h, #0
5522; CHECK-GI-NEXT:    str q5, [sp, #176] // 16-byte Folded Spill
5523; CHECK-GI-NEXT:    ushll v29.8h, v24.8b, #0
5524; CHECK-GI-NEXT:    ushll2 v27.8h, v24.16b, #0
5525; CHECK-GI-NEXT:    stp q0, q1, [sp] // 32-byte Folded Spill
5526; CHECK-GI-NEXT:    ldp q0, q16, [sp, #96] // 32-byte Folded Reload
5527; CHECK-GI-NEXT:    str q4, [sp, #144] // 16-byte Folded Spill
5528; CHECK-GI-NEXT:    sshll2 v24.8h, v23.16b, #0
5529; CHECK-GI-NEXT:    ushll2 v18.4s, v26.8h, #0
5530; CHECK-GI-NEXT:    stp q19, q20, [sp, #192] // 32-byte Folded Spill
5531; CHECK-GI-NEXT:    sshll2 v20.8h, v21.16b, #0
5532; CHECK-GI-NEXT:    sshll v21.8h, v17.8b, #0
5533; CHECK-GI-NEXT:    sshll2 v19.8h, v17.16b, #0
5534; CHECK-GI-NEXT:    sshll2 v17.8h, v0.16b, #0
5535; CHECK-GI-NEXT:    mul v16.4s, v16.4s, v9.4s
5536; CHECK-GI-NEXT:    ldr q9, [sp, #16] // 16-byte Folded Reload
5537; CHECK-GI-NEXT:    sshll v23.8h, v0.8b, #0
5538; CHECK-GI-NEXT:    sshll2 v2.4s, v22.8h, #0
5539; CHECK-GI-NEXT:    ushll2 v12.4s, v27.8h, #0
5540; CHECK-GI-NEXT:    ushll v26.4s, v26.4h, #0
5541; CHECK-GI-NEXT:    ushll2 v10.4s, v28.8h, #0
5542; CHECK-GI-NEXT:    sshll2 v0.4s, v17.8h, #0
5543; CHECK-GI-NEXT:    mul v7.4s, v9.4s, v7.4s
5544; CHECK-GI-NEXT:    ldr q9, [sp] // 16-byte Folded Reload
5545; CHECK-GI-NEXT:    sshll2 v5.4s, v19.8h, #0
5546; CHECK-GI-NEXT:    sshll v17.4s, v17.4h, #0
5547; CHECK-GI-NEXT:    sshll2 v3.4s, v20.8h, #0
5548; CHECK-GI-NEXT:    mul v2.4s, v9.4s, v2.4s
5549; CHECK-GI-NEXT:    ldr q9, [sp, #128] // 16-byte Folded Reload
5550; CHECK-GI-NEXT:    ushll2 v15.4s, v8.8h, #0
5551; CHECK-GI-NEXT:    mul v0.4s, v18.4s, v0.4s
5552; CHECK-GI-NEXT:    ldr q18, [sp, #160] // 16-byte Folded Reload
5553; CHECK-GI-NEXT:    ushll2 v11.4s, v29.8h, #0
5554; CHECK-GI-NEXT:    sshll v9.4s, v9.4h, #0
5555; CHECK-GI-NEXT:    ushll2 v13.4s, v30.8h, #0
5556; CHECK-GI-NEXT:    sshll2 v1.4s, v24.8h, #0
5557; CHECK-GI-NEXT:    ushll v18.4s, v18.4h, #0
5558; CHECK-GI-NEXT:    sshll2 v4.4s, v21.8h, #0
5559; CHECK-GI-NEXT:    sshll2 v6.4s, v23.8h, #0
5560; CHECK-GI-NEXT:    mul v5.4s, v12.4s, v5.4s
5561; CHECK-GI-NEXT:    ushll v27.4s, v27.4h, #0
5562; CHECK-GI-NEXT:    sshll v19.4s, v19.4h, #0
5563; CHECK-GI-NEXT:    mla v0.4s, v26.4s, v17.4s
5564; CHECK-GI-NEXT:    mul v3.4s, v10.4s, v3.4s
5565; CHECK-GI-NEXT:    mul v1.4s, v15.4s, v1.4s
5566; CHECK-GI-NEXT:    mla v16.4s, v18.4s, v9.4s
5567; CHECK-GI-NEXT:    ldp q18, q17, [sp, #192] // 32-byte Folded Reload
5568; CHECK-GI-NEXT:    mul v4.4s, v11.4s, v4.4s
5569; CHECK-GI-NEXT:    mul v6.4s, v13.4s, v6.4s
5570; CHECK-GI-NEXT:    ushll v28.4s, v28.4h, #0
5571; CHECK-GI-NEXT:    ldp d13, d12, [sp, #240] // 16-byte Folded Reload
5572; CHECK-GI-NEXT:    sshll v20.4s, v20.4h, #0
5573; CHECK-GI-NEXT:    ushll v10.4s, v14.4h, #0
5574; CHECK-GI-NEXT:    ldp d15, d14, [sp, #224] // 16-byte Folded Reload
5575; CHECK-GI-NEXT:    ushll v8.4s, v8.4h, #0
5576; CHECK-GI-NEXT:    ushll v31.4s, v31.4h, #0
5577; CHECK-GI-NEXT:    ushll v29.4s, v29.4h, #0
5578; CHECK-GI-NEXT:    ushll v30.4s, v30.4h, #0
5579; CHECK-GI-NEXT:    sshll v25.4s, v25.4h, #0
5580; CHECK-GI-NEXT:    sshll v24.4s, v24.4h, #0
5581; CHECK-GI-NEXT:    sshll v22.4s, v22.4h, #0
5582; CHECK-GI-NEXT:    sshll v21.4s, v21.4h, #0
5583; CHECK-GI-NEXT:    sshll v23.4s, v23.4h, #0
5584; CHECK-GI-NEXT:    mla v5.4s, v27.4s, v19.4s
5585; CHECK-GI-NEXT:    ldr q19, [sp, #144] // 16-byte Folded Reload
5586; CHECK-GI-NEXT:    add v17.4s, v17.4s, v18.4s
5587; CHECK-GI-NEXT:    ldr q18, [sp, #176] // 16-byte Folded Reload
5588; CHECK-GI-NEXT:    mla v3.4s, v28.4s, v20.4s
5589; CHECK-GI-NEXT:    mla v7.4s, v10.4s, v25.4s
5590; CHECK-GI-NEXT:    ldp d11, d10, [sp, #256] // 16-byte Folded Reload
5591; CHECK-GI-NEXT:    mla v1.4s, v8.4s, v24.4s
5592; CHECK-GI-NEXT:    ldp d9, d8, [sp, #272] // 16-byte Folded Reload
5593; CHECK-GI-NEXT:    add v18.4s, v18.4s, v19.4s
5594; CHECK-GI-NEXT:    ldp q20, q19, [sp, #64] // 32-byte Folded Reload
5595; CHECK-GI-NEXT:    mla v2.4s, v31.4s, v22.4s
5596; CHECK-GI-NEXT:    mla v4.4s, v29.4s, v21.4s
5597; CHECK-GI-NEXT:    mla v6.4s, v30.4s, v23.4s
5598; CHECK-GI-NEXT:    add v1.4s, v7.4s, v1.4s
5599; CHECK-GI-NEXT:    add v19.4s, v19.4s, v20.4s
5600; CHECK-GI-NEXT:    ldr q20, [sp, #48] // 16-byte Folded Reload
5601; CHECK-GI-NEXT:    add v2.4s, v2.4s, v3.4s
5602; CHECK-GI-NEXT:    add v16.4s, v20.4s, v16.4s
5603; CHECK-GI-NEXT:    add v3.4s, v4.4s, v5.4s
5604; CHECK-GI-NEXT:    add v0.4s, v6.4s, v0.4s
5605; CHECK-GI-NEXT:    add v4.4s, v17.4s, v18.4s
5606; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
5607; CHECK-GI-NEXT:    add v5.4s, v19.4s, v16.4s
5608; CHECK-GI-NEXT:    add v0.4s, v3.4s, v0.4s
5609; CHECK-GI-NEXT:    add v2.4s, v4.4s, v5.4s
5610; CHECK-GI-NEXT:    add v0.4s, v1.4s, v0.4s
5611; CHECK-GI-NEXT:    addv s1, v2.4s
5612; CHECK-GI-NEXT:    addv s0, v0.4s
5613; CHECK-GI-NEXT:    fmov w8, s1
5614; CHECK-GI-NEXT:    fmov w9, s0
5615; CHECK-GI-NEXT:    add w0, w8, w9
5616; CHECK-GI-NEXT:    add sp, sp, #304
5617; CHECK-GI-NEXT:    ret
5618entry:
5619  %az = zext <64 x i8> %a to <64 x i32>
5620  %bz = sext <64 x i8> %b to <64 x i32>
5621  %m1 = mul nuw nsw <64 x i32> %az, %bz
5622  %r1 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m1)
5623  %cz = zext <64 x i8> %c to <64 x i32>
5624  %dz = sext <64 x i8> %d to <64 x i32>
5625  %m2 = mul nuw nsw <64 x i32> %cz, %dz
5626  %r2 = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %m2)
5627  %x = add i32 %r1, %r2
5628  ret i32 %x
5629}
5630