xref: /llvm-project/llvm/test/CodeGen/AArch64/sve-partial-reduce-dot-product.ll (revision 31340457399d218c27a7a74770eb9fa03e6ae92b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple=aarch64 -mattr=+sve2,+i8mm %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-I8MM
3; RUN: llc -mtriple=aarch64 -mattr=+sve2 %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM
4
5define <vscale x 4 x i32> @udot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
6; CHECK-LABEL: udot:
7; CHECK:       // %bb.0: // %entry
8; CHECK-NEXT:    udot z0.s, z1.b, z2.b
9; CHECK-NEXT:    ret
10entry:
11  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
12  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
13  %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
14  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
15  ret <vscale x 4 x i32> %partial.reduce
16}
17
18define <vscale x 2 x i64> @udot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
19; CHECK-LABEL: udot_wide:
20; CHECK:       // %bb.0: // %entry
21; CHECK-NEXT:    udot z0.d, z1.h, z2.h
22; CHECK-NEXT:    ret
23entry:
24  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
25  %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
26  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
27  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
28  ret <vscale x 2 x i64> %partial.reduce
29}
30
31define <vscale x 4 x i32> @sdot(<vscale x 4 x i32> %accc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
32; CHECK-LABEL: sdot:
33; CHECK:       // %bb.0: // %entry
34; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
35; CHECK-NEXT:    ret
36entry:
37  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
38  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
39  %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
40  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %accc, <vscale x 16 x i32> %mult)
41  ret <vscale x 4 x i32> %partial.reduce
42}
43
44define <vscale x 2 x i64> @sdot_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
45; CHECK-LABEL: sdot_wide:
46; CHECK:       // %bb.0: // %entry
47; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
48; CHECK-NEXT:    ret
49entry:
50  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
51  %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
52  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
53  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
54  ret <vscale x 2 x i64> %partial.reduce
55}
56
57define <vscale x 4 x i32> @usdot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
58; CHECK-I8MM-LABEL: usdot:
59; CHECK-I8MM:       // %bb.0: // %entry
60; CHECK-I8MM-NEXT:    usdot z0.s, z1.b, z2.b
61; CHECK-I8MM-NEXT:    ret
62;
63; CHECK-NOI8MM-LABEL: usdot:
64; CHECK-NOI8MM:       // %bb.0: // %entry
65; CHECK-NOI8MM-NEXT:    uunpklo z3.h, z1.b
66; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
67; CHECK-NOI8MM-NEXT:    uunpkhi z1.h, z1.b
68; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
69; CHECK-NOI8MM-NEXT:    ptrue p0.s
70; CHECK-NOI8MM-NEXT:    uunpklo z5.s, z3.h
71; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
72; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
73; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
74; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z1.h
75; CHECK-NOI8MM-NEXT:    uunpkhi z1.s, z1.h
76; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z2.h
77; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
78; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
79; CHECK-NOI8MM-NEXT:    mul z3.s, z3.s, z4.s
80; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
81; CHECK-NOI8MM-NEXT:    movprfx z1, z3
82; CHECK-NOI8MM-NEXT:    mla z1.s, p0/m, z7.s, z24.s
83; CHECK-NOI8MM-NEXT:    add z0.s, z1.s, z0.s
84; CHECK-NOI8MM-NEXT:    ret
85entry:
86  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
87  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i32>
88  %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
89  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
90  ret <vscale x 4 x i32> %partial.reduce
91}
92
93define <vscale x 4 x i32> @sudot(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
94; CHECK-I8MM-LABEL: sudot:
95; CHECK-I8MM:       // %bb.0: // %entry
96; CHECK-I8MM-NEXT:    usdot z0.s, z2.b, z1.b
97; CHECK-I8MM-NEXT:    ret
98;
99; CHECK-NOI8MM-LABEL: sudot:
100; CHECK-NOI8MM:       // %bb.0: // %entry
101; CHECK-NOI8MM-NEXT:    sunpklo z3.h, z1.b
102; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
103; CHECK-NOI8MM-NEXT:    sunpkhi z1.h, z1.b
104; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
105; CHECK-NOI8MM-NEXT:    ptrue p0.s
106; CHECK-NOI8MM-NEXT:    sunpklo z5.s, z3.h
107; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
108; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
109; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
110; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z1.h
111; CHECK-NOI8MM-NEXT:    sunpkhi z1.s, z1.h
112; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z2.h
113; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
114; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z5.s, z6.s
115; CHECK-NOI8MM-NEXT:    mul z3.s, z3.s, z4.s
116; CHECK-NOI8MM-NEXT:    mla z0.s, p0/m, z1.s, z2.s
117; CHECK-NOI8MM-NEXT:    movprfx z1, z3
118; CHECK-NOI8MM-NEXT:    mla z1.s, p0/m, z7.s, z24.s
119; CHECK-NOI8MM-NEXT:    add z0.s, z1.s, z0.s
120; CHECK-NOI8MM-NEXT:    ret
121entry:
122  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
123  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
124  %mult = mul nuw nsw <vscale x 16 x i32> %a.wide, %b.wide
125  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %mult)
126  ret <vscale x 4 x i32> %partial.reduce
127}
128
129define <vscale x 4 x i64> @udot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
130; CHECK-LABEL: udot_8to64:
131; CHECK:       // %bb.0: // %entry
132; CHECK-NEXT:    mov z4.s, #0 // =0x0
133; CHECK-NEXT:    udot z4.s, z2.b, z3.b
134; CHECK-NEXT:    sunpklo z2.d, z4.s
135; CHECK-NEXT:    sunpkhi z3.d, z4.s
136; CHECK-NEXT:    add z0.d, z0.d, z2.d
137; CHECK-NEXT:    add z1.d, z1.d, z3.d
138; CHECK-NEXT:    ret
139entry:
140  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
141  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
142  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
143  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
144  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
145  ret <vscale x 4 x i64> %partial.reduce
146}
147
148define <vscale x 4 x i64> @sdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
149; CHECK-LABEL: sdot_8to64:
150; CHECK:       // %bb.0: // %entry
151; CHECK-NEXT:    mov z4.s, #0 // =0x0
152; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
153; CHECK-NEXT:    sunpklo z2.d, z4.s
154; CHECK-NEXT:    sunpkhi z3.d, z4.s
155; CHECK-NEXT:    add z0.d, z0.d, z2.d
156; CHECK-NEXT:    add z1.d, z1.d, z3.d
157; CHECK-NEXT:    ret
158entry:
159  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
160  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
161  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
162  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
163  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
164  ret <vscale x 4 x i64> %partial.reduce
165}
166
167define <vscale x 4 x i64> @usdot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b){
168; CHECK-I8MM-LABEL: usdot_8to64:
169; CHECK-I8MM:       // %bb.0: // %entry
170; CHECK-I8MM-NEXT:    mov z4.s, #0 // =0x0
171; CHECK-I8MM-NEXT:    usdot z4.s, z2.b, z3.b
172; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
173; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
174; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
175; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
176; CHECK-I8MM-NEXT:    ret
177;
178; CHECK-NOI8MM-LABEL: usdot_8to64:
179; CHECK-NOI8MM:       // %bb.0: // %entry
180; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
181; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
182; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
183; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
184; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
185; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
186; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
187; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
188; CHECK-NOI8MM-NEXT:    uunpklo z4.h, z2.b
189; CHECK-NOI8MM-NEXT:    sunpklo z5.h, z3.b
190; CHECK-NOI8MM-NEXT:    uunpkhi z2.h, z2.b
191; CHECK-NOI8MM-NEXT:    sunpkhi z3.h, z3.b
192; CHECK-NOI8MM-NEXT:    ptrue p0.d
193; CHECK-NOI8MM-NEXT:    uunpklo z6.s, z4.h
194; CHECK-NOI8MM-NEXT:    uunpkhi z4.s, z4.h
195; CHECK-NOI8MM-NEXT:    sunpklo z7.s, z5.h
196; CHECK-NOI8MM-NEXT:    sunpkhi z5.s, z5.h
197; CHECK-NOI8MM-NEXT:    uunpklo z24.s, z2.h
198; CHECK-NOI8MM-NEXT:    uunpkhi z2.s, z2.h
199; CHECK-NOI8MM-NEXT:    sunpklo z25.s, z3.h
200; CHECK-NOI8MM-NEXT:    sunpkhi z3.s, z3.h
201; CHECK-NOI8MM-NEXT:    uunpkhi z26.d, z6.s
202; CHECK-NOI8MM-NEXT:    uunpklo z6.d, z6.s
203; CHECK-NOI8MM-NEXT:    uunpklo z27.d, z4.s
204; CHECK-NOI8MM-NEXT:    sunpklo z28.d, z7.s
205; CHECK-NOI8MM-NEXT:    sunpklo z29.d, z5.s
206; CHECK-NOI8MM-NEXT:    uunpkhi z4.d, z4.s
207; CHECK-NOI8MM-NEXT:    sunpkhi z7.d, z7.s
208; CHECK-NOI8MM-NEXT:    sunpkhi z5.d, z5.s
209; CHECK-NOI8MM-NEXT:    uunpkhi z30.d, z24.s
210; CHECK-NOI8MM-NEXT:    uunpkhi z31.d, z2.s
211; CHECK-NOI8MM-NEXT:    uunpklo z24.d, z24.s
212; CHECK-NOI8MM-NEXT:    uunpklo z2.d, z2.s
213; CHECK-NOI8MM-NEXT:    sunpkhi z8.d, z25.s
214; CHECK-NOI8MM-NEXT:    sunpklo z25.d, z25.s
215; CHECK-NOI8MM-NEXT:    sunpklo z9.d, z3.s
216; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
217; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
218; CHECK-NOI8MM-NEXT:    sunpkhi z3.d, z3.s
219; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
220; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
221; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
222; CHECK-NOI8MM-NEXT:    movprfx z2, z27
223; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
224; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
225; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
226; CHECK-NOI8MM-NEXT:    movprfx z3, z4
227; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
228; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
229; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
230; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
231; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
232; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
233; CHECK-NOI8MM-NEXT:    ret
234entry:
235  %a.wide = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
236  %b.wide = sext <vscale x 16 x i8> %b to <vscale x 16 x i64>
237  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
238  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
239  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
240  ret <vscale x 4 x i64> %partial.reduce
241}
242
243define <vscale x 4 x i64> @sudot_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a, <vscale x 16 x i8> %b) {
244; CHECK-I8MM-LABEL: sudot_8to64:
245; CHECK-I8MM:       // %bb.0: // %entry
246; CHECK-I8MM-NEXT:    mov z4.s, #0 // =0x0
247; CHECK-I8MM-NEXT:    usdot z4.s, z3.b, z2.b
248; CHECK-I8MM-NEXT:    sunpklo z2.d, z4.s
249; CHECK-I8MM-NEXT:    sunpkhi z3.d, z4.s
250; CHECK-I8MM-NEXT:    add z0.d, z0.d, z2.d
251; CHECK-I8MM-NEXT:    add z1.d, z1.d, z3.d
252; CHECK-I8MM-NEXT:    ret
253;
254; CHECK-NOI8MM-LABEL: sudot_8to64:
255; CHECK-NOI8MM:       // %bb.0: // %entry
256; CHECK-NOI8MM-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
257; CHECK-NOI8MM-NEXT:    addvl sp, sp, #-2
258; CHECK-NOI8MM-NEXT:    str z9, [sp] // 16-byte Folded Spill
259; CHECK-NOI8MM-NEXT:    str z8, [sp, #1, mul vl] // 16-byte Folded Spill
260; CHECK-NOI8MM-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
261; CHECK-NOI8MM-NEXT:    .cfi_offset w29, -16
262; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
263; CHECK-NOI8MM-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
264; CHECK-NOI8MM-NEXT:    sunpklo z4.h, z2.b
265; CHECK-NOI8MM-NEXT:    uunpklo z5.h, z3.b
266; CHECK-NOI8MM-NEXT:    sunpkhi z2.h, z2.b
267; CHECK-NOI8MM-NEXT:    uunpkhi z3.h, z3.b
268; CHECK-NOI8MM-NEXT:    ptrue p0.d
269; CHECK-NOI8MM-NEXT:    sunpklo z6.s, z4.h
270; CHECK-NOI8MM-NEXT:    sunpkhi z4.s, z4.h
271; CHECK-NOI8MM-NEXT:    uunpklo z7.s, z5.h
272; CHECK-NOI8MM-NEXT:    uunpkhi z5.s, z5.h
273; CHECK-NOI8MM-NEXT:    sunpklo z24.s, z2.h
274; CHECK-NOI8MM-NEXT:    sunpkhi z2.s, z2.h
275; CHECK-NOI8MM-NEXT:    uunpklo z25.s, z3.h
276; CHECK-NOI8MM-NEXT:    uunpkhi z3.s, z3.h
277; CHECK-NOI8MM-NEXT:    sunpkhi z26.d, z6.s
278; CHECK-NOI8MM-NEXT:    sunpklo z6.d, z6.s
279; CHECK-NOI8MM-NEXT:    sunpklo z27.d, z4.s
280; CHECK-NOI8MM-NEXT:    uunpklo z28.d, z7.s
281; CHECK-NOI8MM-NEXT:    uunpklo z29.d, z5.s
282; CHECK-NOI8MM-NEXT:    sunpkhi z4.d, z4.s
283; CHECK-NOI8MM-NEXT:    uunpkhi z7.d, z7.s
284; CHECK-NOI8MM-NEXT:    uunpkhi z5.d, z5.s
285; CHECK-NOI8MM-NEXT:    sunpkhi z30.d, z24.s
286; CHECK-NOI8MM-NEXT:    sunpkhi z31.d, z2.s
287; CHECK-NOI8MM-NEXT:    sunpklo z24.d, z24.s
288; CHECK-NOI8MM-NEXT:    sunpklo z2.d, z2.s
289; CHECK-NOI8MM-NEXT:    uunpkhi z8.d, z25.s
290; CHECK-NOI8MM-NEXT:    uunpklo z25.d, z25.s
291; CHECK-NOI8MM-NEXT:    uunpklo z9.d, z3.s
292; CHECK-NOI8MM-NEXT:    mul z27.d, z27.d, z29.d
293; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z6.d, z28.d
294; CHECK-NOI8MM-NEXT:    uunpkhi z3.d, z3.s
295; CHECK-NOI8MM-NEXT:    mul z4.d, z4.d, z5.d
296; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z26.d, z7.d
297; CHECK-NOI8MM-NEXT:    mla z0.d, p0/m, z2.d, z9.d
298; CHECK-NOI8MM-NEXT:    movprfx z2, z27
299; CHECK-NOI8MM-NEXT:    mla z2.d, p0/m, z24.d, z25.d
300; CHECK-NOI8MM-NEXT:    ldr z9, [sp] // 16-byte Folded Reload
301; CHECK-NOI8MM-NEXT:    mla z1.d, p0/m, z31.d, z3.d
302; CHECK-NOI8MM-NEXT:    movprfx z3, z4
303; CHECK-NOI8MM-NEXT:    mla z3.d, p0/m, z30.d, z8.d
304; CHECK-NOI8MM-NEXT:    ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
305; CHECK-NOI8MM-NEXT:    add z0.d, z2.d, z0.d
306; CHECK-NOI8MM-NEXT:    add z1.d, z3.d, z1.d
307; CHECK-NOI8MM-NEXT:    addvl sp, sp, #2
308; CHECK-NOI8MM-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
309; CHECK-NOI8MM-NEXT:    ret
310entry:
311  %a.wide = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
312  %b.wide = zext <vscale x 16 x i8> %b to <vscale x 16 x i64>
313  %mult = mul nuw nsw <vscale x 16 x i64> %a.wide, %b.wide
314  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(
315  <vscale x 4 x i64> %acc, <vscale x 16 x i64> %mult)
316  ret <vscale x 4 x i64> %partial.reduce
317}
318
319define <vscale x 4 x i32> @udot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
320; CHECK-LABEL: udot_no_bin_op:
321; CHECK:       // %bb.0:
322; CHECK-NEXT:    mov z2.b, #1 // =0x1
323; CHECK-NEXT:    udot z0.s, z1.b, z2.b
324; CHECK-NEXT:    ret
325  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
326  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
327  ret <vscale x 4 x i32> %partial.reduce
328}
329
330define <vscale x 4 x i32> @sdot_no_bin_op(<vscale x 4 x i32> %acc, <vscale x 16 x i8> %a){
331; CHECK-LABEL: sdot_no_bin_op:
332; CHECK:       // %bb.0:
333; CHECK-NEXT:    mov z2.b, #1 // =0x1
334; CHECK-NEXT:    sdot z0.s, z1.b, z2.b
335; CHECK-NEXT:    ret
336  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i32>
337  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 16 x i32> %a.ext)
338  ret <vscale x 4 x i32> %partial.reduce
339}
340
341define <vscale x 2 x i64> @udot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
342; CHECK-LABEL: udot_no_bin_op_wide:
343; CHECK:       // %bb.0: // %entry
344; CHECK-NEXT:    mov z2.h, #1 // =0x1
345; CHECK-NEXT:    udot z0.d, z1.h, z2.h
346; CHECK-NEXT:    ret
347entry:
348  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
349  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
350  ret <vscale x 2 x i64> %partial.reduce
351}
352
353define <vscale x 2 x i64> @sdot_no_bin_op_wide(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b){
354; CHECK-LABEL: sdot_no_bin_op_wide:
355; CHECK:       // %bb.0: // %entry
356; CHECK-NEXT:    mov z2.h, #1 // =0x1
357; CHECK-NEXT:    sdot z0.d, z1.h, z2.h
358; CHECK-NEXT:    ret
359entry:
360  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
361  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %a.wide)
362  ret <vscale x 2 x i64> %partial.reduce
363}
364
365define <vscale x 4 x i64> @udot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
366; CHECK-LABEL: udot_no_bin_op_8to64:
367; CHECK:       // %bb.0:
368; CHECK-NEXT:    mov z3.b, #1 // =0x1
369; CHECK-NEXT:    mov z4.s, #0 // =0x0
370; CHECK-NEXT:    udot z4.s, z2.b, z3.b
371; CHECK-NEXT:    sunpklo z2.d, z4.s
372; CHECK-NEXT:    sunpkhi z3.d, z4.s
373; CHECK-NEXT:    add z0.d, z0.d, z2.d
374; CHECK-NEXT:    add z1.d, z1.d, z3.d
375; CHECK-NEXT:    ret
376  %a.ext = zext <vscale x 16 x i8> %a to <vscale x 16 x i64>
377  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
378  ret <vscale x 4 x i64> %partial.reduce
379}
380
381define <vscale x 4 x i64> @sdot_no_bin_op_8to64(<vscale x 4 x i64> %acc, <vscale x 16 x i8> %a){
382; CHECK-LABEL: sdot_no_bin_op_8to64:
383; CHECK:       // %bb.0:
384; CHECK-NEXT:    mov z3.b, #1 // =0x1
385; CHECK-NEXT:    mov z4.s, #0 // =0x0
386; CHECK-NEXT:    sdot z4.s, z2.b, z3.b
387; CHECK-NEXT:    sunpklo z2.d, z4.s
388; CHECK-NEXT:    sunpkhi z3.d, z4.s
389; CHECK-NEXT:    add z0.d, z0.d, z2.d
390; CHECK-NEXT:    add z1.d, z1.d, z3.d
391; CHECK-NEXT:    ret
392  %a.ext = sext <vscale x 16 x i8> %a to <vscale x 16 x i64>
393  %partial.reduce = tail call <vscale x 4 x i64> @llvm.experimental.vector.partial.reduce.add.nxv4i64.nxv16i64(<vscale x 4 x i64> %acc, <vscale x 16 x i64> %a.ext)
394  ret <vscale x 4 x i64> %partial.reduce
395}
396
397define <vscale x 4 x i32> @not_udot(<vscale x 4 x i32> %acc, <vscale x 8 x i8> %a, <vscale x 8 x i8> %b) {
398; CHECK-LABEL: not_udot:
399; CHECK:       // %bb.0: // %entry
400; CHECK-NEXT:    and z1.h, z1.h, #0xff
401; CHECK-NEXT:    and z2.h, z2.h, #0xff
402; CHECK-NEXT:    ptrue p0.s
403; CHECK-NEXT:    uunpklo z3.s, z1.h
404; CHECK-NEXT:    uunpklo z4.s, z2.h
405; CHECK-NEXT:    uunpkhi z1.s, z1.h
406; CHECK-NEXT:    uunpkhi z2.s, z2.h
407; CHECK-NEXT:    mla z0.s, p0/m, z3.s, z4.s
408; CHECK-NEXT:    mla z0.s, p0/m, z1.s, z2.s
409; CHECK-NEXT:    ret
410entry:
411  %a.wide = zext <vscale x 8 x i8> %a to <vscale x 8 x i32>
412  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i32>
413  %mult = mul nuw nsw <vscale x 8 x i32> %a.wide, %b.wide
414  %partial.reduce = tail call <vscale x 4 x i32> @llvm.experimental.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> %acc, <vscale x 8 x i32> %mult)
415  ret <vscale x 4 x i32> %partial.reduce
416}
417
418define <vscale x 2 x i64> @not_udot_wide(<vscale x 2 x i64> %acc, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b) {
419; CHECK-LABEL: not_udot_wide:
420; CHECK:       // %bb.0: // %entry
421; CHECK-NEXT:    and z1.s, z1.s, #0xffff
422; CHECK-NEXT:    and z2.s, z2.s, #0xffff
423; CHECK-NEXT:    ptrue p0.d
424; CHECK-NEXT:    uunpklo z3.d, z1.s
425; CHECK-NEXT:    uunpklo z4.d, z2.s
426; CHECK-NEXT:    uunpkhi z1.d, z1.s
427; CHECK-NEXT:    uunpkhi z2.d, z2.s
428; CHECK-NEXT:    mla z0.d, p0/m, z3.d, z4.d
429; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
430; CHECK-NEXT:    ret
431entry:
432  %a.wide = zext <vscale x 4 x i16> %a to <vscale x 4 x i64>
433  %b.wide = zext <vscale x 4 x i16> %b to <vscale x 4 x i64>
434  %mult = mul nuw nsw <vscale x 4 x i64> %a.wide, %b.wide
435  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv4i64(<vscale x 2 x i64> %acc, <vscale x 4 x i64> %mult)
436  ret <vscale x 2 x i64> %partial.reduce
437}
438
439define <vscale x 2 x i64> @not_usdot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
440; CHECK-LABEL: not_usdot:
441; CHECK:       // %bb.0: // %entry
442; CHECK-NEXT:    uunpklo z3.s, z1.h
443; CHECK-NEXT:    sunpklo z4.s, z2.h
444; CHECK-NEXT:    uunpkhi z1.s, z1.h
445; CHECK-NEXT:    sunpkhi z2.s, z2.h
446; CHECK-NEXT:    ptrue p0.d
447; CHECK-NEXT:    uunpklo z5.d, z3.s
448; CHECK-NEXT:    uunpkhi z3.d, z3.s
449; CHECK-NEXT:    sunpklo z6.d, z4.s
450; CHECK-NEXT:    sunpkhi z4.d, z4.s
451; CHECK-NEXT:    uunpklo z7.d, z1.s
452; CHECK-NEXT:    uunpkhi z1.d, z1.s
453; CHECK-NEXT:    sunpklo z24.d, z2.s
454; CHECK-NEXT:    sunpkhi z2.d, z2.s
455; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
456; CHECK-NEXT:    mul z3.d, z3.d, z4.d
457; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
458; CHECK-NEXT:    movprfx z1, z3
459; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
460; CHECK-NEXT:    add z0.d, z1.d, z0.d
461; CHECK-NEXT:    ret
462entry:
463  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
464  %b.wide = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
465  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
466  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
467  ret <vscale x 2 x i64> %partial.reduce
468}
469
470define <vscale x 2 x i64> @not_sudot(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i16> %b) {
471; CHECK-LABEL: not_sudot:
472; CHECK:       // %bb.0: // %entry
473; CHECK-NEXT:    sunpklo z3.s, z1.h
474; CHECK-NEXT:    uunpklo z4.s, z2.h
475; CHECK-NEXT:    sunpkhi z1.s, z1.h
476; CHECK-NEXT:    uunpkhi z2.s, z2.h
477; CHECK-NEXT:    ptrue p0.d
478; CHECK-NEXT:    sunpklo z5.d, z3.s
479; CHECK-NEXT:    sunpkhi z3.d, z3.s
480; CHECK-NEXT:    uunpklo z6.d, z4.s
481; CHECK-NEXT:    uunpkhi z4.d, z4.s
482; CHECK-NEXT:    sunpklo z7.d, z1.s
483; CHECK-NEXT:    sunpkhi z1.d, z1.s
484; CHECK-NEXT:    uunpklo z24.d, z2.s
485; CHECK-NEXT:    uunpkhi z2.d, z2.s
486; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
487; CHECK-NEXT:    mul z3.d, z3.d, z4.d
488; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
489; CHECK-NEXT:    movprfx z1, z3
490; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
491; CHECK-NEXT:    add z0.d, z1.d, z0.d
492; CHECK-NEXT:    ret
493entry:
494  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
495  %b.wide = zext <vscale x 8 x i16> %b to <vscale x 8 x i64>
496  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
497  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
498  ret <vscale x 2 x i64> %partial.reduce
499}
500
501define <vscale x 2 x i64> @udot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
502; CHECK-LABEL: udot_different_types:
503; CHECK:       // %bb.0: // %entry
504; CHECK-NEXT:    and z2.h, z2.h, #0xff
505; CHECK-NEXT:    uunpklo z3.s, z1.h
506; CHECK-NEXT:    uunpkhi z1.s, z1.h
507; CHECK-NEXT:    ptrue p0.d
508; CHECK-NEXT:    uunpklo z4.s, z2.h
509; CHECK-NEXT:    uunpkhi z2.s, z2.h
510; CHECK-NEXT:    uunpklo z5.d, z3.s
511; CHECK-NEXT:    uunpkhi z3.d, z3.s
512; CHECK-NEXT:    uunpklo z7.d, z1.s
513; CHECK-NEXT:    uunpkhi z1.d, z1.s
514; CHECK-NEXT:    uunpklo z6.d, z4.s
515; CHECK-NEXT:    uunpkhi z4.d, z4.s
516; CHECK-NEXT:    uunpklo z24.d, z2.s
517; CHECK-NEXT:    uunpkhi z2.d, z2.s
518; CHECK-NEXT:    mul z3.d, z3.d, z4.d
519; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
520; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
521; CHECK-NEXT:    movprfx z1, z3
522; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
523; CHECK-NEXT:    add z0.d, z1.d, z0.d
524; CHECK-NEXT:    ret
525entry:
526  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
527  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
528  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
529  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
530  ret <vscale x 2 x i64> %partial.reduce
531}
532
533define <vscale x 2 x i64> @sdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
534; CHECK-LABEL: sdot_different_types:
535; CHECK:       // %bb.0: // %entry
536; CHECK-NEXT:    ptrue p0.h
537; CHECK-NEXT:    sunpklo z3.s, z1.h
538; CHECK-NEXT:    sunpkhi z1.s, z1.h
539; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
540; CHECK-NEXT:    ptrue p0.d
541; CHECK-NEXT:    sunpklo z5.d, z3.s
542; CHECK-NEXT:    sunpkhi z3.d, z3.s
543; CHECK-NEXT:    sunpklo z7.d, z1.s
544; CHECK-NEXT:    sunpklo z4.s, z2.h
545; CHECK-NEXT:    sunpkhi z2.s, z2.h
546; CHECK-NEXT:    sunpkhi z1.d, z1.s
547; CHECK-NEXT:    sunpklo z6.d, z4.s
548; CHECK-NEXT:    sunpkhi z4.d, z4.s
549; CHECK-NEXT:    sunpklo z24.d, z2.s
550; CHECK-NEXT:    sunpkhi z2.d, z2.s
551; CHECK-NEXT:    mul z3.d, z3.d, z4.d
552; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
553; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
554; CHECK-NEXT:    movprfx z1, z3
555; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
556; CHECK-NEXT:    add z0.d, z1.d, z0.d
557; CHECK-NEXT:    ret
558entry:
559  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
560  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
561  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
562  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
563  ret <vscale x 2 x i64> %partial.reduce
564}
565
566define <vscale x 2 x i64> @usdot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
567; CHECK-LABEL: usdot_different_types:
568; CHECK:       // %bb.0: // %entry
569; CHECK-NEXT:    ptrue p0.h
570; CHECK-NEXT:    uunpklo z3.s, z1.h
571; CHECK-NEXT:    uunpkhi z1.s, z1.h
572; CHECK-NEXT:    sxtb z2.h, p0/m, z2.h
573; CHECK-NEXT:    ptrue p0.d
574; CHECK-NEXT:    uunpklo z5.d, z3.s
575; CHECK-NEXT:    uunpkhi z3.d, z3.s
576; CHECK-NEXT:    uunpklo z7.d, z1.s
577; CHECK-NEXT:    sunpklo z4.s, z2.h
578; CHECK-NEXT:    sunpkhi z2.s, z2.h
579; CHECK-NEXT:    uunpkhi z1.d, z1.s
580; CHECK-NEXT:    sunpklo z6.d, z4.s
581; CHECK-NEXT:    sunpkhi z4.d, z4.s
582; CHECK-NEXT:    sunpklo z24.d, z2.s
583; CHECK-NEXT:    sunpkhi z2.d, z2.s
584; CHECK-NEXT:    mul z3.d, z3.d, z4.d
585; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
586; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
587; CHECK-NEXT:    movprfx z1, z3
588; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
589; CHECK-NEXT:    add z0.d, z1.d, z0.d
590; CHECK-NEXT:    ret
591entry:
592  %a.wide = zext <vscale x 8 x i16> %a to <vscale x 8 x i64>
593  %b.wide = sext <vscale x 8 x i8> %b to <vscale x 8 x i64>
594  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
595  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
596  ret <vscale x 2 x i64> %partial.reduce
597}
598
599define <vscale x 2 x i64> @sudot_different_types(<vscale x 2 x i64> %acc, <vscale x 8 x i16> %a, <vscale x 8 x i8> %b){
600; CHECK-LABEL: sudot_different_types:
601; CHECK:       // %bb.0: // %entry
602; CHECK-NEXT:    and z2.h, z2.h, #0xff
603; CHECK-NEXT:    sunpklo z3.s, z1.h
604; CHECK-NEXT:    sunpkhi z1.s, z1.h
605; CHECK-NEXT:    ptrue p0.d
606; CHECK-NEXT:    uunpklo z4.s, z2.h
607; CHECK-NEXT:    uunpkhi z2.s, z2.h
608; CHECK-NEXT:    sunpklo z5.d, z3.s
609; CHECK-NEXT:    sunpkhi z3.d, z3.s
610; CHECK-NEXT:    sunpklo z7.d, z1.s
611; CHECK-NEXT:    sunpkhi z1.d, z1.s
612; CHECK-NEXT:    uunpklo z6.d, z4.s
613; CHECK-NEXT:    uunpkhi z4.d, z4.s
614; CHECK-NEXT:    uunpklo z24.d, z2.s
615; CHECK-NEXT:    uunpkhi z2.d, z2.s
616; CHECK-NEXT:    mul z3.d, z3.d, z4.d
617; CHECK-NEXT:    mla z0.d, p0/m, z5.d, z6.d
618; CHECK-NEXT:    mla z0.d, p0/m, z1.d, z2.d
619; CHECK-NEXT:    movprfx z1, z3
620; CHECK-NEXT:    mla z1.d, p0/m, z7.d, z24.d
621; CHECK-NEXT:    add z0.d, z1.d, z0.d
622; CHECK-NEXT:    ret
623entry:
624  %a.wide = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
625  %b.wide = zext <vscale x 8 x i8> %b to <vscale x 8 x i64>
626  %mult = mul nuw nsw <vscale x 8 x i64> %a.wide, %b.wide
627  %partial.reduce = tail call <vscale x 2 x i64> @llvm.experimental.vector.partial.reduce.add.nxv2i64.nxv8i64(<vscale x 2 x i64> %acc, <vscale x 8 x i64> %mult)
628  ret <vscale x 2 x i64> %partial.reduce
629}
630