xref: /llvm-project/llvm/test/CodeGen/AArch64/neon-partial-reduce-dot-product.ll (revision 31340457399d218c27a7a74770eb9fa03e6ae92b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-NOI8MM
3; RUN: llc -mtriple aarch64 -mattr=+neon < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOI8MM,CHECK-NODOT
4; RUN: llc -mtriple aarch64 -mattr=+neon,+dotprod,+i8mm < %s | FileCheck %s --check-prefixes=CHECK,CHECK-DOT,CHECK-I8MM
5
6define <4 x i32> @udot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
7; CHECK-DOT-LABEL: udot:
8; CHECK-DOT:       // %bb.0:
9; CHECK-DOT-NEXT:    udot v0.4s, v2.16b, v1.16b
10; CHECK-DOT-NEXT:    ret
11;
12; CHECK-NODOT-LABEL: udot:
13; CHECK-NODOT:       // %bb.0:
14; CHECK-NODOT-NEXT:    umull v3.8h, v2.8b, v1.8b
15; CHECK-NODOT-NEXT:    umull2 v1.8h, v2.16b, v1.16b
16; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
17; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v3.4h
18; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v2.4s, v3.8h
19; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
20; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
21; CHECK-NODOT-NEXT:    ret
22  %u.wide = zext <16 x i8> %u to <16 x i32>
23  %s.wide = zext <16 x i8> %s to <16 x i32>
24  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
25  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
26  ret <4 x i32> %partial.reduce
27}
28
29define <2 x i32> @udot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
30; CHECK-DOT-LABEL: udot_narrow:
31; CHECK-DOT:       // %bb.0:
32; CHECK-DOT-NEXT:    udot v0.2s, v2.8b, v1.8b
33; CHECK-DOT-NEXT:    ret
34;
35; CHECK-NODOT-LABEL: udot_narrow:
36; CHECK-NODOT:       // %bb.0:
37; CHECK-NODOT-NEXT:    umull v1.8h, v2.8b, v1.8b
38; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
39; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
40; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
41; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
42; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
43; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
44; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
45; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
46; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
47; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
48; CHECK-NODOT-NEXT:    ret
49  %u.wide = zext <8 x i8> %u to <8 x i32>
50  %s.wide = zext <8 x i8> %s to <8 x i32>
51  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
52  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
53  ret <2 x i32> %partial.reduce
54}
55
56define <4 x i32> @sdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
57; CHECK-DOT-LABEL: sdot:
58; CHECK-DOT:       // %bb.0:
59; CHECK-DOT-NEXT:    sdot v0.4s, v2.16b, v1.16b
60; CHECK-DOT-NEXT:    ret
61;
62; CHECK-NODOT-LABEL: sdot:
63; CHECK-NODOT:       // %bb.0:
64; CHECK-NODOT-NEXT:    smull v3.8h, v2.8b, v1.8b
65; CHECK-NODOT-NEXT:    smull2 v1.8h, v2.16b, v1.16b
66; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
67; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v3.4h
68; CHECK-NODOT-NEXT:    saddw2 v2.4s, v2.4s, v3.8h
69; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
70; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
71; CHECK-NODOT-NEXT:    ret
72  %u.wide = sext <16 x i8> %u to <16 x i32>
73  %s.wide = sext <16 x i8> %s to <16 x i32>
74  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
75  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
76  ret <4 x i32> %partial.reduce
77}
78
79define <2 x i32> @sdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) {
80; CHECK-DOT-LABEL: sdot_narrow:
81; CHECK-DOT:       // %bb.0:
82; CHECK-DOT-NEXT:    sdot v0.2s, v2.8b, v1.8b
83; CHECK-DOT-NEXT:    ret
84;
85; CHECK-NODOT-LABEL: sdot_narrow:
86; CHECK-NODOT:       // %bb.0:
87; CHECK-NODOT-NEXT:    smull v1.8h, v2.8b, v1.8b
88; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
89; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
90; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
91; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
92; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
93; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
94; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
95; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
96; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
97; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
98; CHECK-NODOT-NEXT:    ret
99  %u.wide = sext <8 x i8> %u to <8 x i32>
100  %s.wide = sext <8 x i8> %s to <8 x i32>
101  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
102  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
103  ret <2 x i32> %partial.reduce
104}
105
106define <4 x i32> @usdot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) {
107; CHECK-NOI8MM-LABEL: usdot:
108; CHECK-NOI8MM:       // %bb.0:
109; CHECK-NOI8MM-NEXT:    ushll v3.8h, v1.8b, #0
110; CHECK-NOI8MM-NEXT:    ushll2 v1.8h, v1.16b, #0
111; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
112; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
113; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
114; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
115; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
116; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
117; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
118; CHECK-NOI8MM-NEXT:    ret
119;
120; CHECK-I8MM-LABEL: usdot:
121; CHECK-I8MM:       // %bb.0:
122; CHECK-I8MM-NEXT:    usdot v0.4s, v1.16b, v2.16b
123; CHECK-I8MM-NEXT:    ret
124  %u.wide = zext <16 x i8> %u to <16 x i32>
125  %s.wide = sext <16 x i8> %s to <16 x i32>
126  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
127  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
128  ret <4 x i32> %partial.reduce
129}
130
131define <2 x i32> @usdot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
132; CHECK-NOI8MM-LABEL: usdot_narrow:
133; CHECK-NOI8MM:       // %bb.0:
134; CHECK-NOI8MM-NEXT:    ushll v1.8h, v1.8b, #0
135; CHECK-NOI8MM-NEXT:    sshll v2.8h, v2.8b, #0
136; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
137; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
138; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
139; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
140; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
141; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
142; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
143; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
144; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
145; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
146; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
147; CHECK-NOI8MM-NEXT:    ret
148;
149; CHECK-I8MM-LABEL: usdot_narrow:
150; CHECK-I8MM:       // %bb.0:
151; CHECK-I8MM-NEXT:    usdot v0.2s, v1.8b, v2.8b
152; CHECK-I8MM-NEXT:    ret
153  %u.wide = zext <8 x i8> %u to <8 x i32>
154  %s.wide = sext <8 x i8> %s to <8 x i32>
155  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
156  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
157  ret <2 x i32> %partial.reduce
158}
159
160define <4 x i32> @sudot(<4 x i32> %acc, <16 x i8> %u, <16 x i8> %s) #0{
161; CHECK-NOI8MM-LABEL: sudot:
162; CHECK-NOI8MM:       // %bb.0:
163; CHECK-NOI8MM-NEXT:    sshll v3.8h, v1.8b, #0
164; CHECK-NOI8MM-NEXT:    sshll2 v1.8h, v1.16b, #0
165; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
166; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
167; CHECK-NOI8MM-NEXT:    smlal v0.4s, v4.4h, v3.4h
168; CHECK-NOI8MM-NEXT:    smull v5.4s, v2.4h, v1.4h
169; CHECK-NOI8MM-NEXT:    smlal2 v0.4s, v2.8h, v1.8h
170; CHECK-NOI8MM-NEXT:    smlal2 v5.4s, v4.8h, v3.8h
171; CHECK-NOI8MM-NEXT:    add v0.4s, v5.4s, v0.4s
172; CHECK-NOI8MM-NEXT:    ret
173;
174; CHECK-I8MM-LABEL: sudot:
175; CHECK-I8MM:       // %bb.0:
176; CHECK-I8MM-NEXT:    usdot v0.4s, v2.16b, v1.16b
177; CHECK-I8MM-NEXT:    ret
178  %u.wide = sext <16 x i8> %u to <16 x i32>
179  %s.wide = zext <16 x i8> %s to <16 x i32>
180  %mult = mul nuw nsw <16 x i32> %s.wide, %u.wide
181  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %mult)
182  ret <4 x i32> %partial.reduce
183}
184
185define <2 x i32> @sudot_narrow(<2 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
186; CHECK-NOI8MM-LABEL: sudot_narrow:
187; CHECK-NOI8MM:       // %bb.0:
188; CHECK-NOI8MM-NEXT:    sshll v1.8h, v1.8b, #0
189; CHECK-NOI8MM-NEXT:    ushll v2.8h, v2.8b, #0
190; CHECK-NOI8MM-NEXT:    // kill: def $d0 killed $d0 def $q0
191; CHECK-NOI8MM-NEXT:    smull v3.4s, v2.4h, v1.4h
192; CHECK-NOI8MM-NEXT:    smull2 v4.4s, v2.8h, v1.8h
193; CHECK-NOI8MM-NEXT:    ext v5.16b, v1.16b, v1.16b, #8
194; CHECK-NOI8MM-NEXT:    ext v6.16b, v2.16b, v2.16b, #8
195; CHECK-NOI8MM-NEXT:    smlal v0.4s, v2.4h, v1.4h
196; CHECK-NOI8MM-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
197; CHECK-NOI8MM-NEXT:    ext v1.16b, v4.16b, v4.16b, #8
198; CHECK-NOI8MM-NEXT:    smlal v3.4s, v6.4h, v5.4h
199; CHECK-NOI8MM-NEXT:    add v0.2s, v1.2s, v0.2s
200; CHECK-NOI8MM-NEXT:    add v0.2s, v3.2s, v0.2s
201; CHECK-NOI8MM-NEXT:    ret
202;
203; CHECK-I8MM-LABEL: sudot_narrow:
204; CHECK-I8MM:       // %bb.0:
205; CHECK-I8MM-NEXT:    usdot v0.2s, v2.8b, v1.8b
206; CHECK-I8MM-NEXT:    ret
207  %u.wide = sext <8 x i8> %u to <8 x i32>
208  %s.wide = zext <8 x i8> %s to <8 x i32>
209  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
210  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <8 x i32> %mult)
211  ret <2 x i32> %partial.reduce
212}
213
214define <4 x i64> @udot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
215; CHECK-DOT-LABEL: udot_8to64:
216; CHECK-DOT:       // %bb.0: // %entry
217; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
218; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
219; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
220; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
221; CHECK-DOT-NEXT:    ret
222;
223; CHECK-NODOT-LABEL: udot_8to64:
224; CHECK-NODOT:       // %bb.0: // %entry
225; CHECK-NODOT-NEXT:    umull v4.8h, v2.8b, v3.8b
226; CHECK-NODOT-NEXT:    umull2 v2.8h, v2.16b, v3.16b
227; CHECK-NODOT-NEXT:    ushll v3.4s, v4.4h, #0
228; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
229; CHECK-NODOT-NEXT:    ushll2 v4.4s, v4.8h, #0
230; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
231; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v3.4s
232; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v3.2s
233; CHECK-NODOT-NEXT:    uaddl2 v3.2d, v4.4s, v5.4s
234; CHECK-NODOT-NEXT:    uaddl v4.2d, v4.2s, v5.2s
235; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
236; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
237; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
238; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
239; CHECK-NODOT-NEXT:    ret
240entry:
241  %a.wide = zext <16 x i8> %a to <16 x i64>
242  %b.wide = zext <16 x i8> %b to <16 x i64>
243  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
244  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
245  <4 x i64> %acc, <16 x i64> %mult)
246  ret <4 x i64> %partial.reduce
247}
248
249define <4 x i64> @sdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
250; CHECK-DOT-LABEL: sdot_8to64:
251; CHECK-DOT:       // %bb.0: // %entry
252; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
253; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
254; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
255; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
256; CHECK-DOT-NEXT:    ret
257;
258; CHECK-NODOT-LABEL: sdot_8to64:
259; CHECK-NODOT:       // %bb.0: // %entry
260; CHECK-NODOT-NEXT:    smull v4.8h, v2.8b, v3.8b
261; CHECK-NODOT-NEXT:    smull2 v2.8h, v2.16b, v3.16b
262; CHECK-NODOT-NEXT:    sshll v3.4s, v4.4h, #0
263; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
264; CHECK-NODOT-NEXT:    sshll2 v4.4s, v4.8h, #0
265; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
266; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v3.4s
267; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v3.2s
268; CHECK-NODOT-NEXT:    saddl2 v3.2d, v4.4s, v5.4s
269; CHECK-NODOT-NEXT:    saddl v4.2d, v4.2s, v5.2s
270; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
271; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
272; CHECK-NODOT-NEXT:    add v1.2d, v3.2d, v1.2d
273; CHECK-NODOT-NEXT:    add v0.2d, v4.2d, v0.2d
274; CHECK-NODOT-NEXT:    ret
275entry:
276  %a.wide = sext <16 x i8> %a to <16 x i64>
277  %b.wide = sext <16 x i8> %b to <16 x i64>
278  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
279  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
280  <4 x i64> %acc, <16 x i64> %mult)
281  ret <4 x i64> %partial.reduce
282}
283
284define <4 x i64> @usdot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b){
285; CHECK-NOI8MM-LABEL: usdot_8to64:
286; CHECK-NOI8MM:       // %bb.0: // %entry
287; CHECK-NOI8MM-NEXT:    ushll v4.8h, v2.8b, #0
288; CHECK-NOI8MM-NEXT:    sshll v5.8h, v3.8b, #0
289; CHECK-NOI8MM-NEXT:    ushll2 v2.8h, v2.16b, #0
290; CHECK-NOI8MM-NEXT:    sshll2 v3.8h, v3.16b, #0
291; CHECK-NOI8MM-NEXT:    ushll v6.4s, v4.4h, #0
292; CHECK-NOI8MM-NEXT:    sshll v7.4s, v5.4h, #0
293; CHECK-NOI8MM-NEXT:    ushll2 v4.4s, v4.8h, #0
294; CHECK-NOI8MM-NEXT:    sshll2 v5.4s, v5.8h, #0
295; CHECK-NOI8MM-NEXT:    ushll2 v16.4s, v2.8h, #0
296; CHECK-NOI8MM-NEXT:    sshll2 v17.4s, v3.8h, #0
297; CHECK-NOI8MM-NEXT:    ushll v2.4s, v2.4h, #0
298; CHECK-NOI8MM-NEXT:    sshll v3.4s, v3.4h, #0
299; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
300; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
301; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
302; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
303; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
304; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
305; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
306; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
307; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
308; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
309; CHECK-NOI8MM-NEXT:    ret
310;
311; CHECK-I8MM-LABEL: usdot_8to64:
312; CHECK-I8MM:       // %bb.0: // %entry
313; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
314; CHECK-I8MM-NEXT:    usdot v4.4s, v2.16b, v3.16b
315; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
316; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
317; CHECK-I8MM-NEXT:    ret
318entry:
319  %a.wide = zext <16 x i8> %a to <16 x i64>
320  %b.wide = sext <16 x i8> %b to <16 x i64>
321  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
322  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
323  <4 x i64> %acc, <16 x i64> %mult)
324  ret <4 x i64> %partial.reduce
325}
326
327define <4 x i64> @sudot_8to64(<4 x i64> %acc, <16 x i8> %a, <16 x i8> %b) {
328; CHECK-NOI8MM-LABEL: sudot_8to64:
329; CHECK-NOI8MM:       // %bb.0: // %entry
330; CHECK-NOI8MM-NEXT:    sshll v4.8h, v2.8b, #0
331; CHECK-NOI8MM-NEXT:    ushll v5.8h, v3.8b, #0
332; CHECK-NOI8MM-NEXT:    sshll2 v2.8h, v2.16b, #0
333; CHECK-NOI8MM-NEXT:    ushll2 v3.8h, v3.16b, #0
334; CHECK-NOI8MM-NEXT:    sshll v6.4s, v4.4h, #0
335; CHECK-NOI8MM-NEXT:    ushll v7.4s, v5.4h, #0
336; CHECK-NOI8MM-NEXT:    sshll2 v4.4s, v4.8h, #0
337; CHECK-NOI8MM-NEXT:    ushll2 v5.4s, v5.8h, #0
338; CHECK-NOI8MM-NEXT:    sshll2 v16.4s, v2.8h, #0
339; CHECK-NOI8MM-NEXT:    ushll2 v17.4s, v3.8h, #0
340; CHECK-NOI8MM-NEXT:    sshll v2.4s, v2.4h, #0
341; CHECK-NOI8MM-NEXT:    ushll v3.4s, v3.4h, #0
342; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v6.4s, v7.4s
343; CHECK-NOI8MM-NEXT:    smlal v0.2d, v6.2s, v7.2s
344; CHECK-NOI8MM-NEXT:    smull v18.2d, v4.2s, v5.2s
345; CHECK-NOI8MM-NEXT:    smull2 v4.2d, v4.4s, v5.4s
346; CHECK-NOI8MM-NEXT:    smlal2 v1.2d, v16.4s, v17.4s
347; CHECK-NOI8MM-NEXT:    smlal v0.2d, v16.2s, v17.2s
348; CHECK-NOI8MM-NEXT:    smlal2 v4.2d, v2.4s, v3.4s
349; CHECK-NOI8MM-NEXT:    smlal v18.2d, v2.2s, v3.2s
350; CHECK-NOI8MM-NEXT:    add v1.2d, v4.2d, v1.2d
351; CHECK-NOI8MM-NEXT:    add v0.2d, v18.2d, v0.2d
352; CHECK-NOI8MM-NEXT:    ret
353;
354; CHECK-I8MM-LABEL: sudot_8to64:
355; CHECK-I8MM:       // %bb.0: // %entry
356; CHECK-I8MM-NEXT:    movi v4.2d, #0000000000000000
357; CHECK-I8MM-NEXT:    usdot v4.4s, v3.16b, v2.16b
358; CHECK-I8MM-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
359; CHECK-I8MM-NEXT:    saddw v0.2d, v0.2d, v4.2s
360; CHECK-I8MM-NEXT:    ret
361entry:
362  %a.wide = sext <16 x i8> %a to <16 x i64>
363  %b.wide = zext <16 x i8> %b to <16 x i64>
364  %mult = mul nuw nsw <16 x i64> %a.wide, %b.wide
365  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(
366  <4 x i64> %acc, <16 x i64> %mult)
367  ret <4 x i64> %partial.reduce
368}
369
370define <4 x i32> @udot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
371; CHECK-DOT-LABEL: udot_no_bin_op:
372; CHECK-DOT:       // %bb.0:
373; CHECK-DOT-NEXT:    movi v2.16b, #1
374; CHECK-DOT-NEXT:    udot v0.4s, v1.16b, v2.16b
375; CHECK-DOT-NEXT:    ret
376;
377; CHECK-NODOT-LABEL: udot_no_bin_op:
378; CHECK-NODOT:       // %bb.0:
379; CHECK-NODOT-NEXT:    ushll v2.8h, v1.8b, #0
380; CHECK-NODOT-NEXT:    ushll2 v1.8h, v1.16b, #0
381; CHECK-NODOT-NEXT:    ushll v3.4s, v1.4h, #0
382; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v2.4h
383; CHECK-NODOT-NEXT:    uaddw2 v2.4s, v3.4s, v2.8h
384; CHECK-NODOT-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
385; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
386; CHECK-NODOT-NEXT:    ret
387  %a.wide = zext <16 x i8> %a to <16 x i32>
388  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
389  ret <4 x i32> %partial.reduce
390}
391
392define <4 x i32> @sdot_no_bin_op(<4 x i32> %acc, <16 x i8> %a){
393; CHECK-DOT-LABEL: sdot_no_bin_op:
394; CHECK-DOT:       // %bb.0:
395; CHECK-DOT-NEXT:    movi v2.16b, #1
396; CHECK-DOT-NEXT:    sdot v0.4s, v1.16b, v2.16b
397; CHECK-DOT-NEXT:    ret
398;
399; CHECK-NODOT-LABEL: sdot_no_bin_op:
400; CHECK-NODOT:       // %bb.0:
401; CHECK-NODOT-NEXT:    sshll v2.8h, v1.8b, #0
402; CHECK-NODOT-NEXT:    sshll2 v1.8h, v1.16b, #0
403; CHECK-NODOT-NEXT:    sshll v3.4s, v1.4h, #0
404; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v2.4h
405; CHECK-NODOT-NEXT:    saddw2 v2.4s, v3.4s, v2.8h
406; CHECK-NODOT-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
407; CHECK-NODOT-NEXT:    add v0.4s, v2.4s, v0.4s
408; CHECK-NODOT-NEXT:    ret
409  %a.wide = sext <16 x i8> %a to <16 x i32>
410  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <16 x i32> %a.wide)
411  ret <4 x i32> %partial.reduce
412}
413
414define <2 x i32> @udot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
415; CHECK-DOT-LABEL: udot_no_bin_op_narrow:
416; CHECK-DOT:       // %bb.0:
417; CHECK-DOT-NEXT:    movi v2.8b, #1
418; CHECK-DOT-NEXT:    udot v0.2s, v1.8b, v2.8b
419; CHECK-DOT-NEXT:    ret
420;
421; CHECK-NODOT-LABEL: udot_no_bin_op_narrow:
422; CHECK-NODOT:       // %bb.0:
423; CHECK-NODOT-NEXT:    ushll v1.8h, v1.8b, #0
424; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
425; CHECK-NODOT-NEXT:    ushll v2.4s, v1.4h, #0
426; CHECK-NODOT-NEXT:    ushll2 v3.4s, v1.8h, #0
427; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
428; CHECK-NODOT-NEXT:    uaddw v0.4s, v0.4s, v1.4h
429; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
430; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
431; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
432; CHECK-NODOT-NEXT:    uaddw v1.4s, v2.4s, v4.4h
433; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
434; CHECK-NODOT-NEXT:    ret
435  %a.wide = zext <8 x i8> %a to <8 x i32>
436  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
437  ret <2 x i32> %partial.reduce
438}
439
440define <2 x i32> @sdot_no_bin_op_narrow(<2 x i32> %acc, <8 x i8> %a){
441; CHECK-DOT-LABEL: sdot_no_bin_op_narrow:
442; CHECK-DOT:       // %bb.0:
443; CHECK-DOT-NEXT:    movi v2.8b, #1
444; CHECK-DOT-NEXT:    sdot v0.2s, v1.8b, v2.8b
445; CHECK-DOT-NEXT:    ret
446;
447; CHECK-NODOT-LABEL: sdot_no_bin_op_narrow:
448; CHECK-NODOT:       // %bb.0:
449; CHECK-NODOT-NEXT:    sshll v1.8h, v1.8b, #0
450; CHECK-NODOT-NEXT:    // kill: def $d0 killed $d0 def $q0
451; CHECK-NODOT-NEXT:    sshll v2.4s, v1.4h, #0
452; CHECK-NODOT-NEXT:    sshll2 v3.4s, v1.8h, #0
453; CHECK-NODOT-NEXT:    ext v4.16b, v1.16b, v1.16b, #8
454; CHECK-NODOT-NEXT:    saddw v0.4s, v0.4s, v1.4h
455; CHECK-NODOT-NEXT:    ext v3.16b, v3.16b, v3.16b, #8
456; CHECK-NODOT-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
457; CHECK-NODOT-NEXT:    add v0.2s, v3.2s, v0.2s
458; CHECK-NODOT-NEXT:    saddw v1.4s, v2.4s, v4.4h
459; CHECK-NODOT-NEXT:    add v0.2s, v1.2s, v0.2s
460; CHECK-NODOT-NEXT:    ret
461  %a.wide = sext <8 x i8> %a to <8 x i32>
462  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> %acc, <8 x i32> %a.wide)
463  ret <2 x i32> %partial.reduce
464}
465
466define <4 x i64> @udot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
467; CHECK-DOT-LABEL: udot_no_bin_op_8to64:
468; CHECK-DOT:       // %bb.0:
469; CHECK-DOT-NEXT:    movi v3.16b, #1
470; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
471; CHECK-DOT-NEXT:    udot v4.4s, v2.16b, v3.16b
472; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
473; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
474; CHECK-DOT-NEXT:    ret
475;
476; CHECK-NODOT-LABEL: udot_no_bin_op_8to64:
477; CHECK-NODOT:       // %bb.0:
478; CHECK-NODOT-NEXT:    ushll v3.8h, v2.8b, #0
479; CHECK-NODOT-NEXT:    ushll2 v2.8h, v2.16b, #0
480; CHECK-NODOT-NEXT:    ushll v4.4s, v3.4h, #0
481; CHECK-NODOT-NEXT:    ushll v5.4s, v2.4h, #0
482; CHECK-NODOT-NEXT:    ushll2 v3.4s, v3.8h, #0
483; CHECK-NODOT-NEXT:    ushll2 v2.4s, v2.8h, #0
484; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v4.4s
485; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v4.2s
486; CHECK-NODOT-NEXT:    uaddl2 v4.2d, v3.4s, v5.4s
487; CHECK-NODOT-NEXT:    uaddl v3.2d, v3.2s, v5.2s
488; CHECK-NODOT-NEXT:    uaddw2 v1.2d, v1.2d, v2.4s
489; CHECK-NODOT-NEXT:    uaddw v0.2d, v0.2d, v2.2s
490; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
491; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
492; CHECK-NODOT-NEXT:    ret
493  %a.wide = zext <16 x i8> %a to <16 x i64>
494  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
495  ret <4 x i64> %partial.reduce
496}
497
498define <4 x i64> @sdot_no_bin_op_8to64(<4 x i64> %acc, <16 x i8> %a){
499; CHECK-DOT-LABEL: sdot_no_bin_op_8to64:
500; CHECK-DOT:       // %bb.0:
501; CHECK-DOT-NEXT:    movi v3.16b, #1
502; CHECK-DOT-NEXT:    movi v4.2d, #0000000000000000
503; CHECK-DOT-NEXT:    sdot v4.4s, v2.16b, v3.16b
504; CHECK-DOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
505; CHECK-DOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
506; CHECK-DOT-NEXT:    ret
507;
508; CHECK-NODOT-LABEL: sdot_no_bin_op_8to64:
509; CHECK-NODOT:       // %bb.0:
510; CHECK-NODOT-NEXT:    sshll v3.8h, v2.8b, #0
511; CHECK-NODOT-NEXT:    sshll2 v2.8h, v2.16b, #0
512; CHECK-NODOT-NEXT:    sshll v4.4s, v3.4h, #0
513; CHECK-NODOT-NEXT:    sshll v5.4s, v2.4h, #0
514; CHECK-NODOT-NEXT:    sshll2 v3.4s, v3.8h, #0
515; CHECK-NODOT-NEXT:    sshll2 v2.4s, v2.8h, #0
516; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v4.4s
517; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v4.2s
518; CHECK-NODOT-NEXT:    saddl2 v4.2d, v3.4s, v5.4s
519; CHECK-NODOT-NEXT:    saddl v3.2d, v3.2s, v5.2s
520; CHECK-NODOT-NEXT:    saddw2 v1.2d, v1.2d, v2.4s
521; CHECK-NODOT-NEXT:    saddw v0.2d, v0.2d, v2.2s
522; CHECK-NODOT-NEXT:    add v1.2d, v4.2d, v1.2d
523; CHECK-NODOT-NEXT:    add v0.2d, v3.2d, v0.2d
524; CHECK-NODOT-NEXT:    ret
525  %a.wide = sext <16 x i8> %a to <16 x i64>
526  %partial.reduce = tail call <4 x i64> @llvm.experimental.vector.partial.reduce.add.v4i64.v16i64(<4 x i64> %acc, <16 x i64> %a.wide)
527  ret <4 x i64> %partial.reduce
528}
529
530define <4 x i32> @not_udot(<4 x i32> %acc, <8 x i8> %u, <8 x i8> %s) #0{
531; CHECK-LABEL: not_udot:
532; CHECK:       // %bb.0:
533; CHECK-NEXT:    umull v1.8h, v2.8b, v1.8b
534; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
535; CHECK-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
536; CHECK-NEXT:    ret
537  %u.wide = zext <8 x i8> %u to <8 x i32>
538  %s.wide = zext <8 x i8> %s to <8 x i32>
539  %mult = mul nuw nsw <8 x i32> %s.wide, %u.wide
540  %partial.reduce = tail call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> %acc, <8 x i32> %mult)
541  ret <4 x i32> %partial.reduce
542}
543
544define <2 x i32> @not_udot_narrow(<2 x i32> %acc, <4 x i8> %u, <4 x i8> %s) {
545; CHECK-LABEL: not_udot_narrow:
546; CHECK:       // %bb.0:
547; CHECK-NEXT:    bic v1.4h, #255, lsl #8
548; CHECK-NEXT:    bic v2.4h, #255, lsl #8
549; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
550; CHECK-NEXT:    umull v3.4s, v2.4h, v1.4h
551; CHECK-NEXT:    umlal v0.4s, v2.4h, v1.4h
552; CHECK-NEXT:    ext v1.16b, v3.16b, v3.16b, #8
553; CHECK-NEXT:    add v0.2s, v1.2s, v0.2s
554; CHECK-NEXT:    ret
555  %u.wide = zext <4 x i8> %u to <4 x i32>
556  %s.wide = zext <4 x i8> %s to <4 x i32>
557  %mult = mul nuw nsw <4 x i32> %s.wide, %u.wide
558  %partial.reduce = tail call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<2 x i32> %acc, <4 x i32> %mult)
559  ret <2 x i32> %partial.reduce
560}
561
562define <2 x i64> @udot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
563; CHECK-LABEL: udot_different_types:
564; CHECK:       // %bb.0: // %entry
565; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
566; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
567; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
568; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
569; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
570; CHECK-NEXT:    umull v5.2d, v1.2s, v2.2s
571; CHECK-NEXT:    umlal v0.2d, v3.2s, v4.2s
572; CHECK-NEXT:    umlal2 v0.2d, v1.4s, v2.4s
573; CHECK-NEXT:    umlal2 v5.2d, v3.4s, v4.4s
574; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
575; CHECK-NEXT:    ret
576entry:
577  %a.wide = zext <8 x i16> %a to <8 x i64>
578  %b.wide = zext <8 x i8> %b to <8 x i64>
579  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
580  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
581  ret <2 x i64> %partial.reduce
582}
583
584define <2 x i64> @sdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
585; CHECK-LABEL: sdot_different_types:
586; CHECK:       // %bb.0: // %entry
587; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
588; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
589; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
590; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
591; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
592; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
593; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
594; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
595; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
596; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
597; CHECK-NEXT:    ret
598entry:
599  %a.wide = sext <8 x i16> %a to <8 x i64>
600  %b.wide = sext <8 x i8> %b to <8 x i64>
601  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
602  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
603  ret <2 x i64> %partial.reduce
604}
605
606define <2 x i64> @usdot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
607; CHECK-LABEL: usdot_different_types:
608; CHECK:       // %bb.0: // %entry
609; CHECK-NEXT:    sshll v2.8h, v2.8b, #0
610; CHECK-NEXT:    ushll v3.4s, v1.4h, #0
611; CHECK-NEXT:    ushll2 v1.4s, v1.8h, #0
612; CHECK-NEXT:    sshll v4.4s, v2.4h, #0
613; CHECK-NEXT:    sshll2 v2.4s, v2.8h, #0
614; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
615; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
616; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
617; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
618; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
619; CHECK-NEXT:    ret
620entry:
621  %a.wide = zext <8 x i16> %a to <8 x i64>
622  %b.wide = sext <8 x i8> %b to <8 x i64>
623  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
624  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
625  ret <2 x i64> %partial.reduce
626}
627
628define <2 x i64> @sudot_different_types(<2 x i64> %acc, <8 x i16> %a, <8 x i8> %b){
629; CHECK-LABEL: sudot_different_types:
630; CHECK:       // %bb.0: // %entry
631; CHECK-NEXT:    ushll v2.8h, v2.8b, #0
632; CHECK-NEXT:    sshll v3.4s, v1.4h, #0
633; CHECK-NEXT:    sshll2 v1.4s, v1.8h, #0
634; CHECK-NEXT:    ushll v4.4s, v2.4h, #0
635; CHECK-NEXT:    ushll2 v2.4s, v2.8h, #0
636; CHECK-NEXT:    smull v5.2d, v1.2s, v2.2s
637; CHECK-NEXT:    smlal v0.2d, v3.2s, v4.2s
638; CHECK-NEXT:    smlal2 v0.2d, v1.4s, v2.4s
639; CHECK-NEXT:    smlal2 v5.2d, v3.4s, v4.4s
640; CHECK-NEXT:    add v0.2d, v5.2d, v0.2d
641; CHECK-NEXT:    ret
642entry:
643  %a.wide = sext <8 x i16> %a to <8 x i64>
644  %b.wide = zext <8 x i8> %b to <8 x i64>
645  %mult = mul nuw nsw <8 x i64> %a.wide, %b.wide
646  %partial.reduce = tail call <2 x i64> @llvm.experimental.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> %acc, <8 x i64> %mult)
647  ret <2 x i64> %partial.reduce
648}
649