xref: /llvm-project/llvm/test/CodeGen/AArch64/neon-dot-product.ll (revision fbba818a78f591d89f25768ba31783714d526532)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+dotprod    < %s | FileCheck %s
3; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65   < %s | FileCheck %s
4; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=cortex-a65ae < %s | FileCheck %s
5; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1  < %s | FileCheck %s
6; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1  < %s | FileCheck %s
7; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2  < %s | FileCheck %s
8; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1      < %s | FileCheck %s
9; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1a     < %s | FileCheck %s
10; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1b     < %s | FileCheck %s
11
12declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
13declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
14declare <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
15declare <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
16
17define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
18; CHECK-LABEL: test_vdot_u32:
19; CHECK:       // %bb.0: // %entry
20; CHECK-NEXT:    udot v0.2s, v1.8b, v2.8b
21; CHECK-NEXT:    ret
22entry:
23  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
24  ret <2 x i32> %vdot1.i
25}
26
27define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
28; CHECK-LABEL: test_vdotq_u32:
29; CHECK:       // %bb.0: // %entry
30; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
31; CHECK-NEXT:    ret
32entry:
33  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
34  ret <4 x i32> %vdot1.i
35}
36
37define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
38; CHECK-LABEL: test_vdot_s32:
39; CHECK:       // %bb.0: // %entry
40; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.8b
41; CHECK-NEXT:    ret
42entry:
43  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
44  ret <2 x i32> %vdot1.i
45}
46
47define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
48; CHECK-LABEL: test_vdotq_s32:
49; CHECK:       // %bb.0: // %entry
50; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
51; CHECK-NEXT:    ret
52entry:
53  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
54  ret <4 x i32> %vdot1.i
55}
56
57
58define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
59; CHECK-LABEL: test_vdot_u32_zero:
60; CHECK:       // %bb.0: // %entry
61; CHECK-NEXT:    udot v0.2s, v1.8b, v2.8b
62; CHECK-NEXT:    ret
63entry:
64  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
65  %ret = add <2 x i32> %vdot1.i, %a
66  ret <2 x i32> %ret
67}
68
69define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
70; CHECK-LABEL: test_vdotq_u32_zero:
71; CHECK:       // %bb.0: // %entry
72; CHECK-NEXT:    udot v0.4s, v1.16b, v2.16b
73; CHECK-NEXT:    ret
74entry:
75  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
76  %ret = add <4 x i32> %vdot1.i, %a
77  ret <4 x i32> %ret
78}
79
80define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
81; CHECK-LABEL: test_vdot_s32_zero:
82; CHECK:       // %bb.0: // %entry
83; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.8b
84; CHECK-NEXT:    ret
85entry:
86  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2
87  %ret = add <2 x i32> %vdot1.i, %a
88  ret <2 x i32> %ret
89}
90
91define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
92; CHECK-LABEL: test_vdotq_s32_zero:
93; CHECK:       // %bb.0: // %entry
94; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.16b
95; CHECK-NEXT:    ret
96entry:
97  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2
98  %ret = add <4 x i32> %vdot1.i, %a
99  ret <4 x i32> %ret
100}
101
102
103define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
104; CHECK-LABEL: test_vdot_lane_u32:
105; CHECK:       // %bb.0: // %entry
106; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
107; CHECK-NEXT:    udot v0.2s, v1.8b, v2.4b[1]
108; CHECK-NEXT:    ret
109entry:
110  %.cast = bitcast <8 x i8> %c to <2 x i32>
111  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
112  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
113  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
114  ret <2 x i32> %vdot1.i
115}
116
117define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
118; CHECK-LABEL: test_vdotq_lane_u32:
119; CHECK:       // %bb.0: // %entry
120; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
121; CHECK-NEXT:    udot v0.4s, v1.16b, v2.4b[1]
122; CHECK-NEXT:    ret
123entry:
124  %.cast = bitcast <8 x i8> %c to <2 x i32>
125  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
126  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
127  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
128  ret <4 x i32> %vdot1.i
129}
130
131define <2 x i32> @test_vdot_laneq_u32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
132; CHECK-LABEL: test_vdot_laneq_u32:
133; CHECK:       // %bb.0: // %entry
134; CHECK-NEXT:    udot v0.2s, v1.8b, v2.4b[1]
135; CHECK-NEXT:    ret
136entry:
137  %.cast = bitcast <16 x i8> %c to <4 x i32>
138  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
139  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
140  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
141  ret <2 x i32> %vdot1.i
142}
143
144define <4 x i32> @test_vdotq_laneq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
145; CHECK-LABEL: test_vdotq_laneq_u32:
146; CHECK:       // %bb.0: // %entry
147; CHECK-NEXT:    udot v0.4s, v1.16b, v2.4b[1]
148; CHECK-NEXT:    ret
149entry:
150  %.cast = bitcast <16 x i8> %c to <4 x i32>
151  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
152  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
153  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
154  ret <4 x i32> %vdot1.i
155}
156
157
158define <2 x i32> @test_vdot_lane_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
159; CHECK-LABEL: test_vdot_lane_u32_zero:
160; CHECK:       // %bb.0: // %entry
161; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
162; CHECK-NEXT:    udot v0.2s, v1.8b, v2.4b[1]
163; CHECK-NEXT:    ret
164entry:
165  %.cast = bitcast <8 x i8> %c to <2 x i32>
166  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
167  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
168  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
169  %ret = add <2 x i32> %vdot1.i, %a
170  ret <2 x i32> %ret
171}
172
173define <4 x i32> @test_vdotq_lane_u32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
174; CHECK-LABEL: test_vdotq_lane_u32_zero:
175; CHECK:       // %bb.0: // %entry
176; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
177; CHECK-NEXT:    udot v0.4s, v1.16b, v2.4b[1]
178; CHECK-NEXT:    ret
179entry:
180  %.cast = bitcast <8 x i8> %c to <2 x i32>
181  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
182  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
183  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
184  %ret = add <4 x i32> %vdot1.i, %a
185  ret <4 x i32> %ret
186}
187
188define <2 x i32> @test_vdot_laneq_u32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
189; CHECK-LABEL: test_vdot_laneq_u32_zero:
190; CHECK:       // %bb.0: // %entry
191; CHECK-NEXT:    udot v0.2s, v1.8b, v2.4b[1]
192; CHECK-NEXT:    ret
193entry:
194  %.cast = bitcast <16 x i8> %c to <4 x i32>
195  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
196  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
197  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
198  %ret = add <2 x i32> %vdot1.i, %a
199  ret <2 x i32> %ret
200}
201
202define <4 x i32> @test_vdotq_laneq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
203; CHECK-LABEL: test_vdotq_laneq_u32_zero:
204; CHECK:       // %bb.0: // %entry
205; CHECK-NEXT:    udot v0.4s, v1.16b, v2.4b[1]
206; CHECK-NEXT:    ret
207entry:
208  %.cast = bitcast <16 x i8> %c to <4 x i32>
209  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
210  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
211  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
212  %ret = add <4 x i32> %vdot1.i, %a
213  ret <4 x i32> %ret
214}
215
216
217define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
218; CHECK-LABEL: test_vdot_lane_s32:
219; CHECK:       // %bb.0: // %entry
220; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
221; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.4b[1]
222; CHECK-NEXT:    ret
223entry:
224  %.cast = bitcast <8 x i8> %c to <2 x i32>
225  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
226  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
227  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
228  ret <2 x i32> %vdot1.i
229}
230
231define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
232; CHECK-LABEL: test_vdotq_lane_s32:
233; CHECK:       // %bb.0: // %entry
234; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
235; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.4b[1]
236; CHECK-NEXT:    ret
237entry:
238  %.cast = bitcast <8 x i8> %c to <2 x i32>
239  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
240  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
241  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
242  ret <4 x i32> %vdot1.i
243}
244
245define <2 x i32> @test_vdot_laneq_s32(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
246; CHECK-LABEL: test_vdot_laneq_s32:
247; CHECK:       // %bb.0: // %entry
248; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.4b[1]
249; CHECK-NEXT:    ret
250entry:
251  %.cast = bitcast <16 x i8> %c to <4 x i32>
252  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
253  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
254  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
255  ret <2 x i32> %vdot1.i
256}
257
258define <4 x i32> @test_vdotq_laneq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
259; CHECK-LABEL: test_vdotq_laneq_s32:
260; CHECK:       // %bb.0: // %entry
261; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.4b[1]
262; CHECK-NEXT:    ret
263entry:
264  %.cast = bitcast <16 x i8> %c to <4 x i32>
265  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
266  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
267  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
268  ret <4 x i32> %vdot1.i
269}
270
271
272define <2 x i32> @test_vdot_lane_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
273; CHECK-LABEL: test_vdot_lane_s32_zero:
274; CHECK:       // %bb.0: // %entry
275; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
276; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.4b[1]
277; CHECK-NEXT:    ret
278entry:
279  %.cast = bitcast <8 x i8> %c to <2 x i32>
280  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
281  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
282  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
283  %ret = add <2 x i32> %vdot1.i, %a
284  ret <2 x i32> %ret
285}
286
287define <4 x i32> @test_vdotq_lane_s32_zero(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
288; CHECK-LABEL: test_vdotq_lane_s32_zero:
289; CHECK:       // %bb.0: // %entry
290; CHECK-NEXT:    // kill: def $d2 killed $d2 def $q2
291; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.4b[1]
292; CHECK-NEXT:    ret
293entry:
294  %.cast = bitcast <8 x i8> %c to <2 x i32>
295  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
296  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
297  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
298  %ret = add <4 x i32> %vdot1.i, %a
299  ret <4 x i32> %ret
300}
301
302define <2 x i32> @test_vdot_laneq_s32_zero(<2 x i32> %a, <8 x i8> %b, <16 x i8> %c) {
303; CHECK-LABEL: test_vdot_laneq_s32_zero:
304; CHECK:       // %bb.0: // %entry
305; CHECK-NEXT:    sdot v0.2s, v1.8b, v2.4b[1]
306; CHECK-NEXT:    ret
307entry:
308  %.cast = bitcast <16 x i8> %c to <4 x i32>
309  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <2 x i32> <i32 1, i32 1>
310  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
311  %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %.cast5) #2
312  %ret = add <2 x i32> %vdot1.i, %a
313  ret <2 x i32> %ret
314}
315
316define <4 x i32> @test_vdotq_laneq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) {
317; CHECK-LABEL: test_vdotq_laneq_s32_zero:
318; CHECK:       // %bb.0: // %entry
319; CHECK-NEXT:    sdot v0.4s, v1.16b, v2.4b[1]
320; CHECK-NEXT:    ret
321entry:
322  %.cast = bitcast <16 x i8> %c to <4 x i32>
323  %shuffle = shufflevector <4 x i32> %.cast, <4 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
324  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
325  %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %.cast3) #2
326  %ret = add <4 x i32> %vdot1.i, %a
327  ret <4 x i32> %ret
328}
329