xref: /llvm-project/llvm/test/CodeGen/ARM/arm-matmul.ll (revision 7da19051253219d4bee2c50fe13f250201f1f7ec)
1*7da19051SLuke Geeson; RUN: llc -mtriple=arm-none-linux-gnu -mattr=+neon,+i8mm -float-abi=hard < %s -o -| FileCheck %s
2*7da19051SLuke Geeson
3*7da19051SLuke Geesondefine <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
4*7da19051SLuke Geesonentry:
5*7da19051SLuke Geeson; CHECK-LABEL: smmla.v4i32.v16i8
6*7da19051SLuke Geeson; CHECK:        vsmmla.s8       q0, q1, q2
7*7da19051SLuke Geeson  %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
8*7da19051SLuke Geeson  ret <4 x i32> %vmmla1.i
9*7da19051SLuke Geeson}
10*7da19051SLuke Geeson
11*7da19051SLuke Geesondefine <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
12*7da19051SLuke Geesonentry:
13*7da19051SLuke Geeson; CHECK-LABEL: ummla.v4i32.v16i8
14*7da19051SLuke Geeson; CHECK:        vummla.u8       q0, q1, q2
15*7da19051SLuke Geeson  %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
16*7da19051SLuke Geeson  ret <4 x i32> %vmmla1.i
17*7da19051SLuke Geeson}
18*7da19051SLuke Geeson
19*7da19051SLuke Geesondefine <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) {
20*7da19051SLuke Geesonentry:
21*7da19051SLuke Geeson; CHECK-LABEL: usmmla.v4i32.v16i8
22*7da19051SLuke Geeson; CHECK:        vusmmla.s8       q0, q1, q2
23*7da19051SLuke Geeson  %vusmmla1.i = tail call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3
24*7da19051SLuke Geeson  ret <4 x i32> %vusmmla1.i
25*7da19051SLuke Geeson}
26*7da19051SLuke Geeson
27*7da19051SLuke Geesondefine <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
28*7da19051SLuke Geesonentry:
29*7da19051SLuke Geeson; CHECK-LABEL: usdot.v2i32.v8i8
30*7da19051SLuke Geeson; CHECK:        vusdot.s8       d0, d1, d2
31*7da19051SLuke Geeson  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) #3
32*7da19051SLuke Geeson  ret <2 x i32> %vusdot1.i
33*7da19051SLuke Geeson}
34*7da19051SLuke Geeson
35*7da19051SLuke Geesondefine <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
36*7da19051SLuke Geesonentry:
37*7da19051SLuke Geeson; CHECK-LABEL: usdot_lane.v2i32.v8i8
38*7da19051SLuke Geeson; CHECK:        vusdot.s8       d0, d1, d2[0]
39*7da19051SLuke Geeson  %0 = bitcast <8 x i8> %b to <2 x i32>
40*7da19051SLuke Geeson  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
41*7da19051SLuke Geeson  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
42*7da19051SLuke Geeson  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) #3
43*7da19051SLuke Geeson  ret <2 x i32> %vusdot1.i
44*7da19051SLuke Geeson}
45*7da19051SLuke Geeson
46*7da19051SLuke Geesondefine <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) {
47*7da19051SLuke Geesonentry:
48*7da19051SLuke Geeson; CHECK-LABEL: sudot_lane.v2i32.v8i8
49*7da19051SLuke Geeson; CHECK:        vsudot.u8       d0, d1, d2[0]
50*7da19051SLuke Geeson  %0 = bitcast <8 x i8> %b to <2 x i32>
51*7da19051SLuke Geeson  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer
52*7da19051SLuke Geeson  %1 = bitcast <2 x i32> %shuffle to <8 x i8>
53*7da19051SLuke Geeson  %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3
54*7da19051SLuke Geeson  ret <2 x i32> %vusdot1.i
55*7da19051SLuke Geeson}
56*7da19051SLuke Geeson
57*7da19051SLuke Geesondefine <4 x i32> @usdotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
58*7da19051SLuke Geesonentry:
59*7da19051SLuke Geeson; CHECK-LABEL: usdotq_lane.v4i32.v16i8
60*7da19051SLuke Geeson; CHECK:        vusdot.s8       q0, q1, d4[0]
61*7da19051SLuke Geeson  %0 = bitcast <8 x i8> %b to <2 x i32>
62*7da19051SLuke Geeson  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
63*7da19051SLuke Geeson  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
64*7da19051SLuke Geeson  %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3
65*7da19051SLuke Geeson  ret <4 x i32> %vusdot1.i
66*7da19051SLuke Geeson}
67*7da19051SLuke Geeson
68*7da19051SLuke Geesondefine <4 x i32> @sudotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) {
69*7da19051SLuke Geesonentry:
70*7da19051SLuke Geeson; CHECK-LABEL: sudotq_lane.v4i32.v16i8
71*7da19051SLuke Geeson; CHECK:        vsudot.u8       q0, q1, d4[0]
72*7da19051SLuke Geeson  %0 = bitcast <8 x i8> %b to <2 x i32>
73*7da19051SLuke Geeson  %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer
74*7da19051SLuke Geeson  %1 = bitcast <4 x i32> %shuffle to <16 x i8>
75*7da19051SLuke Geeson  %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3
76*7da19051SLuke Geeson  ret <4 x i32> %vusdot1.i
77*7da19051SLuke Geeson}
78*7da19051SLuke Geeson
79*7da19051SLuke Geesondeclare <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
80*7da19051SLuke Geesondeclare <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
81*7da19051SLuke Geesondeclare <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
82*7da19051SLuke Geesondeclare <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2
83*7da19051SLuke Geesondeclare <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2
84