xref: /llvm-project/llvm/test/CodeGen/AArch64/mla_mls_merge.ll (revision bd07c2e266f65acb0204198ae1a441bf10499cb2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
3
4define <4 x i16> @test_mla0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
5; CHECK-LABEL: test_mla0:
6; CHECK:       // %bb.0: // %entry
7; CHECK-NEXT:    umull v2.8h, v2.8b, v3.8b
8; CHECK-NEXT:    umlal v2.8h, v0.8b, v1.8b
9; CHECK-NEXT:    fmov d0, d2
10; CHECK-NEXT:    ret
11entry:
12  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
13  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d)
14  %add.i = add <8 x i16> %vmull.i.i, %vmull.i
15  %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
16  ret <4 x i16> %shuffle.i
17}
18
19
20define <4 x i16> @test_mla1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
21; CHECK-LABEL: test_mla1:
22; CHECK:       // %bb.0: // %entry
23; CHECK-NEXT:    smull v2.8h, v2.8b, v3.8b
24; CHECK-NEXT:    smlal v2.8h, v0.8b, v1.8b
25; CHECK-NEXT:    fmov d0, d2
26; CHECK-NEXT:    ret
27entry:
28  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
29  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d)
30  %add.i = add <8 x i16> %vmull.i.i, %vmull.i
31  %shuffle.i = shufflevector <8 x i16> %add.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
32  ret <4 x i16> %shuffle.i
33}
34
35
36define <2 x i32> @test_mla2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
37; CHECK-LABEL: test_mla2:
38; CHECK:       // %bb.0: // %entry
39; CHECK-NEXT:    umull v2.4s, v2.4h, v3.4h
40; CHECK-NEXT:    umlal v2.4s, v0.4h, v1.4h
41; CHECK-NEXT:    fmov d0, d2
42; CHECK-NEXT:    ret
43entry:
44  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
45  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d)
46  %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i
47  %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
48  ret <2 x i32> %shuffle.i
49}
50
51
52define <2 x i32> @test_mla3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
53; CHECK-LABEL: test_mla3:
54; CHECK:       // %bb.0: // %entry
55; CHECK-NEXT:    smull v2.4s, v2.4h, v3.4h
56; CHECK-NEXT:    smlal v2.4s, v0.4h, v1.4h
57; CHECK-NEXT:    fmov d0, d2
58; CHECK-NEXT:    ret
59entry:
60  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
61  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d)
62  %add.i = add <4 x i32> %vmull2.i.i, %vmull2.i
63  %shuffle.i = shufflevector <4 x i32> %add.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
64  ret <2 x i32> %shuffle.i
65}
66
67
68define <1 x i64> @test_mla4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
69; CHECK-LABEL: test_mla4:
70; CHECK:       // %bb.0: // %entry
71; CHECK-NEXT:    umull v2.2d, v2.2s, v3.2s
72; CHECK-NEXT:    umlal v2.2d, v0.2s, v1.2s
73; CHECK-NEXT:    fmov d0, d2
74; CHECK-NEXT:    ret
75entry:
76  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
77  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d)
78  %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i
79  %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer
80  ret <1 x i64> %shuffle.i
81}
82
83
84define <1 x i64> @test_mla5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
85; CHECK-LABEL: test_mla5:
86; CHECK:       // %bb.0: // %entry
87; CHECK-NEXT:    smull v2.2d, v2.2s, v3.2s
88; CHECK-NEXT:    smlal v2.2d, v0.2s, v1.2s
89; CHECK-NEXT:    fmov d0, d2
90; CHECK-NEXT:    ret
91entry:
92  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
93  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d)
94  %add.i = add <2 x i64> %vmull2.i.i, %vmull2.i
95  %shuffle.i = shufflevector <2 x i64> %add.i, <2 x i64> undef, <1 x i32> zeroinitializer
96  ret <1 x i64> %shuffle.i
97}
98
99
100define <4 x i16> @test_mls0(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
101; CHECK-LABEL: test_mls0:
102; CHECK:       // %bb.0: // %entry
103; CHECK-NEXT:    umull v0.8h, v0.8b, v1.8b
104; CHECK-NEXT:    umlsl v0.8h, v2.8b, v3.8b
105; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
106; CHECK-NEXT:    ret
107entry:
108  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %a, <8 x i8> %b)
109  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8> %c, <8 x i8> %d)
110  %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i
111  %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
112  ret <4 x i16> %shuffle.i
113}
114
115
116define <4 x i16> @test_mls1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
117; CHECK-LABEL: test_mls1:
118; CHECK:       // %bb.0: // %entry
119; CHECK-NEXT:    smull v0.8h, v0.8b, v1.8b
120; CHECK-NEXT:    smlsl v0.8h, v2.8b, v3.8b
121; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
122; CHECK-NEXT:    ret
123entry:
124  %vmull.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %a, <8 x i8> %b)
125  %vmull.i.i = tail call <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8> %c, <8 x i8> %d)
126  %sub.i = sub <8 x i16> %vmull.i, %vmull.i.i
127  %shuffle.i = shufflevector <8 x i16> %sub.i, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
128  ret <4 x i16> %shuffle.i
129}
130
131
132define <2 x i32> @test_mls2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
133; CHECK-LABEL: test_mls2:
134; CHECK:       // %bb.0: // %entry
135; CHECK-NEXT:    umull v0.4s, v0.4h, v1.4h
136; CHECK-NEXT:    umlsl v0.4s, v2.4h, v3.4h
137; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
138; CHECK-NEXT:    ret
139entry:
140  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %a, <4 x i16> %b)
141  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16> %c, <4 x i16> %d)
142  %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i
143  %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
144  ret <2 x i32> %shuffle.i
145}
146
147
148define <2 x i32> @test_mls3(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c, <4 x i16> %d) {
149; CHECK-LABEL: test_mls3:
150; CHECK:       // %bb.0: // %entry
151; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
152; CHECK-NEXT:    smlsl v0.4s, v2.4h, v3.4h
153; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
154; CHECK-NEXT:    ret
155entry:
156  %vmull2.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %a, <4 x i16> %b)
157  %vmull2.i.i = tail call <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16> %c, <4 x i16> %d)
158  %sub.i = sub <4 x i32> %vmull2.i, %vmull2.i.i
159  %shuffle.i = shufflevector <4 x i32> %sub.i, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
160  ret <2 x i32> %shuffle.i
161}
162
163
164define <1 x i64> @test_mls4(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
165; CHECK-LABEL: test_mls4:
166; CHECK:       // %bb.0: // %entry
167; CHECK-NEXT:    umull v0.2d, v0.2s, v1.2s
168; CHECK-NEXT:    umlsl v0.2d, v2.2s, v3.2s
169; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
170; CHECK-NEXT:    ret
171entry:
172  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %a, <2 x i32> %b)
173  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32> %c, <2 x i32> %d)
174  %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i
175  %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer
176  ret <1 x i64> %shuffle.i
177}
178
179
180define <1 x i64> @test_mls5(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c, <2 x i32> %d) {
181; CHECK-LABEL: test_mls5:
182; CHECK:       // %bb.0: // %entry
183; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
184; CHECK-NEXT:    smlsl v0.2d, v2.2s, v3.2s
185; CHECK-NEXT:    // kill: def $d0 killed $d0 killed $q0
186; CHECK-NEXT:    ret
187entry:
188  %vmull2.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %a, <2 x i32> %b)
189  %vmull2.i.i = tail call <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32> %c, <2 x i32> %d)
190  %sub.i = sub <2 x i64> %vmull2.i, %vmull2.i.i
191  %shuffle.i = shufflevector <2 x i64> %sub.i, <2 x i64> undef, <1 x i32> zeroinitializer
192  ret <1 x i64> %shuffle.i
193}
194
195declare <8 x i16> @llvm.aarch64.neon.umull.v8i16(<8 x i8>, <8 x i8>)
196
197declare <8 x i16> @llvm.aarch64.neon.smull.v8i16(<8 x i8>, <8 x i8>)
198
199declare <4 x i32> @llvm.aarch64.neon.umull.v4i32(<4 x i16>, <4 x i16>)
200
201declare <4 x i32> @llvm.aarch64.neon.smull.v4i32(<4 x i16>, <4 x i16>)
202
203declare <2 x i64> @llvm.aarch64.neon.umull.v2i64(<2 x i32>, <2 x i32>)
204
205declare <2 x i64> @llvm.aarch64.neon.smull.v2i64(<2 x i32>, <2 x i32>)
206