xref: /llvm-project/llvm/test/CodeGen/AArch64/lowerMUL-newload.ll (revision db158c7c830807caeeb0691739c41f1d522029e9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=aarch64 | FileCheck %s
3
4define <4 x i16> @mlai16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
5; CHECK-LABEL: mlai16_trunc:
6; CHECK:       // %bb.0: // %entry
7; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
8; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
9; CHECK-NEXT:    xtn v0.4h, v0.4s
10; CHECK-NEXT:    ret
11entry:
12  %v0 = sext <4 x i16> %vec0 to <4 x i32>
13  %v1 = sext <4 x i16> %vec1 to <4 x i32>
14  %v2 = sext <4 x i16> %vec2 to <4 x i32>
15  %v3 = mul <4 x i32> %v1, %v0
16  %v4 = add <4 x i32> %v3, %v2
17  %v5 = trunc <4 x i32> %v4 to <4 x i16>
18  ret <4 x i16> %v5
19}
20
21define <4 x i32> @mlai16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
22; CHECK-LABEL: mlai16_and:
23; CHECK:       // %bb.0: // %entry
24; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
25; CHECK-NEXT:    movi v1.2d, #0x00ffff0000ffff
26; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
27; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
28; CHECK-NEXT:    ret
29entry:
30  %v0 = sext <4 x i16> %vec0 to <4 x i32>
31  %v1 = sext <4 x i16> %vec1 to <4 x i32>
32  %v2 = sext <4 x i16> %vec2 to <4 x i32>
33  %v3 = mul <4 x i32> %v1, %v0
34  %v4 = add <4 x i32> %v3, %v2
35  %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535>
36  ret <4 x i32> %v5
37}
38
39define void @mlai16_loadstore(ptr %a, ptr %b, ptr %c) {
40; CHECK-LABEL: mlai16_loadstore:
41; CHECK:       // %bb.0: // %entry
42; CHECK-NEXT:    ldr d0, [x0, #16]
43; CHECK-NEXT:    ldr d1, [x1, #16]
44; CHECK-NEXT:    smull v0.4s, v1.4h, v0.4h
45; CHECK-NEXT:    ldr d1, [x2, #16]
46; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
47; CHECK-NEXT:    xtn v0.4h, v0.4s
48; CHECK-NEXT:    str d0, [x0, #16]
49; CHECK-NEXT:    ret
50entry:
51  %scevgep0 = getelementptr i16, ptr %a, i32 8
52  %vec0 = load <4 x i16>, ptr %scevgep0, align 8
53  %v0 = sext <4 x i16> %vec0 to <4 x i32>
54  %scevgep1 = getelementptr i16, ptr %b, i32 8
55  %vec1 = load <4 x i16>, ptr %scevgep1, align 8
56  %v1 = sext <4 x i16> %vec1 to <4 x i32>
57  %scevgep2 = getelementptr i16, ptr %c, i32 8
58  %vec2 = load <4 x i16>, ptr %scevgep2, align 8
59  %v2 = sext <4 x i16> %vec2 to <4 x i32>
60  %v3 = mul <4 x i32> %v1, %v0
61  %v4 = add <4 x i32> %v3, %v2
62  %v5 = trunc <4 x i32> %v4 to <4 x i16>
63  %scevgep3 = getelementptr i16, ptr %a, i32 8
64  store <4 x i16> %v5, ptr %scevgep3, align 8
65  ret void
66}
67
68define <4 x i16> @addmuli16_trunc(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
69; CHECK-LABEL: addmuli16_trunc:
70; CHECK:       // %bb.0: // %entry
71; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
72; CHECK-NEXT:    smlal v1.4s, v0.4h, v2.4h
73; CHECK-NEXT:    xtn v0.4h, v1.4s
74; CHECK-NEXT:    ret
75entry:
76  %v0 = sext <4 x i16> %vec0 to <4 x i32>
77  %v1 = sext <4 x i16> %vec1 to <4 x i32>
78  %v2 = sext <4 x i16> %vec2 to <4 x i32>
79  %v3 = add <4 x i32> %v1, %v0
80  %v4 = mul <4 x i32> %v3, %v2
81  %v5 = trunc <4 x i32> %v4 to <4 x i16>
82  ret <4 x i16> %v5
83}
84
85define <4 x i32> @addmuli16_and(<4 x i16> %vec0, <4 x i16> %vec1, <4 x i16> %vec2) {
86; CHECK-LABEL: addmuli16_and:
87; CHECK:       // %bb.0: // %entry
88; CHECK-NEXT:    smull v1.4s, v1.4h, v2.4h
89; CHECK-NEXT:    smlal v1.4s, v0.4h, v2.4h
90; CHECK-NEXT:    movi v0.2d, #0x00ffff0000ffff
91; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
92; CHECK-NEXT:    ret
93entry:
94  %v0 = sext <4 x i16> %vec0 to <4 x i32>
95  %v1 = sext <4 x i16> %vec1 to <4 x i32>
96  %v2 = sext <4 x i16> %vec2 to <4 x i32>
97  %v3 = add <4 x i32> %v1, %v0
98  %v4 = mul <4 x i32> %v3, %v2
99  %v5 = and <4 x i32> %v4, <i32 65535, i32 65535, i32 65535, i32 65535>
100  ret <4 x i32> %v5
101}
102
103define void @addmuli16_loadstore(ptr %a, ptr %b, ptr %c) {
104; CHECK-LABEL: addmuli16_loadstore:
105; CHECK:       // %bb.0: // %entry
106; CHECK-NEXT:    ldr d0, [x1, #16]
107; CHECK-NEXT:    ldr d1, [x2, #16]
108; CHECK-NEXT:    ldr d2, [x0, #16]
109; CHECK-NEXT:    smull v0.4s, v0.4h, v1.4h
110; CHECK-NEXT:    smlal v0.4s, v2.4h, v1.4h
111; CHECK-NEXT:    xtn v0.4h, v0.4s
112; CHECK-NEXT:    str d0, [x0, #16]
113; CHECK-NEXT:    ret
114entry:
115  %scevgep0 = getelementptr i16, ptr %a, i32 8
116  %vec0 = load <4 x i16>, ptr %scevgep0, align 8
117  %v0 = sext <4 x i16> %vec0 to <4 x i32>
118  %scevgep1 = getelementptr i16, ptr %b, i32 8
119  %vec1 = load <4 x i16>, ptr %scevgep1, align 8
120  %v1 = sext <4 x i16> %vec1 to <4 x i32>
121  %scevgep2 = getelementptr i16, ptr %c, i32 8
122  %vec2 = load <4 x i16>, ptr %scevgep2, align 8
123  %v2 = sext <4 x i16> %vec2 to <4 x i32>
124  %v3 = add <4 x i32> %v1, %v0
125  %v4 = mul <4 x i32> %v3, %v2
126  %v5 = trunc <4 x i32> %v4 to <4 x i16>
127  %scevgep3 = getelementptr i16, ptr %a, i32 8
128  store <4 x i16> %v5, ptr %scevgep3, align 8
129  ret void
130}
131
132define <2 x i32> @mlai32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
133; CHECK-LABEL: mlai32_trunc:
134; CHECK:       // %bb.0: // %entry
135; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
136; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
137; CHECK-NEXT:    xtn v0.2s, v0.2d
138; CHECK-NEXT:    ret
139entry:
140  %v0 = sext <2 x i32> %vec0 to <2 x i64>
141  %v1 = sext <2 x i32> %vec1 to <2 x i64>
142  %v2 = sext <2 x i32> %vec2 to <2 x i64>
143  %v3 = mul <2 x i64> %v1, %v0
144  %v4 = add <2 x i64> %v3, %v2
145  %v5 = trunc <2 x i64> %v4 to <2 x i32>
146  ret <2 x i32> %v5
147}
148
149define <2 x i64> @mlai32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
150; CHECK-LABEL: mlai32_and:
151; CHECK:       // %bb.0: // %entry
152; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
153; CHECK-NEXT:    movi v1.2d, #0x000000ffffffff
154; CHECK-NEXT:    uaddw v0.2d, v0.2d, v2.2s
155; CHECK-NEXT:    and v0.16b, v0.16b, v1.16b
156; CHECK-NEXT:    ret
157entry:
158  %v0 = sext <2 x i32> %vec0 to <2 x i64>
159  %v1 = sext <2 x i32> %vec1 to <2 x i64>
160  %v2 = sext <2 x i32> %vec2 to <2 x i64>
161  %v3 = mul <2 x i64> %v1, %v0
162  %v4 = add <2 x i64> %v3, %v2
163  %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295>
164  ret <2 x i64> %v5
165}
166
167define void @mlai32_loadstore(ptr %a, ptr %b, ptr %c) {
168; CHECK-LABEL: mlai32_loadstore:
169; CHECK:       // %bb.0: // %entry
170; CHECK-NEXT:    ldr d0, [x0, #32]
171; CHECK-NEXT:    ldr d1, [x1, #32]
172; CHECK-NEXT:    smull v0.2d, v1.2s, v0.2s
173; CHECK-NEXT:    ldr d1, [x2, #32]
174; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
175; CHECK-NEXT:    xtn v0.2s, v0.2d
176; CHECK-NEXT:    str d0, [x0, #32]
177; CHECK-NEXT:    ret
178entry:
179  %scevgep0 = getelementptr i32, ptr %a, i32 8
180  %vec0 = load <2 x i32>, ptr %scevgep0, align 8
181  %v0 = sext <2 x i32> %vec0 to <2 x i64>
182  %scevgep1 = getelementptr i32, ptr %b, i32 8
183  %vec1 = load <2 x i32>, ptr %scevgep1, align 8
184  %v1 = sext <2 x i32> %vec1 to <2 x i64>
185  %scevgep2 = getelementptr i32, ptr %c, i32 8
186  %vec2 = load <2 x i32>, ptr %scevgep2, align 8
187  %v2 = sext <2 x i32> %vec2 to <2 x i64>
188  %v3 = mul <2 x i64> %v1, %v0
189  %v4 = add <2 x i64> %v3, %v2
190  %v5 = trunc <2 x i64> %v4 to <2 x i32>
191  %scevgep3 = getelementptr i32, ptr %a, i32 8
192  store <2 x i32> %v5, ptr %scevgep3, align 8
193  ret void
194}
195
196define <2 x i32> @addmuli32_trunc(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
197; CHECK-LABEL: addmuli32_trunc:
198; CHECK:       // %bb.0: // %entry
199; CHECK-NEXT:    smull v1.2d, v1.2s, v2.2s
200; CHECK-NEXT:    smlal v1.2d, v0.2s, v2.2s
201; CHECK-NEXT:    xtn v0.2s, v1.2d
202; CHECK-NEXT:    ret
203entry:
204  %v0 = sext <2 x i32> %vec0 to <2 x i64>
205  %v1 = sext <2 x i32> %vec1 to <2 x i64>
206  %v2 = sext <2 x i32> %vec2 to <2 x i64>
207  %v3 = add <2 x i64> %v1, %v0
208  %v4 = mul <2 x i64> %v3, %v2
209  %v5 = trunc <2 x i64> %v4 to <2 x i32>
210  ret <2 x i32> %v5
211}
212
213define <2 x i64> @addmuli32_and(<2 x i32> %vec0, <2 x i32> %vec1, <2 x i32> %vec2) {
214; CHECK-LABEL: addmuli32_and:
215; CHECK:       // %bb.0: // %entry
216; CHECK-NEXT:    smull v1.2d, v1.2s, v2.2s
217; CHECK-NEXT:    smlal v1.2d, v0.2s, v2.2s
218; CHECK-NEXT:    movi v0.2d, #0x000000ffffffff
219; CHECK-NEXT:    and v0.16b, v1.16b, v0.16b
220; CHECK-NEXT:    ret
221entry:
222  %v0 = sext <2 x i32> %vec0 to <2 x i64>
223  %v1 = sext <2 x i32> %vec1 to <2 x i64>
224  %v2 = sext <2 x i32> %vec2 to <2 x i64>
225  %v3 = add <2 x i64> %v1, %v0
226  %v4 = mul <2 x i64> %v3, %v2
227  %v5 = and <2 x i64> %v4, <i64 4294967295, i64 4294967295>
228  ret <2 x i64> %v5
229}
230
231define void @addmuli32_loadstore(ptr %a, ptr %b, ptr %c) {
232; CHECK-LABEL: addmuli32_loadstore:
233; CHECK:       // %bb.0: // %entry
234; CHECK-NEXT:    ldr d0, [x1, #32]
235; CHECK-NEXT:    ldr d1, [x2, #32]
236; CHECK-NEXT:    ldr d2, [x0, #32]
237; CHECK-NEXT:    smull v0.2d, v0.2s, v1.2s
238; CHECK-NEXT:    smlal v0.2d, v2.2s, v1.2s
239; CHECK-NEXT:    xtn v0.2s, v0.2d
240; CHECK-NEXT:    str d0, [x0, #32]
241; CHECK-NEXT:    ret
242entry:
243  %scevgep0 = getelementptr i32, ptr %a, i32 8
244  %vec0 = load <2 x i32>, ptr %scevgep0, align 8
245  %v0 = sext <2 x i32> %vec0 to <2 x i64>
246  %scevgep1 = getelementptr i32, ptr %b, i32 8
247  %vec1 = load <2 x i32>, ptr %scevgep1, align 8
248  %v1 = sext <2 x i32> %vec1 to <2 x i64>
249  %scevgep2 = getelementptr i32, ptr %c, i32 8
250  %vec2 = load <2 x i32>, ptr %scevgep2, align 8
251  %v2 = sext <2 x i32> %vec2 to <2 x i64>
252  %v3 = add <2 x i64> %v1, %v0
253  %v4 = mul <2 x i64> %v3, %v2
254  %v5 = trunc <2 x i64> %v4 to <2 x i32>
255  %scevgep3 = getelementptr i32, ptr %a, i32 8
256  store <2 x i32> %v5, ptr %scevgep3, align 8
257  ret void
258}
259
260define void @func1(ptr %a, ptr %b, ptr %c) {
261; CHECK-LABEL: func1:
262; CHECK:       // %bb.0: // %entry
263; CHECK-NEXT:    ldr d0, [x2, #16]
264; CHECK-NEXT:    ldr d1, [x1, #16]
265; CHECK-NEXT:    ldr d2, [x0, #16]
266; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
267; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
268; CHECK-NEXT:    xtn v1.4h, v0.4s
269; CHECK-NEXT:    str d1, [x0, #16]
270; CHECK-NEXT:    ldr d1, [x2, #16]
271; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
272; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
273; CHECK-NEXT:    xtn v1.4h, v0.4s
274; CHECK-NEXT:    str d1, [x1, #16]
275; CHECK-NEXT:    ldr d1, [x2, #16]
276; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
277; CHECK-NEXT:    xtn v0.4h, v0.4s
278; CHECK-NEXT:    str d0, [x0, #16]
279; CHECK-NEXT:    ret
280entry:
281; The test case trying to vectorize the pseudo code below.
282; a[i] = b[i] + c[i];
283; b[i] = aptr c[i];
284; a[i] = b[i] + aptr c[i];
285; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i]" is
286; scheduled before the first vector store to "a[i] = b[i] + c[i]".
287; Checking that there is no vector load a[i] scheduled between the vector
288; stores to a[i], otherwise the load of a[i] will be polluted by the first
289; vector store to a[i].
290; This test case check that the chain information is updated during
291; lowerMUL for the new created Load SDNode.
292
293
294  %scevgep0 = getelementptr i16, ptr %a, i32 8
295  %vec0 = load <4 x i16>, ptr %scevgep0, align 8
296  %scevgep1 = getelementptr i16, ptr %b, i32 8
297  %vec1 = load <4 x i16>, ptr %scevgep1, align 8
298  %0 = zext <4 x i16> %vec1 to <4 x i32>
299  %scevgep2 = getelementptr i16, ptr %c, i32 8
300  %vec2 = load <4 x i16>, ptr %scevgep2, align 8
301  %1 = sext <4 x i16> %vec2 to <4 x i32>
302  %vec3 = add <4 x i32> %1, %0
303  %2 = trunc <4 x i32> %vec3 to <4 x i16>
304  %scevgep3 = getelementptr i16, ptr %a, i32 8
305  store <4 x i16> %2, ptr %scevgep3, align 8
306  %vec4 = load <4 x i16>, ptr %scevgep2, align 8
307  %3 = sext <4 x i16> %vec4 to <4 x i32>
308  %vec5 = mul <4 x i32> %3, %vec3
309  %4 = trunc <4 x i32> %vec5 to <4 x i16>
310  store <4 x i16> %4, ptr %scevgep1, align 8
311  %5 = sext <4 x i16> %vec0 to <4 x i32>
312  %vec6 = load <4 x i16>, ptr %scevgep2, align 8
313  %6 = sext <4 x i16> %vec6 to <4 x i32>
314  %vec7 = mul <4 x i32> %6, %5
315  %vec8 = add <4 x i32> %vec7, %vec5
316  %7 = trunc <4 x i32> %vec8 to <4 x i16>
317  store <4 x i16> %7, ptr %scevgep3, align 8
318  ret void
319}
320
321define void @func2(ptr %a, ptr %b, ptr %c) {
322; CHECK-LABEL: func2:
323; CHECK:       // %bb.0: // %entry
324; CHECK-NEXT:    ldr d0, [x2, #16]
325; CHECK-NEXT:    ldr d1, [x1, #16]
326; CHECK-NEXT:    ldr d2, [x0, #16]
327; CHECK-NEXT:    sshll v0.4s, v0.4h, #0
328; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
329; CHECK-NEXT:    xtn v1.4h, v0.4s
330; CHECK-NEXT:    str d1, [x0, #16]
331; CHECK-NEXT:    ldr d1, [x2, #16]
332; CHECK-NEXT:    sshll v1.4s, v1.4h, #0
333; CHECK-NEXT:    mul v0.4s, v1.4s, v0.4s
334; CHECK-NEXT:    xtn v1.4h, v0.4s
335; CHECK-NEXT:    str d1, [x1, #16]
336; CHECK-NEXT:    ldr d1, [x2, #16]
337; CHECK-NEXT:    smlal v0.4s, v1.4h, v2.4h
338; CHECK-NEXT:    uaddw v0.4s, v0.4s, v2.4h
339; CHECK-NEXT:    xtn v0.4h, v0.4s
340; CHECK-NEXT:    str d0, [x0, #16]
341; CHECK-NEXT:    ret
342entry:
343; The test case trying to vectorize the pseudo code below.
344; a[i] = b[i] + c[i];
345; b[i] = aptr c[i];
346; a[i] = b[i] + aptr c[i] + a[i];
347; Checking that vector load a[i] for "a[i] = b[i] + aptr c[i] + a[i]"
348; is scheduled before the first vector store to "a[i] = b[i] + c[i]".
349; Checking that there is no vector load a[i] scheduled between the first
350; vector store to a[i] and the vector add of a[i], otherwise the load of
351; a[i] will be polluted by the first vector store to a[i].
352; This test case check that both the chain and value of the new created
353; Load SDNode are updated during lowerMUL.
354
355
356  %scevgep0 = getelementptr i16, ptr %a, i32 8
357  %vec0 = load <4 x i16>, ptr %scevgep0, align 8
358  %scevgep1 = getelementptr i16, ptr %b, i32 8
359  %vec1 = load <4 x i16>, ptr %scevgep1, align 8
360  %0 = zext <4 x i16> %vec1 to <4 x i32>
361  %scevgep2 = getelementptr i16, ptr %c, i32 8
362  %vec2 = load <4 x i16>, ptr %scevgep2, align 8
363  %1 = sext <4 x i16> %vec2 to <4 x i32>
364  %vec3 = add <4 x i32> %1, %0
365  %2 = trunc <4 x i32> %vec3 to <4 x i16>
366  %scevgep3 = getelementptr i16, ptr %a, i32 8
367  store <4 x i16> %2, ptr %scevgep3, align 8
368  %vec4 = load <4 x i16>, ptr %scevgep2, align 8
369  %3 = sext <4 x i16> %vec4 to <4 x i32>
370  %vec5 = mul <4 x i32> %3, %vec3
371  %4 = trunc <4 x i32> %vec5 to <4 x i16>
372  store <4 x i16> %4, ptr %scevgep1, align 8
373  %5 = sext <4 x i16> %vec0 to <4 x i32>
374  %vec6 = load <4 x i16>, ptr %scevgep2, align 8
375  %6 = sext <4 x i16> %vec6 to <4 x i32>
376  %vec7 = mul <4 x i32> %6, %5
377  %vec8 = add <4 x i32> %vec7, %vec5
378  %vec9 = add <4 x i32> %vec8, %5
379  %7 = trunc <4 x i32> %vec9 to <4 x i16>
380  store <4 x i16> %7, ptr %scevgep3, align 8
381  ret void
382}
383