xref: /llvm-project/llvm/test/CodeGen/AArch64/arm64-neon-vector-list-spill.ll (revision 6e54fccede402c9ed0e8038aa258a99c5a2773e5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
2; RUN: llc < %s -verify-machineinstrs -mtriple=arm64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
3
4; FIXME: We should not generate ld/st for such register spill/fill, because the
5; test case seems very simple and the register pressure is not high. If the
6; spill/fill algorithm is optimized, this test case may not be triggered. And
7; then we can delete it.
8define i32 @spill.DPairReg(ptr %arg1, i32 %arg2) {
9; CHECK-LABEL: spill.DPairReg:
10; CHECK:       // %bb.0: // %entry
11; CHECK-NEXT:    ld2 { v0.2s, v1.2s }, [x0]
12; CHECK-NEXT:    cbz w1, .LBB0_2
13; CHECK-NEXT:  // %bb.1: // %if.end
14; CHECK-NEXT:    mov w0, v0.s[1]
15; CHECK-NEXT:    ret
16; CHECK-NEXT:  .LBB0_2: // %if.then
17; CHECK-NEXT:    sub sp, sp, #48
18; CHECK-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
19; CHECK-NEXT:    .cfi_def_cfa_offset 48
20; CHECK-NEXT:    .cfi_offset w30, -8
21; CHECK-NEXT:    .cfi_offset w29, -16
22; CHECK-NEXT:    mov x8, sp
23; CHECK-NEXT:    st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill
24; CHECK-NEXT:    bl foo
25; CHECK-NEXT:    mov x8, sp
26; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
27; CHECK-NEXT:    ld1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Reload
28; CHECK-NEXT:    add sp, sp, #48
29; CHECK-NEXT:    mov w0, v0.s[1]
30; CHECK-NEXT:    ret
31entry:
32  %vld = tail call { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr %arg1)
33  %cmp = icmp eq i32 %arg2, 0
34  br i1 %cmp, label %if.then, label %if.end
35
36if.then:
37  tail call void @foo()
38  br label %if.end
39
40if.end:
41  %vld.extract = extractvalue { <2 x i32>, <2 x i32> } %vld, 0
42  %res = extractelement <2 x i32> %vld.extract, i32 1
43  ret i32 %res
44}
45
46define i16 @spill.DTripleReg(ptr %arg1, i32 %arg2) {
47; CHECK-LABEL: spill.DTripleReg:
48; CHECK:       // %bb.0: // %entry
49; CHECK-NEXT:    ld3 { v0.4h, v1.4h, v2.4h }, [x0]
50; CHECK-NEXT:    cbz w1, .LBB1_2
51; CHECK-NEXT:  // %bb.1: // %if.end
52; CHECK-NEXT:    umov w0, v0.h[1]
53; CHECK-NEXT:    ret
54; CHECK-NEXT:  .LBB1_2: // %if.then
55; CHECK-NEXT:    sub sp, sp, #64
56; CHECK-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
57; CHECK-NEXT:    .cfi_def_cfa_offset 64
58; CHECK-NEXT:    .cfi_offset w30, -8
59; CHECK-NEXT:    .cfi_offset w29, -16
60; CHECK-NEXT:    mov x8, sp
61; CHECK-NEXT:    st1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Spill
62; CHECK-NEXT:    bl foo
63; CHECK-NEXT:    mov x8, sp
64; CHECK-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
65; CHECK-NEXT:    ld1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Reload
66; CHECK-NEXT:    add sp, sp, #64
67; CHECK-NEXT:    umov w0, v0.h[1]
68; CHECK-NEXT:    ret
69entry:
70  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr %arg1)
71  %cmp = icmp eq i32 %arg2, 0
72  br i1 %cmp, label %if.then, label %if.end
73
74if.then:
75  tail call void @foo()
76  br label %if.end
77
78if.end:
79  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
80  %res = extractelement <4 x i16> %vld.extract, i32 1
81  ret i16 %res
82}
83
84define i16 @spill.DQuadReg(ptr %arg1, i32 %arg2) {
85; CHECK-LABEL: spill.DQuadReg:
86; CHECK:       // %bb.0: // %entry
87; CHECK-NEXT:    ld4 { v0.4h, v1.4h, v2.4h, v3.4h }, [x0]
88; CHECK-NEXT:    cbz w1, .LBB2_2
89; CHECK-NEXT:  // %bb.1: // %if.end
90; CHECK-NEXT:    umov w0, v0.h[0]
91; CHECK-NEXT:    ret
92; CHECK-NEXT:  .LBB2_2: // %if.then
93; CHECK-NEXT:    sub sp, sp, #80
94; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
95; CHECK-NEXT:    .cfi_def_cfa_offset 80
96; CHECK-NEXT:    .cfi_offset w30, -8
97; CHECK-NEXT:    .cfi_offset w29, -16
98; CHECK-NEXT:    mov x8, sp
99; CHECK-NEXT:    st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Spill
100; CHECK-NEXT:    bl foo
101; CHECK-NEXT:    mov x8, sp
102; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
103; CHECK-NEXT:    ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Reload
104; CHECK-NEXT:    add sp, sp, #80
105; CHECK-NEXT:    umov w0, v0.h[0]
106; CHECK-NEXT:    ret
107entry:
108  %vld = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr %arg1)
109  %cmp = icmp eq i32 %arg2, 0
110  br i1 %cmp, label %if.then, label %if.end
111
112if.then:
113  tail call void @foo()
114  br label %if.end
115
116if.end:
117  %vld.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld, 0
118  %res = extractelement <4 x i16> %vld.extract, i32 0
119  ret i16 %res
120}
121
122define i32 @spill.QPairReg(ptr %arg1, i32 %arg2) {
123; CHECK-LABEL: spill.QPairReg:
124; CHECK:       // %bb.0: // %entry
125; CHECK-NEXT:    ld2 { v0.4s, v1.4s }, [x0]
126; CHECK-NEXT:    cbz w1, .LBB3_2
127; CHECK-NEXT:  // %bb.1: // %if.end
128; CHECK-NEXT:    mov w0, v0.s[1]
129; CHECK-NEXT:    ret
130; CHECK-NEXT:  .LBB3_2: // %if.then
131; CHECK-NEXT:    sub sp, sp, #48
132; CHECK-NEXT:    stp x29, x30, [sp, #32] // 16-byte Folded Spill
133; CHECK-NEXT:    .cfi_def_cfa_offset 48
134; CHECK-NEXT:    .cfi_offset w30, -8
135; CHECK-NEXT:    .cfi_offset w29, -16
136; CHECK-NEXT:    mov x8, sp
137; CHECK-NEXT:    st1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Spill
138; CHECK-NEXT:    bl foo
139; CHECK-NEXT:    mov x8, sp
140; CHECK-NEXT:    ldp x29, x30, [sp, #32] // 16-byte Folded Reload
141; CHECK-NEXT:    ld1 { v0.2d, v1.2d }, [x8] // 32-byte Folded Reload
142; CHECK-NEXT:    add sp, sp, #48
143; CHECK-NEXT:    mov w0, v0.s[1]
144; CHECK-NEXT:    ret
145entry:
146  %vld = tail call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr %arg1)
147  %cmp = icmp eq i32 %arg2, 0
148  br i1 %cmp, label %if.then, label %if.end
149
150if.then:
151  tail call void @foo()
152  br label %if.end
153
154if.end:
155  %vld.extract = extractvalue { <4 x i32>, <4 x i32> } %vld, 0
156  %res = extractelement <4 x i32> %vld.extract, i32 1
157  ret i32 %res
158}
159
160define float @spill.QTripleReg(ptr %arg1, i32 %arg2) {
161; CHECK-LABEL: spill.QTripleReg:
162; CHECK:       // %bb.0: // %entry
163; CHECK-NEXT:    ld3 { v0.4s, v1.4s, v2.4s }, [x0]
164; CHECK-NEXT:    cbz w1, .LBB4_2
165; CHECK-NEXT:  // %bb.1: // %if.end
166; CHECK-NEXT:    mov s0, v0.s[1]
167; CHECK-NEXT:    ret
168; CHECK-NEXT:  .LBB4_2: // %if.then
169; CHECK-NEXT:    sub sp, sp, #64
170; CHECK-NEXT:    stp x29, x30, [sp, #48] // 16-byte Folded Spill
171; CHECK-NEXT:    .cfi_def_cfa_offset 64
172; CHECK-NEXT:    .cfi_offset w30, -8
173; CHECK-NEXT:    .cfi_offset w29, -16
174; CHECK-NEXT:    mov x8, sp
175; CHECK-NEXT:    st1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Spill
176; CHECK-NEXT:    bl foo
177; CHECK-NEXT:    mov x8, sp
178; CHECK-NEXT:    ldp x29, x30, [sp, #48] // 16-byte Folded Reload
179; CHECK-NEXT:    ld1 { v0.2d, v1.2d, v2.2d }, [x8] // 48-byte Folded Reload
180; CHECK-NEXT:    add sp, sp, #64
181; CHECK-NEXT:    mov s0, v0.s[1]
182; CHECK-NEXT:    ret
183entry:
184  %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr %arg1)
185  %cmp = icmp eq i32 %arg2, 0
186  br i1 %cmp, label %if.then, label %if.end
187
188if.then:
189  tail call void @foo()
190  br label %if.end
191
192if.end:
193  %vld3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
194  %res = extractelement <4 x float> %vld3.extract, i32 1
195  ret float %res
196}
197
198define i8 @spill.QQuadReg(ptr %arg1, i32 %arg2) {
199; CHECK-LABEL: spill.QQuadReg:
200; CHECK:       // %bb.0: // %entry
201; CHECK-NEXT:    ld4 { v0.16b, v1.16b, v2.16b, v3.16b }, [x0]
202; CHECK-NEXT:    cbz w1, .LBB5_2
203; CHECK-NEXT:  // %bb.1: // %if.end
204; CHECK-NEXT:    umov w0, v0.b[1]
205; CHECK-NEXT:    ret
206; CHECK-NEXT:  .LBB5_2: // %if.then
207; CHECK-NEXT:    sub sp, sp, #80
208; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
209; CHECK-NEXT:    .cfi_def_cfa_offset 80
210; CHECK-NEXT:    .cfi_offset w30, -8
211; CHECK-NEXT:    .cfi_offset w29, -16
212; CHECK-NEXT:    mov x8, sp
213; CHECK-NEXT:    st1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Spill
214; CHECK-NEXT:    bl foo
215; CHECK-NEXT:    mov x8, sp
216; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
217; CHECK-NEXT:    ld1 { v0.2d, v1.2d, v2.2d, v3.2d }, [x8] // 64-byte Folded Reload
218; CHECK-NEXT:    add sp, sp, #80
219; CHECK-NEXT:    umov w0, v0.b[1]
220; CHECK-NEXT:    ret
221entry:
222  %vld = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr %arg1)
223  %cmp = icmp eq i32 %arg2, 0
224  br i1 %cmp, label %if.then, label %if.end
225
226if.then:
227  tail call void @foo()
228  br label %if.end
229
230if.end:
231  %vld.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld, 0
232  %res = extractelement <16 x i8> %vld.extract, i32 1
233  ret i8 %res
234}
235
236declare { <2 x i32>, <2 x i32> } @llvm.aarch64.neon.ld2.v2i32.p0(ptr)
237declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld3.v4i16.p0(ptr)
238declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.aarch64.neon.ld4.v4i16.p0(ptr)
239declare { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr)
240declare { <4 x float>, <4 x float>, <4 x float> } @llvm.aarch64.neon.ld3.v4f32.p0(ptr)
241declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld4.v16i8.p0(ptr)
242
243declare void @foo()
244
245; FIXME: We should not generate ld/st for such register spill/fill, because the
246; test case seems very simple and the register pressure is not high. If the
247; spill/fill algorithm is optimized, this test case may not be triggered. And
248; then we can delete it.
249; check the spill for Register Class QPair_with_qsub_0_in_FPR128Lo
250define <8 x i16> @test_2xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
251; CHECK-LABEL: test_2xFPR128Lo:
252; CHECK:       // %bb.0:
253; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
254; CHECK-NEXT:    .cfi_def_cfa_offset 16
255; CHECK-NEXT:    .cfi_offset w30, -16
256; CHECK-NEXT:    movi v0.2d, #0000000000000000
257; CHECK-NEXT:    mov v1.16b, v0.16b
258; CHECK-NEXT:    st2 { v0.d, v1.d }[0], [x1]
259; CHECK-NEXT:    bl foo
260; CHECK-NEXT:    movi v0.2d, #0000000000000000
261; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
262; CHECK-NEXT:    ret
263  tail call void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
264  tail call void @foo()
265  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
266  %1 = bitcast <2 x i64> %sv to <8 x i16>
267  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
268  %3 = mul <8 x i16> %2, %2
269  ret <8 x i16> %3
270}
271
272; check the spill for Register Class QTriple_with_qsub_0_in_FPR128Lo
273define <8 x i16> @test_3xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
274; CHECK-LABEL: test_3xFPR128Lo:
275; CHECK:       // %bb.0:
276; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
277; CHECK-NEXT:    .cfi_def_cfa_offset 16
278; CHECK-NEXT:    .cfi_offset w30, -16
279; CHECK-NEXT:    movi v0.2d, #0000000000000000
280; CHECK-NEXT:    mov v1.16b, v0.16b
281; CHECK-NEXT:    mov v2.16b, v0.16b
282; CHECK-NEXT:    st3 { v0.d, v1.d, v2.d }[0], [x1]
283; CHECK-NEXT:    bl foo
284; CHECK-NEXT:    movi v0.2d, #0000000000000000
285; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
286; CHECK-NEXT:    ret
287  tail call void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
288  tail call void @foo()
289  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
290  %1 = bitcast <2 x i64> %sv to <8 x i16>
291  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
292  %3 = mul <8 x i16> %2, %2
293  ret <8 x i16> %3
294}
295
296; check the spill for Register Class QQuad_with_qsub_0_in_FPR128Lo
297define <8 x i16> @test_4xFPR128Lo(i64 %got, ptr %ptr, <1 x i64> %a) {
298; CHECK-LABEL: test_4xFPR128Lo:
299; CHECK:       // %bb.0:
300; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
301; CHECK-NEXT:    .cfi_def_cfa_offset 16
302; CHECK-NEXT:    .cfi_offset w30, -16
303; CHECK-NEXT:    movi v0.2d, #0000000000000000
304; CHECK-NEXT:    mov v1.16b, v0.16b
305; CHECK-NEXT:    mov v2.16b, v0.16b
306; CHECK-NEXT:    mov v3.16b, v0.16b
307; CHECK-NEXT:    st4 { v0.d, v1.d, v2.d, v3.d }[0], [x1]
308; CHECK-NEXT:    bl foo
309; CHECK-NEXT:    movi v0.2d, #0000000000000000
310; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
311; CHECK-NEXT:    ret
312  tail call void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, <1 x i64> zeroinitializer, i64 0, ptr %ptr)
313  tail call void @foo()
314  %sv = shufflevector <1 x i64> zeroinitializer, <1 x i64> %a, <2 x i32> <i32 0, i32 1>
315  %1 = bitcast <2 x i64> %sv to <8 x i16>
316  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
317  %3 = mul <8 x i16> %2, %2
318  ret <8 x i16> %3
319}
320
321declare void @llvm.aarch64.neon.st2lane.v1i64.p0(<1 x i64>, <1 x i64>, i64, ptr)
322declare void @llvm.aarch64.neon.st3lane.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, i64, ptr)
323declare void @llvm.aarch64.neon.st4lane.v1i64.p0(<1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i64, ptr)
324