xref: /llvm-project/llvm/test/CodeGen/AArch64/stack-probing-sve.ll (revision c5253aa136ac6ba683b367b2bae0dde1a543d1df)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s
3; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
4
5; Test prolog sequences for stack probing when SVE objects are involved.
6
7; The space for SVE objects needs probing in the general case, because
8; the stack adjustment may happen to be too big (i.e. greater than the
9; probe size) to allocate with a single `addvl`.
10; When we do know that the stack adjustment cannot exceed the probe size
11; we can avoid emitting a probe loop and emit a simple `addvl; str`
12; sequence instead.
13
14define void @sve_1_vector(ptr %out) #0 {
15; CHECK-LABEL: sve_1_vector:
16; CHECK:       // %bb.0: // %entry
17; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
18; CHECK-NEXT:    .cfi_def_cfa_offset 16
19; CHECK-NEXT:    .cfi_offset w29, -16
20; CHECK-NEXT:    addvl sp, sp, #-1
21; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
22; CHECK-NEXT:    addvl sp, sp, #1
23; CHECK-NEXT:    .cfi_def_cfa wsp, 16
24; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
25; CHECK-NEXT:    .cfi_def_cfa_offset 0
26; CHECK-NEXT:    .cfi_restore w29
27; CHECK-NEXT:    ret
28entry:
29  %vec = alloca <vscale x 4 x float>, align 16
30  ret void
31}
32
33; As above, but with 4 SVE vectors of stack space.
34define void @sve_4_vector(ptr %out) #0 {
35; CHECK-LABEL: sve_4_vector:
36; CHECK:       // %bb.0: // %entry
37; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
38; CHECK-NEXT:    .cfi_def_cfa_offset 16
39; CHECK-NEXT:    .cfi_offset w29, -16
40; CHECK-NEXT:    addvl sp, sp, #-4
41; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
42; CHECK-NEXT:    addvl sp, sp, #4
43; CHECK-NEXT:    .cfi_def_cfa wsp, 16
44; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
45; CHECK-NEXT:    .cfi_def_cfa_offset 0
46; CHECK-NEXT:    .cfi_restore w29
47; CHECK-NEXT:    ret
48entry:
49  %vec1 = alloca <vscale x 4 x float>, align 16
50  %vec2 = alloca <vscale x 4 x float>, align 16
51  %vec3 = alloca <vscale x 4 x float>, align 16
52  %vec4 = alloca <vscale x 4 x float>, align 16
53  ret void
54}
55
56; As above, but with 16 SVE vectors of stack space.
57; The stack adjustment is less than or equal to 16 x 256 = 4096, so
58; we can allocate the locals at once.
59define void @sve_16_vector(ptr %out) #0 {
60; CHECK-LABEL: sve_16_vector:
61; CHECK:       // %bb.0: // %entry
62; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
63; CHECK-NEXT:    .cfi_def_cfa_offset 16
64; CHECK-NEXT:    .cfi_offset w29, -16
65; CHECK-NEXT:    addvl sp, sp, #-16
66; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
67; CHECK-NEXT:    str xzr, [sp]
68; CHECK-NEXT:    addvl sp, sp, #16
69; CHECK-NEXT:    .cfi_def_cfa wsp, 16
70; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
71; CHECK-NEXT:    .cfi_def_cfa_offset 0
72; CHECK-NEXT:    .cfi_restore w29
73; CHECK-NEXT:    ret
74entry:
75  %vec1 = alloca <vscale x 4 x float>, align 16
76  %vec2 = alloca <vscale x 4 x float>, align 16
77  %vec3 = alloca <vscale x 4 x float>, align 16
78  %vec4 = alloca <vscale x 4 x float>, align 16
79  %vec5 = alloca <vscale x 4 x float>, align 16
80  %vec6 = alloca <vscale x 4 x float>, align 16
81  %vec7 = alloca <vscale x 4 x float>, align 16
82  %vec8 = alloca <vscale x 4 x float>, align 16
83  %vec9 = alloca <vscale x 4 x float>, align 16
84  %vec10 = alloca <vscale x 4 x float>, align 16
85  %vec11 = alloca <vscale x 4 x float>, align 16
86  %vec12 = alloca <vscale x 4 x float>, align 16
87  %vec13 = alloca <vscale x 4 x float>, align 16
88  %vec14 = alloca <vscale x 4 x float>, align 16
89  %vec15 = alloca <vscale x 4 x float>, align 16
90  %vec16 = alloca <vscale x 4 x float>, align 16
91  ret void
92}
93
94; As above, but with 17 SVE vectors of stack space. Now we need
95; a probing loops since stack adjustment may be greater than
96; the probe size (17 x 256 = 4354 bytes)
97; TODO: Allocating `k*16+r` SVE vectors can be unrolled into
98; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]`
99define void @sve_17_vector(ptr %out) #0 {
100; CHECK-LABEL: sve_17_vector:
101; CHECK:       // %bb.0: // %entry
102; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
103; CHECK-NEXT:    .cfi_def_cfa_offset 16
104; CHECK-NEXT:    .cfi_offset w29, -16
105; CHECK-NEXT:    addvl x9, sp, #-17
106; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
107; CHECK-NEXT:  .LBB3_1: // %entry
108; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
109; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
110; CHECK-NEXT:    cmp sp, x9
111; CHECK-NEXT:    b.le .LBB3_3
112; CHECK-NEXT:  // %bb.2: // %entry
113; CHECK-NEXT:    // in Loop: Header=BB3_1 Depth=1
114; CHECK-NEXT:    str xzr, [sp]
115; CHECK-NEXT:    b .LBB3_1
116; CHECK-NEXT:  .LBB3_3: // %entry
117; CHECK-NEXT:    mov sp, x9
118; CHECK-NEXT:    ldr xzr, [sp]
119; CHECK-NEXT:    .cfi_def_cfa_register wsp
120; CHECK-NEXT:    addvl sp, sp, #17
121; CHECK-NEXT:    .cfi_def_cfa wsp, 16
122; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
123; CHECK-NEXT:    .cfi_def_cfa_offset 0
124; CHECK-NEXT:    .cfi_restore w29
125; CHECK-NEXT:    ret
126entry:
127  %vec1 = alloca <vscale x 4 x float>, align 16
128  %vec2 = alloca <vscale x 4 x float>, align 16
129  %vec3 = alloca <vscale x 4 x float>, align 16
130  %vec4 = alloca <vscale x 4 x float>, align 16
131  %vec5 = alloca <vscale x 4 x float>, align 16
132  %vec6 = alloca <vscale x 4 x float>, align 16
133  %vec7 = alloca <vscale x 4 x float>, align 16
134  %vec8 = alloca <vscale x 4 x float>, align 16
135  %vec9 = alloca <vscale x 4 x float>, align 16
136  %vec10 = alloca <vscale x 4 x float>, align 16
137  %vec11 = alloca <vscale x 4 x float>, align 16
138  %vec12 = alloca <vscale x 4 x float>, align 16
139  %vec13 = alloca <vscale x 4 x float>, align 16
140  %vec14 = alloca <vscale x 4 x float>, align 16
141  %vec15 = alloca <vscale x 4 x float>, align 16
142  %vec16 = alloca <vscale x 4 x float>, align 16
143  %vec17 = alloca <vscale x 4 x float>, align 16
144  ret void
145}
146
147; Space for callee-saved SVE register is allocated similarly to allocating
148; space for SVE locals. When we know the stack adjustment cannot exceed the
149; probe size we can skip the explict probe, since saving SVE registers serves
150; as an implicit probe.
151define void @sve_1v_csr(<vscale x 4 x float> %a) #0 {
152; CHECK-LABEL: sve_1v_csr:
153; CHECK:       // %bb.0: // %entry
154; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
155; CHECK-NEXT:    .cfi_def_cfa_offset 16
156; CHECK-NEXT:    .cfi_offset w29, -16
157; CHECK-NEXT:    addvl sp, sp, #-1
158; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
159; CHECK-NEXT:    str z8, [sp] // 16-byte Folded Spill
160; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
161; CHECK-NEXT:    //APP
162; CHECK-NEXT:    //NO_APP
163; CHECK-NEXT:    ldr z8, [sp] // 16-byte Folded Reload
164; CHECK-NEXT:    addvl sp, sp, #1
165; CHECK-NEXT:    .cfi_def_cfa wsp, 16
166; CHECK-NEXT:    .cfi_restore z8
167; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
168; CHECK-NEXT:    .cfi_def_cfa_offset 0
169; CHECK-NEXT:    .cfi_restore w29
170; CHECK-NEXT:    ret
171entry:
172  call void asm sideeffect "", "~{z8}" ()
173  ret void
174}
175
176define void @sve_4v_csr(<vscale x 4 x float> %a) #0 {
177; CHECK-LABEL: sve_4v_csr:
178; CHECK:       // %bb.0: // %entry
179; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
180; CHECK-NEXT:    .cfi_def_cfa_offset 16
181; CHECK-NEXT:    .cfi_offset w29, -16
182; CHECK-NEXT:    addvl sp, sp, #-4
183; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
184; CHECK-NEXT:    str z11, [sp] // 16-byte Folded Spill
185; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
186; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
187; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
188; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
189; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
190; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
191; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
192; CHECK-NEXT:    //APP
193; CHECK-NEXT:    //NO_APP
194; CHECK-NEXT:    ldr z11, [sp] // 16-byte Folded Reload
195; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
196; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
197; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
198; CHECK-NEXT:    addvl sp, sp, #4
199; CHECK-NEXT:    .cfi_def_cfa wsp, 16
200; CHECK-NEXT:    .cfi_restore z8
201; CHECK-NEXT:    .cfi_restore z9
202; CHECK-NEXT:    .cfi_restore z10
203; CHECK-NEXT:    .cfi_restore z11
204; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
205; CHECK-NEXT:    .cfi_def_cfa_offset 0
206; CHECK-NEXT:    .cfi_restore w29
207; CHECK-NEXT:    ret
208entry:
209  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" ()
210  ret void
211}
212
213define void @sve_16v_csr(<vscale x 4 x float> %a) #0 {
214; CHECK-LABEL: sve_16v_csr:
215; CHECK:       // %bb.0: // %entry
216; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
217; CHECK-NEXT:    .cfi_def_cfa_offset 16
218; CHECK-NEXT:    .cfi_offset w29, -16
219; CHECK-NEXT:    addvl sp, sp, #-16
220; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG
221; CHECK-NEXT:    str xzr, [sp]
222; CHECK-NEXT:    str z23, [sp] // 16-byte Folded Spill
223; CHECK-NEXT:    str z22, [sp, #1, mul vl] // 16-byte Folded Spill
224; CHECK-NEXT:    str z21, [sp, #2, mul vl] // 16-byte Folded Spill
225; CHECK-NEXT:    str z20, [sp, #3, mul vl] // 16-byte Folded Spill
226; CHECK-NEXT:    str z19, [sp, #4, mul vl] // 16-byte Folded Spill
227; CHECK-NEXT:    str z18, [sp, #5, mul vl] // 16-byte Folded Spill
228; CHECK-NEXT:    str z17, [sp, #6, mul vl] // 16-byte Folded Spill
229; CHECK-NEXT:    str z16, [sp, #7, mul vl] // 16-byte Folded Spill
230; CHECK-NEXT:    str z15, [sp, #8, mul vl] // 16-byte Folded Spill
231; CHECK-NEXT:    str z14, [sp, #9, mul vl] // 16-byte Folded Spill
232; CHECK-NEXT:    str z13, [sp, #10, mul vl] // 16-byte Folded Spill
233; CHECK-NEXT:    str z12, [sp, #11, mul vl] // 16-byte Folded Spill
234; CHECK-NEXT:    str z11, [sp, #12, mul vl] // 16-byte Folded Spill
235; CHECK-NEXT:    str z10, [sp, #13, mul vl] // 16-byte Folded Spill
236; CHECK-NEXT:    str z9, [sp, #14, mul vl] // 16-byte Folded Spill
237; CHECK-NEXT:    str z8, [sp, #15, mul vl] // 16-byte Folded Spill
238; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
239; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
240; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
241; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
242; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
243; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
244; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
245; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
246; CHECK-NEXT:    //APP
247; CHECK-NEXT:    //NO_APP
248; CHECK-NEXT:    ldr z23, [sp] // 16-byte Folded Reload
249; CHECK-NEXT:    ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
250; CHECK-NEXT:    ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
251; CHECK-NEXT:    ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
252; CHECK-NEXT:    ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
253; CHECK-NEXT:    ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
254; CHECK-NEXT:    ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
255; CHECK-NEXT:    ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
256; CHECK-NEXT:    ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
257; CHECK-NEXT:    ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
258; CHECK-NEXT:    ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
259; CHECK-NEXT:    ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
260; CHECK-NEXT:    ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
261; CHECK-NEXT:    ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
262; CHECK-NEXT:    ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
263; CHECK-NEXT:    ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
264; CHECK-NEXT:    addvl sp, sp, #16
265; CHECK-NEXT:    .cfi_def_cfa wsp, 16
266; CHECK-NEXT:    .cfi_restore z8
267; CHECK-NEXT:    .cfi_restore z9
268; CHECK-NEXT:    .cfi_restore z10
269; CHECK-NEXT:    .cfi_restore z11
270; CHECK-NEXT:    .cfi_restore z12
271; CHECK-NEXT:    .cfi_restore z13
272; CHECK-NEXT:    .cfi_restore z14
273; CHECK-NEXT:    .cfi_restore z15
274; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
275; CHECK-NEXT:    .cfi_def_cfa_offset 0
276; CHECK-NEXT:    .cfi_restore w29
277; CHECK-NEXT:    ret
278entry:
279  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
280  ret void
281}
282
283define void @sve_1p_csr(<vscale x 4 x float> %a) #0 {
284; CHECK-LABEL: sve_1p_csr:
285; CHECK:       // %bb.0: // %entry
286; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
287; CHECK-NEXT:    .cfi_def_cfa_offset 16
288; CHECK-NEXT:    .cfi_offset w29, -16
289; CHECK-NEXT:    addvl sp, sp, #-1
290; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
291; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
292; CHECK-NEXT:    //APP
293; CHECK-NEXT:    //NO_APP
294; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
295; CHECK-NEXT:    addvl sp, sp, #1
296; CHECK-NEXT:    .cfi_def_cfa wsp, 16
297; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
298; CHECK-NEXT:    .cfi_def_cfa_offset 0
299; CHECK-NEXT:    .cfi_restore w29
300; CHECK-NEXT:    ret
301entry:
302  call void asm sideeffect "", "~{p8}" ()
303  ret void
304}
305
306define void @sve_4p_csr(<vscale x 4 x float> %a) #0 {
307; CHECK-LABEL: sve_4p_csr:
308; CHECK:       // %bb.0: // %entry
309; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
310; CHECK-NEXT:    .cfi_def_cfa_offset 16
311; CHECK-NEXT:    .cfi_offset w29, -16
312; CHECK-NEXT:    addvl sp, sp, #-1
313; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG
314; CHECK-NEXT:    str p11, [sp, #4, mul vl] // 2-byte Folded Spill
315; CHECK-NEXT:    str p10, [sp, #5, mul vl] // 2-byte Folded Spill
316; CHECK-NEXT:    str p9, [sp, #6, mul vl] // 2-byte Folded Spill
317; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
318; CHECK-NEXT:    //APP
319; CHECK-NEXT:    //NO_APP
320; CHECK-NEXT:    ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload
321; CHECK-NEXT:    ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload
322; CHECK-NEXT:    ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload
323; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
324; CHECK-NEXT:    addvl sp, sp, #1
325; CHECK-NEXT:    .cfi_def_cfa wsp, 16
326; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
327; CHECK-NEXT:    .cfi_def_cfa_offset 0
328; CHECK-NEXT:    .cfi_restore w29
329; CHECK-NEXT:    ret
330entry:
331  call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" ()
332  ret void
333}
334
335define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
336; CHECK-LABEL: sve_16v_1p_csr:
337; CHECK:       // %bb.0: // %entry
338; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
339; CHECK-NEXT:    .cfi_def_cfa_offset 16
340; CHECK-NEXT:    .cfi_offset w29, -16
341; CHECK-NEXT:    addvl x9, sp, #-17
342; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG
343; CHECK-NEXT:  .LBB9_1: // %entry
344; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
345; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
346; CHECK-NEXT:    cmp sp, x9
347; CHECK-NEXT:    b.le .LBB9_3
348; CHECK-NEXT:  // %bb.2: // %entry
349; CHECK-NEXT:    // in Loop: Header=BB9_1 Depth=1
350; CHECK-NEXT:    str xzr, [sp]
351; CHECK-NEXT:    b .LBB9_1
352; CHECK-NEXT:  .LBB9_3: // %entry
353; CHECK-NEXT:    mov sp, x9
354; CHECK-NEXT:    ldr xzr, [sp]
355; CHECK-NEXT:    .cfi_def_cfa_register wsp
356; CHECK-NEXT:    str p8, [sp, #7, mul vl] // 2-byte Folded Spill
357; CHECK-NEXT:    str z23, [sp, #1, mul vl] // 16-byte Folded Spill
358; CHECK-NEXT:    str z22, [sp, #2, mul vl] // 16-byte Folded Spill
359; CHECK-NEXT:    str z21, [sp, #3, mul vl] // 16-byte Folded Spill
360; CHECK-NEXT:    str z20, [sp, #4, mul vl] // 16-byte Folded Spill
361; CHECK-NEXT:    str z19, [sp, #5, mul vl] // 16-byte Folded Spill
362; CHECK-NEXT:    str z18, [sp, #6, mul vl] // 16-byte Folded Spill
363; CHECK-NEXT:    str z17, [sp, #7, mul vl] // 16-byte Folded Spill
364; CHECK-NEXT:    str z16, [sp, #8, mul vl] // 16-byte Folded Spill
365; CHECK-NEXT:    str z15, [sp, #9, mul vl] // 16-byte Folded Spill
366; CHECK-NEXT:    str z14, [sp, #10, mul vl] // 16-byte Folded Spill
367; CHECK-NEXT:    str z13, [sp, #11, mul vl] // 16-byte Folded Spill
368; CHECK-NEXT:    str z12, [sp, #12, mul vl] // 16-byte Folded Spill
369; CHECK-NEXT:    str z11, [sp, #13, mul vl] // 16-byte Folded Spill
370; CHECK-NEXT:    str z10, [sp, #14, mul vl] // 16-byte Folded Spill
371; CHECK-NEXT:    str z9, [sp, #15, mul vl] // 16-byte Folded Spill
372; CHECK-NEXT:    str z8, [sp, #16, mul vl] // 16-byte Folded Spill
373; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
374; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
375; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
376; CHECK-NEXT:    .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG
377; CHECK-NEXT:    .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG
378; CHECK-NEXT:    .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG
379; CHECK-NEXT:    .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG
380; CHECK-NEXT:    .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
381; CHECK-NEXT:    //APP
382; CHECK-NEXT:    //NO_APP
383; CHECK-NEXT:    ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
384; CHECK-NEXT:    ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
385; CHECK-NEXT:    ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
386; CHECK-NEXT:    ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload
387; CHECK-NEXT:    ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload
388; CHECK-NEXT:    ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload
389; CHECK-NEXT:    ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload
390; CHECK-NEXT:    ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload
391; CHECK-NEXT:    ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload
392; CHECK-NEXT:    ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload
393; CHECK-NEXT:    ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload
394; CHECK-NEXT:    ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload
395; CHECK-NEXT:    ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
396; CHECK-NEXT:    ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
397; CHECK-NEXT:    ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
398; CHECK-NEXT:    ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
399; CHECK-NEXT:    ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
400; CHECK-NEXT:    addvl sp, sp, #17
401; CHECK-NEXT:    .cfi_def_cfa wsp, 16
402; CHECK-NEXT:    .cfi_restore z8
403; CHECK-NEXT:    .cfi_restore z9
404; CHECK-NEXT:    .cfi_restore z10
405; CHECK-NEXT:    .cfi_restore z11
406; CHECK-NEXT:    .cfi_restore z12
407; CHECK-NEXT:    .cfi_restore z13
408; CHECK-NEXT:    .cfi_restore z14
409; CHECK-NEXT:    .cfi_restore z15
410; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
411; CHECK-NEXT:    .cfi_def_cfa_offset 0
412; CHECK-NEXT:    .cfi_restore w29
413; CHECK-NEXT:    ret
414entry:
415  call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" ()
416  ret void
417}
418
419; A SVE vector and a 16-byte fixed size object.
420define void @sve_1_vector_16_arr(ptr %out) #0 {
421; CHECK-LABEL: sve_1_vector_16_arr:
422; CHECK:       // %bb.0: // %entry
423; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
424; CHECK-NEXT:    .cfi_def_cfa_offset 16
425; CHECK-NEXT:    .cfi_offset w29, -16
426; CHECK-NEXT:    sub sp, sp, #16
427; CHECK-NEXT:    .cfi_def_cfa_offset 32
428; CHECK-NEXT:    addvl sp, sp, #-1
429; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG
430; CHECK-NEXT:    addvl sp, sp, #1
431; CHECK-NEXT:    .cfi_def_cfa wsp, 32
432; CHECK-NEXT:    add sp, sp, #16
433; CHECK-NEXT:    .cfi_def_cfa_offset 16
434; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
435; CHECK-NEXT:    .cfi_def_cfa_offset 0
436; CHECK-NEXT:    .cfi_restore w29
437; CHECK-NEXT:    ret
438entry:
439  %vec = alloca <vscale x 4 x float>, align 16
440  %arr = alloca i8, i64 16, align 1
441  ret void
442}
443
444; A large SVE stack object and a large stack slot, both of which need probing.
445; TODO: This could be optimised by combining the fixed-size offset into the
446; loop.
447define void @sve_1_vector_4096_arr(ptr %out) #0 {
448; CHECK-LABEL: sve_1_vector_4096_arr:
449; CHECK:       // %bb.0: // %entry
450; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
451; CHECK-NEXT:    .cfi_def_cfa_offset 16
452; CHECK-NEXT:    .cfi_offset w29, -16
453; CHECK-NEXT:    sub x9, sp, #3, lsl #12 // =12288
454; CHECK-NEXT:    .cfi_def_cfa w9, 12304
455; CHECK-NEXT:    addvl x9, x9, #-32
456; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG
457; CHECK-NEXT:    addvl x9, x9, #-32
458; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG
459; CHECK-NEXT:  .LBB11_1: // %entry
460; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
461; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
462; CHECK-NEXT:    cmp sp, x9
463; CHECK-NEXT:    b.le .LBB11_3
464; CHECK-NEXT:  // %bb.2: // %entry
465; CHECK-NEXT:    // in Loop: Header=BB11_1 Depth=1
466; CHECK-NEXT:    str xzr, [sp]
467; CHECK-NEXT:    b .LBB11_1
468; CHECK-NEXT:  .LBB11_3: // %entry
469; CHECK-NEXT:    mov sp, x9
470; CHECK-NEXT:    ldr xzr, [sp]
471; CHECK-NEXT:    .cfi_def_cfa_register wsp
472; CHECK-NEXT:    addvl sp, sp, #31
473; CHECK-NEXT:    .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG
474; CHECK-NEXT:    addvl sp, sp, #31
475; CHECK-NEXT:    .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG
476; CHECK-NEXT:    addvl sp, sp, #2
477; CHECK-NEXT:    .cfi_def_cfa wsp, 12304
478; CHECK-NEXT:    add sp, sp, #3, lsl #12 // =12288
479; CHECK-NEXT:    .cfi_def_cfa_offset 16
480; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
481; CHECK-NEXT:    .cfi_def_cfa_offset 0
482; CHECK-NEXT:    .cfi_restore w29
483; CHECK-NEXT:    ret
484entry:
485  %vec = alloca <vscale x 256 x float>, align 16
486  %arr = alloca i8, i64 12288, align 1
487  ret void
488}
489
490; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently
491; supported even without stack-probing.
492
493; An SVE vector, and a 16-byte fixed size object, which
494; has a large alignment requirement.
495define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 {
496; CHECK-LABEL: sve_1_vector_16_arr_align_8192:
497; CHECK:       // %bb.0: // %entry
498; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
499; CHECK-NEXT:    .cfi_def_cfa_offset 16
500; CHECK-NEXT:    mov x29, sp
501; CHECK-NEXT:    .cfi_def_cfa w29, 16
502; CHECK-NEXT:    .cfi_offset w30, -8
503; CHECK-NEXT:    .cfi_offset w29, -16
504; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
505; CHECK-NEXT:    sub x9, x9, #4080
506; CHECK-NEXT:    addvl x9, x9, #-1
507; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
508; CHECK-NEXT:  .LBB12_1: // %entry
509; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
510; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
511; CHECK-NEXT:    cmp sp, x9
512; CHECK-NEXT:    b.le .LBB12_3
513; CHECK-NEXT:  // %bb.2: // %entry
514; CHECK-NEXT:    // in Loop: Header=BB12_1 Depth=1
515; CHECK-NEXT:    str xzr, [sp]
516; CHECK-NEXT:    b .LBB12_1
517; CHECK-NEXT:  .LBB12_3: // %entry
518; CHECK-NEXT:    mov sp, x9
519; CHECK-NEXT:    ldr xzr, [sp]
520; CHECK-NEXT:    mov sp, x29
521; CHECK-NEXT:    .cfi_def_cfa wsp, 16
522; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
523; CHECK-NEXT:    .cfi_def_cfa_offset 0
524; CHECK-NEXT:    .cfi_restore w30
525; CHECK-NEXT:    .cfi_restore w29
526; CHECK-NEXT:    ret
527entry:
528  %vec = alloca <vscale x 4 x float>, align 16
529  %arr = alloca i8, i64 16, align 8192
530  ret void
531}
532
533; With 64k guard pages, we can allocate bigger SVE space without a probing loop.
534define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
535; CHECK-LABEL: sve_1024_64k_guard:
536; CHECK:       // %bb.0: // %entry
537; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
538; CHECK-NEXT:    .cfi_def_cfa_offset 16
539; CHECK-NEXT:    .cfi_offset w29, -16
540; CHECK-NEXT:    addvl sp, sp, #-32
541; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG
542; CHECK-NEXT:    addvl sp, sp, #-32
543; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG
544; CHECK-NEXT:    addvl sp, sp, #-32
545; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG
546; CHECK-NEXT:    addvl sp, sp, #-32
547; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG
548; CHECK-NEXT:    addvl sp, sp, #-32
549; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG
550; CHECK-NEXT:    addvl sp, sp, #-32
551; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG
552; CHECK-NEXT:    addvl sp, sp, #-32
553; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG
554; CHECK-NEXT:    addvl sp, sp, #-32
555; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG
556; CHECK-NEXT:    str xzr, [sp]
557; CHECK-NEXT:    addvl sp, sp, #31
558; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG
559; CHECK-NEXT:    addvl sp, sp, #31
560; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG
561; CHECK-NEXT:    addvl sp, sp, #31
562; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG
563; CHECK-NEXT:    addvl sp, sp, #31
564; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG
565; CHECK-NEXT:    addvl sp, sp, #31
566; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG
567; CHECK-NEXT:    addvl sp, sp, #31
568; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG
569; CHECK-NEXT:    addvl sp, sp, #31
570; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG
571; CHECK-NEXT:    addvl sp, sp, #31
572; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
573; CHECK-NEXT:    addvl sp, sp, #8
574; CHECK-NEXT:    .cfi_def_cfa wsp, 16
575; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
576; CHECK-NEXT:    .cfi_def_cfa_offset 0
577; CHECK-NEXT:    .cfi_restore w29
578; CHECK-NEXT:    ret
579entry:
580  %vec = alloca <vscale x 1024 x float>, align 16
581  ret void
582}
583
584define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" {
585; CHECK-LABEL: sve_1028_64k_guard:
586; CHECK:       // %bb.0: // %entry
587; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
588; CHECK-NEXT:    .cfi_def_cfa_offset 16
589; CHECK-NEXT:    .cfi_offset w29, -16
590; CHECK-NEXT:    addvl x9, sp, #-32
591; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG
592; CHECK-NEXT:    addvl x9, x9, #-32
593; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG
594; CHECK-NEXT:    addvl x9, x9, #-32
595; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG
596; CHECK-NEXT:    addvl x9, x9, #-32
597; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG
598; CHECK-NEXT:    addvl x9, x9, #-32
599; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG
600; CHECK-NEXT:    addvl x9, x9, #-32
601; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG
602; CHECK-NEXT:    addvl x9, x9, #-32
603; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG
604; CHECK-NEXT:    addvl x9, x9, #-32
605; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG
606; CHECK-NEXT:    addvl x9, x9, #-1
607; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG
608; CHECK-NEXT:  .LBB14_1: // %entry
609; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
610; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
611; CHECK-NEXT:    cmp sp, x9
612; CHECK-NEXT:    b.le .LBB14_3
613; CHECK-NEXT:  // %bb.2: // %entry
614; CHECK-NEXT:    // in Loop: Header=BB14_1 Depth=1
615; CHECK-NEXT:    str xzr, [sp]
616; CHECK-NEXT:    b .LBB14_1
617; CHECK-NEXT:  .LBB14_3: // %entry
618; CHECK-NEXT:    mov sp, x9
619; CHECK-NEXT:    ldr xzr, [sp]
620; CHECK-NEXT:    .cfi_def_cfa_register wsp
621; CHECK-NEXT:    addvl sp, sp, #31
622; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG
623; CHECK-NEXT:    addvl sp, sp, #31
624; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG
625; CHECK-NEXT:    addvl sp, sp, #31
626; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG
627; CHECK-NEXT:    addvl sp, sp, #31
628; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG
629; CHECK-NEXT:    addvl sp, sp, #31
630; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG
631; CHECK-NEXT:    addvl sp, sp, #31
632; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG
633; CHECK-NEXT:    addvl sp, sp, #31
634; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG
635; CHECK-NEXT:    addvl sp, sp, #31
636; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG
637; CHECK-NEXT:    addvl sp, sp, #9
638; CHECK-NEXT:    .cfi_def_cfa wsp, 16
639; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
640; CHECK-NEXT:    .cfi_def_cfa_offset 0
641; CHECK-NEXT:    .cfi_restore w29
642; CHECK-NEXT:    ret
643entry:
644  %vec = alloca <vscale x 1024 x float>, align 16
645  %vec1 = alloca <vscale x 4 x float>, align 16
646  ret void
647}
648
649; With 5 SVE vectors of stack space the unprobed area
650; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280),
651; hence we need to issue a probe.
652define void @sve_5_vector(ptr %out) #0 {
653; CHECK-LABEL: sve_5_vector:
654; CHECK:       // %bb.0: // %entry
655; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
656; CHECK-NEXT:    .cfi_def_cfa_offset 16
657; CHECK-NEXT:    .cfi_offset w29, -16
658; CHECK-NEXT:    addvl sp, sp, #-5
659; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG
660; CHECK-NEXT:    str xzr, [sp]
661; CHECK-NEXT:    addvl sp, sp, #5
662; CHECK-NEXT:    .cfi_def_cfa wsp, 16
663; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
664; CHECK-NEXT:    .cfi_def_cfa_offset 0
665; CHECK-NEXT:    .cfi_restore w29
666; CHECK-NEXT:    ret
667entry:
668  %vec1 = alloca <vscale x 4 x float>, align 16
669  %vec2 = alloca <vscale x 4 x float>, align 16
670  %vec3 = alloca <vscale x 4 x float>, align 16
671  %vec4 = alloca <vscale x 4 x float>, align 16
672  %vec5 = alloca <vscale x 4 x float>, align 16
673  ret void
674}
675
676; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed
677; are bellow the save location of `p9`.
678define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
679; CHECK-LABEL: sve_unprobed_area:
680; CHECK:       // %bb.0: // %entry
681; CHECK-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
682; CHECK-NEXT:    .cfi_def_cfa_offset 16
683; CHECK-NEXT:    .cfi_offset w29, -16
684; CHECK-NEXT:    addvl sp, sp, #-4
685; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
686; CHECK-NEXT:    str xzr, [sp]
687; CHECK-NEXT:    str p9, [sp, #7, mul vl] // 2-byte Folded Spill
688; CHECK-NEXT:    str z10, [sp, #1, mul vl] // 16-byte Folded Spill
689; CHECK-NEXT:    str z9, [sp, #2, mul vl] // 16-byte Folded Spill
690; CHECK-NEXT:    str z8, [sp, #3, mul vl] // 16-byte Folded Spill
691; CHECK-NEXT:    .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
692; CHECK-NEXT:    .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG
693; CHECK-NEXT:    .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG
694; CHECK-NEXT:    addvl sp, sp, #-4
695; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG
696; CHECK-NEXT:    //APP
697; CHECK-NEXT:    //NO_APP
698; CHECK-NEXT:    addvl sp, sp, #4
699; CHECK-NEXT:    .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
700; CHECK-NEXT:    ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
701; CHECK-NEXT:    ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
702; CHECK-NEXT:    ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
703; CHECK-NEXT:    ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
704; CHECK-NEXT:    addvl sp, sp, #4
705; CHECK-NEXT:    .cfi_def_cfa wsp, 16
706; CHECK-NEXT:    .cfi_restore z8
707; CHECK-NEXT:    .cfi_restore z9
708; CHECK-NEXT:    .cfi_restore z10
709; CHECK-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
710; CHECK-NEXT:    .cfi_def_cfa_offset 0
711; CHECK-NEXT:    .cfi_restore w29
712; CHECK-NEXT:    ret
713entry:
714  call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" ()
715
716  %v0 = alloca <vscale x 4 x float>, align 16
717  %v1 = alloca <vscale x 4 x float>, align 16
718  %v2 = alloca <vscale x 4 x float>, align 16
719  %v3 = alloca <vscale x 4 x float>, align 16
720
721  ret void
722}
723
724attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" }
725