xref: /llvm-project/llvm/test/CodeGen/AArch64/stack-probing-dynamic.ll (revision 15e86c2a3ed89e3c43d5124fb8461305a6c70b09)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs                                   | FileCheck %s
3; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s
4
5; Dynamically-sized allocation, needs a loop which can handle any size at
6; runtime. The final iteration of the loop will temporarily put SP below the
7; target address, but this doesn't break any of the ABI constraints on the
8; stack, and also doesn't probe below the target SP value.
9define void @dynamic(i64 %size, ptr %out) #0 {
10; CHECK-LABEL: dynamic:
11; CHECK:       // %bb.0:
12; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
13; CHECK-NEXT:    .cfi_def_cfa_offset 16
14; CHECK-NEXT:    mov x29, sp
15; CHECK-NEXT:    .cfi_def_cfa w29, 16
16; CHECK-NEXT:    .cfi_offset w30, -8
17; CHECK-NEXT:    .cfi_offset w29, -16
18; CHECK-NEXT:    add x9, x0, #15
19; CHECK-NEXT:    mov x8, sp
20; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
21; CHECK-NEXT:    sub x8, x8, x9
22; CHECK-NEXT:  .LBB0_1: // =>This Inner Loop Header: Depth=1
23; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
24; CHECK-NEXT:    cmp sp, x8
25; CHECK-NEXT:    b.le .LBB0_3
26; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB0_1 Depth=1
27; CHECK-NEXT:    str xzr, [sp]
28; CHECK-NEXT:    b .LBB0_1
29; CHECK-NEXT:  .LBB0_3:
30; CHECK-NEXT:    mov sp, x8
31; CHECK-NEXT:    ldr xzr, [sp]
32; CHECK-NEXT:    str x8, [x1]
33; CHECK-NEXT:    mov sp, x29
34; CHECK-NEXT:    .cfi_def_cfa wsp, 16
35; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
36; CHECK-NEXT:    .cfi_def_cfa_offset 0
37; CHECK-NEXT:    .cfi_restore w30
38; CHECK-NEXT:    .cfi_restore w29
39; CHECK-NEXT:    ret
40  %v = alloca i8, i64 %size, align 1
41  store ptr %v, ptr %out, align 8
42  ret void
43}
44
45; This function has a fixed-size stack slot and a dynamic one. The fixed size
46; slot isn't large enough that we would normally probe it, but we need to do so
47; here otherwise the gap between the CSR save and the first probe of the
48; dynamic allocation could be too far apart when the size of the dynamic
49; allocation is close to the guard size.
50define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 {
51; CHECK-LABEL: dynamic_fixed:
52; CHECK:       // %bb.0:
53; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
54; CHECK-NEXT:    .cfi_def_cfa_offset 16
55; CHECK-NEXT:    mov x29, sp
56; CHECK-NEXT:    .cfi_def_cfa w29, 16
57; CHECK-NEXT:    .cfi_offset w30, -8
58; CHECK-NEXT:    .cfi_offset w29, -16
59; CHECK-NEXT:    str xzr, [sp, #-64]!
60; CHECK-NEXT:    add x9, x0, #15
61; CHECK-NEXT:    mov x8, sp
62; CHECK-DAG:     sub x10, x29, #64
63; CHECK-DAG:     and x9, x9, #0xfffffffffffffff0
64; CHECK-NOT:     INVALID_TO_BREAK_UP_CHECK_DAG
65; CHECK-DAG:     str x10, [x1]
66; CHECK-DAG:     sub x8, x8, x9
67; CHECK-NEXT:  .LBB1_1: // =>This Inner Loop Header: Depth=1
68; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
69; CHECK-NEXT:    cmp sp, x8
70; CHECK-NEXT:    b.le .LBB1_3
71; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB1_1 Depth=1
72; CHECK-NEXT:    str xzr, [sp]
73; CHECK-NEXT:    b .LBB1_1
74; CHECK-NEXT:  .LBB1_3:
75; CHECK-NEXT:    mov sp, x8
76; CHECK-NEXT:    ldr xzr, [sp]
77; CHECK-NEXT:    str x8, [x2]
78; CHECK-NEXT:    mov sp, x29
79; CHECK-NEXT:    .cfi_def_cfa wsp, 16
80; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
81; CHECK-NEXT:    .cfi_def_cfa_offset 0
82; CHECK-NEXT:    .cfi_restore w30
83; CHECK-NEXT:    .cfi_restore w29
84; CHECK-NEXT:    ret
85  %v1 = alloca i8, i64 64, align 1
86  store ptr %v1, ptr %out1, align 8
87  %v2 = alloca i8, i64 %size, align 1
88  store ptr %v2, ptr %out2, align 8
89  ret void
90}
91
92; Dynamic allocation, with an alignment requirement greater than the alignment
93; of SP. Done by ANDing the target SP with a constant to align it down, then
94; doing the loop as normal. Note that we also re-align the stack in the prolog,
95; which isn't actually needed because the only aligned allocations are dynamic,
96; this is done even without stack probing.
97define void @dynamic_align_64(i64 %size, ptr %out) #0 {
98; CHECK-LABEL: dynamic_align_64:
99; CHECK:       // %bb.0:
100; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
101; CHECK-NEXT:    .cfi_def_cfa_offset 32
102; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
103; CHECK-NEXT:    mov x29, sp
104; CHECK-NEXT:    .cfi_def_cfa w29, 32
105; CHECK-NEXT:    .cfi_offset w19, -16
106; CHECK-NEXT:    .cfi_offset w30, -24
107; CHECK-NEXT:    .cfi_offset w29, -32
108; CHECK-NEXT:    sub x9, sp, #32
109; CHECK-NEXT:    and sp, x9, #0xffffffffffffffc0
110; CHECK-NEXT:    add x9, x0, #15
111; CHECK-NEXT:    mov x8, sp
112; CHECK-DAG:     str xzr, [sp]
113; CHECK-DAG:     and x9, x9, #0xfffffffffffffff0
114; CHECK-NOT:     INVALID_TO_BREAK_UP_CHECK_DAG
115; CHECK-DAG:     mov x19, sp
116; CHECK-DAG:     sub x8, x8, x9
117; CHECK-NEXT:    and x8, x8, #0xffffffffffffffc0
118; CHECK-NEXT:  .LBB2_1: // =>This Inner Loop Header: Depth=1
119; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
120; CHECK-NEXT:    cmp sp, x8
121; CHECK-NEXT:    b.le .LBB2_3
122; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB2_1 Depth=1
123; CHECK-NEXT:    str xzr, [sp]
124; CHECK-NEXT:    b .LBB2_1
125; CHECK-NEXT:  .LBB2_3:
126; CHECK-NEXT:    mov sp, x8
127; CHECK-NEXT:    ldr xzr, [sp]
128; CHECK-NEXT:    str x8, [x1]
129; CHECK-NEXT:    mov sp, x29
130; CHECK-NEXT:    .cfi_def_cfa wsp, 32
131; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
132; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
133; CHECK-NEXT:    .cfi_def_cfa_offset 0
134; CHECK-NEXT:    .cfi_restore w19
135; CHECK-NEXT:    .cfi_restore w30
136; CHECK-NEXT:    .cfi_restore w29
137; CHECK-NEXT:    ret
138  %v = alloca i8, i64 %size, align 64
139  store ptr %v, ptr %out, align 8
140  ret void
141}
142
143; Dynamic allocation, with an alignment greater than the stack guard size. The
144; only difference to the dynamic allocation is the constant used for aligning
145; the target SP, the loop will probe the whole allocation without needing to
146; know about the alignment padding.
147define void @dynamic_align_8192(i64 %size, ptr %out) #0 {
148; CHECK-LABEL: dynamic_align_8192:
149; CHECK:       // %bb.0:
150; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
151; CHECK-NEXT:    .cfi_def_cfa_offset 32
152; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
153; CHECK-NEXT:    mov x29, sp
154; CHECK-NEXT:    .cfi_def_cfa w29, 32
155; CHECK-NEXT:    .cfi_offset w19, -16
156; CHECK-NEXT:    .cfi_offset w30, -24
157; CHECK-NEXT:    .cfi_offset w29, -32
158; CHECK-NEXT:    sub x9, sp, #1, lsl #12 // =4096
159; CHECK-NEXT:    sub x9, x9, #4064
160; CHECK-NEXT:    and x9, x9, #0xffffffffffffe000
161; CHECK-NEXT:  .LBB3_1: // =>This Inner Loop Header: Depth=1
162; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
163; CHECK-NEXT:    cmp sp, x9
164; CHECK-NEXT:    b.le .LBB3_3
165; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB3_1 Depth=1
166; CHECK-NEXT:    str xzr, [sp]
167; CHECK-NEXT:    b .LBB3_1
168; CHECK-NEXT:  .LBB3_3:
169; CHECK-NEXT:    mov sp, x9
170; CHECK-NEXT:    add x9, x0, #15
171; CHECK-NEXT:    mov x8, sp
172; CHECK-DAG:     ldr xzr, [sp]
173; CHECK-DAG:     and x9, x9, #0xfffffffffffffff0
174; CHECK-NOT:     INVALID_TO_BREAK_UP_CHECK_DAG
175; CHECK-DAG:     mov x19, sp
176; CHECK-DAG:     sub x8, x8, x9
177; CHECK-NEXT:    and x8, x8, #0xffffffffffffe000
178; CHECK-NEXT:  .LBB3_4: // =>This Inner Loop Header: Depth=1
179; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
180; CHECK-NEXT:    cmp sp, x8
181; CHECK-NEXT:    b.le .LBB3_6
182; CHECK-NEXT:  // %bb.5: // in Loop: Header=BB3_4 Depth=1
183; CHECK-NEXT:    str xzr, [sp]
184; CHECK-NEXT:    b .LBB3_4
185; CHECK-NEXT:  .LBB3_6:
186; CHECK-NEXT:    mov sp, x8
187; CHECK-NEXT:    ldr xzr, [sp]
188; CHECK-NEXT:    str x8, [x1]
189; CHECK-NEXT:    mov sp, x29
190; CHECK-NEXT:    .cfi_def_cfa wsp, 32
191; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
192; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
193; CHECK-NEXT:    .cfi_def_cfa_offset 0
194; CHECK-NEXT:    .cfi_restore w19
195; CHECK-NEXT:    .cfi_restore w30
196; CHECK-NEXT:    .cfi_restore w29
197; CHECK-NEXT:    ret
198  %v = alloca i8, i64 %size, align 8192
199  store ptr %v, ptr %out, align 8
200  ret void
201}
202
203; For 64k guard pages, the only difference is the constant subtracted from SP
204; in the loop.
205define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" {
206; CHECK-LABEL: dynamic_64k_guard:
207; CHECK:       // %bb.0:
208; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
209; CHECK-NEXT:    .cfi_def_cfa_offset 16
210; CHECK-NEXT:    mov x29, sp
211; CHECK-NEXT:    .cfi_def_cfa w29, 16
212; CHECK-NEXT:    .cfi_offset w30, -8
213; CHECK-NEXT:    .cfi_offset w29, -16
214; CHECK-NEXT:    add x9, x0, #15
215; CHECK-NEXT:    mov x8, sp
216; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
217; CHECK-NEXT:    sub x8, x8, x9
218; CHECK-NEXT:  .LBB4_1: // =>This Inner Loop Header: Depth=1
219; CHECK-NEXT:    sub sp, sp, #16, lsl #12 // =65536
220; CHECK-NEXT:    cmp sp, x8
221; CHECK-NEXT:    b.le .LBB4_3
222; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB4_1 Depth=1
223; CHECK-NEXT:    str xzr, [sp]
224; CHECK-NEXT:    b .LBB4_1
225; CHECK-NEXT:  .LBB4_3:
226; CHECK-NEXT:    mov sp, x8
227; CHECK-NEXT:    ldr xzr, [sp]
228; CHECK-NEXT:    str x8, [x1]
229; CHECK-NEXT:    mov sp, x29
230; CHECK-NEXT:    .cfi_def_cfa wsp, 16
231; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
232; CHECK-NEXT:    .cfi_def_cfa_offset 0
233; CHECK-NEXT:    .cfi_restore w30
234; CHECK-NEXT:    .cfi_restore w29
235; CHECK-NEXT:    ret
236  %v = alloca i8, i64 %size, align 1
237  store ptr %v, ptr %out, align 8
238  ret void
239}
240
241; If a function has variable-sized stack objects, then any function calls which
242; need to pass arguments on the stack must allocate the stack space for them
243; dynamically, to ensure they are at the bottom of the frame. We need to probe
244; that space when it is larger than the unprobed space allowed by the ABI (1024
245; bytes), so this needs a very large number of arguments.
246define void @no_reserved_call_frame(i64 %n) #0 {
247; CHECK-LABEL: no_reserved_call_frame:
248; CHECK:       // %bb.0: // %entry
249; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
250; CHECK-NEXT:    .cfi_def_cfa_offset 16
251; CHECK-NEXT:    mov x29, sp
252; CHECK-NEXT:    .cfi_def_cfa w29, 16
253; CHECK-NEXT:    .cfi_offset w30, -8
254; CHECK-NEXT:    .cfi_offset w29, -16
255; CHECK-NEXT:    lsl x9, x0, #2
256; CHECK-NEXT:    mov x8, sp
257; CHECK-NEXT:    add x9, x9, #15
258; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
259; CHECK-NEXT:    sub x0, x8, x9
260; CHECK-NEXT:  .LBB5_1: // %entry
261; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
262; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
263; CHECK-NEXT:    cmp sp, x0
264; CHECK-NEXT:    b.le .LBB5_3
265; CHECK-NEXT:  // %bb.2: // %entry
266; CHECK-NEXT:    // in Loop: Header=BB5_1 Depth=1
267; CHECK-NEXT:    str xzr, [sp]
268; CHECK-NEXT:    b .LBB5_1
269; CHECK-NEXT:  .LBB5_3: // %entry
270; CHECK-NEXT:    mov sp, x0
271; CHECK-NEXT:    ldr xzr, [sp]
272; CHECK-NEXT:    sub sp, sp, #1104
273; CHECK-NEXT:    str xzr, [sp]
274; CHECK-NEXT:    bl callee_stack_args
275; CHECK-NEXT:    add sp, sp, #1104
276; CHECK-NEXT:    mov sp, x29
277; CHECK-NEXT:    .cfi_def_cfa wsp, 16
278; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
279; CHECK-NEXT:    .cfi_def_cfa_offset 0
280; CHECK-NEXT:    .cfi_restore w30
281; CHECK-NEXT:    .cfi_restore w29
282; CHECK-NEXT:    ret
283entry:
284  %v = alloca i32, i64 %n
285  call void @callee_stack_args(ptr %v, [138 x i64] undef)
286  ret void
287}
288
289; Same as above but without a variable-sized allocation, so the reserved call
290; frame can be folded into the fixed-size allocation in the prologue.
291define void @reserved_call_frame(i64 %n) #0 {
292; CHECK-LABEL: reserved_call_frame:
293; CHECK:       // %bb.0: // %entry
294; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
295; CHECK-NEXT:    .cfi_def_cfa_offset 32
296; CHECK-NEXT:    str x28, [sp, #16] // 8-byte Folded Spill
297; CHECK-NEXT:    mov x29, sp
298; CHECK-NEXT:    .cfi_def_cfa w29, 32
299; CHECK-NEXT:    .cfi_offset w28, -16
300; CHECK-NEXT:    .cfi_offset w30, -24
301; CHECK-NEXT:    .cfi_offset w29, -32
302; CHECK-NEXT:    sub sp, sp, #1504
303; CHECK-NEXT:    add x0, sp, #1104
304; CHECK-NEXT:    str xzr, [sp]
305; CHECK-NEXT:    bl callee_stack_args
306; CHECK-NEXT:    add sp, sp, #1504
307; CHECK-NEXT:    .cfi_def_cfa wsp, 32
308; CHECK-NEXT:    ldr x28, [sp, #16] // 8-byte Folded Reload
309; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
310; CHECK-NEXT:    .cfi_def_cfa_offset 0
311; CHECK-NEXT:    .cfi_restore w28
312; CHECK-NEXT:    .cfi_restore w30
313; CHECK-NEXT:    .cfi_restore w29
314; CHECK-NEXT:    ret
315entry:
316  %v = alloca i32, i64 100
317  call void @callee_stack_args(ptr %v, [138 x i64] undef)
318  ret void
319}
320
321declare void @callee_stack_args(ptr, [138 x i64])
322
323; Dynamic allocation of SVE vectors
324define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" {
325; CHECK-LABEL: dynamic_sve:
326; CHECK:       // %bb.0:
327; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
328; CHECK-NEXT:    .cfi_def_cfa_offset 32
329; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
330; CHECK-NEXT:    mov x29, sp
331; CHECK-NEXT:    .cfi_def_cfa w29, 32
332; CHECK-NEXT:    .cfi_offset w19, -16
333; CHECK-NEXT:    .cfi_offset w30, -24
334; CHECK-NEXT:    .cfi_offset w29, -32
335; CHECK-NEXT:    rdvl x9, #1
336; CHECK-NEXT:    mov x10, #15 // =0xf
337; CHECK-DAG:     mov x8, sp
338; CHECK-DAG:     madd x9, x0, x9, x10
339; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
340; CHECK-NEXT:    sub x8, x8, x9
341; CHECK-NEXT:  .LBB7_1: // =>This Inner Loop Header: Depth=1
342; CHECK-NEXT:    sub sp, sp, #1, lsl #12 // =4096
343; CHECK-NEXT:    cmp sp, x8
344; CHECK-NEXT:    b.le .LBB7_3
345; CHECK-NEXT:  // %bb.2: // in Loop: Header=BB7_1 Depth=1
346; CHECK-NEXT:    str xzr, [sp]
347; CHECK-NEXT:    b .LBB7_1
348; CHECK-NEXT:  .LBB7_3:
349; CHECK-NEXT:    mov sp, x8
350; CHECK-NEXT:    ldr xzr, [sp]
351; CHECK-NEXT:    str x8, [x1]
352; CHECK-NEXT:    mov sp, x29
353; CHECK-NEXT:    .cfi_def_cfa wsp, 32
354; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
355; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
356; CHECK-NEXT:    .cfi_def_cfa_offset 0
357; CHECK-NEXT:    .cfi_restore w19
358; CHECK-NEXT:    .cfi_restore w30
359; CHECK-NEXT:    .cfi_restore w29
360; CHECK-NEXT:    ret
361  %v = alloca <vscale x 4 x float>, i64 %size, align 16
362  store ptr %v, ptr %out, align 8
363  ret void
364}
365
366attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }
367
368