1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s 3; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s 4 5; Dynamically-sized allocation, needs a loop which can handle any size at 6; runtime. The final iteration of the loop will temporarily put SP below the 7; target address, but this doesn't break any of the ABI constraints on the 8; stack, and also doesn't probe below the target SP value. 9define void @dynamic(i64 %size, ptr %out) #0 { 10; CHECK-LABEL: dynamic: 11; CHECK: // %bb.0: 12; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 13; CHECK-NEXT: .cfi_def_cfa_offset 16 14; CHECK-NEXT: mov x29, sp 15; CHECK-NEXT: .cfi_def_cfa w29, 16 16; CHECK-NEXT: .cfi_offset w30, -8 17; CHECK-NEXT: .cfi_offset w29, -16 18; CHECK-NEXT: add x9, x0, #15 19; CHECK-NEXT: mov x8, sp 20; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 21; CHECK-NEXT: sub x8, x8, x9 22; CHECK-NEXT: .LBB0_1: // =>This Inner Loop Header: Depth=1 23; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 24; CHECK-NEXT: cmp sp, x8 25; CHECK-NEXT: b.le .LBB0_3 26; CHECK-NEXT: // %bb.2: // in Loop: Header=BB0_1 Depth=1 27; CHECK-NEXT: str xzr, [sp] 28; CHECK-NEXT: b .LBB0_1 29; CHECK-NEXT: .LBB0_3: 30; CHECK-NEXT: mov sp, x8 31; CHECK-NEXT: ldr xzr, [sp] 32; CHECK-NEXT: str x8, [x1] 33; CHECK-NEXT: mov sp, x29 34; CHECK-NEXT: .cfi_def_cfa wsp, 16 35; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload 36; CHECK-NEXT: .cfi_def_cfa_offset 0 37; CHECK-NEXT: .cfi_restore w30 38; CHECK-NEXT: .cfi_restore w29 39; CHECK-NEXT: ret 40 %v = alloca i8, i64 %size, align 1 41 store ptr %v, ptr %out, align 8 42 ret void 43} 44 45; This function has a fixed-size stack slot and a dynamic one. The fixed size 46; slot isn't large enough that we would normally probe it, but we need to do so 47; here otherwise the gap between the CSR save and the first probe of the 48; dynamic allocation could be too far apart when the size of the dynamic 49; allocation is close to the guard size. 50define void @dynamic_fixed(i64 %size, ptr %out1, ptr %out2) #0 { 51; CHECK-LABEL: dynamic_fixed: 52; CHECK: // %bb.0: 53; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 54; CHECK-NEXT: .cfi_def_cfa_offset 16 55; CHECK-NEXT: mov x29, sp 56; CHECK-NEXT: .cfi_def_cfa w29, 16 57; CHECK-NEXT: .cfi_offset w30, -8 58; CHECK-NEXT: .cfi_offset w29, -16 59; CHECK-NEXT: str xzr, [sp, #-64]! 60; CHECK-NEXT: add x9, x0, #15 61; CHECK-NEXT: mov x8, sp 62; CHECK-DAG: sub x10, x29, #64 63; CHECK-DAG: and x9, x9, #0xfffffffffffffff0 64; CHECK-NOT: INVALID_TO_BREAK_UP_CHECK_DAG 65; CHECK-DAG: str x10, [x1] 66; CHECK-DAG: sub x8, x8, x9 67; CHECK-NEXT: .LBB1_1: // =>This Inner Loop Header: Depth=1 68; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 69; CHECK-NEXT: cmp sp, x8 70; CHECK-NEXT: b.le .LBB1_3 71; CHECK-NEXT: // %bb.2: // in Loop: Header=BB1_1 Depth=1 72; CHECK-NEXT: str xzr, [sp] 73; CHECK-NEXT: b .LBB1_1 74; CHECK-NEXT: .LBB1_3: 75; CHECK-NEXT: mov sp, x8 76; CHECK-NEXT: ldr xzr, [sp] 77; CHECK-NEXT: str x8, [x2] 78; CHECK-NEXT: mov sp, x29 79; CHECK-NEXT: .cfi_def_cfa wsp, 16 80; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload 81; CHECK-NEXT: .cfi_def_cfa_offset 0 82; CHECK-NEXT: .cfi_restore w30 83; CHECK-NEXT: .cfi_restore w29 84; CHECK-NEXT: ret 85 %v1 = alloca i8, i64 64, align 1 86 store ptr %v1, ptr %out1, align 8 87 %v2 = alloca i8, i64 %size, align 1 88 store ptr %v2, ptr %out2, align 8 89 ret void 90} 91 92; Dynamic allocation, with an alignment requirement greater than the alignment 93; of SP. Done by ANDing the target SP with a constant to align it down, then 94; doing the loop as normal. Note that we also re-align the stack in the prolog, 95; which isn't actually needed because the only aligned allocations are dynamic, 96; this is done even without stack probing. 97define void @dynamic_align_64(i64 %size, ptr %out) #0 { 98; CHECK-LABEL: dynamic_align_64: 99; CHECK: // %bb.0: 100; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill 101; CHECK-NEXT: .cfi_def_cfa_offset 32 102; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill 103; CHECK-NEXT: mov x29, sp 104; CHECK-NEXT: .cfi_def_cfa w29, 32 105; CHECK-NEXT: .cfi_offset w19, -16 106; CHECK-NEXT: .cfi_offset w30, -24 107; CHECK-NEXT: .cfi_offset w29, -32 108; CHECK-NEXT: sub x9, sp, #32 109; CHECK-NEXT: and sp, x9, #0xffffffffffffffc0 110; CHECK-NEXT: add x9, x0, #15 111; CHECK-NEXT: mov x8, sp 112; CHECK-DAG: str xzr, [sp] 113; CHECK-DAG: and x9, x9, #0xfffffffffffffff0 114; CHECK-NOT: INVALID_TO_BREAK_UP_CHECK_DAG 115; CHECK-DAG: mov x19, sp 116; CHECK-DAG: sub x8, x8, x9 117; CHECK-NEXT: and x8, x8, #0xffffffffffffffc0 118; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 119; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 120; CHECK-NEXT: cmp sp, x8 121; CHECK-NEXT: b.le .LBB2_3 122; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 123; CHECK-NEXT: str xzr, [sp] 124; CHECK-NEXT: b .LBB2_1 125; CHECK-NEXT: .LBB2_3: 126; CHECK-NEXT: mov sp, x8 127; CHECK-NEXT: ldr xzr, [sp] 128; CHECK-NEXT: str x8, [x1] 129; CHECK-NEXT: mov sp, x29 130; CHECK-NEXT: .cfi_def_cfa wsp, 32 131; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload 132; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload 133; CHECK-NEXT: .cfi_def_cfa_offset 0 134; CHECK-NEXT: .cfi_restore w19 135; CHECK-NEXT: .cfi_restore w30 136; CHECK-NEXT: .cfi_restore w29 137; CHECK-NEXT: ret 138 %v = alloca i8, i64 %size, align 64 139 store ptr %v, ptr %out, align 8 140 ret void 141} 142 143; Dynamic allocation, with an alignment greater than the stack guard size. The 144; only difference to the dynamic allocation is the constant used for aligning 145; the target SP, the loop will probe the whole allocation without needing to 146; know about the alignment padding. 147define void @dynamic_align_8192(i64 %size, ptr %out) #0 { 148; CHECK-LABEL: dynamic_align_8192: 149; CHECK: // %bb.0: 150; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill 151; CHECK-NEXT: .cfi_def_cfa_offset 32 152; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill 153; CHECK-NEXT: mov x29, sp 154; CHECK-NEXT: .cfi_def_cfa w29, 32 155; CHECK-NEXT: .cfi_offset w19, -16 156; CHECK-NEXT: .cfi_offset w30, -24 157; CHECK-NEXT: .cfi_offset w29, -32 158; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 159; CHECK-NEXT: sub x9, x9, #4064 160; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 161; CHECK-NEXT: .LBB3_1: // =>This Inner Loop Header: Depth=1 162; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 163; CHECK-NEXT: cmp sp, x9 164; CHECK-NEXT: b.le .LBB3_3 165; CHECK-NEXT: // %bb.2: // in Loop: Header=BB3_1 Depth=1 166; CHECK-NEXT: str xzr, [sp] 167; CHECK-NEXT: b .LBB3_1 168; CHECK-NEXT: .LBB3_3: 169; CHECK-NEXT: mov sp, x9 170; CHECK-NEXT: add x9, x0, #15 171; CHECK-NEXT: mov x8, sp 172; CHECK-DAG: ldr xzr, [sp] 173; CHECK-DAG: and x9, x9, #0xfffffffffffffff0 174; CHECK-NOT: INVALID_TO_BREAK_UP_CHECK_DAG 175; CHECK-DAG: mov x19, sp 176; CHECK-DAG: sub x8, x8, x9 177; CHECK-NEXT: and x8, x8, #0xffffffffffffe000 178; CHECK-NEXT: .LBB3_4: // =>This Inner Loop Header: Depth=1 179; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 180; CHECK-NEXT: cmp sp, x8 181; CHECK-NEXT: b.le .LBB3_6 182; CHECK-NEXT: // %bb.5: // in Loop: Header=BB3_4 Depth=1 183; CHECK-NEXT: str xzr, [sp] 184; CHECK-NEXT: b .LBB3_4 185; CHECK-NEXT: .LBB3_6: 186; CHECK-NEXT: mov sp, x8 187; CHECK-NEXT: ldr xzr, [sp] 188; CHECK-NEXT: str x8, [x1] 189; CHECK-NEXT: mov sp, x29 190; CHECK-NEXT: .cfi_def_cfa wsp, 32 191; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload 192; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload 193; CHECK-NEXT: .cfi_def_cfa_offset 0 194; CHECK-NEXT: .cfi_restore w19 195; CHECK-NEXT: .cfi_restore w30 196; CHECK-NEXT: .cfi_restore w29 197; CHECK-NEXT: ret 198 %v = alloca i8, i64 %size, align 8192 199 store ptr %v, ptr %out, align 8 200 ret void 201} 202 203; For 64k guard pages, the only difference is the constant subtracted from SP 204; in the loop. 205define void @dynamic_64k_guard(i64 %size, ptr %out) #0 "stack-probe-size"="65536" { 206; CHECK-LABEL: dynamic_64k_guard: 207; CHECK: // %bb.0: 208; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 209; CHECK-NEXT: .cfi_def_cfa_offset 16 210; CHECK-NEXT: mov x29, sp 211; CHECK-NEXT: .cfi_def_cfa w29, 16 212; CHECK-NEXT: .cfi_offset w30, -8 213; CHECK-NEXT: .cfi_offset w29, -16 214; CHECK-NEXT: add x9, x0, #15 215; CHECK-NEXT: mov x8, sp 216; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 217; CHECK-NEXT: sub x8, x8, x9 218; CHECK-NEXT: .LBB4_1: // =>This Inner Loop Header: Depth=1 219; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 220; CHECK-NEXT: cmp sp, x8 221; CHECK-NEXT: b.le .LBB4_3 222; CHECK-NEXT: // %bb.2: // in Loop: Header=BB4_1 Depth=1 223; CHECK-NEXT: str xzr, [sp] 224; CHECK-NEXT: b .LBB4_1 225; CHECK-NEXT: .LBB4_3: 226; CHECK-NEXT: mov sp, x8 227; CHECK-NEXT: ldr xzr, [sp] 228; CHECK-NEXT: str x8, [x1] 229; CHECK-NEXT: mov sp, x29 230; CHECK-NEXT: .cfi_def_cfa wsp, 16 231; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload 232; CHECK-NEXT: .cfi_def_cfa_offset 0 233; CHECK-NEXT: .cfi_restore w30 234; CHECK-NEXT: .cfi_restore w29 235; CHECK-NEXT: ret 236 %v = alloca i8, i64 %size, align 1 237 store ptr %v, ptr %out, align 8 238 ret void 239} 240 241; If a function has variable-sized stack objects, then any function calls which 242; need to pass arguments on the stack must allocate the stack space for them 243; dynamically, to ensure they are at the bottom of the frame. We need to probe 244; that space when it is larger than the unprobed space allowed by the ABI (1024 245; bytes), so this needs a very large number of arguments. 246define void @no_reserved_call_frame(i64 %n) #0 { 247; CHECK-LABEL: no_reserved_call_frame: 248; CHECK: // %bb.0: // %entry 249; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 250; CHECK-NEXT: .cfi_def_cfa_offset 16 251; CHECK-NEXT: mov x29, sp 252; CHECK-NEXT: .cfi_def_cfa w29, 16 253; CHECK-NEXT: .cfi_offset w30, -8 254; CHECK-NEXT: .cfi_offset w29, -16 255; CHECK-NEXT: lsl x9, x0, #2 256; CHECK-NEXT: mov x8, sp 257; CHECK-NEXT: add x9, x9, #15 258; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 259; CHECK-NEXT: sub x0, x8, x9 260; CHECK-NEXT: .LBB5_1: // %entry 261; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 262; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 263; CHECK-NEXT: cmp sp, x0 264; CHECK-NEXT: b.le .LBB5_3 265; CHECK-NEXT: // %bb.2: // %entry 266; CHECK-NEXT: // in Loop: Header=BB5_1 Depth=1 267; CHECK-NEXT: str xzr, [sp] 268; CHECK-NEXT: b .LBB5_1 269; CHECK-NEXT: .LBB5_3: // %entry 270; CHECK-NEXT: mov sp, x0 271; CHECK-NEXT: ldr xzr, [sp] 272; CHECK-NEXT: sub sp, sp, #1104 273; CHECK-NEXT: str xzr, [sp] 274; CHECK-NEXT: bl callee_stack_args 275; CHECK-NEXT: add sp, sp, #1104 276; CHECK-NEXT: mov sp, x29 277; CHECK-NEXT: .cfi_def_cfa wsp, 16 278; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload 279; CHECK-NEXT: .cfi_def_cfa_offset 0 280; CHECK-NEXT: .cfi_restore w30 281; CHECK-NEXT: .cfi_restore w29 282; CHECK-NEXT: ret 283entry: 284 %v = alloca i32, i64 %n 285 call void @callee_stack_args(ptr %v, [138 x i64] undef) 286 ret void 287} 288 289; Same as above but without a variable-sized allocation, so the reserved call 290; frame can be folded into the fixed-size allocation in the prologue. 291define void @reserved_call_frame(i64 %n) #0 { 292; CHECK-LABEL: reserved_call_frame: 293; CHECK: // %bb.0: // %entry 294; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill 295; CHECK-NEXT: .cfi_def_cfa_offset 32 296; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill 297; CHECK-NEXT: mov x29, sp 298; CHECK-NEXT: .cfi_def_cfa w29, 32 299; CHECK-NEXT: .cfi_offset w28, -16 300; CHECK-NEXT: .cfi_offset w30, -24 301; CHECK-NEXT: .cfi_offset w29, -32 302; CHECK-NEXT: sub sp, sp, #1504 303; CHECK-NEXT: add x0, sp, #1104 304; CHECK-NEXT: str xzr, [sp] 305; CHECK-NEXT: bl callee_stack_args 306; CHECK-NEXT: add sp, sp, #1504 307; CHECK-NEXT: .cfi_def_cfa wsp, 32 308; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload 309; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload 310; CHECK-NEXT: .cfi_def_cfa_offset 0 311; CHECK-NEXT: .cfi_restore w28 312; CHECK-NEXT: .cfi_restore w30 313; CHECK-NEXT: .cfi_restore w29 314; CHECK-NEXT: ret 315entry: 316 %v = alloca i32, i64 100 317 call void @callee_stack_args(ptr %v, [138 x i64] undef) 318 ret void 319} 320 321declare void @callee_stack_args(ptr, [138 x i64]) 322 323; Dynamic allocation of SVE vectors 324define void @dynamic_sve(i64 %size, ptr %out) #0 "target-features"="+sve" { 325; CHECK-LABEL: dynamic_sve: 326; CHECK: // %bb.0: 327; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill 328; CHECK-NEXT: .cfi_def_cfa_offset 32 329; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill 330; CHECK-NEXT: mov x29, sp 331; CHECK-NEXT: .cfi_def_cfa w29, 32 332; CHECK-NEXT: .cfi_offset w19, -16 333; CHECK-NEXT: .cfi_offset w30, -24 334; CHECK-NEXT: .cfi_offset w29, -32 335; CHECK-NEXT: rdvl x9, #1 336; CHECK-NEXT: mov x10, #15 // =0xf 337; CHECK-DAG: mov x8, sp 338; CHECK-DAG: madd x9, x0, x9, x10 339; CHECK-NEXT: and x9, x9, #0xfffffffffffffff0 340; CHECK-NEXT: sub x8, x8, x9 341; CHECK-NEXT: .LBB7_1: // =>This Inner Loop Header: Depth=1 342; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 343; CHECK-NEXT: cmp sp, x8 344; CHECK-NEXT: b.le .LBB7_3 345; CHECK-NEXT: // %bb.2: // in Loop: Header=BB7_1 Depth=1 346; CHECK-NEXT: str xzr, [sp] 347; CHECK-NEXT: b .LBB7_1 348; CHECK-NEXT: .LBB7_3: 349; CHECK-NEXT: mov sp, x8 350; CHECK-NEXT: ldr xzr, [sp] 351; CHECK-NEXT: str x8, [x1] 352; CHECK-NEXT: mov sp, x29 353; CHECK-NEXT: .cfi_def_cfa wsp, 32 354; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload 355; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload 356; CHECK-NEXT: .cfi_def_cfa_offset 0 357; CHECK-NEXT: .cfi_restore w19 358; CHECK-NEXT: .cfi_restore w30 359; CHECK-NEXT: .cfi_restore w29 360; CHECK-NEXT: ret 361 %v = alloca <vscale x 4 x float>, i64 %size, align 16 362 store ptr %v, ptr %out, align 8 363 ret void 364} 365 366attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" } 367 368