1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs | FileCheck %s 3; RUN: llc -mtriple=aarch64 < %s -verify-machineinstrs -global-isel -global-isel-abort=2 | FileCheck %s 4 5; Test prolog sequences for stack probing when SVE objects are involved. 6 7; The space for SVE objects needs probing in the general case, because 8; the stack adjustment may happen to be too big (i.e. greater than the 9; probe size) to allocate with a single `addvl`. 10; When we do know that the stack adjustment cannot exceed the probe size 11; we can avoid emitting a probe loop and emit a simple `addvl; str` 12; sequence instead. 13 14define void @sve_1_vector(ptr %out) #0 { 15; CHECK-LABEL: sve_1_vector: 16; CHECK: // %bb.0: // %entry 17; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 18; CHECK-NEXT: .cfi_def_cfa_offset 16 19; CHECK-NEXT: .cfi_offset w29, -16 20; CHECK-NEXT: addvl sp, sp, #-1 21; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 22; CHECK-NEXT: addvl sp, sp, #1 23; CHECK-NEXT: .cfi_def_cfa wsp, 16 24; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 25; CHECK-NEXT: .cfi_def_cfa_offset 0 26; CHECK-NEXT: .cfi_restore w29 27; CHECK-NEXT: ret 28entry: 29 %vec = alloca <vscale x 4 x float>, align 16 30 ret void 31} 32 33; As above, but with 4 SVE vectors of stack space. 34define void @sve_4_vector(ptr %out) #0 { 35; CHECK-LABEL: sve_4_vector: 36; CHECK: // %bb.0: // %entry 37; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 38; CHECK-NEXT: .cfi_def_cfa_offset 16 39; CHECK-NEXT: .cfi_offset w29, -16 40; CHECK-NEXT: addvl sp, sp, #-4 41; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 42; CHECK-NEXT: addvl sp, sp, #4 43; CHECK-NEXT: .cfi_def_cfa wsp, 16 44; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 45; CHECK-NEXT: .cfi_def_cfa_offset 0 46; CHECK-NEXT: .cfi_restore w29 47; CHECK-NEXT: ret 48entry: 49 %vec1 = alloca <vscale x 4 x float>, align 16 50 %vec2 = alloca <vscale x 4 x float>, align 16 51 %vec3 = alloca <vscale x 4 x float>, align 16 52 %vec4 = alloca <vscale x 4 x float>, align 16 53 ret void 54} 55 56; As above, but with 16 SVE vectors of stack space. 57; The stack adjustment is less than or equal to 16 x 256 = 4096, so 58; we can allocate the locals at once. 59define void @sve_16_vector(ptr %out) #0 { 60; CHECK-LABEL: sve_16_vector: 61; CHECK: // %bb.0: // %entry 62; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 63; CHECK-NEXT: .cfi_def_cfa_offset 16 64; CHECK-NEXT: .cfi_offset w29, -16 65; CHECK-NEXT: addvl sp, sp, #-16 66; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG 67; CHECK-NEXT: str xzr, [sp] 68; CHECK-NEXT: addvl sp, sp, #16 69; CHECK-NEXT: .cfi_def_cfa wsp, 16 70; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 71; CHECK-NEXT: .cfi_def_cfa_offset 0 72; CHECK-NEXT: .cfi_restore w29 73; CHECK-NEXT: ret 74entry: 75 %vec1 = alloca <vscale x 4 x float>, align 16 76 %vec2 = alloca <vscale x 4 x float>, align 16 77 %vec3 = alloca <vscale x 4 x float>, align 16 78 %vec4 = alloca <vscale x 4 x float>, align 16 79 %vec5 = alloca <vscale x 4 x float>, align 16 80 %vec6 = alloca <vscale x 4 x float>, align 16 81 %vec7 = alloca <vscale x 4 x float>, align 16 82 %vec8 = alloca <vscale x 4 x float>, align 16 83 %vec9 = alloca <vscale x 4 x float>, align 16 84 %vec10 = alloca <vscale x 4 x float>, align 16 85 %vec11 = alloca <vscale x 4 x float>, align 16 86 %vec12 = alloca <vscale x 4 x float>, align 16 87 %vec13 = alloca <vscale x 4 x float>, align 16 88 %vec14 = alloca <vscale x 4 x float>, align 16 89 %vec15 = alloca <vscale x 4 x float>, align 16 90 %vec16 = alloca <vscale x 4 x float>, align 16 91 ret void 92} 93 94; As above, but with 17 SVE vectors of stack space. Now we need 95; a probing loops since stack adjustment may be greater than 96; the probe size (17 x 256 = 4354 bytes) 97; TODO: Allocating `k*16+r` SVE vectors can be unrolled into 98; emiting the `k + r` sequences of `addvl sp, sp, #-N; str xzr, [sp]` 99define void @sve_17_vector(ptr %out) #0 { 100; CHECK-LABEL: sve_17_vector: 101; CHECK: // %bb.0: // %entry 102; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 103; CHECK-NEXT: .cfi_def_cfa_offset 16 104; CHECK-NEXT: .cfi_offset w29, -16 105; CHECK-NEXT: addvl x9, sp, #-17 106; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG 107; CHECK-NEXT: .LBB3_1: // %entry 108; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 109; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 110; CHECK-NEXT: cmp sp, x9 111; CHECK-NEXT: b.le .LBB3_3 112; CHECK-NEXT: // %bb.2: // %entry 113; CHECK-NEXT: // in Loop: Header=BB3_1 Depth=1 114; CHECK-NEXT: str xzr, [sp] 115; CHECK-NEXT: b .LBB3_1 116; CHECK-NEXT: .LBB3_3: // %entry 117; CHECK-NEXT: mov sp, x9 118; CHECK-NEXT: ldr xzr, [sp] 119; CHECK-NEXT: .cfi_def_cfa_register wsp 120; CHECK-NEXT: addvl sp, sp, #17 121; CHECK-NEXT: .cfi_def_cfa wsp, 16 122; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 123; CHECK-NEXT: .cfi_def_cfa_offset 0 124; CHECK-NEXT: .cfi_restore w29 125; CHECK-NEXT: ret 126entry: 127 %vec1 = alloca <vscale x 4 x float>, align 16 128 %vec2 = alloca <vscale x 4 x float>, align 16 129 %vec3 = alloca <vscale x 4 x float>, align 16 130 %vec4 = alloca <vscale x 4 x float>, align 16 131 %vec5 = alloca <vscale x 4 x float>, align 16 132 %vec6 = alloca <vscale x 4 x float>, align 16 133 %vec7 = alloca <vscale x 4 x float>, align 16 134 %vec8 = alloca <vscale x 4 x float>, align 16 135 %vec9 = alloca <vscale x 4 x float>, align 16 136 %vec10 = alloca <vscale x 4 x float>, align 16 137 %vec11 = alloca <vscale x 4 x float>, align 16 138 %vec12 = alloca <vscale x 4 x float>, align 16 139 %vec13 = alloca <vscale x 4 x float>, align 16 140 %vec14 = alloca <vscale x 4 x float>, align 16 141 %vec15 = alloca <vscale x 4 x float>, align 16 142 %vec16 = alloca <vscale x 4 x float>, align 16 143 %vec17 = alloca <vscale x 4 x float>, align 16 144 ret void 145} 146 147; Space for callee-saved SVE register is allocated similarly to allocating 148; space for SVE locals. When we know the stack adjustment cannot exceed the 149; probe size we can skip the explict probe, since saving SVE registers serves 150; as an implicit probe. 151define void @sve_1v_csr(<vscale x 4 x float> %a) #0 { 152; CHECK-LABEL: sve_1v_csr: 153; CHECK: // %bb.0: // %entry 154; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 155; CHECK-NEXT: .cfi_def_cfa_offset 16 156; CHECK-NEXT: .cfi_offset w29, -16 157; CHECK-NEXT: addvl sp, sp, #-1 158; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 159; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill 160; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 161; CHECK-NEXT: //APP 162; CHECK-NEXT: //NO_APP 163; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload 164; CHECK-NEXT: addvl sp, sp, #1 165; CHECK-NEXT: .cfi_def_cfa wsp, 16 166; CHECK-NEXT: .cfi_restore z8 167; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 168; CHECK-NEXT: .cfi_def_cfa_offset 0 169; CHECK-NEXT: .cfi_restore w29 170; CHECK-NEXT: ret 171entry: 172 call void asm sideeffect "", "~{z8}" () 173 ret void 174} 175 176define void @sve_4v_csr(<vscale x 4 x float> %a) #0 { 177; CHECK-LABEL: sve_4v_csr: 178; CHECK: // %bb.0: // %entry 179; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 180; CHECK-NEXT: .cfi_def_cfa_offset 16 181; CHECK-NEXT: .cfi_offset w29, -16 182; CHECK-NEXT: addvl sp, sp, #-4 183; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 184; CHECK-NEXT: str z11, [sp] // 16-byte Folded Spill 185; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 186; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 187; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill 188; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 189; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 190; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG 191; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG 192; CHECK-NEXT: //APP 193; CHECK-NEXT: //NO_APP 194; CHECK-NEXT: ldr z11, [sp] // 16-byte Folded Reload 195; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 196; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 197; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload 198; CHECK-NEXT: addvl sp, sp, #4 199; CHECK-NEXT: .cfi_def_cfa wsp, 16 200; CHECK-NEXT: .cfi_restore z8 201; CHECK-NEXT: .cfi_restore z9 202; CHECK-NEXT: .cfi_restore z10 203; CHECK-NEXT: .cfi_restore z11 204; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 205; CHECK-NEXT: .cfi_def_cfa_offset 0 206; CHECK-NEXT: .cfi_restore w29 207; CHECK-NEXT: ret 208entry: 209 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11}" () 210 ret void 211} 212 213define void @sve_16v_csr(<vscale x 4 x float> %a) #0 { 214; CHECK-LABEL: sve_16v_csr: 215; CHECK: // %bb.0: // %entry 216; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 217; CHECK-NEXT: .cfi_def_cfa_offset 16 218; CHECK-NEXT: .cfi_offset w29, -16 219; CHECK-NEXT: addvl sp, sp, #-16 220; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 128 * VG 221; CHECK-NEXT: str xzr, [sp] 222; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill 223; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill 224; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill 225; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill 226; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill 227; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill 228; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill 229; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill 230; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill 231; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill 232; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill 233; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill 234; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill 235; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill 236; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill 237; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill 238; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 239; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 240; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG 241; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG 242; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG 243; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG 244; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG 245; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG 246; CHECK-NEXT: //APP 247; CHECK-NEXT: //NO_APP 248; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload 249; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload 250; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload 251; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload 252; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload 253; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload 254; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload 255; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload 256; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload 257; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload 258; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload 259; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload 260; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload 261; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload 262; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload 263; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload 264; CHECK-NEXT: addvl sp, sp, #16 265; CHECK-NEXT: .cfi_def_cfa wsp, 16 266; CHECK-NEXT: .cfi_restore z8 267; CHECK-NEXT: .cfi_restore z9 268; CHECK-NEXT: .cfi_restore z10 269; CHECK-NEXT: .cfi_restore z11 270; CHECK-NEXT: .cfi_restore z12 271; CHECK-NEXT: .cfi_restore z13 272; CHECK-NEXT: .cfi_restore z14 273; CHECK-NEXT: .cfi_restore z15 274; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 275; CHECK-NEXT: .cfi_def_cfa_offset 0 276; CHECK-NEXT: .cfi_restore w29 277; CHECK-NEXT: ret 278entry: 279 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () 280 ret void 281} 282 283define void @sve_1p_csr(<vscale x 4 x float> %a) #0 { 284; CHECK-LABEL: sve_1p_csr: 285; CHECK: // %bb.0: // %entry 286; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 287; CHECK-NEXT: .cfi_def_cfa_offset 16 288; CHECK-NEXT: .cfi_offset w29, -16 289; CHECK-NEXT: addvl sp, sp, #-1 290; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 291; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 292; CHECK-NEXT: //APP 293; CHECK-NEXT: //NO_APP 294; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 295; CHECK-NEXT: addvl sp, sp, #1 296; CHECK-NEXT: .cfi_def_cfa wsp, 16 297; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 298; CHECK-NEXT: .cfi_def_cfa_offset 0 299; CHECK-NEXT: .cfi_restore w29 300; CHECK-NEXT: ret 301entry: 302 call void asm sideeffect "", "~{p8}" () 303 ret void 304} 305 306define void @sve_4p_csr(<vscale x 4 x float> %a) #0 { 307; CHECK-LABEL: sve_4p_csr: 308; CHECK: // %bb.0: // %entry 309; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 310; CHECK-NEXT: .cfi_def_cfa_offset 16 311; CHECK-NEXT: .cfi_offset w29, -16 312; CHECK-NEXT: addvl sp, sp, #-1 313; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG 314; CHECK-NEXT: str p11, [sp, #4, mul vl] // 2-byte Folded Spill 315; CHECK-NEXT: str p10, [sp, #5, mul vl] // 2-byte Folded Spill 316; CHECK-NEXT: str p9, [sp, #6, mul vl] // 2-byte Folded Spill 317; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 318; CHECK-NEXT: //APP 319; CHECK-NEXT: //NO_APP 320; CHECK-NEXT: ldr p11, [sp, #4, mul vl] // 2-byte Folded Reload 321; CHECK-NEXT: ldr p10, [sp, #5, mul vl] // 2-byte Folded Reload 322; CHECK-NEXT: ldr p9, [sp, #6, mul vl] // 2-byte Folded Reload 323; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 324; CHECK-NEXT: addvl sp, sp, #1 325; CHECK-NEXT: .cfi_def_cfa wsp, 16 326; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 327; CHECK-NEXT: .cfi_def_cfa_offset 0 328; CHECK-NEXT: .cfi_restore w29 329; CHECK-NEXT: ret 330entry: 331 call void asm sideeffect "", "~{p8},~{p9},~{p10},~{p11}" () 332 ret void 333} 334 335define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 { 336; CHECK-LABEL: sve_16v_1p_csr: 337; CHECK: // %bb.0: // %entry 338; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 339; CHECK-NEXT: .cfi_def_cfa_offset 16 340; CHECK-NEXT: .cfi_offset w29, -16 341; CHECK-NEXT: addvl x9, sp, #-17 342; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 136 * VG 343; CHECK-NEXT: .LBB9_1: // %entry 344; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 345; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 346; CHECK-NEXT: cmp sp, x9 347; CHECK-NEXT: b.le .LBB9_3 348; CHECK-NEXT: // %bb.2: // %entry 349; CHECK-NEXT: // in Loop: Header=BB9_1 Depth=1 350; CHECK-NEXT: str xzr, [sp] 351; CHECK-NEXT: b .LBB9_1 352; CHECK-NEXT: .LBB9_3: // %entry 353; CHECK-NEXT: mov sp, x9 354; CHECK-NEXT: ldr xzr, [sp] 355; CHECK-NEXT: .cfi_def_cfa_register wsp 356; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill 357; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill 358; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill 359; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill 360; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill 361; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill 362; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill 363; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill 364; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill 365; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill 366; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill 367; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill 368; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill 369; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill 370; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill 371; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill 372; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill 373; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 374; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 375; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG 376; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x60, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d11 @ cfa - 16 - 32 * VG 377; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x58, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d12 @ cfa - 16 - 40 * VG 378; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x50, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d13 @ cfa - 16 - 48 * VG 379; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x48, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d14 @ cfa - 16 - 56 * VG 380; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG 381; CHECK-NEXT: //APP 382; CHECK-NEXT: //NO_APP 383; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload 384; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload 385; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload 386; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload 387; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload 388; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload 389; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload 390; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload 391; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload 392; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload 393; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload 394; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload 395; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload 396; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload 397; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload 398; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload 399; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload 400; CHECK-NEXT: addvl sp, sp, #17 401; CHECK-NEXT: .cfi_def_cfa wsp, 16 402; CHECK-NEXT: .cfi_restore z8 403; CHECK-NEXT: .cfi_restore z9 404; CHECK-NEXT: .cfi_restore z10 405; CHECK-NEXT: .cfi_restore z11 406; CHECK-NEXT: .cfi_restore z12 407; CHECK-NEXT: .cfi_restore z13 408; CHECK-NEXT: .cfi_restore z14 409; CHECK-NEXT: .cfi_restore z15 410; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 411; CHECK-NEXT: .cfi_def_cfa_offset 0 412; CHECK-NEXT: .cfi_restore w29 413; CHECK-NEXT: ret 414entry: 415 call void asm sideeffect "", "~{p8},~{z8},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23}" () 416 ret void 417} 418 419; A SVE vector and a 16-byte fixed size object. 420define void @sve_1_vector_16_arr(ptr %out) #0 { 421; CHECK-LABEL: sve_1_vector_16_arr: 422; CHECK: // %bb.0: // %entry 423; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 424; CHECK-NEXT: .cfi_def_cfa_offset 16 425; CHECK-NEXT: .cfi_offset w29, -16 426; CHECK-NEXT: sub sp, sp, #16 427; CHECK-NEXT: .cfi_def_cfa_offset 32 428; CHECK-NEXT: addvl sp, sp, #-1 429; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 32 + 8 * VG 430; CHECK-NEXT: addvl sp, sp, #1 431; CHECK-NEXT: .cfi_def_cfa wsp, 32 432; CHECK-NEXT: add sp, sp, #16 433; CHECK-NEXT: .cfi_def_cfa_offset 16 434; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 435; CHECK-NEXT: .cfi_def_cfa_offset 0 436; CHECK-NEXT: .cfi_restore w29 437; CHECK-NEXT: ret 438entry: 439 %vec = alloca <vscale x 4 x float>, align 16 440 %arr = alloca i8, i64 16, align 1 441 ret void 442} 443 444; A large SVE stack object and a large stack slot, both of which need probing. 445; TODO: This could be optimised by combining the fixed-size offset into the 446; loop. 447define void @sve_1_vector_4096_arr(ptr %out) #0 { 448; CHECK-LABEL: sve_1_vector_4096_arr: 449; CHECK: // %bb.0: // %entry 450; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 451; CHECK-NEXT: .cfi_def_cfa_offset 16 452; CHECK-NEXT: .cfi_offset w29, -16 453; CHECK-NEXT: sub x9, sp, #3, lsl #12 // =12288 454; CHECK-NEXT: .cfi_def_cfa w9, 12304 455; CHECK-NEXT: addvl x9, x9, #-32 456; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 256 * VG 457; CHECK-NEXT: addvl x9, x9, #-32 458; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x79, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 12304 + 512 * VG 459; CHECK-NEXT: .LBB11_1: // %entry 460; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 461; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 462; CHECK-NEXT: cmp sp, x9 463; CHECK-NEXT: b.le .LBB11_3 464; CHECK-NEXT: // %bb.2: // %entry 465; CHECK-NEXT: // in Loop: Header=BB11_1 Depth=1 466; CHECK-NEXT: str xzr, [sp] 467; CHECK-NEXT: b .LBB11_1 468; CHECK-NEXT: .LBB11_3: // %entry 469; CHECK-NEXT: mov sp, x9 470; CHECK-NEXT: ldr xzr, [sp] 471; CHECK-NEXT: .cfi_def_cfa_register wsp 472; CHECK-NEXT: addvl sp, sp, #31 473; CHECK-NEXT: .cfi_escape 0x0f, 0x0f, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x88, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 264 * VG 474; CHECK-NEXT: addvl sp, sp, #31 475; CHECK-NEXT: .cfi_escape 0x0f, 0x0e, 0x8f, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 12304 + 16 * VG 476; CHECK-NEXT: addvl sp, sp, #2 477; CHECK-NEXT: .cfi_def_cfa wsp, 12304 478; CHECK-NEXT: add sp, sp, #3, lsl #12 // =12288 479; CHECK-NEXT: .cfi_def_cfa_offset 16 480; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 481; CHECK-NEXT: .cfi_def_cfa_offset 0 482; CHECK-NEXT: .cfi_restore w29 483; CHECK-NEXT: ret 484entry: 485 %vec = alloca <vscale x 256 x float>, align 16 486 %arr = alloca i8, i64 12288, align 1 487 ret void 488} 489 490; Not tested: SVE stack objects with alignment >16 bytes, which isn't currently 491; supported even without stack-probing. 492 493; An SVE vector, and a 16-byte fixed size object, which 494; has a large alignment requirement. 495define void @sve_1_vector_16_arr_align_8192(ptr %out) #0 { 496; CHECK-LABEL: sve_1_vector_16_arr_align_8192: 497; CHECK: // %bb.0: // %entry 498; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill 499; CHECK-NEXT: .cfi_def_cfa_offset 16 500; CHECK-NEXT: mov x29, sp 501; CHECK-NEXT: .cfi_def_cfa w29, 16 502; CHECK-NEXT: .cfi_offset w30, -8 503; CHECK-NEXT: .cfi_offset w29, -16 504; CHECK-NEXT: sub x9, sp, #1, lsl #12 // =4096 505; CHECK-NEXT: sub x9, x9, #4080 506; CHECK-NEXT: addvl x9, x9, #-1 507; CHECK-NEXT: and x9, x9, #0xffffffffffffe000 508; CHECK-NEXT: .LBB12_1: // %entry 509; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 510; CHECK-NEXT: sub sp, sp, #1, lsl #12 // =4096 511; CHECK-NEXT: cmp sp, x9 512; CHECK-NEXT: b.le .LBB12_3 513; CHECK-NEXT: // %bb.2: // %entry 514; CHECK-NEXT: // in Loop: Header=BB12_1 Depth=1 515; CHECK-NEXT: str xzr, [sp] 516; CHECK-NEXT: b .LBB12_1 517; CHECK-NEXT: .LBB12_3: // %entry 518; CHECK-NEXT: mov sp, x9 519; CHECK-NEXT: ldr xzr, [sp] 520; CHECK-NEXT: mov sp, x29 521; CHECK-NEXT: .cfi_def_cfa wsp, 16 522; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload 523; CHECK-NEXT: .cfi_def_cfa_offset 0 524; CHECK-NEXT: .cfi_restore w30 525; CHECK-NEXT: .cfi_restore w29 526; CHECK-NEXT: ret 527entry: 528 %vec = alloca <vscale x 4 x float>, align 16 529 %arr = alloca i8, i64 16, align 8192 530 ret void 531} 532 533; With 64k guard pages, we can allocate bigger SVE space without a probing loop. 534define void @sve_1024_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { 535; CHECK-LABEL: sve_1024_64k_guard: 536; CHECK: // %bb.0: // %entry 537; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 538; CHECK-NEXT: .cfi_def_cfa_offset 16 539; CHECK-NEXT: .cfi_offset w29, -16 540; CHECK-NEXT: addvl sp, sp, #-32 541; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 256 * VG 542; CHECK-NEXT: addvl sp, sp, #-32 543; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 512 * VG 544; CHECK-NEXT: addvl sp, sp, #-32 545; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 768 * VG 546; CHECK-NEXT: addvl sp, sp, #-32 547; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1024 * VG 548; CHECK-NEXT: addvl sp, sp, #-32 549; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1280 * VG 550; CHECK-NEXT: addvl sp, sp, #-32 551; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1536 * VG 552; CHECK-NEXT: addvl sp, sp, #-32 553; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1792 * VG 554; CHECK-NEXT: addvl sp, sp, #-32 555; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 2048 * VG 556; CHECK-NEXT: str xzr, [sp] 557; CHECK-NEXT: addvl sp, sp, #31 558; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1800 * VG 559; CHECK-NEXT: addvl sp, sp, #31 560; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1552 * VG 561; CHECK-NEXT: addvl sp, sp, #31 562; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1304 * VG 563; CHECK-NEXT: addvl sp, sp, #31 564; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1056 * VG 565; CHECK-NEXT: addvl sp, sp, #31 566; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 808 * VG 567; CHECK-NEXT: addvl sp, sp, #31 568; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 560 * VG 569; CHECK-NEXT: addvl sp, sp, #31 570; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 312 * VG 571; CHECK-NEXT: addvl sp, sp, #31 572; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG 573; CHECK-NEXT: addvl sp, sp, #8 574; CHECK-NEXT: .cfi_def_cfa wsp, 16 575; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 576; CHECK-NEXT: .cfi_def_cfa_offset 0 577; CHECK-NEXT: .cfi_restore w29 578; CHECK-NEXT: ret 579entry: 580 %vec = alloca <vscale x 1024 x float>, align 16 581 ret void 582} 583 584define void @sve_1028_64k_guard(ptr %out) #0 "stack-probe-size"="65536" { 585; CHECK-LABEL: sve_1028_64k_guard: 586; CHECK: // %bb.0: // %entry 587; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 588; CHECK-NEXT: .cfi_def_cfa_offset 16 589; CHECK-NEXT: .cfi_offset w29, -16 590; CHECK-NEXT: addvl x9, sp, #-32 591; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 256 * VG 592; CHECK-NEXT: addvl x9, x9, #-32 593; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 512 * VG 594; CHECK-NEXT: addvl x9, x9, #-32 595; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 768 * VG 596; CHECK-NEXT: addvl x9, x9, #-32 597; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1024 * VG 598; CHECK-NEXT: addvl x9, x9, #-32 599; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1280 * VG 600; CHECK-NEXT: addvl x9, x9, #-32 601; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1536 * VG 602; CHECK-NEXT: addvl x9, x9, #-32 603; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 1792 * VG 604; CHECK-NEXT: addvl x9, x9, #-32 605; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x80, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2048 * VG 606; CHECK-NEXT: addvl x9, x9, #-1 607; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x79, 0x00, 0x11, 0x10, 0x22, 0x11, 0x88, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $x9 + 16 + 2056 * VG 608; CHECK-NEXT: .LBB14_1: // %entry 609; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 610; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 611; CHECK-NEXT: cmp sp, x9 612; CHECK-NEXT: b.le .LBB14_3 613; CHECK-NEXT: // %bb.2: // %entry 614; CHECK-NEXT: // in Loop: Header=BB14_1 Depth=1 615; CHECK-NEXT: str xzr, [sp] 616; CHECK-NEXT: b .LBB14_1 617; CHECK-NEXT: .LBB14_3: // %entry 618; CHECK-NEXT: mov sp, x9 619; CHECK-NEXT: ldr xzr, [sp] 620; CHECK-NEXT: .cfi_def_cfa_register wsp 621; CHECK-NEXT: addvl sp, sp, #31 622; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x0e, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1808 * VG 623; CHECK-NEXT: addvl sp, sp, #31 624; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x98, 0x0c, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1560 * VG 625; CHECK-NEXT: addvl sp, sp, #31 626; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa0, 0x0a, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1312 * VG 627; CHECK-NEXT: addvl sp, sp, #31 628; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xa8, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 1064 * VG 629; CHECK-NEXT: addvl sp, sp, #31 630; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb0, 0x06, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 816 * VG 631; CHECK-NEXT: addvl sp, sp, #31 632; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xb8, 0x04, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 568 * VG 633; CHECK-NEXT: addvl sp, sp, #31 634; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x02, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 320 * VG 635; CHECK-NEXT: addvl sp, sp, #31 636; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc8, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 72 * VG 637; CHECK-NEXT: addvl sp, sp, #9 638; CHECK-NEXT: .cfi_def_cfa wsp, 16 639; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 640; CHECK-NEXT: .cfi_def_cfa_offset 0 641; CHECK-NEXT: .cfi_restore w29 642; CHECK-NEXT: ret 643entry: 644 %vec = alloca <vscale x 1024 x float>, align 16 645 %vec1 = alloca <vscale x 4 x float>, align 16 646 ret void 647} 648 649; With 5 SVE vectors of stack space the unprobed area 650; at the top of the stack can exceed 1024 bytes (5 x 256 == 1280), 651; hence we need to issue a probe. 652define void @sve_5_vector(ptr %out) #0 { 653; CHECK-LABEL: sve_5_vector: 654; CHECK: // %bb.0: // %entry 655; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 656; CHECK-NEXT: .cfi_def_cfa_offset 16 657; CHECK-NEXT: .cfi_offset w29, -16 658; CHECK-NEXT: addvl sp, sp, #-5 659; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 40 * VG 660; CHECK-NEXT: str xzr, [sp] 661; CHECK-NEXT: addvl sp, sp, #5 662; CHECK-NEXT: .cfi_def_cfa wsp, 16 663; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 664; CHECK-NEXT: .cfi_def_cfa_offset 0 665; CHECK-NEXT: .cfi_restore w29 666; CHECK-NEXT: ret 667entry: 668 %vec1 = alloca <vscale x 4 x float>, align 16 669 %vec2 = alloca <vscale x 4 x float>, align 16 670 %vec3 = alloca <vscale x 4 x float>, align 16 671 %vec4 = alloca <vscale x 4 x float>, align 16 672 %vec5 = alloca <vscale x 4 x float>, align 16 673 ret void 674} 675 676; Test with a 14 scalable bytes (so up to 14 * 16 = 224) of unprobed 677; are bellow the save location of `p9`. 678define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 { 679; CHECK-LABEL: sve_unprobed_area: 680; CHECK: // %bb.0: // %entry 681; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill 682; CHECK-NEXT: .cfi_def_cfa_offset 16 683; CHECK-NEXT: .cfi_offset w29, -16 684; CHECK-NEXT: addvl sp, sp, #-4 685; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 686; CHECK-NEXT: str xzr, [sp] 687; CHECK-NEXT: str p9, [sp, #7, mul vl] // 2-byte Folded Spill 688; CHECK-NEXT: str z10, [sp, #1, mul vl] // 16-byte Folded Spill 689; CHECK-NEXT: str z9, [sp, #2, mul vl] // 16-byte Folded Spill 690; CHECK-NEXT: str z8, [sp, #3, mul vl] // 16-byte Folded Spill 691; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG 692; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x70, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d9 @ cfa - 16 - 16 * VG 693; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x68, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d10 @ cfa - 16 - 24 * VG 694; CHECK-NEXT: addvl sp, sp, #-4 695; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0xc0, 0x00, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 64 * VG 696; CHECK-NEXT: //APP 697; CHECK-NEXT: //NO_APP 698; CHECK-NEXT: addvl sp, sp, #4 699; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG 700; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload 701; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload 702; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload 703; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload 704; CHECK-NEXT: addvl sp, sp, #4 705; CHECK-NEXT: .cfi_def_cfa wsp, 16 706; CHECK-NEXT: .cfi_restore z8 707; CHECK-NEXT: .cfi_restore z9 708; CHECK-NEXT: .cfi_restore z10 709; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload 710; CHECK-NEXT: .cfi_def_cfa_offset 0 711; CHECK-NEXT: .cfi_restore w29 712; CHECK-NEXT: ret 713entry: 714 call void asm sideeffect "", "~{z8},~{z9},~{z10},~{p9}" () 715 716 %v0 = alloca <vscale x 4 x float>, align 16 717 %v1 = alloca <vscale x 4 x float>, align 16 718 %v2 = alloca <vscale x 4 x float>, align 16 719 %v3 = alloca <vscale x 4 x float>, align 16 720 721 ret void 722} 723 724attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" "target-features"="+sve" } 725