RISCV/rvv/stack-probing-rvv.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=riscv64 -mattr=+m,+v -O2 < %s \
; RUN:   | FileCheck %s -check-prefix=RV64IV
; RUN: llc -mtriple=riscv32 -mattr=+m,+v -O2 < %s \
; RUN:   | FileCheck %s -check-prefix=RV32IV

; Tests adapted from AArch64.

; Test prolog sequences for stack probing when vector is involved.

; The space for vector objects needs probing in the general case, because
; the stack adjustment may happen to be too big (i.e. greater than the
; probe size).

define void @f_vector(ptr %out) #0 {
; RV64IV-LABEL: f_vector:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    slli t1, t1, 1
; RV64IV-NEXT:    .cfi_def_cfa t1, -16
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB0_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB0_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    slli a0, a0, 1
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f_vector:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    slli t1, t1, 1
; RV32IV-NEXT:    .cfi_def_cfa t1, -16
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB0_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB0_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 2 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    slli a0, a0, 1
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 0
; RV32IV-NEXT:    ret
entry:
  %vec = alloca <vscale x 4 x float>, align 16
  ret void
}

; As above, but with 4 vectors of stack space.
define void @f4_vector(ptr %out) #0 {
; RV64IV-LABEL: f4_vector:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    slli t1, t1, 3
; RV64IV-NEXT:    .cfi_def_cfa t1, -64
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB1_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB1_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    slli a0, a0, 3
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f4_vector:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    slli t1, t1, 3
; RV32IV-NEXT:    .cfi_def_cfa t1, -64
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB1_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB1_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 8 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    slli a0, a0, 3
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 0
; RV32IV-NEXT:    ret
entry:
  %vec1 = alloca <vscale x 4 x float>, align 16
  %vec2 = alloca <vscale x 4 x float>, align 16
  %vec3 = alloca <vscale x 4 x float>, align 16
  %vec4 = alloca <vscale x 4 x float>, align 16
  ret void
}

; As above, but with 16 vectors of stack space.
; The stack adjustment is less than or equal to 16 x 256 = 4096, so
; we can allocate the locals at once.
define void @f16_vector(ptr %out) #0 {
; RV64IV-LABEL: f16_vector:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    slli t1, t1, 5
; RV64IV-NEXT:    .cfi_def_cfa t1, -256
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB2_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB2_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    slli a0, a0, 5
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f16_vector:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    slli t1, t1, 5
; RV32IV-NEXT:    .cfi_def_cfa t1, -256
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB2_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB2_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 32 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    slli a0, a0, 5
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 0
; RV32IV-NEXT:    ret
entry:
  %vec1 = alloca <vscale x 4 x float>, align 16
  %vec2 = alloca <vscale x 4 x float>, align 16
  %vec3 = alloca <vscale x 4 x float>, align 16
  %vec4 = alloca <vscale x 4 x float>, align 16
  %vec5 = alloca <vscale x 4 x float>, align 16
  %vec6 = alloca <vscale x 4 x float>, align 16
  %vec7 = alloca <vscale x 4 x float>, align 16
  %vec8 = alloca <vscale x 4 x float>, align 16
  %vec9 = alloca <vscale x 4 x float>, align 16
  %vec10 = alloca <vscale x 4 x float>, align 16
  %vec11 = alloca <vscale x 4 x float>, align 16
  %vec12 = alloca <vscale x 4 x float>, align 16
  %vec13 = alloca <vscale x 4 x float>, align 16
  %vec14 = alloca <vscale x 4 x float>, align 16
  %vec15 = alloca <vscale x 4 x float>, align 16
  %vec16 = alloca <vscale x 4 x float>, align 16
  ret void
}

; As above, but with 17 vectors of stack space.
define void @f17_vector(ptr %out) #0 {
; RV64IV-LABEL: f17_vector:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    li a0, 34
; RV64IV-NEXT:    mul t1, t1, a0
; RV64IV-NEXT:    .cfi_def_cfa t1, -272
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB3_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB3_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    li a1, 34
; RV64IV-NEXT:    mul a0, a0, a1
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f17_vector:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    li a0, 34
; RV32IV-NEXT:    mul t1, t1, a0
; RV32IV-NEXT:    .cfi_def_cfa t1, -272
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB3_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB3_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0a, 0x72, 0x00, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 34 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    li a1, 34
; RV32IV-NEXT:    mul a0, a0, a1
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 0
; RV32IV-NEXT:    ret
entry:
  %vec1 = alloca <vscale x 4 x float>, align 16
  %vec2 = alloca <vscale x 4 x float>, align 16
  %vec3 = alloca <vscale x 4 x float>, align 16
  %vec4 = alloca <vscale x 4 x float>, align 16
  %vec5 = alloca <vscale x 4 x float>, align 16
  %vec6 = alloca <vscale x 4 x float>, align 16
  %vec7 = alloca <vscale x 4 x float>, align 16
  %vec8 = alloca <vscale x 4 x float>, align 16
  %vec9 = alloca <vscale x 4 x float>, align 16
  %vec10 = alloca <vscale x 4 x float>, align 16
  %vec11 = alloca <vscale x 4 x float>, align 16
  %vec12 = alloca <vscale x 4 x float>, align 16
  %vec13 = alloca <vscale x 4 x float>, align 16
  %vec14 = alloca <vscale x 4 x float>, align 16
  %vec15 = alloca <vscale x 4 x float>, align 16
  %vec16 = alloca <vscale x 4 x float>, align 16
  %vec17 = alloca <vscale x 4 x float>, align 16
  ret void
}

; A vector and a 16-byte fixed size object.
define void @f1_vector_16_arr(ptr %out) #0 {
; RV64IV-LABEL: f1_vector_16_arr:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    addi sp, sp, -16
; RV64IV-NEXT:    .cfi_def_cfa_offset 16
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    slli t1, t1, 1
; RV64IV-NEXT:    .cfi_def_cfa t1, -16
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB4_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB4_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    slli a0, a0, 1
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 16
; RV64IV-NEXT:    addi sp, sp, 16
; RV64IV-NEXT:    .cfi_def_cfa_offset 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f1_vector_16_arr:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    addi sp, sp, -16
; RV32IV-NEXT:    .cfi_def_cfa_offset 16
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    slli t1, t1, 1
; RV32IV-NEXT:    .cfi_def_cfa t1, -16
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB4_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB4_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 2 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    slli a0, a0, 1
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 16
; RV32IV-NEXT:    addi sp, sp, 16
; RV32IV-NEXT:    .cfi_def_cfa_offset 0
; RV32IV-NEXT:    ret
entry:
  %vec = alloca <vscale x 4 x float>, align 16
  %arr = alloca i8, i64 16, align 1
  ret void
}

; A large vector object and a large slot, both of which need probing.
define void @f1_vector_4096_arr(ptr %out) #0 {
; RV64IV-LABEL: f1_vector_4096_arr:
; RV64IV:       # %bb.0: # %entry
; RV64IV-NEXT:    lui a0, 1
; RV64IV-NEXT:    sub sp, sp, a0
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    .cfi_def_cfa_offset 4096
; RV64IV-NEXT:    lui a0, 1
; RV64IV-NEXT:    sub sp, sp, a0
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    .cfi_def_cfa_offset 8192
; RV64IV-NEXT:    lui a0, 1
; RV64IV-NEXT:    sub sp, sp, a0
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    .cfi_def_cfa_offset 12288
; RV64IV-NEXT:    addi sp, sp, -16
; RV64IV-NEXT:    .cfi_def_cfa_offset 12304
; RV64IV-NEXT:    csrr t1, vlenb
; RV64IV-NEXT:    slli t1, t1, 7
; RV64IV-NEXT:    .cfi_def_cfa t1, -1024
; RV64IV-NEXT:    lui t2, 1
; RV64IV-NEXT:  .LBB5_1: # %entry
; RV64IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV64IV-NEXT:    sub sp, sp, t2
; RV64IV-NEXT:    sd zero, 0(sp)
; RV64IV-NEXT:    sub t1, t1, t2
; RV64IV-NEXT:    bge t1, t2, .LBB5_1
; RV64IV-NEXT:  # %bb.2: # %entry
; RV64IV-NEXT:    .cfi_def_cfa_register sp
; RV64IV-NEXT:    sub sp, sp, t1
; RV64IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
; RV64IV-NEXT:    csrr a0, vlenb
; RV64IV-NEXT:    slli a0, a0, 7
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa sp, 12304
; RV64IV-NEXT:    lui a0, 3
; RV64IV-NEXT:    addiw a0, a0, 16
; RV64IV-NEXT:    add sp, sp, a0
; RV64IV-NEXT:    .cfi_def_cfa_offset 0
; RV64IV-NEXT:    ret
;
; RV32IV-LABEL: f1_vector_4096_arr:
; RV32IV:       # %bb.0: # %entry
; RV32IV-NEXT:    lui a0, 1
; RV32IV-NEXT:    sub sp, sp, a0
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    .cfi_def_cfa_offset 4096
; RV32IV-NEXT:    lui a0, 1
; RV32IV-NEXT:    sub sp, sp, a0
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    .cfi_def_cfa_offset 8192
; RV32IV-NEXT:    lui a0, 1
; RV32IV-NEXT:    sub sp, sp, a0
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    .cfi_def_cfa_offset 12288
; RV32IV-NEXT:    addi sp, sp, -16
; RV32IV-NEXT:    .cfi_def_cfa_offset 12304
; RV32IV-NEXT:    csrr t1, vlenb
; RV32IV-NEXT:    slli t1, t1, 7
; RV32IV-NEXT:    .cfi_def_cfa t1, -1024
; RV32IV-NEXT:    lui t2, 1
; RV32IV-NEXT:  .LBB5_1: # %entry
; RV32IV-NEXT:    # =>This Inner Loop Header: Depth=1
; RV32IV-NEXT:    sub sp, sp, t2
; RV32IV-NEXT:    sw zero, 0(sp)
; RV32IV-NEXT:    sub t1, t1, t2
; RV32IV-NEXT:    bge t1, t2, .LBB5_1
; RV32IV-NEXT:  # %bb.2: # %entry
; RV32IV-NEXT:    .cfi_def_cfa_register sp
; RV32IV-NEXT:    sub sp, sp, t1
; RV32IV-NEXT:    .cfi_escape 0x0f, 0x10, 0x72, 0x00, 0x11, 0x90, 0xe0, 0x00, 0x22, 0x11, 0x80, 0x01, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 12304 + 128 * vlenb
; RV32IV-NEXT:    csrr a0, vlenb
; RV32IV-NEXT:    slli a0, a0, 7
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa sp, 12304
; RV32IV-NEXT:    lui a0, 3
; RV32IV-NEXT:    addi a0, a0, 16
; RV32IV-NEXT:    add sp, sp, a0
; RV32IV-NEXT:    .cfi_def_cfa_offset 0
; RV32IV-NEXT:    ret
entry:
  %vec = alloca <vscale x 256 x float>, align 16
  %arr = alloca i8, i64 12288, align 1
  ret void
}

attributes #0 = { uwtable(async) "probe-stack"="inline-asm" "frame-pointer"="none" }