1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s \ 3; RUN: | FileCheck %s --check-prefixes=CHECK,RV32 4; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s \ 5; RUN: | FileCheck %s --check-prefixes=CHECK,RV64 6 7define fastcc <vscale x 4 x i8> @ret_nxv4i8(ptr %p) { 8; CHECK-LABEL: ret_nxv4i8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma 11; CHECK-NEXT: vle8.v v8, (a0) 12; CHECK-NEXT: ret 13 %v = load <vscale x 4 x i8>, ptr %p 14 ret <vscale x 4 x i8> %v 15} 16 17define fastcc <vscale x 4 x i32> @ret_nxv4i32(ptr %p) { 18; CHECK-LABEL: ret_nxv4i32: 19; CHECK: # %bb.0: 20; CHECK-NEXT: vl2re32.v v8, (a0) 21; CHECK-NEXT: ret 22 %v = load <vscale x 4 x i32>, ptr %p 23 ret <vscale x 4 x i32> %v 24} 25 26define fastcc <vscale x 8 x i32> @ret_nxv8i32(ptr %p) { 27; CHECK-LABEL: ret_nxv8i32: 28; CHECK: # %bb.0: 29; CHECK-NEXT: vl4re32.v v8, (a0) 30; CHECK-NEXT: ret 31 %v = load <vscale x 8 x i32>, ptr %p 32 ret <vscale x 8 x i32> %v 33} 34 35define fastcc <vscale x 16 x i64> @ret_nxv16i64(ptr %p) { 36; CHECK-LABEL: ret_nxv16i64: 37; CHECK: # %bb.0: 38; CHECK-NEXT: csrr a1, vlenb 39; CHECK-NEXT: slli a1, a1, 3 40; CHECK-NEXT: add a1, a0, a1 41; CHECK-NEXT: vl8re64.v v16, (a1) 42; CHECK-NEXT: vl8re64.v v8, (a0) 43; CHECK-NEXT: ret 44 %v = load <vscale x 16 x i64>, ptr %p 45 ret <vscale x 16 x i64> %v 46} 47 48define fastcc <vscale x 8 x i1> @ret_mask_nxv8i1(ptr %p) { 49; CHECK-LABEL: ret_mask_nxv8i1: 50; CHECK: # %bb.0: 51; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma 52; CHECK-NEXT: vlm.v v0, (a0) 53; CHECK-NEXT: ret 54 %v = load <vscale x 8 x i1>, ptr %p 55 ret <vscale x 8 x i1> %v 56} 57 58define fastcc <vscale x 32 x i1> @ret_mask_nxv32i1(ptr %p) { 59; CHECK-LABEL: ret_mask_nxv32i1: 60; CHECK: # %bb.0: 61; CHECK-NEXT: vsetvli a1, zero, e8, m4, ta, ma 62; CHECK-NEXT: vlm.v v0, (a0) 63; CHECK-NEXT: ret 64 %v = load <vscale x 32 x i1>, ptr %p 65 ret <vscale x 32 x i1> %v 66} 67 68; Return the vector via registers v8-v23 69define fastcc <vscale x 64 x i32> @ret_split_nxv64i32(ptr %x) { 70; CHECK-LABEL: ret_split_nxv64i32: 71; CHECK: # %bb.0: 72; CHECK-NEXT: csrr a2, vlenb 73; CHECK-NEXT: vl8re32.v v8, (a1) 74; CHECK-NEXT: slli a3, a2, 3 75; CHECK-NEXT: slli a4, a2, 5 76; CHECK-NEXT: slli a2, a2, 4 77; CHECK-NEXT: sub a4, a4, a3 78; CHECK-NEXT: add a5, a1, a2 79; CHECK-NEXT: vl8re32.v v16, (a5) 80; CHECK-NEXT: add a5, a1, a3 81; CHECK-NEXT: add a2, a0, a2 82; CHECK-NEXT: add a3, a0, a3 83; CHECK-NEXT: add a1, a1, a4 84; CHECK-NEXT: vl8re32.v v24, (a5) 85; CHECK-NEXT: vl8re32.v v0, (a1) 86; CHECK-NEXT: vs8r.v v8, (a0) 87; CHECK-NEXT: vs8r.v v16, (a2) 88; CHECK-NEXT: vs8r.v v24, (a3) 89; CHECK-NEXT: add a0, a0, a4 90; CHECK-NEXT: vs8r.v v0, (a0) 91; CHECK-NEXT: ret 92 %v = load <vscale x 64 x i32>, ptr %x 93 ret <vscale x 64 x i32> %v 94} 95 96; Return the vector fully via the stack 97define fastcc <vscale x 128 x i32> @ret_split_nxv128i32(ptr %x) { 98; CHECK-LABEL: ret_split_nxv128i32: 99; CHECK: # %bb.0: 100; CHECK-NEXT: addi sp, sp, -16 101; CHECK-NEXT: .cfi_def_cfa_offset 16 102; CHECK-NEXT: csrr a2, vlenb 103; CHECK-NEXT: li a3, 40 104; CHECK-NEXT: mul a2, a2, a3 105; CHECK-NEXT: sub sp, sp, a2 106; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x28, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 40 * vlenb 107; CHECK-NEXT: csrr a2, vlenb 108; CHECK-NEXT: li a3, 40 109; CHECK-NEXT: vl8re32.v v8, (a1) 110; CHECK-NEXT: csrr a4, vlenb 111; CHECK-NEXT: slli a4, a4, 5 112; CHECK-NEXT: add a4, sp, a4 113; CHECK-NEXT: addi a4, a4, 16 114; CHECK-NEXT: vs8r.v v8, (a4) # Unknown-size Folded Spill 115; CHECK-NEXT: slli a4, a2, 3 116; CHECK-NEXT: slli a5, a2, 5 117; CHECK-NEXT: slli a6, a2, 4 118; CHECK-NEXT: slli a7, a2, 6 119; CHECK-NEXT: mul a2, a2, a3 120; CHECK-NEXT: sub a3, a5, a4 121; CHECK-NEXT: sub t0, a7, a6 122; CHECK-NEXT: sub a7, a7, a4 123; CHECK-NEXT: add t1, a1, a4 124; CHECK-NEXT: add t2, a1, a6 125; CHECK-NEXT: add t3, a1, a5 126; CHECK-NEXT: vl8re32.v v8, (t1) 127; CHECK-NEXT: csrr t1, vlenb 128; CHECK-NEXT: li t4, 24 129; CHECK-NEXT: mul t1, t1, t4 130; CHECK-NEXT: add t1, sp, t1 131; CHECK-NEXT: addi t1, t1, 16 132; CHECK-NEXT: vs8r.v v8, (t1) # Unknown-size Folded Spill 133; CHECK-NEXT: add t1, a1, a2 134; CHECK-NEXT: vl8re32.v v8, (t2) 135; CHECK-NEXT: csrr t2, vlenb 136; CHECK-NEXT: slli t2, t2, 3 137; CHECK-NEXT: add t2, sp, t2 138; CHECK-NEXT: addi t2, t2, 16 139; CHECK-NEXT: vs8r.v v8, (t2) # Unknown-size Folded Spill 140; CHECK-NEXT: add t2, a1, a3 141; CHECK-NEXT: vl8re32.v v16, (t3) 142; CHECK-NEXT: add t3, a1, t0 143; CHECK-NEXT: add a1, a1, a7 144; CHECK-NEXT: vl8re32.v v8, (t1) 145; CHECK-NEXT: vl8re32.v v24, (t2) 146; CHECK-NEXT: csrr t1, vlenb 147; CHECK-NEXT: slli t1, t1, 4 148; CHECK-NEXT: add t1, sp, t1 149; CHECK-NEXT: addi t1, t1, 16 150; CHECK-NEXT: vs8r.v v24, (t1) # Unknown-size Folded Spill 151; CHECK-NEXT: vl8re32.v v24, (t3) 152; CHECK-NEXT: addi t1, sp, 16 153; CHECK-NEXT: vs8r.v v24, (t1) # Unknown-size Folded Spill 154; CHECK-NEXT: vl8re32.v v24, (a1) 155; CHECK-NEXT: csrr a1, vlenb 156; CHECK-NEXT: slli a1, a1, 5 157; CHECK-NEXT: add a1, sp, a1 158; CHECK-NEXT: addi a1, a1, 16 159; CHECK-NEXT: vl8r.v v0, (a1) # Unknown-size Folded Reload 160; CHECK-NEXT: vs8r.v v0, (a0) 161; CHECK-NEXT: add a2, a0, a2 162; CHECK-NEXT: vs8r.v v8, (a2) 163; CHECK-NEXT: add a5, a0, a5 164; CHECK-NEXT: vs8r.v v16, (a5) 165; CHECK-NEXT: add a6, a0, a6 166; CHECK-NEXT: csrr a1, vlenb 167; CHECK-NEXT: slli a1, a1, 3 168; CHECK-NEXT: add a1, sp, a1 169; CHECK-NEXT: addi a1, a1, 16 170; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 171; CHECK-NEXT: vs8r.v v8, (a6) 172; CHECK-NEXT: add a4, a0, a4 173; CHECK-NEXT: csrr a1, vlenb 174; CHECK-NEXT: li a2, 24 175; CHECK-NEXT: mul a1, a1, a2 176; CHECK-NEXT: add a1, sp, a1 177; CHECK-NEXT: addi a1, a1, 16 178; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 179; CHECK-NEXT: vs8r.v v8, (a4) 180; CHECK-NEXT: add a7, a0, a7 181; CHECK-NEXT: vs8r.v v24, (a7) 182; CHECK-NEXT: add t0, a0, t0 183; CHECK-NEXT: addi a1, sp, 16 184; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 185; CHECK-NEXT: vs8r.v v8, (t0) 186; CHECK-NEXT: add a0, a0, a3 187; CHECK-NEXT: csrr a1, vlenb 188; CHECK-NEXT: slli a1, a1, 4 189; CHECK-NEXT: add a1, sp, a1 190; CHECK-NEXT: addi a1, a1, 16 191; CHECK-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 192; CHECK-NEXT: vs8r.v v8, (a0) 193; CHECK-NEXT: csrr a0, vlenb 194; CHECK-NEXT: li a1, 40 195; CHECK-NEXT: mul a0, a0, a1 196; CHECK-NEXT: add sp, sp, a0 197; CHECK-NEXT: .cfi_def_cfa sp, 16 198; CHECK-NEXT: addi sp, sp, 16 199; CHECK-NEXT: .cfi_def_cfa_offset 0 200; CHECK-NEXT: ret 201 %v = load <vscale x 128 x i32>, ptr %x 202 ret <vscale x 128 x i32> %v 203} 204 205define fastcc <vscale x 4 x i8> @ret_nxv4i8_param_nxv4i8_nxv4i8(<vscale x 4 x i8> %v, <vscale x 4 x i8> %w) { 206; CHECK-LABEL: ret_nxv4i8_param_nxv4i8_nxv4i8: 207; CHECK: # %bb.0: 208; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, ma 209; CHECK-NEXT: vadd.vv v8, v8, v9 210; CHECK-NEXT: ret 211 %r = add <vscale x 4 x i8> %v, %w 212 ret <vscale x 4 x i8> %r 213} 214 215define fastcc <vscale x 4 x i64> @ret_nxv4i64_param_nxv4i64_nxv4i64(<vscale x 4 x i64> %v, <vscale x 4 x i64> %w) { 216; CHECK-LABEL: ret_nxv4i64_param_nxv4i64_nxv4i64: 217; CHECK: # %bb.0: 218; CHECK-NEXT: vsetvli a0, zero, e64, m4, ta, ma 219; CHECK-NEXT: vadd.vv v8, v8, v12 220; CHECK-NEXT: ret 221 %r = add <vscale x 4 x i64> %v, %w 222 ret <vscale x 4 x i64> %r 223} 224 225define fastcc <vscale x 8 x i1> @ret_nxv8i1_param_nxv8i1_nxv8i1(<vscale x 8 x i1> %v, <vscale x 8 x i1> %w) { 226; CHECK-LABEL: ret_nxv8i1_param_nxv8i1_nxv8i1: 227; CHECK: # %bb.0: 228; CHECK-NEXT: vsetvli a0, zero, e8, m1, ta, ma 229; CHECK-NEXT: vmxor.mm v0, v0, v8 230; CHECK-NEXT: ret 231 %r = xor <vscale x 8 x i1> %v, %w 232 ret <vscale x 8 x i1> %r 233} 234 235define fastcc <vscale x 32 x i1> @ret_nxv32i1_param_nxv32i1_nxv32i1(<vscale x 32 x i1> %v, <vscale x 32 x i1> %w) { 236; CHECK-LABEL: ret_nxv32i1_param_nxv32i1_nxv32i1: 237; CHECK: # %bb.0: 238; CHECK-NEXT: vsetvli a0, zero, e8, m4, ta, ma 239; CHECK-NEXT: vmand.mm v0, v0, v8 240; CHECK-NEXT: ret 241 %r = and <vscale x 32 x i1> %v, %w 242 ret <vscale x 32 x i1> %r 243} 244 245define fastcc <vscale x 32 x i32> @ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) { 246; CHECK-LABEL: ret_nxv32i32_param_nxv32i32_nxv32i32_nxv32i32_i32: 247; CHECK: # %bb.0: 248; CHECK-NEXT: addi sp, sp, -16 249; CHECK-NEXT: .cfi_def_cfa_offset 16 250; CHECK-NEXT: csrr a1, vlenb 251; CHECK-NEXT: li a3, 24 252; CHECK-NEXT: mul a1, a1, a3 253; CHECK-NEXT: sub sp, sp, a1 254; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb 255; CHECK-NEXT: csrr a1, vlenb 256; CHECK-NEXT: slli a1, a1, 4 257; CHECK-NEXT: add a1, sp, a1 258; CHECK-NEXT: addi a1, a1, 16 259; CHECK-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 260; CHECK-NEXT: csrr a1, vlenb 261; CHECK-NEXT: slli a1, a1, 3 262; CHECK-NEXT: add a1, sp, a1 263; CHECK-NEXT: addi a1, a1, 16 264; CHECK-NEXT: vs8r.v v8, (a1) # Unknown-size Folded Spill 265; CHECK-NEXT: csrr a1, vlenb 266; CHECK-NEXT: vl8re32.v v8, (a2) 267; CHECK-NEXT: addi a3, sp, 16 268; CHECK-NEXT: vs8r.v v8, (a3) # Unknown-size Folded Spill 269; CHECK-NEXT: vl8re32.v v0, (a0) 270; CHECK-NEXT: slli a1, a1, 3 271; CHECK-NEXT: add a2, a2, a1 272; CHECK-NEXT: add a0, a0, a1 273; CHECK-NEXT: vl8re32.v v8, (a0) 274; CHECK-NEXT: vl8re32.v v16, (a2) 275; CHECK-NEXT: csrr a0, vlenb 276; CHECK-NEXT: slli a0, a0, 3 277; CHECK-NEXT: add a0, sp, a0 278; CHECK-NEXT: addi a0, a0, 16 279; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 280; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma 281; CHECK-NEXT: vadd.vv v0, v24, v0 282; CHECK-NEXT: addi a0, sp, 16 283; CHECK-NEXT: vl8r.v v24, (a0) # Unknown-size Folded Reload 284; CHECK-NEXT: vadd.vv v24, v0, v24 285; CHECK-NEXT: csrr a0, vlenb 286; CHECK-NEXT: slli a0, a0, 4 287; CHECK-NEXT: add a0, sp, a0 288; CHECK-NEXT: addi a0, a0, 16 289; CHECK-NEXT: vl8r.v v0, (a0) # Unknown-size Folded Reload 290; CHECK-NEXT: vadd.vv v8, v0, v8 291; CHECK-NEXT: vadd.vv v8, v8, v16 292; CHECK-NEXT: vadd.vx v16, v8, a4 293; CHECK-NEXT: vadd.vx v8, v24, a4 294; CHECK-NEXT: csrr a0, vlenb 295; CHECK-NEXT: li a1, 24 296; CHECK-NEXT: mul a0, a0, a1 297; CHECK-NEXT: add sp, sp, a0 298; CHECK-NEXT: .cfi_def_cfa sp, 16 299; CHECK-NEXT: addi sp, sp, 16 300; CHECK-NEXT: .cfi_def_cfa_offset 0 301; CHECK-NEXT: ret 302 %r = add <vscale x 32 x i32> %x, %y 303 %s = add <vscale x 32 x i32> %r, %z 304 %head = insertelement <vscale x 32 x i32> poison, i32 %w, i32 0 305 %splat = shufflevector <vscale x 32 x i32> %head, <vscale x 32 x i32> poison, <vscale x 32 x i32> zeroinitializer 306 %t = add <vscale x 32 x i32> %s, %splat 307 ret <vscale x 32 x i32> %t 308} 309 310declare <vscale x 32 x i32> @ext2(<vscale x 32 x i32>, <vscale x 32 x i32>, i32, i32) 311declare <vscale x 32 x i32> @ext3(<vscale x 32 x i32>, <vscale x 32 x i32>, <vscale x 32 x i32>, i32, i32) 312 313define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, i32 %w) { 314; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32: 315; RV32: # %bb.0: 316; RV32-NEXT: addi sp, sp, -144 317; RV32-NEXT: .cfi_def_cfa_offset 144 318; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill 319; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill 320; RV32-NEXT: .cfi_offset ra, -4 321; RV32-NEXT: .cfi_offset s0, -8 322; RV32-NEXT: addi s0, sp, 144 323; RV32-NEXT: .cfi_def_cfa s0, 0 324; RV32-NEXT: csrr a1, vlenb 325; RV32-NEXT: slli a1, a1, 4 326; RV32-NEXT: sub sp, sp, a1 327; RV32-NEXT: andi sp, sp, -128 328; RV32-NEXT: csrr a1, vlenb 329; RV32-NEXT: slli a1, a1, 3 330; RV32-NEXT: add a3, a0, a1 331; RV32-NEXT: vl8re32.v v24, (a3) 332; RV32-NEXT: vl8re32.v v0, (a0) 333; RV32-NEXT: addi a3, sp, 128 334; RV32-NEXT: addi a0, sp, 128 335; RV32-NEXT: vs8r.v v8, (a3) 336; RV32-NEXT: add a1, a3, a1 337; RV32-NEXT: li a3, 2 338; RV32-NEXT: vs8r.v v16, (a1) 339; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma 340; RV32-NEXT: vmv8r.v v8, v0 341; RV32-NEXT: vmv8r.v v16, v24 342; RV32-NEXT: call ext2 343; RV32-NEXT: addi sp, s0, -144 344; RV32-NEXT: .cfi_def_cfa sp, 144 345; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload 346; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload 347; RV32-NEXT: .cfi_restore ra 348; RV32-NEXT: .cfi_restore s0 349; RV32-NEXT: addi sp, sp, 144 350; RV32-NEXT: .cfi_def_cfa_offset 0 351; RV32-NEXT: ret 352; 353; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_i32: 354; RV64: # %bb.0: 355; RV64-NEXT: addi sp, sp, -144 356; RV64-NEXT: .cfi_def_cfa_offset 144 357; RV64-NEXT: sd ra, 136(sp) # 8-byte Folded Spill 358; RV64-NEXT: sd s0, 128(sp) # 8-byte Folded Spill 359; RV64-NEXT: .cfi_offset ra, -8 360; RV64-NEXT: .cfi_offset s0, -16 361; RV64-NEXT: addi s0, sp, 144 362; RV64-NEXT: .cfi_def_cfa s0, 0 363; RV64-NEXT: csrr a1, vlenb 364; RV64-NEXT: slli a1, a1, 4 365; RV64-NEXT: sub sp, sp, a1 366; RV64-NEXT: andi sp, sp, -128 367; RV64-NEXT: csrr a1, vlenb 368; RV64-NEXT: slli a1, a1, 3 369; RV64-NEXT: add a3, a0, a1 370; RV64-NEXT: vl8re32.v v24, (a3) 371; RV64-NEXT: vl8re32.v v0, (a0) 372; RV64-NEXT: addi a3, sp, 128 373; RV64-NEXT: addi a0, sp, 128 374; RV64-NEXT: vs8r.v v8, (a3) 375; RV64-NEXT: add a1, a3, a1 376; RV64-NEXT: li a3, 2 377; RV64-NEXT: vs8r.v v16, (a1) 378; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma 379; RV64-NEXT: vmv8r.v v8, v0 380; RV64-NEXT: vmv8r.v v16, v24 381; RV64-NEXT: call ext2 382; RV64-NEXT: addi sp, s0, -144 383; RV64-NEXT: .cfi_def_cfa sp, 144 384; RV64-NEXT: ld ra, 136(sp) # 8-byte Folded Reload 385; RV64-NEXT: ld s0, 128(sp) # 8-byte Folded Reload 386; RV64-NEXT: .cfi_restore ra 387; RV64-NEXT: .cfi_restore s0 388; RV64-NEXT: addi sp, sp, 144 389; RV64-NEXT: .cfi_def_cfa_offset 0 390; RV64-NEXT: ret 391 %t = call fastcc <vscale x 32 x i32> @ext2(<vscale x 32 x i32> %y, <vscale x 32 x i32> %x, i32 %w, i32 2) 392 ret <vscale x 32 x i32> %t 393} 394 395define fastcc <vscale x 32 x i32> @ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %w) { 396; RV32-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32: 397; RV32: # %bb.0: 398; RV32-NEXT: addi sp, sp, -144 399; RV32-NEXT: .cfi_def_cfa_offset 144 400; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill 401; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill 402; RV32-NEXT: .cfi_offset ra, -4 403; RV32-NEXT: .cfi_offset s0, -8 404; RV32-NEXT: addi s0, sp, 144 405; RV32-NEXT: .cfi_def_cfa s0, 0 406; RV32-NEXT: csrr a1, vlenb 407; RV32-NEXT: li a3, 48 408; RV32-NEXT: mul a1, a1, a3 409; RV32-NEXT: sub sp, sp, a1 410; RV32-NEXT: andi sp, sp, -128 411; RV32-NEXT: addi a1, sp, 128 412; RV32-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 413; RV32-NEXT: csrr a1, vlenb 414; RV32-NEXT: vl8re32.v v16, (a2) 415; RV32-NEXT: csrr a3, vlenb 416; RV32-NEXT: slli a3, a3, 3 417; RV32-NEXT: add a3, sp, a3 418; RV32-NEXT: addi a3, a3, 128 419; RV32-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 420; RV32-NEXT: slli a1, a1, 3 421; RV32-NEXT: add a2, a2, a1 422; RV32-NEXT: add a3, a0, a1 423; RV32-NEXT: vl8re32.v v0, (a2) 424; RV32-NEXT: vl8re32.v v24, (a3) 425; RV32-NEXT: vl8re32.v v16, (a0) 426; RV32-NEXT: csrr a0, vlenb 427; RV32-NEXT: slli a0, a0, 4 428; RV32-NEXT: add a0, sp, a0 429; RV32-NEXT: addi a0, a0, 128 430; RV32-NEXT: vs8r.v v8, (a0) 431; RV32-NEXT: csrr a3, vlenb 432; RV32-NEXT: slli a3, a3, 5 433; RV32-NEXT: add a3, sp, a3 434; RV32-NEXT: addi a3, a3, 128 435; RV32-NEXT: vs8r.v v16, (a3) 436; RV32-NEXT: add a0, a0, a1 437; RV32-NEXT: addi a2, sp, 128 438; RV32-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload 439; RV32-NEXT: vs8r.v v8, (a0) 440; RV32-NEXT: csrr a0, vlenb 441; RV32-NEXT: slli a0, a0, 5 442; RV32-NEXT: add a0, sp, a0 443; RV32-NEXT: addi a0, a0, 128 444; RV32-NEXT: csrr a2, vlenb 445; RV32-NEXT: slli a2, a2, 4 446; RV32-NEXT: add a2, sp, a2 447; RV32-NEXT: addi a2, a2, 128 448; RV32-NEXT: add a1, a3, a1 449; RV32-NEXT: li a5, 42 450; RV32-NEXT: vs8r.v v24, (a1) 451; RV32-NEXT: csrr a1, vlenb 452; RV32-NEXT: slli a1, a1, 3 453; RV32-NEXT: add a1, sp, a1 454; RV32-NEXT: addi a1, a1, 128 455; RV32-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 456; RV32-NEXT: vsetivli zero, 1, e8, m1, ta, ma 457; RV32-NEXT: vmv8r.v v16, v0 458; RV32-NEXT: call ext3 459; RV32-NEXT: addi sp, s0, -144 460; RV32-NEXT: .cfi_def_cfa sp, 144 461; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload 462; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload 463; RV32-NEXT: .cfi_restore ra 464; RV32-NEXT: .cfi_restore s0 465; RV32-NEXT: addi sp, sp, 144 466; RV32-NEXT: .cfi_def_cfa_offset 0 467; RV32-NEXT: ret 468; 469; RV64-LABEL: ret_nxv32i32_call_nxv32i32_nxv32i32_nxv32i32_i32: 470; RV64: # %bb.0: 471; RV64-NEXT: addi sp, sp, -144 472; RV64-NEXT: .cfi_def_cfa_offset 144 473; RV64-NEXT: sd ra, 136(sp) # 8-byte Folded Spill 474; RV64-NEXT: sd s0, 128(sp) # 8-byte Folded Spill 475; RV64-NEXT: .cfi_offset ra, -8 476; RV64-NEXT: .cfi_offset s0, -16 477; RV64-NEXT: addi s0, sp, 144 478; RV64-NEXT: .cfi_def_cfa s0, 0 479; RV64-NEXT: csrr a1, vlenb 480; RV64-NEXT: li a3, 48 481; RV64-NEXT: mul a1, a1, a3 482; RV64-NEXT: sub sp, sp, a1 483; RV64-NEXT: andi sp, sp, -128 484; RV64-NEXT: addi a1, sp, 128 485; RV64-NEXT: vs8r.v v16, (a1) # Unknown-size Folded Spill 486; RV64-NEXT: csrr a1, vlenb 487; RV64-NEXT: vl8re32.v v16, (a2) 488; RV64-NEXT: csrr a3, vlenb 489; RV64-NEXT: slli a3, a3, 3 490; RV64-NEXT: add a3, sp, a3 491; RV64-NEXT: addi a3, a3, 128 492; RV64-NEXT: vs8r.v v16, (a3) # Unknown-size Folded Spill 493; RV64-NEXT: slli a1, a1, 3 494; RV64-NEXT: add a2, a2, a1 495; RV64-NEXT: add a3, a0, a1 496; RV64-NEXT: vl8re32.v v0, (a2) 497; RV64-NEXT: vl8re32.v v24, (a3) 498; RV64-NEXT: vl8re32.v v16, (a0) 499; RV64-NEXT: csrr a0, vlenb 500; RV64-NEXT: slli a0, a0, 4 501; RV64-NEXT: add a0, sp, a0 502; RV64-NEXT: addi a0, a0, 128 503; RV64-NEXT: vs8r.v v8, (a0) 504; RV64-NEXT: csrr a3, vlenb 505; RV64-NEXT: slli a3, a3, 5 506; RV64-NEXT: add a3, sp, a3 507; RV64-NEXT: addi a3, a3, 128 508; RV64-NEXT: vs8r.v v16, (a3) 509; RV64-NEXT: add a0, a0, a1 510; RV64-NEXT: addi a2, sp, 128 511; RV64-NEXT: vl8r.v v8, (a2) # Unknown-size Folded Reload 512; RV64-NEXT: vs8r.v v8, (a0) 513; RV64-NEXT: csrr a0, vlenb 514; RV64-NEXT: slli a0, a0, 5 515; RV64-NEXT: add a0, sp, a0 516; RV64-NEXT: addi a0, a0, 128 517; RV64-NEXT: csrr a2, vlenb 518; RV64-NEXT: slli a2, a2, 4 519; RV64-NEXT: add a2, sp, a2 520; RV64-NEXT: addi a2, a2, 128 521; RV64-NEXT: add a1, a3, a1 522; RV64-NEXT: li a5, 42 523; RV64-NEXT: vs8r.v v24, (a1) 524; RV64-NEXT: csrr a1, vlenb 525; RV64-NEXT: slli a1, a1, 3 526; RV64-NEXT: add a1, sp, a1 527; RV64-NEXT: addi a1, a1, 128 528; RV64-NEXT: vl8r.v v8, (a1) # Unknown-size Folded Reload 529; RV64-NEXT: vsetivli zero, 1, e8, m1, ta, ma 530; RV64-NEXT: vmv8r.v v16, v0 531; RV64-NEXT: call ext3 532; RV64-NEXT: addi sp, s0, -144 533; RV64-NEXT: .cfi_def_cfa sp, 144 534; RV64-NEXT: ld ra, 136(sp) # 8-byte Folded Reload 535; RV64-NEXT: ld s0, 128(sp) # 8-byte Folded Reload 536; RV64-NEXT: .cfi_restore ra 537; RV64-NEXT: .cfi_restore s0 538; RV64-NEXT: addi sp, sp, 144 539; RV64-NEXT: .cfi_def_cfa_offset 0 540; RV64-NEXT: ret 541 %t = call fastcc <vscale x 32 x i32> @ext3(<vscale x 32 x i32> %z, <vscale x 32 x i32> %y, <vscale x 32 x i32> %x, i32 %w, i32 42) 542 ret <vscale x 32 x i32> %t 543} 544 545; A test case where the normal calling convention would pass directly via the 546; stack, but with fastcc can pass indirectly with the extra GPR registers 547; allowed. 548define fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z, i32 %8) { 549; CHECK-LABEL: vector_arg_indirect_stack: 550; CHECK: # %bb.0: 551; CHECK-NEXT: csrr a0, vlenb 552; CHECK-NEXT: slli a0, a0, 3 553; CHECK-NEXT: add a0, t5, a0 554; CHECK-NEXT: vl8re32.v v24, (t5) 555; CHECK-NEXT: vl8re32.v v0, (a0) 556; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, ma 557; CHECK-NEXT: vadd.vv v8, v8, v24 558; CHECK-NEXT: vadd.vv v16, v16, v0 559; CHECK-NEXT: ret 560 %s = add <vscale x 32 x i32> %x, %z 561 ret <vscale x 32 x i32> %s 562} 563 564; Calling the function above. Ensure we pass the arguments correctly. 565define fastcc <vscale x 32 x i32> @pass_vector_arg_indirect_stack(<vscale x 32 x i32> %x, <vscale x 32 x i32> %y, <vscale x 32 x i32> %z) { 566; RV32-LABEL: pass_vector_arg_indirect_stack: 567; RV32: # %bb.0: 568; RV32-NEXT: addi sp, sp, -144 569; RV32-NEXT: .cfi_def_cfa_offset 144 570; RV32-NEXT: sw ra, 140(sp) # 4-byte Folded Spill 571; RV32-NEXT: sw s0, 136(sp) # 4-byte Folded Spill 572; RV32-NEXT: sw s1, 132(sp) # 4-byte Folded Spill 573; RV32-NEXT: .cfi_offset ra, -4 574; RV32-NEXT: .cfi_offset s0, -8 575; RV32-NEXT: .cfi_offset s1, -12 576; RV32-NEXT: addi s0, sp, 144 577; RV32-NEXT: .cfi_def_cfa s0, 0 578; RV32-NEXT: csrr a0, vlenb 579; RV32-NEXT: slli a0, a0, 5 580; RV32-NEXT: sub sp, sp, a0 581; RV32-NEXT: andi sp, sp, -128 582; RV32-NEXT: mv s1, sp 583; RV32-NEXT: csrr a0, vlenb 584; RV32-NEXT: slli a0, a0, 3 585; RV32-NEXT: addi sp, sp, -16 586; RV32-NEXT: vsetvli a1, zero, e32, m8, ta, ma 587; RV32-NEXT: vmv.v.i v8, 0 588; RV32-NEXT: addi t0, s1, 128 589; RV32-NEXT: csrr t1, vlenb 590; RV32-NEXT: slli t1, t1, 4 591; RV32-NEXT: add t1, s1, t1 592; RV32-NEXT: addi t1, t1, 128 593; RV32-NEXT: li a7, 8 594; RV32-NEXT: li a1, 1 595; RV32-NEXT: li a2, 2 596; RV32-NEXT: li a3, 3 597; RV32-NEXT: li a4, 4 598; RV32-NEXT: li a5, 5 599; RV32-NEXT: li a6, 6 600; RV32-NEXT: vs8r.v v8, (t0) 601; RV32-NEXT: vs8r.v v8, (t1) 602; RV32-NEXT: sw a7, 0(sp) 603; RV32-NEXT: li a7, 7 604; RV32-NEXT: add t0, t0, a0 605; RV32-NEXT: add a0, t1, a0 606; RV32-NEXT: csrr t3, vlenb 607; RV32-NEXT: slli t3, t3, 4 608; RV32-NEXT: add t3, s1, t3 609; RV32-NEXT: addi t3, t3, 128 610; RV32-NEXT: vs8r.v v8, (t0) 611; RV32-NEXT: addi t5, s1, 128 612; RV32-NEXT: vs8r.v v8, (a0) 613; RV32-NEXT: li a0, 0 614; RV32-NEXT: vmv.v.i v16, 0 615; RV32-NEXT: call vector_arg_indirect_stack 616; RV32-NEXT: addi sp, sp, 16 617; RV32-NEXT: addi sp, s0, -144 618; RV32-NEXT: .cfi_def_cfa sp, 144 619; RV32-NEXT: lw ra, 140(sp) # 4-byte Folded Reload 620; RV32-NEXT: lw s0, 136(sp) # 4-byte Folded Reload 621; RV32-NEXT: lw s1, 132(sp) # 4-byte Folded Reload 622; RV32-NEXT: .cfi_restore ra 623; RV32-NEXT: .cfi_restore s0 624; RV32-NEXT: .cfi_restore s1 625; RV32-NEXT: addi sp, sp, 144 626; RV32-NEXT: .cfi_def_cfa_offset 0 627; RV32-NEXT: ret 628; 629; RV64-LABEL: pass_vector_arg_indirect_stack: 630; RV64: # %bb.0: 631; RV64-NEXT: addi sp, sp, -160 632; RV64-NEXT: .cfi_def_cfa_offset 160 633; RV64-NEXT: sd ra, 152(sp) # 8-byte Folded Spill 634; RV64-NEXT: sd s0, 144(sp) # 8-byte Folded Spill 635; RV64-NEXT: sd s1, 136(sp) # 8-byte Folded Spill 636; RV64-NEXT: .cfi_offset ra, -8 637; RV64-NEXT: .cfi_offset s0, -16 638; RV64-NEXT: .cfi_offset s1, -24 639; RV64-NEXT: addi s0, sp, 160 640; RV64-NEXT: .cfi_def_cfa s0, 0 641; RV64-NEXT: csrr a0, vlenb 642; RV64-NEXT: slli a0, a0, 5 643; RV64-NEXT: sub sp, sp, a0 644; RV64-NEXT: andi sp, sp, -128 645; RV64-NEXT: mv s1, sp 646; RV64-NEXT: csrr a0, vlenb 647; RV64-NEXT: slli a0, a0, 3 648; RV64-NEXT: addi sp, sp, -16 649; RV64-NEXT: vsetvli a1, zero, e32, m8, ta, ma 650; RV64-NEXT: vmv.v.i v8, 0 651; RV64-NEXT: addi t0, s1, 128 652; RV64-NEXT: csrr t1, vlenb 653; RV64-NEXT: slli t1, t1, 4 654; RV64-NEXT: add t1, s1, t1 655; RV64-NEXT: addi t1, t1, 128 656; RV64-NEXT: li a7, 8 657; RV64-NEXT: li a1, 1 658; RV64-NEXT: li a2, 2 659; RV64-NEXT: li a3, 3 660; RV64-NEXT: li a4, 4 661; RV64-NEXT: li a5, 5 662; RV64-NEXT: li a6, 6 663; RV64-NEXT: vs8r.v v8, (t0) 664; RV64-NEXT: vs8r.v v8, (t1) 665; RV64-NEXT: sd a7, 0(sp) 666; RV64-NEXT: li a7, 7 667; RV64-NEXT: add t0, t0, a0 668; RV64-NEXT: add a0, t1, a0 669; RV64-NEXT: csrr t3, vlenb 670; RV64-NEXT: slli t3, t3, 4 671; RV64-NEXT: add t3, s1, t3 672; RV64-NEXT: addi t3, t3, 128 673; RV64-NEXT: vs8r.v v8, (t0) 674; RV64-NEXT: addi t5, s1, 128 675; RV64-NEXT: vs8r.v v8, (a0) 676; RV64-NEXT: li a0, 0 677; RV64-NEXT: vmv.v.i v16, 0 678; RV64-NEXT: call vector_arg_indirect_stack 679; RV64-NEXT: addi sp, sp, 16 680; RV64-NEXT: addi sp, s0, -160 681; RV64-NEXT: .cfi_def_cfa sp, 160 682; RV64-NEXT: ld ra, 152(sp) # 8-byte Folded Reload 683; RV64-NEXT: ld s0, 144(sp) # 8-byte Folded Reload 684; RV64-NEXT: ld s1, 136(sp) # 8-byte Folded Reload 685; RV64-NEXT: .cfi_restore ra 686; RV64-NEXT: .cfi_restore s0 687; RV64-NEXT: .cfi_restore s1 688; RV64-NEXT: addi sp, sp, 160 689; RV64-NEXT: .cfi_def_cfa_offset 0 690; RV64-NEXT: ret 691 %s = call fastcc <vscale x 32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, <vscale x 32 x i32> zeroinitializer, i32 8) 692 ret <vscale x 32 x i32> %s 693} 694 695; Test case where we are out of registers for the vector and all GPRs are used. 696define fastcc <vscale x 16 x i32> @vector_arg_indirect_stack_no_gpr(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, <vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) { 697; RV32-LABEL: vector_arg_indirect_stack_no_gpr: 698; RV32: # %bb.0: 699; RV32-NEXT: lw a0, 0(sp) 700; RV32-NEXT: vl8re32.v v16, (a0) 701; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma 702; RV32-NEXT: vadd.vv v8, v8, v16 703; RV32-NEXT: ret 704; 705; RV64-LABEL: vector_arg_indirect_stack_no_gpr: 706; RV64: # %bb.0: 707; RV64-NEXT: ld a0, 0(sp) 708; RV64-NEXT: vl8re32.v v16, (a0) 709; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma 710; RV64-NEXT: vadd.vv v8, v8, v16 711; RV64-NEXT: ret 712 %s = add <vscale x 16 x i32> %x, %z 713 ret <vscale x 16 x i32> %s 714} 715 716; Calling the function above. Ensure we pass the arguments correctly. 717define fastcc <vscale x 16 x i32> @pass_vector_arg_indirect_stack_no_gpr(<vscale x 16 x i32> %x, <vscale x 16 x i32> %y, <vscale x 16 x i32> %z) { 718; RV32-LABEL: pass_vector_arg_indirect_stack_no_gpr: 719; RV32: # %bb.0: 720; RV32-NEXT: addi sp, sp, -80 721; RV32-NEXT: .cfi_def_cfa_offset 80 722; RV32-NEXT: sw ra, 76(sp) # 4-byte Folded Spill 723; RV32-NEXT: sw s0, 72(sp) # 4-byte Folded Spill 724; RV32-NEXT: sw s1, 68(sp) # 4-byte Folded Spill 725; RV32-NEXT: .cfi_offset ra, -4 726; RV32-NEXT: .cfi_offset s0, -8 727; RV32-NEXT: .cfi_offset s1, -12 728; RV32-NEXT: addi s0, sp, 80 729; RV32-NEXT: .cfi_def_cfa s0, 0 730; RV32-NEXT: csrr a0, vlenb 731; RV32-NEXT: slli a0, a0, 3 732; RV32-NEXT: sub sp, sp, a0 733; RV32-NEXT: andi sp, sp, -64 734; RV32-NEXT: mv s1, sp 735; RV32-NEXT: addi sp, sp, -16 736; RV32-NEXT: vsetvli a0, zero, e32, m8, ta, ma 737; RV32-NEXT: vmv.v.i v8, 0 738; RV32-NEXT: addi a0, s1, 64 739; RV32-NEXT: li a1, 1 740; RV32-NEXT: li a2, 2 741; RV32-NEXT: li a3, 3 742; RV32-NEXT: li a4, 4 743; RV32-NEXT: li a5, 5 744; RV32-NEXT: li a6, 6 745; RV32-NEXT: li a7, 7 746; RV32-NEXT: li t3, 8 747; RV32-NEXT: li t4, 9 748; RV32-NEXT: li t5, 10 749; RV32-NEXT: li t6, 11 750; RV32-NEXT: vs8r.v v8, (a0) 751; RV32-NEXT: sw a0, 0(sp) 752; RV32-NEXT: li a0, 0 753; RV32-NEXT: vmv.v.i v16, 0 754; RV32-NEXT: call vector_arg_indirect_stack_no_gpr 755; RV32-NEXT: addi sp, sp, 16 756; RV32-NEXT: addi sp, s0, -80 757; RV32-NEXT: .cfi_def_cfa sp, 80 758; RV32-NEXT: lw ra, 76(sp) # 4-byte Folded Reload 759; RV32-NEXT: lw s0, 72(sp) # 4-byte Folded Reload 760; RV32-NEXT: lw s1, 68(sp) # 4-byte Folded Reload 761; RV32-NEXT: .cfi_restore ra 762; RV32-NEXT: .cfi_restore s0 763; RV32-NEXT: .cfi_restore s1 764; RV32-NEXT: addi sp, sp, 80 765; RV32-NEXT: .cfi_def_cfa_offset 0 766; RV32-NEXT: ret 767; 768; RV64-LABEL: pass_vector_arg_indirect_stack_no_gpr: 769; RV64: # %bb.0: 770; RV64-NEXT: addi sp, sp, -96 771; RV64-NEXT: .cfi_def_cfa_offset 96 772; RV64-NEXT: sd ra, 88(sp) # 8-byte Folded Spill 773; RV64-NEXT: sd s0, 80(sp) # 8-byte Folded Spill 774; RV64-NEXT: sd s1, 72(sp) # 8-byte Folded Spill 775; RV64-NEXT: .cfi_offset ra, -8 776; RV64-NEXT: .cfi_offset s0, -16 777; RV64-NEXT: .cfi_offset s1, -24 778; RV64-NEXT: addi s0, sp, 96 779; RV64-NEXT: .cfi_def_cfa s0, 0 780; RV64-NEXT: csrr a0, vlenb 781; RV64-NEXT: slli a0, a0, 3 782; RV64-NEXT: sub sp, sp, a0 783; RV64-NEXT: andi sp, sp, -64 784; RV64-NEXT: mv s1, sp 785; RV64-NEXT: addi sp, sp, -16 786; RV64-NEXT: vsetvli a0, zero, e32, m8, ta, ma 787; RV64-NEXT: vmv.v.i v8, 0 788; RV64-NEXT: addi a0, s1, 64 789; RV64-NEXT: li a1, 1 790; RV64-NEXT: li a2, 2 791; RV64-NEXT: li a3, 3 792; RV64-NEXT: li a4, 4 793; RV64-NEXT: li a5, 5 794; RV64-NEXT: li a6, 6 795; RV64-NEXT: li a7, 7 796; RV64-NEXT: li t3, 8 797; RV64-NEXT: li t4, 9 798; RV64-NEXT: li t5, 10 799; RV64-NEXT: li t6, 11 800; RV64-NEXT: vs8r.v v8, (a0) 801; RV64-NEXT: sd a0, 0(sp) 802; RV64-NEXT: li a0, 0 803; RV64-NEXT: vmv.v.i v16, 0 804; RV64-NEXT: call vector_arg_indirect_stack_no_gpr 805; RV64-NEXT: addi sp, sp, 16 806; RV64-NEXT: addi sp, s0, -96 807; RV64-NEXT: .cfi_def_cfa sp, 96 808; RV64-NEXT: ld ra, 88(sp) # 8-byte Folded Reload 809; RV64-NEXT: ld s0, 80(sp) # 8-byte Folded Reload 810; RV64-NEXT: ld s1, 72(sp) # 8-byte Folded Reload 811; RV64-NEXT: .cfi_restore ra 812; RV64-NEXT: .cfi_restore s0 813; RV64-NEXT: .cfi_restore s1 814; RV64-NEXT: addi sp, sp, 96 815; RV64-NEXT: .cfi_def_cfa_offset 0 816; RV64-NEXT: ret 817 %s = call fastcc <vscale x 16 x i32> @vector_arg_indirect_stack_no_gpr(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> zeroinitializer) 818 ret <vscale x 16 x i32> %s 819} 820