1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s 3 4define fastcc <4 x i8> @ret_v4i8(ptr %p) { 5; CHECK-LABEL: ret_v4i8: 6; CHECK: # %bb.0: 7; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 8; CHECK-NEXT: vle8.v v8, (a0) 9; CHECK-NEXT: ret 10 %v = load <4 x i8>, ptr %p 11 ret <4 x i8> %v 12} 13 14define fastcc <4 x i32> @ret_v4i32(ptr %p) { 15; CHECK-LABEL: ret_v4i32: 16; CHECK: # %bb.0: 17; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 18; CHECK-NEXT: vle32.v v8, (a0) 19; CHECK-NEXT: ret 20 %v = load <4 x i32>, ptr %p 21 ret <4 x i32> %v 22} 23 24define fastcc <8 x i32> @ret_v8i32(ptr %p) { 25; CHECK-LABEL: ret_v8i32: 26; CHECK: # %bb.0: 27; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 28; CHECK-NEXT: vle32.v v8, (a0) 29; CHECK-NEXT: ret 30 %v = load <8 x i32>, ptr %p 31 ret <8 x i32> %v 32} 33 34define fastcc <16 x i64> @ret_v16i64(ptr %p) { 35; CHECK-LABEL: ret_v16i64: 36; CHECK: # %bb.0: 37; CHECK-NEXT: vsetivli zero, 16, e64, m8, ta, ma 38; CHECK-NEXT: vle64.v v8, (a0) 39; CHECK-NEXT: ret 40 %v = load <16 x i64>, ptr %p 41 ret <16 x i64> %v 42} 43 44define fastcc <8 x i1> @ret_mask_v8i1(ptr %p) { 45; CHECK-LABEL: ret_mask_v8i1: 46; CHECK: # %bb.0: 47; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 48; CHECK-NEXT: vlm.v v0, (a0) 49; CHECK-NEXT: ret 50 %v = load <8 x i1>, ptr %p 51 ret <8 x i1> %v 52} 53 54define fastcc <32 x i1> @ret_mask_v32i1(ptr %p) { 55; CHECK-LABEL: ret_mask_v32i1: 56; CHECK: # %bb.0: 57; CHECK-NEXT: li a1, 32 58; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 59; CHECK-NEXT: vlm.v v0, (a0) 60; CHECK-NEXT: ret 61 %v = load <32 x i1>, ptr %p 62 ret <32 x i1> %v 63} 64 65; Return the vector via registers v8-v23 66define fastcc <64 x i32> @ret_split_v64i32(ptr %x) { 67; CHECK-LABEL: ret_split_v64i32: 68; CHECK: # %bb.0: 69; CHECK-NEXT: li a1, 32 70; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 71; CHECK-NEXT: vle32.v v8, (a0) 72; CHECK-NEXT: addi a0, a0, 128 73; CHECK-NEXT: vle32.v v16, (a0) 74; CHECK-NEXT: ret 75 %v = load <64 x i32>, ptr %x 76 ret <64 x i32> %v 77} 78 79; Return the vector fully via the stack 80define fastcc <128 x i32> @ret_split_v128i32(ptr %x) { 81; CHECK-LABEL: ret_split_v128i32: 82; CHECK: # %bb.0: 83; CHECK-NEXT: addi a2, a1, 128 84; CHECK-NEXT: li a3, 32 85; CHECK-NEXT: vsetvli zero, a3, e32, m8, ta, ma 86; CHECK-NEXT: vle32.v v8, (a2) 87; CHECK-NEXT: addi a2, a1, 256 88; CHECK-NEXT: vle32.v v16, (a2) 89; CHECK-NEXT: addi a2, a1, 384 90; CHECK-NEXT: vle32.v v24, (a1) 91; CHECK-NEXT: addi a1, a0, 384 92; CHECK-NEXT: vle32.v v0, (a2) 93; CHECK-NEXT: addi a2, a0, 256 94; CHECK-NEXT: vse32.v v24, (a0) 95; CHECK-NEXT: addi a0, a0, 128 96; CHECK-NEXT: vse32.v v0, (a1) 97; CHECK-NEXT: vse32.v v16, (a2) 98; CHECK-NEXT: vse32.v v8, (a0) 99; CHECK-NEXT: ret 100 %v = load <128 x i32>, ptr %x 101 ret <128 x i32> %v 102} 103 104define fastcc <4 x i8> @ret_v8i8_param_v4i8(<4 x i8> %v) { 105; CHECK-LABEL: ret_v8i8_param_v4i8: 106; CHECK: # %bb.0: 107; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 108; CHECK-NEXT: vadd.vi v8, v8, 2 109; CHECK-NEXT: ret 110 %r = add <4 x i8> %v, <i8 2, i8 2, i8 2, i8 2> 111 ret <4 x i8> %r 112} 113 114define fastcc <4 x i8> @ret_v4i8_param_v4i8_v4i8(<4 x i8> %v, <4 x i8> %w) { 115; CHECK-LABEL: ret_v4i8_param_v4i8_v4i8: 116; CHECK: # %bb.0: 117; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 118; CHECK-NEXT: vadd.vv v8, v8, v9 119; CHECK-NEXT: ret 120 %r = add <4 x i8> %v, %w 121 ret <4 x i8> %r 122} 123 124define fastcc <4 x i64> @ret_v4i64_param_v4i64_v4i64(<4 x i64> %v, <4 x i64> %w) { 125; CHECK-LABEL: ret_v4i64_param_v4i64_v4i64: 126; CHECK: # %bb.0: 127; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 128; CHECK-NEXT: vadd.vv v8, v8, v10 129; CHECK-NEXT: ret 130 %r = add <4 x i64> %v, %w 131 ret <4 x i64> %r 132} 133 134define fastcc <8 x i1> @ret_v8i1_param_v8i1_v8i1(<8 x i1> %v, <8 x i1> %w) { 135; CHECK-LABEL: ret_v8i1_param_v8i1_v8i1: 136; CHECK: # %bb.0: 137; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 138; CHECK-NEXT: vmxor.mm v0, v0, v8 139; CHECK-NEXT: ret 140 %r = xor <8 x i1> %v, %w 141 ret <8 x i1> %r 142} 143 144define fastcc <32 x i1> @ret_v32i1_param_v32i1_v32i1(<32 x i1> %v, <32 x i1> %w) { 145; CHECK-LABEL: ret_v32i1_param_v32i1_v32i1: 146; CHECK: # %bb.0: 147; CHECK-NEXT: li a0, 32 148; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma 149; CHECK-NEXT: vmand.mm v0, v0, v8 150; CHECK-NEXT: ret 151 %r = and <32 x i1> %v, %w 152 ret <32 x i1> %r 153} 154 155define fastcc <32 x i32> @ret_v32i32_param_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { 156; CHECK-LABEL: ret_v32i32_param_v32i32_v32i32_v32i32_i32: 157; CHECK: # %bb.0: 158; CHECK-NEXT: li a2, 32 159; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma 160; CHECK-NEXT: vle32.v v24, (a0) 161; CHECK-NEXT: vadd.vv v8, v8, v16 162; CHECK-NEXT: vadd.vv v8, v8, v24 163; CHECK-NEXT: vadd.vx v8, v8, a1 164; CHECK-NEXT: ret 165 %r = add <32 x i32> %x, %y 166 %s = add <32 x i32> %r, %z 167 %head = insertelement <32 x i32> poison, i32 %w, i32 0 168 %splat = shufflevector <32 x i32> %head, <32 x i32> poison, <32 x i32> zeroinitializer 169 %t = add <32 x i32> %s, %splat 170 ret <32 x i32> %t 171} 172 173declare <32 x i32> @ext2(<32 x i32>, <32 x i32>, i32, i32) 174declare <32 x i32> @ext3(<32 x i32>, <32 x i32>, <32 x i32>, i32, i32) 175 176define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, i32 %w) { 177; CHECK-LABEL: ret_v32i32_call_v32i32_v32i32_i32: 178; CHECK: # %bb.0: 179; CHECK-NEXT: addi sp, sp, -16 180; CHECK-NEXT: .cfi_def_cfa_offset 16 181; CHECK-NEXT: sd ra, 8(sp) # 8-byte Folded Spill 182; CHECK-NEXT: .cfi_offset ra, -8 183; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 184; CHECK-NEXT: vmv8r.v v24, v8 185; CHECK-NEXT: li a1, 2 186; CHECK-NEXT: vmv8r.v v8, v16 187; CHECK-NEXT: vmv8r.v v16, v24 188; CHECK-NEXT: call ext2 189; CHECK-NEXT: ld ra, 8(sp) # 8-byte Folded Reload 190; CHECK-NEXT: .cfi_restore ra 191; CHECK-NEXT: addi sp, sp, 16 192; CHECK-NEXT: .cfi_def_cfa_offset 0 193; CHECK-NEXT: ret 194 %t = call fastcc <32 x i32> @ext2(<32 x i32> %y, <32 x i32> %x, i32 %w, i32 2) 195 ret <32 x i32> %t 196} 197 198define fastcc <32 x i32> @ret_v32i32_call_v32i32_v32i32_v32i32_i32(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %w) { 199; CHECK-LABEL: ret_v32i32_call_v32i32_v32i32_v32i32_i32: 200; CHECK: # %bb.0: 201; CHECK-NEXT: addi sp, sp, -256 202; CHECK-NEXT: .cfi_def_cfa_offset 256 203; CHECK-NEXT: sd ra, 248(sp) # 8-byte Folded Spill 204; CHECK-NEXT: sd s0, 240(sp) # 8-byte Folded Spill 205; CHECK-NEXT: .cfi_offset ra, -8 206; CHECK-NEXT: .cfi_offset s0, -16 207; CHECK-NEXT: addi s0, sp, 256 208; CHECK-NEXT: .cfi_def_cfa s0, 0 209; CHECK-NEXT: andi sp, sp, -128 210; CHECK-NEXT: li a2, 32 211; CHECK-NEXT: vsetvli zero, a2, e32, m8, ta, ma 212; CHECK-NEXT: vle32.v v24, (a0) 213; CHECK-NEXT: mv a3, sp 214; CHECK-NEXT: mv a0, sp 215; CHECK-NEXT: li a2, 42 216; CHECK-NEXT: vse32.v v8, (a3) 217; CHECK-NEXT: vmv.v.v v8, v24 218; CHECK-NEXT: call ext3 219; CHECK-NEXT: addi sp, s0, -256 220; CHECK-NEXT: .cfi_def_cfa sp, 256 221; CHECK-NEXT: ld ra, 248(sp) # 8-byte Folded Reload 222; CHECK-NEXT: ld s0, 240(sp) # 8-byte Folded Reload 223; CHECK-NEXT: .cfi_restore ra 224; CHECK-NEXT: .cfi_restore s0 225; CHECK-NEXT: addi sp, sp, 256 226; CHECK-NEXT: .cfi_def_cfa_offset 0 227; CHECK-NEXT: ret 228 %t = call fastcc <32 x i32> @ext3(<32 x i32> %z, <32 x i32> %y, <32 x i32> %x, i32 %w, i32 42) 229 ret <32 x i32> %t 230} 231 232; A test case where the normal calling convention would pass directly via the 233; stack, but with fastcc can pass indirectly with the extra GPR registers 234; allowed. 235define fastcc <32 x i32> @vector_arg_indirect_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %8) { 236; CHECK-LABEL: vector_arg_indirect_stack: 237; CHECK: # %bb.0: 238; CHECK-NEXT: li a0, 32 239; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma 240; CHECK-NEXT: vle32.v v16, (t3) 241; CHECK-NEXT: vadd.vv v8, v8, v16 242; CHECK-NEXT: ret 243 %s = add <32 x i32> %x, %z 244 ret <32 x i32> %s 245} 246 247; Calling the function above. Ensure we pass the arguments correctly. 248define fastcc <32 x i32> @pass_vector_arg_indirect_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { 249; CHECK-LABEL: pass_vector_arg_indirect_stack: 250; CHECK: # %bb.0: 251; CHECK-NEXT: addi sp, sp, -256 252; CHECK-NEXT: .cfi_def_cfa_offset 256 253; CHECK-NEXT: sd ra, 248(sp) # 8-byte Folded Spill 254; CHECK-NEXT: sd s0, 240(sp) # 8-byte Folded Spill 255; CHECK-NEXT: .cfi_offset ra, -8 256; CHECK-NEXT: .cfi_offset s0, -16 257; CHECK-NEXT: addi s0, sp, 256 258; CHECK-NEXT: .cfi_def_cfa s0, 0 259; CHECK-NEXT: andi sp, sp, -128 260; CHECK-NEXT: li a0, 32 261; CHECK-NEXT: mv t0, sp 262; CHECK-NEXT: li a1, 1 263; CHECK-NEXT: li a2, 2 264; CHECK-NEXT: li a3, 3 265; CHECK-NEXT: li a4, 4 266; CHECK-NEXT: li a5, 5 267; CHECK-NEXT: li a6, 6 268; CHECK-NEXT: li a7, 7 269; CHECK-NEXT: mv t3, sp 270; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma 271; CHECK-NEXT: vmv.v.i v8, 0 272; CHECK-NEXT: li t4, 8 273; CHECK-NEXT: vse32.v v8, (t0) 274; CHECK-NEXT: li a0, 0 275; CHECK-NEXT: vmv.v.i v16, 0 276; CHECK-NEXT: call vector_arg_indirect_stack 277; CHECK-NEXT: addi sp, s0, -256 278; CHECK-NEXT: .cfi_def_cfa sp, 256 279; CHECK-NEXT: ld ra, 248(sp) # 8-byte Folded Reload 280; CHECK-NEXT: ld s0, 240(sp) # 8-byte Folded Reload 281; CHECK-NEXT: .cfi_restore ra 282; CHECK-NEXT: .cfi_restore s0 283; CHECK-NEXT: addi sp, sp, 256 284; CHECK-NEXT: .cfi_def_cfa_offset 0 285; CHECK-NEXT: ret 286 %s = call fastcc <32 x i32> @vector_arg_indirect_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 8) 287 ret <32 x i32> %s 288} 289 290; A pathological test case where even with fastcc we must use the stack for arguments %13 and %z 291define fastcc <32 x i32> @vector_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, i32 %last) { 292; CHECK-LABEL: vector_arg_direct_stack: 293; CHECK: # %bb.0: 294; CHECK-NEXT: li a0, 32 295; CHECK-NEXT: addi a1, sp, 16 296; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma 297; CHECK-NEXT: vle32.v v24, (a1) 298; CHECK-NEXT: vadd.vv v8, v8, v16 299; CHECK-NEXT: vadd.vv v8, v8, v24 300; CHECK-NEXT: ret 301 %s = add <32 x i32> %x, %y 302 %t = add <32 x i32> %s, %z 303 ret <32 x i32> %t 304} 305 306; Calling the function above. Ensure we pass the arguments correctly. 307define fastcc <32 x i32> @pass_vector_arg_direct_stack(<32 x i32> %x, <32 x i32> %y, <32 x i32> %z) { 308; CHECK-LABEL: pass_vector_arg_direct_stack: 309; CHECK: # %bb.0: 310; CHECK-NEXT: addi sp, sp, -176 311; CHECK-NEXT: .cfi_def_cfa_offset 176 312; CHECK-NEXT: sd ra, 168(sp) # 8-byte Folded Spill 313; CHECK-NEXT: sd s0, 160(sp) # 8-byte Folded Spill 314; CHECK-NEXT: .cfi_offset ra, -8 315; CHECK-NEXT: .cfi_offset s0, -16 316; CHECK-NEXT: li a0, 32 317; CHECK-NEXT: addi t0, sp, 16 318; CHECK-NEXT: li t1, 1 319; CHECK-NEXT: li t2, 13 320; CHECK-NEXT: li s0, 12 321; CHECK-NEXT: li a1, 1 322; CHECK-NEXT: li a2, 2 323; CHECK-NEXT: li a3, 3 324; CHECK-NEXT: li a4, 4 325; CHECK-NEXT: li a5, 5 326; CHECK-NEXT: li a6, 6 327; CHECK-NEXT: li a7, 7 328; CHECK-NEXT: li t3, 8 329; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma 330; CHECK-NEXT: vmv.v.i v8, 0 331; CHECK-NEXT: vse32.v v8, (t0) 332; CHECK-NEXT: li t4, 9 333; CHECK-NEXT: li t5, 10 334; CHECK-NEXT: sd t1, 144(sp) 335; CHECK-NEXT: li t6, 11 336; CHECK-NEXT: sd s0, 0(sp) 337; CHECK-NEXT: sd t2, 8(sp) 338; CHECK-NEXT: li a0, 0 339; CHECK-NEXT: vmv.v.i v16, 0 340; CHECK-NEXT: call vector_arg_direct_stack 341; CHECK-NEXT: ld ra, 168(sp) # 8-byte Folded Reload 342; CHECK-NEXT: ld s0, 160(sp) # 8-byte Folded Reload 343; CHECK-NEXT: .cfi_restore ra 344; CHECK-NEXT: .cfi_restore s0 345; CHECK-NEXT: addi sp, sp, 176 346; CHECK-NEXT: .cfi_def_cfa_offset 0 347; CHECK-NEXT: ret 348 %s = call fastcc <32 x i32> @vector_arg_direct_stack(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, <32 x i32> zeroinitializer, i32 1) 349 ret <32 x i32> %s 350} 351 352; A pathological test case where even with fastcc we must use the stack for 353; mask argument %m2. %m1 is passed via v0. 354define fastcc <4 x i1> @vector_mask_arg_direct_stack(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12, i32 %13, <32 x i32> %x, <32 x i32> %y, <32 x i32> %z, <4 x i1> %m1, <4 x i1> %m2, i32 %last) { 355; CHECK-LABEL: vector_mask_arg_direct_stack: 356; CHECK: # %bb.0: 357; CHECK-NEXT: addi a0, sp, 144 358; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 359; CHECK-NEXT: vlm.v v8, (a0) 360; CHECK-NEXT: vmxor.mm v0, v0, v8 361; CHECK-NEXT: ret 362 %r = xor <4 x i1> %m1, %m2 363 ret <4 x i1> %r 364} 365