1 // RUN: %clang_cc1 -O3 -triple aarch64 -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-AAPCS 2 // RUN: %clang_cc1 -O3 -triple arm64-apple-ios7.0 -target-abi darwinpcs -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DARWIN 3 // RUN: %clang_cc1 -O3 -triple aarch64-linux-gnu -target-feature +sve -target-feature +sve2p1 -mvscale-min=1 -mvscale-max=1 -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-AAPCS 4 5 // REQUIRES: aarch64-registered-target 6 7 #include <arm_neon.h> 8 #include <arm_sve.h> 9 #include <stdarg.h> 10 11 typedef svfloat32_t fvec32 __attribute__((arm_sve_vector_bits(128))); 12 typedef svfloat64_t fvec64 __attribute__((arm_sve_vector_bits(128))); 13 typedef svbool_t bvec __attribute__((arm_sve_vector_bits(128))); 14 typedef svmfloat8_t mfvec8 __attribute__((arm_sve_vector_bits(128))); 15 16 typedef struct { 17 float f[4]; 18 } HFA; 19 20 typedef struct { 21 mfloat8x16_t f[4]; 22 } HVA; 23 24 // Pure Scalable Type, needs 4 Z-regs, 2 P-regs 25 typedef struct { 26 bvec a; 27 fvec64 x; 28 fvec32 y[2]; 29 mfvec8 z; 30 bvec b; 31 } PST; 32 33 // Pure Scalable Type, 1 Z-reg 34 typedef struct { 35 fvec32 x; 36 } SmallPST; 37 38 // Big PST, does not fit in registers. 39 typedef struct { 40 struct { 41 bvec a; 42 fvec32 x[4]; 43 } u[2]; 44 fvec64 v; 45 } BigPST; 46 47 // A small aggregate type 48 typedef struct { 49 char data[16]; 50 } SmallAgg; 51 52 // CHECK: %struct.PST = type { <2 x i8>, <2 x double>, [2 x <4 x float>], <16 x i8>, <2 x i8> } 53 54 // Test argument passing of Pure Scalable Types by examining the generated 55 // LLVM IR function declarations. A PST argument in C/C++ should map to: 56 // a) an `ptr` argument, if passed indirectly through memory 57 // b) a series of scalable vector arguments, if passed via registers 58 59 // Simple argument passing, PST expanded into registers. 60 // a -> p0 61 // b -> p1 62 // x -> q0 63 // y[0] -> q1 64 // y[1] -> q2 65 // z -> q3 66 void test_argpass_simple(PST *p) { 67 void argpass_simple_callee(PST); 68 argpass_simple_callee(*p); 69 } 70 // CHECK-AAPCS: define dso_local void @test_argpass_simple(ptr noundef readonly captures(none) %p) 71 // CHECK-AAPCS-NEXT: entry: 72 // CHECK-AAPCS-NEXT: %0 = load <2 x i8>, ptr %p, align 16 73 // CHECK-AAPCS-NEXT: %cast.scalable = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> %0, i64 0) 74 // CHECK-AAPCS-NEXT: %1 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1> 75 // CHECK-AAPCS-NEXT: %2 = getelementptr inbounds nuw i8, ptr %p, i64 16 76 // CHECK-AAPCS-NEXT: %3 = load <2 x double>, ptr %2, align 16 77 // CHECK-AAPCS-NEXT: %cast.scalable1 = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> poison, <2 x double> %3, i64 0) 78 // CHECK-AAPCS-NEXT: %4 = getelementptr inbounds nuw i8, ptr %p, i64 32 79 // CHECK-AAPCS-NEXT: %5 = load <4 x float>, ptr %4, align 16 80 // CHECK-AAPCS-NEXT: %cast.scalable2 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %5, i64 0) 81 // CHECK-AAPCS-NEXT: %6 = getelementptr inbounds nuw i8, ptr %p, i64 48 82 // CHECK-AAPCS-NEXT: %7 = load <4 x float>, ptr %6, align 16 83 // CHECK-AAPCS-NEXT: %cast.scalable3 = tail call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %7, i64 0) 84 // CHECK-AAPCS-NEXT: %8 = getelementptr inbounds nuw i8, ptr %p, i64 64 85 // CHECK-AAPCS-NEXT: %9 = load <16 x i8>, ptr %8, align 16 86 // CHECK-AAPCS-NEXT: %cast.scalable4 = tail call <vscale x 16 x i8> @llvm.vector.insert.nxv16i8.v16i8(<vscale x 16 x i8> poison, <16 x i8> %9, i64 0) 87 // CHECK-AAPCS-NEXT: %10 = getelementptr inbounds nuw i8, ptr %p, i64 80 88 // CHECK-AAPCS-NEXT: %11 = load <2 x i8>, ptr %10, align 16 89 // CHECK-AAPCS-NEXT: %cast.scalable5 = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> %11, i64 0) 90 // CHECK-AAPCS-NEXT: %12 = bitcast <vscale x 2 x i8> %cast.scalable5 to <vscale x 16 x i1> 91 // CHECK-AAPCS-NEXT: tail call void @argpass_simple_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12) 92 // CHECK-AAPCS-NEXT: ret void 93 94 // CHECK-AAPCS: declare void @argpass_simple_callee(<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>) 95 // CHECK-DARWIN: declare void @argpass_simple_callee(ptr noundef) 96 97 // Boundary case of using the last available Z-reg, PST expanded. 98 // 0.0 -> d0-d3 99 // a -> p0 100 // b -> p1 101 // x -> q4 102 // y[0] -> q5 103 // y[1] -> q6 104 // z -> q7 105 void test_argpass_last_z(PST *p) { 106 void argpass_last_z_callee(double, double, double, double, PST); 107 argpass_last_z_callee(.0, .0, .0, .0, *p); 108 } 109 // CHECK-AAPCS: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>) 110 // CHECK-DARWIN: declare void @argpass_last_z_callee(double noundef, double noundef, double noundef, double noundef, ptr noundef) 111 112 113 // Like the above, but using a tuple type to occupy some registers. 114 // x -> z0.d-z3.d 115 // a -> p0 116 // b -> p1 117 // x -> q4 118 // y[0] -> q5 119 // y[1] -> q6 120 // z -> q7 121 void test_argpass_last_z_tuple(PST *p, svfloat64x4_t x) { 122 void argpass_last_z_tuple_callee(svfloat64x4_t, PST); 123 argpass_last_z_tuple_callee(x, *p); 124 } 125 // CHECK-AAPCS: declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>) 126 // CHECK-DARWIN: declare void @argpass_last_z_tuple_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, ptr noundef) 127 128 129 // Boundary case of using the last available P-reg, PST expanded. 130 // false -> p0-p1 131 // a -> p2 132 // b -> p3 133 // x -> q0 134 // y[0] -> q1 135 // y[1] -> q2 136 // z -> q3 137 void test_argpass_last_p(PST *p) { 138 void argpass_last_p_callee(svbool_t, svcount_t, PST); 139 argpass_last_p_callee(svpfalse(), svpfalse_c(), *p); 140 } 141 // CHECK-AAPCS: declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>) 142 // CHECK-DARWIN: declare void @argpass_last_p_callee(<vscale x 16 x i1>, target("aarch64.svcount"), ptr noundef) 143 144 145 // Not enough Z-regs, push PST to memory and pass a pointer, Z-regs and 146 // P-regs still available for other arguments 147 // u -> z0 148 // v -> q1 149 // w -> q2 150 // 0.0 -> d3-d4 151 // 1 -> w0 152 // *p -> memory, address -> x1 153 // 2 -> w2 154 // 3.0 -> d5 155 // true -> p0 156 void test_argpass_no_z(PST *p, double dummy, svmfloat8_t u, int8x16_t v, mfloat8x16_t w) { 157 void argpass_no_z_callee(svmfloat8_t, int8x16_t, mfloat8x16_t, double, double, int, PST, int, double, svbool_t); 158 argpass_no_z_callee(u, v, w, .0, .0, 1, *p, 2, 3.0, svptrue_b64()); 159 } 160 // CHECK: declare void @argpass_no_z_callee(<vscale x 16 x i8>, <16 x i8> noundef, <16 x i8>, double noundef, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>) 161 162 163 // Like the above, using a tuple to occupy some registers. 164 // x -> z0.d-z3.d 165 // 0.0 -> d4 166 // 1 -> w0 167 // *p -> memory, address -> x1 168 // 2 -> w2 169 // 3.0 -> d5 170 // true -> p0 171 void test_argpass_no_z_tuple_f64(PST *p, float dummy, svfloat64x4_t x) { 172 void argpass_no_z_tuple_f64_callee(svfloat64x4_t, double, int, PST, int, 173 double, svbool_t); 174 argpass_no_z_tuple_f64_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64()); 175 } 176 // CHECK: declare void @argpass_no_z_tuple_f64_callee(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x double>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>) 177 178 179 // Likewise, using a different tuple. 180 // x -> z0.d-z3.d 181 // 0.0 -> d4 182 // 1 -> w0 183 // *p -> memory, address -> x1 184 // 2 -> w2 185 // 3.0 -> d5 186 // true -> p0 187 void test_argpass_no_z_tuple_mfp8(PST *p, float dummy, svmfloat8x4_t x) { 188 void argpass_no_z_tuple_mfp8_callee(svmfloat8x4_t, double, int, PST, int, 189 double, svbool_t); 190 argpass_no_z_tuple_mfp8_callee(x, .0, 1, *p, 2, 3.0, svptrue_b64()); 191 } 192 // CHECK: declare void @argpass_no_z_tuple_mfp8_callee(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, double noundef, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>) 193 194 195 // Not enough Z-regs (consumed by a HFA), PST passed indirectly 196 // 0.0 -> d0 197 // *h -> s1-s4 198 // 1 -> w0 199 // *p -> memory, address -> x1 200 // p -> x1 201 // 2 -> w2 202 // true -> p0 203 void test_argpass_no_z_hfa(HFA *h, PST *p) { 204 void argpass_no_z_hfa_callee(double, HFA, int, PST, int, svbool_t); 205 argpass_no_z_hfa_callee(.0, *h, 1, *p, 2, svptrue_b64()); 206 } 207 // CHECK-AAPCS: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float] alignstack(8), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>) 208 // CHECK-DARWIN: declare void @argpass_no_z_hfa_callee(double noundef, [4 x float], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>) 209 210 // Not enough Z-regs (consumed by a HVA), PST passed indirectly 211 // 0.0 -> d0 212 // *h -> s1-s4 213 // 1 -> w0 214 // *p -> memory, address -> x1 215 // p -> x1 216 // 2 -> w2 217 // true -> p0 218 void test_argpass_no_z_hva(HVA *h, PST *p) { 219 void argpass_no_z_hva_callee(double, HVA, int, PST, int, svbool_t); 220 argpass_no_z_hva_callee(.0, *h, 1, *p, 2, svptrue_b64()); 221 } 222 // CHECK-AAPCS: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>] alignstack(16), i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>) 223 // CHECK-DARWIN: declare void @argpass_no_z_hva_callee(double noundef, [4 x <16 x i8>], i32 noundef, ptr noundef, i32 noundef, <vscale x 16 x i1>) 224 225 // Not enough P-regs, PST passed indirectly, Z-regs and P-regs still available. 226 // true -> p0-p2 227 // 1 -> w0 228 // *p -> memory, address -> x1 229 // 2 -> w2 230 // 3.0 -> d0 231 // true -> p3 232 void test_argpass_no_p(PST *p) { 233 void argpass_no_p_callee(svbool_t, svbool_t, svbool_t, int, PST, int, double, svbool_t); 234 argpass_no_p_callee(svptrue_b8(), svptrue_b16(), svptrue_b32(), 1, *p, 2, 3.0, svptrue_b64()); 235 } 236 // CHECK: declare void @argpass_no_p_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>) 237 238 239 // Like above, using a tuple to occupy some registers. 240 // P-regs still available. 241 // v -> p0-p1 242 // u -> p2 243 // 1 -> w0 244 // *p -> memory, address -> x1 245 // 2 -> w2 246 // 3.0 -> d0 247 // true -> p3 248 void test_argpass_no_p_tuple(PST *p, svbool_t u, svboolx2_t v) { 249 void argpass_no_p_tuple_callee(svboolx2_t, svbool_t, int, PST, int, double, 250 svbool_t); 251 argpass_no_p_tuple_callee(v, u, 1, *p, 2, 3.0, svptrue_b64()); 252 } 253 // CHECK: declare void @argpass_no_p_tuple_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, i32 noundef, ptr noundef, i32 noundef, double noundef, <vscale x 16 x i1>) 254 255 256 // HFAs go back-to-back to memory, afterwards Z-regs not available, PST passed indirectly. 257 // 0.0 -> d0-d3 258 // *h -> memory 259 // *p -> memory, address -> x0 260 // *h -> memory 261 // false -> p0 262 void test_after_hfa(HFA *h, PST *p) { 263 void after_hfa_callee(double, double, double, double, double, HFA, PST, HFA, svbool_t); 264 after_hfa_callee(.0, .0, .0, .0, .0, *h, *p, *h, svpfalse()); 265 } 266 // CHECK-AAPCS: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float] alignstack(8), ptr noundef, [4 x float] alignstack(8), <vscale x 16 x i1>) 267 // CHECK-DARWIN: declare void @after_hfa_callee(double noundef, double noundef, double noundef, double noundef, double noundef, [4 x float], ptr noundef, [4 x float], <vscale x 16 x i1>) 268 269 // Small PST, not enough registers, passed indirectly, unlike other small 270 // aggregates. 271 // *s -> x0-x1 272 // 0.0 -> d0-d7 273 // *p -> memory, address -> x2 274 // 1.0 -> memory 275 // 2.0 -> memory (next to the above) 276 void test_small_pst(SmallPST *p, SmallAgg *s) { 277 void small_pst_callee(SmallAgg, double, double, double, double, double, double, double, double, double, SmallPST, double); 278 small_pst_callee(*s, .0, .0, .0, .0, .0, .0, .0, .0, 1.0, *p, 2.0); 279 } 280 // CHECK-AAPCS: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, ptr noundef, double noundef) 281 // CHECK-DARWIN: declare void @small_pst_callee([2 x i64], double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, double noundef, i128, double noundef) 282 283 284 // Simple return, PST expanded to registers 285 // p->a -> p0 286 // p->x -> q0 287 // p->y[0] -> q1 288 // p->y[1] -> q2 289 // p->z -> q3 290 // p->b -> p1 291 PST test_return(PST *p) { 292 return *p; 293 } 294 // CHECK-AAPCS: define dso_local <{ <vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1> }> @test_return(ptr 295 // CHECK-DARWIN: define void @test_return(ptr dead_on_unwind noalias writable writeonly sret(%struct.PST) align 16 captures(none) initializes((0, 96)) %agg.result, ptr noundef readonly captures(none) %p) 296 297 // Corner case of 1-element aggregate 298 // p->x -> q0 299 SmallPST test_return_small_pst(SmallPST *p) { 300 return *p; 301 } 302 // CHECK-AAPCS: define dso_local <vscale x 4 x float> @test_return_small_pst(ptr 303 // CHECK-DARWIN: define i128 @test_return_small_pst(ptr noundef readonly captures(none) %p) 304 305 306 // Big PST, returned indirectly 307 // *p -> *x8 308 BigPST test_return_big_pst(BigPST *p) { 309 return *p; 310 } 311 // CHECK-AAPCS: define dso_local void @test_return_big_pst(ptr dead_on_unwind noalias writable writeonly sret(%struct.BigPST) align 16 captures(none) initializes((0, 176)) %agg.result, ptr noundef readonly captures(none) %p) 312 // CHECK-DARWIN: define void @test_return_big_pst(ptr dead_on_unwind noalias writable writeonly sret(%struct.BigPST) align 16 captures(none) initializes((0, 176)) %agg.result, ptr noundef readonly captures(none) %p) 313 314 // Variadic arguments are unnamed, PST passed indirectly. 315 // (Passing SVE types to a variadic function currently unsupported by 316 // the AArch64 backend) 317 // p->a -> p0 318 // p->x -> q0 319 // p->y[0] -> q1 320 // p->y[1] -> q2 321 // p->z -> q3 322 // p->b -> p1 323 // *q -> memory, address -> x1 324 void test_pass_variadic(PST *p, PST *q) { 325 void pass_variadic_callee(PST, ...); 326 pass_variadic_callee(*p, *q); 327 } 328 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false) 329 // CHECK-AAPCS: call void (<vscale x 16 x i1>, <vscale x 2 x double>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 16 x i8>, <vscale x 16 x i1>, ...) @pass_variadic_callee(<vscale x 16 x i1> %1, <vscale x 2 x double> %cast.scalable1, <vscale x 4 x float> %cast.scalable2, <vscale x 4 x float> %cast.scalable3, <vscale x 16 x i8> %cast.scalable4, <vscale x 16 x i1> %12, ptr noundef nonnull %byval-temp) 330 331 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp, ptr noundef nonnull align 16 dereferenceable(96) %p, i64 96, i1 false) 332 // CHECK-DARWIN: call void @llvm.lifetime.start.p0(i64 96, ptr nonnull %byval-temp1) 333 // CHECK-DARWIN: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(96) %byval-temp1, ptr noundef nonnull align 16 dereferenceable(96) %q, i64 96, i1 false) 334 // CHECK-DARWIN: call void (ptr, ...) @pass_variadic_callee(ptr noundef nonnull %byval-temp, ptr noundef nonnull %byval-temp1) 335 336 337 // Test passing a small PST, still passed indirectly, despite being <= 128 bits 338 void test_small_pst_variadic(SmallPST *p) { 339 void small_pst_variadic_callee(int, ...); 340 small_pst_variadic_callee(0, *p); 341 } 342 // CHECK-AAPCS: call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 16 dereferenceable(16) %byval-temp, ptr noundef nonnull align 16 dereferenceable(16) %p, i64 16, i1 false) 343 // CHECK-AAPCS: call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, ptr noundef nonnull %byval-temp) 344 345 // CHECK-DARWIN: %0 = load i128, ptr %p, align 16 346 // CHECK-DARWIN: tail call void (i32, ...) @small_pst_variadic_callee(i32 noundef 0, i128 %0) 347 348 // Test handling of a PST argument when passed in registers, from the callee side. 349 void test_argpass_callee_side(PST v) { 350 void use(PST *p); 351 use(&v); 352 } 353 // CHECK-AAPCS: define dso_local void @test_argpass_callee_side(<vscale x 16 x i1> %0, <vscale x 2 x double> %.coerce1, <vscale x 4 x float> %.coerce3, <vscale x 4 x float> %.coerce5, <vscale x 16 x i8> %.coerce7, <vscale x 16 x i1> %1) 354 // CHECK-AAPCS-NEXT: entry: 355 // CHECK-AAPCS-NEXT: %v = alloca %struct.PST, align 16 356 // CHECK-AAPCS-NEXT: %.coerce = bitcast <vscale x 16 x i1> %0 to <vscale x 2 x i8> 357 // CHECK-AAPCS-NEXT: %cast.fixed = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %.coerce, i64 0) 358 // CHECK-AAPCS-NEXT: store <2 x i8> %cast.fixed, ptr %v, align 16 359 // CHECK-AAPCS-NEXT: %2 = getelementptr inbounds nuw i8, ptr %v, i64 16 360 // CHECK-AAPCS-NEXT: %cast.fixed2 = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> %.coerce1, i64 0) 361 // CHECK-AAPCS-NEXT: store <2 x double> %cast.fixed2, ptr %2, align 16 362 // CHECK-AAPCS-NEXT: %3 = getelementptr inbounds nuw i8, ptr %v, i64 32 363 // CHECK-AAPCS-NEXT: %cast.fixed4 = tail call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> %.coerce3, i64 0) 364 // CHECK-AAPCS-NEXT: store <4 x float> %cast.fixed4, ptr %3, align 16 365 // CHECK-AAPCS-NEXT: %4 = getelementptr inbounds nuw i8, ptr %v, i64 48 366 // CHECK-AAPCS-NEXT: %cast.fixed6 = tail call <4 x float> @llvm.vector.extract.v4f32.nxv4f32(<vscale x 4 x float> %.coerce5, i64 0) 367 // CHECK-AAPCS-NEXT: store <4 x float> %cast.fixed6, ptr %4, align 16 368 // CHECK-AAPCS-NEXT: %5 = getelementptr inbounds nuw i8, ptr %v, i64 64 369 // CHECK-AAPCS-NEXT: %cast.fixed8 = tail call <16 x i8> @llvm.vector.extract.v16i8.nxv16i8(<vscale x 16 x i8> %.coerce7, i64 0) 370 // CHECK-AAPCS-NEXT: store <16 x i8> %cast.fixed8, ptr %5, align 16 371 // CHECK-AAPCS-NEXT: %6 = getelementptr inbounds nuw i8, ptr %v, i64 80 372 // CHECK-AAPCS-NEXT: %.coerce9 = bitcast <vscale x 16 x i1> %1 to <vscale x 2 x i8> 373 // CHECK-AAPCS-NEXT: %cast.fixed10 = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> %.coerce9, i64 0) 374 // CHECK-AAPCS-NEXT: store <2 x i8> %cast.fixed10, ptr %6, align 16 375 // CHECK-AAPCS-NEXT: call void @use(ptr noundef nonnull %v) 376 // CHECK-AAPCS-NEXT: ret void 377 // CHECK-AAPCS-NEXT: } 378 379 // Test va_arg operation 380 #ifdef __cplusplus 381 extern "C" 382 #endif 383 void test_va_arg(int n, ...) { 384 va_list ap; 385 va_start(ap, n); 386 PST v = va_arg(ap, PST); 387 va_end(ap); 388 389 void use1(bvec, fvec32); 390 use1(v.a, v.y[1]); 391 } 392 // CHECK-AAPCS: define dso_local void @test_va_arg(i32 noundef %n, ...) 393 // CHECK-AAPCS-NEXT: entry: 394 // CHECK-AAPCS-NEXT: %ap = alloca %struct.__va_list, align 8 395 // CHECK-AAPCS-NEXT: call void @llvm.lifetime.start.p0(i64 32, ptr nonnull %ap) 396 // CHECK-AAPCS-NEXT: call void @llvm.va_start.p0(ptr nonnull %ap) 397 // CHECK-AAPCS-NEXT: %gr_offs_p = getelementptr inbounds nuw i8, ptr %ap, i64 24 398 // CHECK-AAPCS-NEXT: %gr_offs = load i32, ptr %gr_offs_p, align 8 399 // CHECK-AAPCS-NEXT: %0 = icmp sgt i32 %gr_offs, -1 400 // CHECK-AAPCS-NEXT: br i1 %0, label %vaarg.on_stack, label %vaarg.maybe_reg 401 // CHECK-AAPCS-EMPTY: 402 // CHECK-AAPCS-NEXT: vaarg.maybe_reg: ; preds = %entry 403 404 // Increment by 8, size of the pointer to the argument value, not size of the argument value itself. 405 406 // CHECK-AAPCS-NEXT: %new_reg_offs = add nsw i32 %gr_offs, 8 407 // CHECK-AAPCS-NEXT: store i32 %new_reg_offs, ptr %gr_offs_p, align 8 408 // CHECK-AAPCS-NEXT: %inreg = icmp samesign ult i32 %gr_offs, -7 409 // CHECK-AAPCS-NEXT: br i1 %inreg, label %vaarg.in_reg, label %vaarg.on_stack 410 // CHECK-AAPCS-EMPTY: 411 // CHECK-AAPCS-NEXT: vaarg.in_reg: ; preds = %vaarg.maybe_reg 412 // CHECK-AAPCS-NEXT: %reg_top_p = getelementptr inbounds nuw i8, ptr %ap, i64 8 413 // CHECK-AAPCS-NEXT: %reg_top = load ptr, ptr %reg_top_p, align 8 414 // CHECK-AAPCS-NEXT: %1 = sext i32 %gr_offs to i64 415 // CHECK-AAPCS-NEXT: %2 = getelementptr inbounds i8, ptr %reg_top, i64 %1 416 // CHECK-AAPCS-NEXT: br label %vaarg.end 417 // CHECK-AAPCS-EMPTY: 418 // CHECK-AAPCS-NEXT: vaarg.on_stack: ; preds = %vaarg.maybe_reg, %entry 419 // CHECK-AAPCS-NEXT: %stack = load ptr, ptr %ap, align 8 420 // CHECK-AAPCS-NEXT: %new_stack = getelementptr inbounds nuw i8, ptr %stack, i64 8 421 // CHECK-AAPCS-NEXT: store ptr %new_stack, ptr %ap, align 8 422 // CHECK-AAPCS-NEXT: br label %vaarg.end 423 // CHECK-AAPCS-EMPTY: 424 // CHECK-AAPCS-NEXT: vaarg.end: ; preds = %vaarg.on_stack, %vaarg.in_reg 425 // CHECK-AAPCS-NEXT: %vaargs.addr = phi ptr [ %2, %vaarg.in_reg ], [ %stack, %vaarg.on_stack ] 426 427 // Extra indirection, for a composite passed indirectly. 428 // CHECK-AAPCS-NEXT: %vaarg.addr = load ptr, ptr %vaargs.addr, align 8 429 430 // CHECK-AAPCS-NEXT: %v.sroa.0.0.copyload = load <2 x i8>, ptr %vaarg.addr, align 16 431 // CHECK-AAPCS-NEXT: %v.sroa.43.0.vaarg.addr.sroa_idx = getelementptr inbounds nuw i8, ptr %vaarg.addr, i64 48 432 // CHECK-AAPCS-NEXT: %v.sroa.43.0.copyload = load <4 x float>, ptr %v.sroa.43.0.vaarg.addr.sroa_idx, align 16 433 // CHECK-AAPCS-NEXT: call void @llvm.va_end.p0(ptr nonnull %ap) 434 // CHECK-AAPCS-NEXT: %cast.scalable = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> %v.sroa.0.0.copyload, i64 0) 435 // CHECK-AAPCS-NEXT: %3 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1> 436 // CHECK-AAPCS-NEXT: %cast.scalable2 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %v.sroa.43.0.copyload, i64 0) 437 // CHECK-AAPCS-NEXT: call void @use1(<vscale x 16 x i1> noundef %3, <vscale x 4 x float> noundef %cast.scalable2) 438 // CHECK-AAPCS-NEXT: call void @llvm.lifetime.end.p0(i64 32, ptr nonnull %ap) 439 // CHECK-AAPCS-NEXT: ret void 440 // CHECK-AAPCS-NEXT: } 441 442 // CHECK-DARWIN: define void @test_va_arg(i32 noundef %n, ...) 443 // CHECK-DARWIN-NEXT: entry: 444 // CHECK-DARWIN-NEXT: %ap = alloca ptr, align 8 445 // CHECK-DARWIN-NEXT: call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ap) 446 // CHECK-DARWIN-NEXT: call void @llvm.va_start.p0(ptr nonnull %ap) 447 // CHECK-DARWIN-NEXT: %argp.cur = load ptr, ptr %ap, align 8 448 // CHECK-DARWIN-NEXT: %argp.next = getelementptr inbounds nuw i8, ptr %argp.cur, i64 8 449 // CHECK-DARWIN-NEXT: store ptr %argp.next, ptr %ap, align 8 450 // CHECK-DARWIN-NEXT: %0 = load ptr, ptr %argp.cur, align 8 451 // CHECK-DARWIN-NEXT: %v.sroa.0.0.copyload = load <2 x i8>, ptr %0, align 16 452 // CHECK-DARWIN-NEXT: %v.sroa.43.0..sroa_idx = getelementptr inbounds nuw i8, ptr %0, i64 48 453 // CHECK-DARWIN-NEXT: %v.sroa.43.0.copyload = load <4 x float>, ptr %v.sroa.43.0..sroa_idx, align 16 454 // CHECK-DARWIN-NEXT: call void @llvm.va_end.p0(ptr nonnull %ap) 455 // CHECK-DARWIN-NEXT: %cast.scalable = call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> poison, <2 x i8> %v.sroa.0.0.copyload, i64 0) 456 // CHECK-DARWIN-NEXT: %1 = bitcast <vscale x 2 x i8> %cast.scalable to <vscale x 16 x i1> 457 // CHECK-DARWIN-NEXT: %cast.scalable2 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> poison, <4 x float> %v.sroa.43.0.copyload, i64 0) 458 // CHECK-DARWIN-NEXT: call void @use1(<vscale x 16 x i1> noundef %1, <vscale x 4 x float> noundef %cast.scalable2) 459 // CHECK-DARWIN-NEXT: call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ap) 460 // CHECK-DARWIN-NEXT: ret void 461 // CHECK-DARWIN-NEXT: } 462 463 // Regression test for incorrect passing of SVE vector tuples 464 // The whole `y` need to be passed indirectly. 465 void test_tuple_reg_count(svfloat32_t x, svfloat32x2_t y) { 466 void test_tuple_reg_count_callee(svfloat32_t, svfloat32_t, svfloat32_t, svfloat32_t, 467 svfloat32_t, svfloat32_t, svfloat32_t, svfloat32x2_t); 468 test_tuple_reg_count_callee(x, x, x, x, x, x, x, y); 469 } 470 // CHECK-AAPCS: declare void @test_tuple_reg_count_callee(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, ptr noundef) 471 // CHECK-DARWIN: declare void @test_tuple_reg_count_callee(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>) 472 473 // Regression test for incorrect passing of SVE vector tuples 474 // The whole `y` need to be passed indirectly. 475 void test_tuple_reg_count_bool(svboolx4_t x, svboolx4_t y) { 476 void test_tuple_reg_count_bool_callee(svboolx4_t, svboolx4_t); 477 test_tuple_reg_count_bool_callee(x, y); 478 } 479 // CHECK-AAPCS: declare void @test_tuple_reg_count_bool_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, ptr noundef) 480 // CHECK-DARWIN: declare void @test_tuple_reg_count_bool_callee(<vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>, <vscale x 16 x i1>) 481