1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 -aarch64-enable-atomic-cfg-tidy=false < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6; Test we can code generater patterns of the form: 7; fixed_length_vector = ISD::EXTRACT_SUBVECTOR scalable_vector, 0 8; scalable_vector = ISD::INSERT_SUBVECTOR scalable_vector, fixed_length_vector, 0 9; 10; NOTE: Currently shufflevector does not support scalable vectors so it cannot 11; be used to model the above operations. Instead these tests rely on knowing 12; how fixed length operation are lowered to scalable ones, with multiple blocks 13; ensuring insert/extract sequences are not folded away. 14 15target triple = "aarch64-unknown-linux-gnu" 16 17define void @subvector_v8i16(ptr %in, ptr %out) vscale_range(2,0) #0 { 18; CHECK-LABEL: subvector_v8i16: 19; CHECK: // %bb.0: 20; CHECK-NEXT: ldr q0, [x0] 21; CHECK-NEXT: str q0, [x1] 22; CHECK-NEXT: ret 23 %a = load <8 x i16>, ptr %in 24 br label %bb1 25 26bb1: 27 store <8 x i16> %a, ptr %out 28 ret void 29} 30 31define void @subvector_v16i16(ptr %in, ptr %out) vscale_range(2,0) #0 { 32; CHECK-LABEL: subvector_v16i16: 33; CHECK: // %bb.0: 34; CHECK-NEXT: ptrue p0.h, vl16 35; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 36; CHECK-NEXT: st1h { z0.h }, p0, [x1] 37; CHECK-NEXT: ret 38 %a = load <16 x i16>, ptr %in 39 br label %bb1 40 41bb1: 42 store <16 x i16> %a, ptr %out 43 ret void 44} 45 46define void @subvector_v32i16(ptr %in, ptr %out) #0 { 47; VBITS_GE_256-LABEL: subvector_v32i16: 48; VBITS_GE_256: // %bb.0: 49; VBITS_GE_256-NEXT: ptrue p0.h, vl16 50; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 51; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 52; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 53; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] 54; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] 55; VBITS_GE_256-NEXT: ret 56; 57; VBITS_GE_512-LABEL: subvector_v32i16: 58; VBITS_GE_512: // %bb.0: 59; VBITS_GE_512-NEXT: ptrue p0.h, vl32 60; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 61; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 62; VBITS_GE_512-NEXT: ret 63 %a = load <32 x i16>, ptr %in 64 br label %bb1 65 66bb1: 67 store <32 x i16> %a, ptr %out 68 ret void 69} 70 71define void @subvector_v64i16(ptr %in, ptr %out) vscale_range(8,0) #0 { 72; CHECK-LABEL: subvector_v64i16: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ptrue p0.h, vl64 75; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 76; CHECK-NEXT: st1h { z0.h }, p0, [x1] 77; CHECK-NEXT: ret 78 %a = load <64 x i16>, ptr %in 79 br label %bb1 80 81bb1: 82 store <64 x i16> %a, ptr %out 83 ret void 84} 85 86define void @subvector_v8i32(ptr %in, ptr %out) vscale_range(2,0) #0 { 87; CHECK-LABEL: subvector_v8i32: 88; CHECK: // %bb.0: 89; CHECK-NEXT: ptrue p0.s, vl8 90; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 91; CHECK-NEXT: st1w { z0.s }, p0, [x1] 92; CHECK-NEXT: ret 93 %a = load <8 x i32>, ptr %in 94 br label %bb1 95 96bb1: 97 store <8 x i32> %a, ptr %out 98 ret void 99} 100 101define void @subvector_v16i32(ptr %in, ptr %out) #0 { 102; VBITS_GE_256-LABEL: subvector_v16i32: 103; VBITS_GE_256: // %bb.0: 104; VBITS_GE_256-NEXT: ptrue p0.s, vl8 105; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 106; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 107; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 108; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] 109; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] 110; VBITS_GE_256-NEXT: ret 111; 112; VBITS_GE_512-LABEL: subvector_v16i32: 113; VBITS_GE_512: // %bb.0: 114; VBITS_GE_512-NEXT: ptrue p0.s, vl16 115; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 116; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 117; VBITS_GE_512-NEXT: ret 118 %a = load <16 x i32>, ptr %in 119 br label %bb1 120 121bb1: 122 store <16 x i32> %a, ptr %out 123 ret void 124} 125 126define void @subvector_v32i32(ptr %in, ptr %out) vscale_range(8,0) #0 { 127; CHECK-LABEL: subvector_v32i32: 128; CHECK: // %bb.0: 129; CHECK-NEXT: ptrue p0.s, vl32 130; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 131; CHECK-NEXT: st1w { z0.s }, p0, [x1] 132; CHECK-NEXT: ret 133 %a = load <32 x i32>, ptr %in 134 br label %bb1 135 136bb1: 137 store <32 x i32> %a, ptr %out 138 ret void 139} 140 141define void @subvector_v64i32(ptr %in, ptr %out) vscale_range(16,0) #0 { 142; CHECK-LABEL: subvector_v64i32: 143; CHECK: // %bb.0: 144; CHECK-NEXT: ptrue p0.s, vl64 145; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 146; CHECK-NEXT: st1w { z0.s }, p0, [x1] 147; CHECK-NEXT: ret 148 %a = load <64 x i32>, ptr %in 149 br label %bb1 150 151bb1: 152 store <64 x i32> %a, ptr %out 153 ret void 154} 155 156 157define void @subvector_v8i64(ptr %in, ptr %out) vscale_range(2,0) #0 { 158; CHECK-LABEL: subvector_v8i64: 159; CHECK: // %bb.0: 160; CHECK-NEXT: ptrue p0.d, vl4 161; CHECK-NEXT: mov x8, #4 // =0x4 162; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 163; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] 164; CHECK-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 165; CHECK-NEXT: st1d { z1.d }, p0, [x1] 166; CHECK-NEXT: ret 167 %a = load <8 x i64>, ptr %in 168 br label %bb1 169 170bb1: 171 store <8 x i64> %a, ptr %out 172 ret void 173} 174 175define void @subvector_v16i64(ptr %in, ptr %out) vscale_range(8,0) #0 { 176; CHECK-LABEL: subvector_v16i64: 177; CHECK: // %bb.0: 178; CHECK-NEXT: ptrue p0.d, vl16 179; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 180; CHECK-NEXT: st1d { z0.d }, p0, [x1] 181; CHECK-NEXT: ret 182 %a = load <16 x i64>, ptr %in 183 br label %bb1 184 185bb1: 186 store <16 x i64> %a, ptr %out 187 ret void 188} 189 190define void @subvector_v32i64(ptr %in, ptr %out) vscale_range(16,0) #0 { 191; CHECK-LABEL: subvector_v32i64: 192; CHECK: // %bb.0: 193; CHECK-NEXT: ptrue p0.d, vl32 194; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 195; CHECK-NEXT: st1d { z0.d }, p0, [x1] 196; CHECK-NEXT: ret 197 %a = load <32 x i64>, ptr %in 198 br label %bb1 199 200bb1: 201 store <32 x i64> %a, ptr %out 202 ret void 203} 204 205define void @subvector_v8f16(ptr %in, ptr %out) vscale_range(2,0) #0 { 206; CHECK-LABEL: subvector_v8f16: 207; CHECK: // %bb.0: 208; CHECK-NEXT: ldr q0, [x0] 209; CHECK-NEXT: str q0, [x1] 210; CHECK-NEXT: ret 211 %a = load <8 x half>, ptr %in 212 br label %bb1 213 214bb1: 215 store <8 x half> %a, ptr %out 216 ret void 217} 218 219define void @subvector_v16f16(ptr %in, ptr %out) vscale_range(2,0) #0 { 220; CHECK-LABEL: subvector_v16f16: 221; CHECK: // %bb.0: 222; CHECK-NEXT: ptrue p0.h, vl16 223; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 224; CHECK-NEXT: st1h { z0.h }, p0, [x1] 225; CHECK-NEXT: ret 226 %a = load <16 x half>, ptr %in 227 br label %bb1 228 229bb1: 230 store <16 x half> %a, ptr %out 231 ret void 232} 233 234define void @subvector_v32f16(ptr %in, ptr %out) #0 { 235; VBITS_GE_256-LABEL: subvector_v32f16: 236; VBITS_GE_256: // %bb.0: 237; VBITS_GE_256-NEXT: ptrue p0.h, vl16 238; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 239; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 240; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 241; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1, x8, lsl #1] 242; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x1] 243; VBITS_GE_256-NEXT: ret 244; 245; VBITS_GE_512-LABEL: subvector_v32f16: 246; VBITS_GE_512: // %bb.0: 247; VBITS_GE_512-NEXT: ptrue p0.h, vl32 248; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 249; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 250; VBITS_GE_512-NEXT: ret 251 %a = load <32 x half>, ptr %in 252 br label %bb1 253 254bb1: 255 store <32 x half> %a, ptr %out 256 ret void 257} 258 259define void @subvector_v64f16(ptr %in, ptr %out) vscale_range(8,0) #0 { 260; CHECK-LABEL: subvector_v64f16: 261; CHECK: // %bb.0: 262; CHECK-NEXT: ptrue p0.h, vl64 263; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 264; CHECK-NEXT: st1h { z0.h }, p0, [x1] 265; CHECK-NEXT: ret 266 %a = load <64 x half>, ptr %in 267 br label %bb1 268 269bb1: 270 store <64 x half> %a, ptr %out 271 ret void 272} 273 274define void @subvector_v8f32(ptr %in, ptr %out) vscale_range(2,0) #0 { 275; CHECK-LABEL: subvector_v8f32: 276; CHECK: // %bb.0: 277; CHECK-NEXT: ptrue p0.s, vl8 278; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 279; CHECK-NEXT: st1w { z0.s }, p0, [x1] 280; CHECK-NEXT: ret 281 %a = load <8 x float>, ptr %in 282 br label %bb1 283 284bb1: 285 store <8 x float> %a, ptr %out 286 ret void 287} 288 289define void @subvector_v16f32(ptr %in, ptr %out) #0 { 290; VBITS_GE_256-LABEL: subvector_v16f32: 291; VBITS_GE_256: // %bb.0: 292; VBITS_GE_256-NEXT: ptrue p0.s, vl8 293; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 294; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 295; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 296; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1, x8, lsl #2] 297; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x1] 298; VBITS_GE_256-NEXT: ret 299; 300; VBITS_GE_512-LABEL: subvector_v16f32: 301; VBITS_GE_512: // %bb.0: 302; VBITS_GE_512-NEXT: ptrue p0.s, vl16 303; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 304; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 305; VBITS_GE_512-NEXT: ret 306 %a = load <16 x float>, ptr %in 307 br label %bb1 308 309bb1: 310 store <16 x float> %a, ptr %out 311 ret void 312} 313 314define void @subvector_v32f32(ptr %in, ptr %out) vscale_range(8,0) #0 { 315; CHECK-LABEL: subvector_v32f32: 316; CHECK: // %bb.0: 317; CHECK-NEXT: ptrue p0.s, vl32 318; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 319; CHECK-NEXT: st1w { z0.s }, p0, [x1] 320; CHECK-NEXT: ret 321 %a = load <32 x float>, ptr %in 322 br label %bb1 323 324bb1: 325 store <32 x float> %a, ptr %out 326 ret void 327} 328 329define void @subvector_v64f32(ptr %in, ptr %out) vscale_range(16,0) #0 { 330; CHECK-LABEL: subvector_v64f32: 331; CHECK: // %bb.0: 332; CHECK-NEXT: ptrue p0.s, vl64 333; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 334; CHECK-NEXT: st1w { z0.s }, p0, [x1] 335; CHECK-NEXT: ret 336 %a = load <64 x float>, ptr %in 337 br label %bb1 338 339bb1: 340 store <64 x float> %a, ptr %out 341 ret void 342} 343define void @subvector_v8f64(ptr %in, ptr %out) #0 { 344; VBITS_GE_256-LABEL: subvector_v8f64: 345; VBITS_GE_256: // %bb.0: 346; VBITS_GE_256-NEXT: ptrue p0.d, vl4 347; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 348; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 349; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 350; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 351; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 352; VBITS_GE_256-NEXT: ret 353; 354; VBITS_GE_512-LABEL: subvector_v8f64: 355; VBITS_GE_512: // %bb.0: 356; VBITS_GE_512-NEXT: ptrue p0.d, vl8 357; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 358; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 359; VBITS_GE_512-NEXT: ret 360 %a = load <8 x double>, ptr %in 361 br label %bb1 362 363bb1: 364 store <8 x double> %a, ptr %out 365 ret void 366} 367 368define void @subvector_v16f64(ptr %in, ptr %out) vscale_range(8,0) #0 { 369; CHECK-LABEL: subvector_v16f64: 370; CHECK: // %bb.0: 371; CHECK-NEXT: ptrue p0.d, vl16 372; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 373; CHECK-NEXT: st1d { z0.d }, p0, [x1] 374; CHECK-NEXT: ret 375 %a = load <16 x double>, ptr %in 376 br label %bb1 377 378bb1: 379 store <16 x double> %a, ptr %out 380 ret void 381} 382 383define void @subvector_v32f64(ptr %in, ptr %out) vscale_range(16,0) #0 { 384; CHECK-LABEL: subvector_v32f64: 385; CHECK: // %bb.0: 386; CHECK-NEXT: ptrue p0.d, vl32 387; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 388; CHECK-NEXT: st1d { z0.d }, p0, [x1] 389; CHECK-NEXT: ret 390 %a = load <32 x double>, ptr %in 391 br label %bb1 392 393bb1: 394 store <32 x double> %a, ptr %out 395 ret void 396} 397 398define <8 x i1> @no_warn_dropped_scalable(ptr %in) #0 { 399; CHECK-LABEL: no_warn_dropped_scalable: 400; CHECK: // %bb.0: 401; CHECK-NEXT: ptrue p0.s, vl8 402; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 403; CHECK-NEXT: cmpgt p0.s, p0/z, z0.s, #0 404; CHECK-NEXT: mov z0.s, p0/z, #-1 // =0xffffffffffffffff 405; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h 406; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b 407; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0 408; CHECK-NEXT: ret 409 %a = load <8 x i32>, ptr %in 410 br label %bb1 411 412bb1: 413 %cond = icmp sgt <8 x i32> %a, zeroinitializer 414 ret <8 x i1> %cond 415} 416 417; binop(insert_subvec(a), insert_subvec(b)) -> insert_subvec(binop(a,b)) like 418; combines remove redundant subvector operations. This test ensures it's not 419; performed when the input idiom is the result of operation legalisation. When 420; not prevented the test triggers infinite combine->legalise->combine->... 421define void @no_subvector_binop_hang(ptr %in, ptr %out, i1 %cond) #0 { 422; CHECK-LABEL: no_subvector_binop_hang: 423; CHECK: // %bb.0: 424; CHECK-NEXT: tbz w2, #0, .LBB23_2 425; CHECK-NEXT: // %bb.1: // %bb.1 426; CHECK-NEXT: ptrue p0.s, vl8 427; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 428; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 429; CHECK-NEXT: orr z0.d, z0.d, z1.d 430; CHECK-NEXT: st1w { z0.s }, p0, [x1] 431; CHECK-NEXT: .LBB23_2: // %bb.2 432; CHECK-NEXT: ret 433 %a = load <8 x i32>, ptr %in 434 %b = load <8 x i32>, ptr %out 435 br i1 %cond, label %bb.1, label %bb.2 436 437bb.1: 438 %or = or <8 x i32> %a, %b 439 store <8 x i32> %or, ptr %out 440 br label %bb.2 441 442bb.2: 443 ret void 444} 445 446attributes #0 = { "target-features"="+sve" } 447