1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; i8 9 10; Don't use SVE for 64-bit vectors. 11define <4 x i8> @extract_subvector_v8i8(<8 x i8> %op) vscale_range(2,0) #0 { 12; CHECK-LABEL: extract_subvector_v8i8: 13; CHECK: // %bb.0: 14; CHECK-NEXT: zip2 v0.8b, v0.8b, v0.8b 15; CHECK-NEXT: ret 16 %ret = call <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8> %op, i64 4) 17 ret <4 x i8> %ret 18} 19 20; Don't use SVE for 128-bit vectors. 21define <8 x i8> @extract_subvector_v16i8(<16 x i8> %op) vscale_range(2,0) #0 { 22; CHECK-LABEL: extract_subvector_v16i8: 23; CHECK: // %bb.0: 24; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 25; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 26; CHECK-NEXT: ret 27 %ret = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> %op, i64 8) 28 ret <8 x i8> %ret 29} 30 31define void @extract_subvector_v32i8(ptr %a, ptr %b) vscale_range(2,0) #0 { 32; CHECK-LABEL: extract_subvector_v32i8: 33; CHECK: // %bb.0: 34; CHECK-NEXT: ptrue p0.b, vl32 35; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 36; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 37; CHECK-NEXT: str q0, [x1] 38; CHECK-NEXT: ret 39 %op = load <32 x i8>, ptr %a 40 %ret = call <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8> %op, i64 16) 41 store <16 x i8> %ret, ptr %b 42 ret void 43} 44 45define void @extract_subvector_v64i8(ptr %a, ptr %b) #0 { 46; VBITS_GE_256-LABEL: extract_subvector_v64i8: 47; VBITS_GE_256: // %bb.0: 48; VBITS_GE_256-NEXT: ptrue p0.b, vl32 49; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 50; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 51; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x1] 52; VBITS_GE_256-NEXT: ret 53; 54; VBITS_GE_512-LABEL: extract_subvector_v64i8: 55; VBITS_GE_512: // %bb.0: 56; VBITS_GE_512-NEXT: ptrue p0.b, vl64 57; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 58; VBITS_GE_512-NEXT: ptrue p0.b, vl32 59; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 60; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x1] 61; VBITS_GE_512-NEXT: ret 62 %op = load <64 x i8>, ptr %a 63 %ret = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> %op, i64 32) 64 store <32 x i8> %ret, ptr %b 65 ret void 66} 67 68define void @extract_subvector_v128i8(ptr %a, ptr %b) vscale_range(8,0) #0 { 69; CHECK-LABEL: extract_subvector_v128i8: 70; CHECK: // %bb.0: 71; CHECK-NEXT: ptrue p0.b, vl128 72; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 73; CHECK-NEXT: ptrue p0.b, vl64 74; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 75; CHECK-NEXT: st1b { z0.b }, p0, [x1] 76; CHECK-NEXT: ret 77 %op = load <128 x i8>, ptr %a 78 %ret = call <64 x i8> @llvm.vector.extract.v64i8.v128i8(<128 x i8> %op, i64 64) 79 store <64 x i8> %ret, ptr %b 80 ret void 81} 82 83define void @extract_subvector_v256i8(ptr %a, ptr %b) vscale_range(16,0) #0 { 84; CHECK-LABEL: extract_subvector_v256i8: 85; CHECK: // %bb.0: 86; CHECK-NEXT: ptrue p0.b, vl256 87; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 88; CHECK-NEXT: ptrue p0.b, vl128 89; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 90; CHECK-NEXT: st1b { z0.b }, p0, [x1] 91; CHECK-NEXT: ret 92 %op = load <256 x i8>, ptr %a 93 %ret = call <128 x i8> @llvm.vector.extract.v128i8.v256i8(<256 x i8> %op, i64 128) 94 store <128 x i8> %ret, ptr %b 95 ret void 96} 97 98; i16 99 100; Don't use SVE for 64-bit vectors. 101define <2 x i16> @extract_subvector_v4i16(<4 x i16> %op) vscale_range(2,0) #0 { 102; CHECK-LABEL: extract_subvector_v4i16: 103; CHECK: // %bb.0: 104; CHECK-NEXT: ushll v0.4s, v0.4h, #0 105; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 106; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 107; CHECK-NEXT: ret 108 %ret = call <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16> %op, i64 2) 109 ret <2 x i16> %ret 110} 111 112; Don't use SVE for 128-bit vectors. 113define <4 x i16> @extract_subvector_v8i16(<8 x i16> %op) vscale_range(2,0) #0 { 114; CHECK-LABEL: extract_subvector_v8i16: 115; CHECK: // %bb.0: 116; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 117; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 118; CHECK-NEXT: ret 119 %ret = call <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16> %op, i64 4) 120 ret <4 x i16> %ret 121} 122 123define void @extract_subvector_v16i16(ptr %a, ptr %b) vscale_range(2,0) #0 { 124; CHECK-LABEL: extract_subvector_v16i16: 125; CHECK: // %bb.0: 126; CHECK-NEXT: ptrue p0.h, vl16 127; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 128; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 129; CHECK-NEXT: str q0, [x1] 130; CHECK-NEXT: ret 131 %op = load <16 x i16>, ptr %a 132 %ret = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> %op, i64 8) 133 store <8 x i16> %ret, ptr %b 134 ret void 135} 136 137define void @extract_subvector_v32i16(ptr %a, ptr %b) #0 { 138; VBITS_GE_256-LABEL: extract_subvector_v32i16: 139; VBITS_GE_256: // %bb.0: 140; VBITS_GE_256-NEXT: ptrue p0.h, vl16 141; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 142; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 143; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] 144; VBITS_GE_256-NEXT: ret 145; 146; VBITS_GE_512-LABEL: extract_subvector_v32i16: 147; VBITS_GE_512: // %bb.0: 148; VBITS_GE_512-NEXT: ptrue p0.h, vl32 149; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 150; VBITS_GE_512-NEXT: ptrue p0.h, vl16 151; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 152; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 153; VBITS_GE_512-NEXT: ret 154 %op = load <32 x i16>, ptr %a 155 %ret = call <16 x i16> @llvm.vector.extract.v16i16.v32i16(<32 x i16> %op, i64 16) 156 store <16 x i16> %ret, ptr %b 157 ret void 158} 159 160define void @extract_subvector_v64i16(ptr %a, ptr %b) vscale_range(8,0) #0 { 161; CHECK-LABEL: extract_subvector_v64i16: 162; CHECK: // %bb.0: 163; CHECK-NEXT: ptrue p0.h, vl64 164; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 165; CHECK-NEXT: ptrue p0.h, vl32 166; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 167; CHECK-NEXT: st1h { z0.h }, p0, [x1] 168; CHECK-NEXT: ret 169 %op = load <64 x i16>, ptr %a 170 %ret = call <32 x i16> @llvm.vector.extract.v32i16.v64i16(<64 x i16> %op, i64 32) 171 store <32 x i16> %ret, ptr %b 172 ret void 173} 174 175define void @extract_subvector_v128i16(ptr %a, ptr %b) vscale_range(16,0) #0 { 176; CHECK-LABEL: extract_subvector_v128i16: 177; CHECK: // %bb.0: 178; CHECK-NEXT: ptrue p0.h, vl128 179; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 180; CHECK-NEXT: ptrue p0.h, vl64 181; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 182; CHECK-NEXT: st1h { z0.h }, p0, [x1] 183; CHECK-NEXT: ret 184 %op = load <128 x i16>, ptr %a 185 %ret = call <64 x i16> @llvm.vector.extract.v64i16.v128i16(<128 x i16> %op, i64 64) 186 store <64 x i16> %ret, ptr %b 187 ret void 188} 189 190; i32 191 192; Don't use SVE for 64-bit vectors. 193define <1 x i32> @extract_subvector_v2i32(<2 x i32> %op) vscale_range(2,0) #0 { 194; CHECK-LABEL: extract_subvector_v2i32: 195; CHECK: // %bb.0: 196; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 197; CHECK-NEXT: dup v0.2s, v0.s[1] 198; CHECK-NEXT: ret 199 %ret = call <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32> %op, i64 1) 200 ret <1 x i32> %ret 201} 202 203; Don't use SVE for 128-bit vectors. 204define <2 x i32> @extract_subvector_v4i32(<4 x i32> %op) vscale_range(2,0) #0 { 205; CHECK-LABEL: extract_subvector_v4i32: 206; CHECK: // %bb.0: 207; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 208; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 209; CHECK-NEXT: ret 210 %ret = call <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32> %op, i64 2) 211 ret <2 x i32> %ret 212} 213 214define void @extract_subvector_v8i32(ptr %a, ptr %b) vscale_range(2,0) #0 { 215; CHECK-LABEL: extract_subvector_v8i32: 216; CHECK: // %bb.0: 217; CHECK-NEXT: ptrue p0.s, vl8 218; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 219; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 220; CHECK-NEXT: str q0, [x1] 221; CHECK-NEXT: ret 222 %op = load <8 x i32>, ptr %a 223 %ret = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> %op, i64 4) 224 store <4 x i32> %ret, ptr %b 225 ret void 226} 227 228define void @extract_subvector_v16i32(ptr %a, ptr %b) #0 { 229; VBITS_GE_256-LABEL: extract_subvector_v16i32: 230; VBITS_GE_256: // %bb.0: 231; VBITS_GE_256-NEXT: ptrue p0.s, vl8 232; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 233; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 234; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] 235; VBITS_GE_256-NEXT: ret 236; 237; VBITS_GE_512-LABEL: extract_subvector_v16i32: 238; VBITS_GE_512: // %bb.0: 239; VBITS_GE_512-NEXT: ptrue p0.s, vl16 240; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 241; VBITS_GE_512-NEXT: ptrue p0.s, vl8 242; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 243; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 244; VBITS_GE_512-NEXT: ret 245 %op = load <16 x i32>, ptr %a 246 %ret = call <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32> %op, i64 8) 247 store <8 x i32> %ret, ptr %b 248 ret void 249} 250 251define void @extract_subvector_v32i32(ptr %a, ptr %b) vscale_range(8,0) #0 { 252; CHECK-LABEL: extract_subvector_v32i32: 253; CHECK: // %bb.0: 254; CHECK-NEXT: ptrue p0.s, vl32 255; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 256; CHECK-NEXT: ptrue p0.s, vl16 257; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 258; CHECK-NEXT: st1w { z0.s }, p0, [x1] 259; CHECK-NEXT: ret 260 %op = load <32 x i32>, ptr %a 261 %ret = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> %op, i64 16) 262 store <16 x i32> %ret, ptr %b 263 ret void 264} 265 266define void @extract_subvector_v64i32(ptr %a, ptr %b) vscale_range(16,0) #0 { 267; CHECK-LABEL: extract_subvector_v64i32: 268; CHECK: // %bb.0: 269; CHECK-NEXT: ptrue p0.s, vl64 270; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 271; CHECK-NEXT: ptrue p0.s, vl32 272; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 273; CHECK-NEXT: st1w { z0.s }, p0, [x1] 274; CHECK-NEXT: ret 275 %op = load <64 x i32>, ptr %a 276 %ret = call <32 x i32> @llvm.vector.extract.v32i32.v64i32(<64 x i32> %op, i64 32) 277 store <32 x i32> %ret, ptr %b 278 ret void 279} 280 281; i64 282 283; Don't use SVE for 128-bit vectors. 284define <1 x i64> @extract_subvector_v2i64(<2 x i64> %op) vscale_range(2,0) #0 { 285; CHECK-LABEL: extract_subvector_v2i64: 286; CHECK: // %bb.0: 287; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 288; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 289; CHECK-NEXT: ret 290 %ret = call <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64> %op, i64 1) 291 ret <1 x i64> %ret 292} 293 294define void @extract_subvector_v4i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 295; CHECK-LABEL: extract_subvector_v4i64: 296; CHECK: // %bb.0: 297; CHECK-NEXT: ptrue p0.d, vl4 298; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 299; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 300; CHECK-NEXT: str q0, [x1] 301; CHECK-NEXT: ret 302 %op = load <4 x i64>, ptr %a 303 %ret = call <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64> %op, i64 2) 304 store <2 x i64> %ret, ptr %b 305 ret void 306} 307 308define void @extract_subvector_v8i64(ptr %a, ptr %b) vscale_range(2,0) #0 { 309; CHECK-LABEL: extract_subvector_v8i64: 310; CHECK: // %bb.0: 311; CHECK-NEXT: ptrue p0.d, vl4 312; CHECK-NEXT: mov x8, #4 // =0x4 313; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 314; CHECK-NEXT: st1d { z0.d }, p0, [x1] 315; CHECK-NEXT: ret 316 %op = load <8 x i64>, ptr %a 317 %ret = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> %op, i64 4) 318 store <4 x i64> %ret, ptr %b 319 ret void 320} 321 322define void @extract_subvector_v16i64(ptr %a, ptr %b) #0 { 323; VBITS_GE_256-LABEL: extract_subvector_v16i64: 324; VBITS_GE_256: // %bb.0: 325; VBITS_GE_256-NEXT: ptrue p0.d, vl4 326; VBITS_GE_256-NEXT: mov x8, #12 // =0xc 327; VBITS_GE_256-NEXT: mov x9, #8 // =0x8 328; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 329; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0, x9, lsl #3] 330; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 331; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1, x8, lsl #3] 332; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x1] 333; VBITS_GE_256-NEXT: ret 334 %op = load <16 x i64>, ptr %a 335 %ret = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> %op, i64 8) 336 store <8 x i64> %ret, ptr %b 337 ret void 338} 339 340define void @extract_subvector_v32i64(ptr %a, ptr %b) vscale_range(8,0) #0 { 341; CHECK-LABEL: extract_subvector_v32i64: 342; CHECK: // %bb.0: 343; CHECK-NEXT: ptrue p0.d, vl16 344; CHECK-NEXT: mov x8, #16 // =0x10 345; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 346; CHECK-NEXT: st1d { z0.d }, p0, [x1] 347; CHECK-NEXT: ret 348 %op = load <32 x i64>, ptr %a 349 %ret = call <16 x i64> @llvm.vector.extract.v16i64.v32i64(<32 x i64> %op, i64 16) 350 store <16 x i64> %ret, ptr %b 351 ret void 352} 353 354; f16 355 356; Don't use SVE for 64-bit vectors. 357define <2 x half> @extract_subvector_v4f16(<4 x half> %op) vscale_range(16,0) #0 { 358; CHECK-LABEL: extract_subvector_v4f16: 359; CHECK: // %bb.0: 360; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 361; CHECK-NEXT: dup v0.2s, v0.s[1] 362; CHECK-NEXT: ret 363 %ret = call <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half> %op, i64 2) 364 ret <2 x half> %ret 365} 366 367; Don't use SVE for 128-bit vectors. 368define <4 x half> @extract_subvector_v8f16(<8 x half> %op) vscale_range(2,0) #0 { 369; CHECK-LABEL: extract_subvector_v8f16: 370; CHECK: // %bb.0: 371; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 372; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 373; CHECK-NEXT: ret 374 %ret = call <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half> %op, i64 4) 375 ret <4 x half> %ret 376} 377 378define void @extract_subvector_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 379; CHECK-LABEL: extract_subvector_v16f16: 380; CHECK: // %bb.0: 381; CHECK-NEXT: ptrue p0.h, vl16 382; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 383; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 384; CHECK-NEXT: str q0, [x1] 385; CHECK-NEXT: ret 386 %op = load <16 x half>, ptr %a 387 %ret = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> %op, i64 8) 388 store <8 x half> %ret, ptr %b 389 ret void 390} 391 392define void @extract_subvector_v32f16(ptr %a, ptr %b) #0 { 393; VBITS_GE_256-LABEL: extract_subvector_v32f16: 394; VBITS_GE_256: // %bb.0: 395; VBITS_GE_256-NEXT: ptrue p0.h, vl16 396; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 397; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 398; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x1] 399; VBITS_GE_256-NEXT: ret 400; 401; VBITS_GE_512-LABEL: extract_subvector_v32f16: 402; VBITS_GE_512: // %bb.0: 403; VBITS_GE_512-NEXT: ptrue p0.h, vl32 404; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 405; VBITS_GE_512-NEXT: ptrue p0.h, vl16 406; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 407; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x1] 408; VBITS_GE_512-NEXT: ret 409 %op = load <32 x half>, ptr %a 410 %ret = call <16 x half> @llvm.vector.extract.v16f16.v32f16(<32 x half> %op, i64 16) 411 store <16 x half> %ret, ptr %b 412 ret void 413} 414 415define void @extract_subvector_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 416; CHECK-LABEL: extract_subvector_v64f16: 417; CHECK: // %bb.0: 418; CHECK-NEXT: ptrue p0.h, vl64 419; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 420; CHECK-NEXT: ptrue p0.h, vl32 421; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 422; CHECK-NEXT: st1h { z0.h }, p0, [x1] 423; CHECK-NEXT: ret 424 %op = load <64 x half>, ptr %a 425 %ret = call <32 x half> @llvm.vector.extract.v32f16.v64f16(<64 x half> %op, i64 32) 426 store <32 x half> %ret, ptr %b 427 ret void 428} 429 430define void @extract_subvector_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 431; CHECK-LABEL: extract_subvector_v128f16: 432; CHECK: // %bb.0: 433; CHECK-NEXT: ptrue p0.h, vl128 434; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 435; CHECK-NEXT: ptrue p0.h, vl64 436; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 437; CHECK-NEXT: st1h { z0.h }, p0, [x1] 438; CHECK-NEXT: ret 439 %op = load <128 x half>, ptr %a 440 %ret = call <64 x half> @llvm.vector.extract.v64f16.v128f16(<128 x half> %op, i64 64) 441 store <64 x half> %ret, ptr %b 442 ret void 443} 444 445; f32 446 447; Don't use SVE for 64-bit vectors. 448define <1 x float> @extract_subvector_v2f32(<2 x float> %op) vscale_range(2,0) #0 { 449; CHECK-LABEL: extract_subvector_v2f32: 450; CHECK: // %bb.0: 451; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 452; CHECK-NEXT: dup v0.2s, v0.s[1] 453; CHECK-NEXT: ret 454 %ret = call <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float> %op, i64 1) 455 ret <1 x float> %ret 456} 457 458; Don't use SVE for 128-bit vectors. 459define <2 x float> @extract_subvector_v4f32(<4 x float> %op) vscale_range(2,0) #0 { 460; CHECK-LABEL: extract_subvector_v4f32: 461; CHECK: // %bb.0: 462; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 463; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 464; CHECK-NEXT: ret 465 %ret = call <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float> %op, i64 2) 466 ret <2 x float> %ret 467} 468 469define void @extract_subvector_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 470; CHECK-LABEL: extract_subvector_v8f32: 471; CHECK: // %bb.0: 472; CHECK-NEXT: ptrue p0.s, vl8 473; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 474; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 475; CHECK-NEXT: str q0, [x1] 476; CHECK-NEXT: ret 477 %op = load <8 x float>, ptr %a 478 %ret = call <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float> %op, i64 4) 479 store <4 x float> %ret, ptr %b 480 ret void 481} 482 483define void @extract_subvector_v16f32(ptr %a, ptr %b) #0 { 484; VBITS_GE_256-LABEL: extract_subvector_v16f32: 485; VBITS_GE_256: // %bb.0: 486; VBITS_GE_256-NEXT: ptrue p0.s, vl8 487; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 488; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 489; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x1] 490; VBITS_GE_256-NEXT: ret 491; 492; VBITS_GE_512-LABEL: extract_subvector_v16f32: 493; VBITS_GE_512: // %bb.0: 494; VBITS_GE_512-NEXT: ptrue p0.s, vl16 495; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 496; VBITS_GE_512-NEXT: ptrue p0.s, vl8 497; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 498; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x1] 499; VBITS_GE_512-NEXT: ret 500 %op = load <16 x float>, ptr %a 501 %ret = call <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float> %op, i64 8) 502 store <8 x float> %ret, ptr %b 503 ret void 504} 505 506define void @extract_subvector_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 507; CHECK-LABEL: extract_subvector_v32f32: 508; CHECK: // %bb.0: 509; CHECK-NEXT: ptrue p0.s, vl32 510; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 511; CHECK-NEXT: ptrue p0.s, vl16 512; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 513; CHECK-NEXT: st1w { z0.s }, p0, [x1] 514; CHECK-NEXT: ret 515 %op = load <32 x float>, ptr %a 516 %ret = call <16 x float> @llvm.vector.extract.v16f32.v32f32(<32 x float> %op, i64 16) 517 store <16 x float> %ret, ptr %b 518 ret void 519} 520 521define void @extract_subvector_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 522; CHECK-LABEL: extract_subvector_v64f32: 523; CHECK: // %bb.0: 524; CHECK-NEXT: ptrue p0.s, vl64 525; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 526; CHECK-NEXT: ptrue p0.s, vl32 527; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 528; CHECK-NEXT: st1w { z0.s }, p0, [x1] 529; CHECK-NEXT: ret 530 %op = load <64 x float>, ptr %a 531 %ret = call <32 x float> @llvm.vector.extract.v32f32.v64f32(<64 x float> %op, i64 32) 532 store <32 x float> %ret, ptr %b 533 ret void 534} 535 536; f64 537 538; Don't use SVE for 128-bit vectors. 539define <1 x double> @extract_subvector_v2f64(<2 x double> %op) vscale_range(2,0) #0 { 540; CHECK-LABEL: extract_subvector_v2f64: 541; CHECK: // %bb.0: 542; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 543; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 544; CHECK-NEXT: ret 545 %ret = call <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double> %op, i64 1) 546 ret <1 x double> %ret 547} 548 549define void @extract_subvector_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 550; CHECK-LABEL: extract_subvector_v4f64: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ptrue p0.d, vl4 553; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 554; CHECK-NEXT: ext z0.b, z0.b, z0.b, #16 555; CHECK-NEXT: str q0, [x1] 556; CHECK-NEXT: ret 557 %op = load <4 x double>, ptr %a 558 %ret = call <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double> %op, i64 2) 559 store <2 x double> %ret, ptr %b 560 ret void 561} 562 563define void @extract_subvector_v8f64(ptr %a, ptr %b) #0 { 564; VBITS_GE_256-LABEL: extract_subvector_v8f64: 565; VBITS_GE_256: // %bb.0: 566; VBITS_GE_256-NEXT: ptrue p0.d, vl4 567; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 568; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 569; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x1] 570; VBITS_GE_256-NEXT: ret 571; 572; VBITS_GE_512-LABEL: extract_subvector_v8f64: 573; VBITS_GE_512: // %bb.0: 574; VBITS_GE_512-NEXT: ptrue p0.d, vl8 575; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 576; VBITS_GE_512-NEXT: ptrue p0.d, vl4 577; VBITS_GE_512-NEXT: ext z0.b, z0.b, z0.b, #32 578; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x1] 579; VBITS_GE_512-NEXT: ret 580 %op = load <8 x double>, ptr %a 581 %ret = call <4 x double> @llvm.vector.extract.v4f64.v8f64(<8 x double> %op, i64 4) 582 store <4 x double> %ret, ptr %b 583 ret void 584} 585 586define void @extract_subvector_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 587; CHECK-LABEL: extract_subvector_v16f64: 588; CHECK: // %bb.0: 589; CHECK-NEXT: ptrue p0.d, vl16 590; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 591; CHECK-NEXT: ptrue p0.d, vl8 592; CHECK-NEXT: ext z0.b, z0.b, z0.b, #64 593; CHECK-NEXT: st1d { z0.d }, p0, [x1] 594; CHECK-NEXT: ret 595 %op = load <16 x double>, ptr %a 596 %ret = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> %op, i64 8) 597 store <8 x double> %ret, ptr %b 598 ret void 599} 600 601define void @extract_subvector_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 602; CHECK-LABEL: extract_subvector_v32f64: 603; CHECK: // %bb.0: 604; CHECK-NEXT: ptrue p0.d, vl32 605; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 606; CHECK-NEXT: ptrue p0.d, vl16 607; CHECK-NEXT: ext z0.b, z0.b, z0.b, #128 608; CHECK-NEXT: st1d { z0.d }, p0, [x1] 609; CHECK-NEXT: ret 610 %op = load <32 x double>, ptr %a 611 %ret = call <16 x double> @llvm.vector.extract.v16f64.v32f64(<32 x double> %op, i64 16) 612 store <16 x double> %ret, ptr %b 613 ret void 614} 615 616; Test for infinite loop due to fold: 617; extract_subvector(insert_subvector(x,y,c1),c2)--> extract_subvector(y,c2-c1) 618define void @extract_subvector_legalization_v8i32() vscale_range(2,2) #0 { 619; CHECK-LABEL: extract_subvector_legalization_v8i32: 620; CHECK: // %bb.0: // %entry 621; CHECK-NEXT: ptrue p0.s 622; CHECK-NEXT: adrp x8, .LCPI40_0 623; CHECK-NEXT: add x8, x8, :lo12:.LCPI40_0 624; CHECK-NEXT: movi v2.2d, #0000000000000000 625; CHECK-NEXT: ptrue p1.d 626; CHECK-NEXT: ld1w { z0.s }, p0/z, [x8] 627; CHECK-NEXT: mov z1.d, z0.d 628; CHECK-NEXT: ext z1.b, z1.b, z0.b, #16 629; CHECK-NEXT: cmeq v0.4s, v0.4s, v2.4s 630; CHECK-NEXT: cmeq v1.4s, v1.4s, v2.4s 631; CHECK-NEXT: sunpklo z0.d, z0.s 632; CHECK-NEXT: sunpklo z1.d, z1.s 633; CHECK-NEXT: cmpne p0.d, p1/z, z1.d, #0 634; CHECK-NEXT: cmpne p1.d, p1/z, z0.d, #0 635; CHECK-NEXT: .LBB40_1: // %body 636; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 637; CHECK-NEXT: st1d { z0.d }, p1, [x8] 638; CHECK-NEXT: st1d { z0.d }, p0, [x8] 639; CHECK-NEXT: b .LBB40_1 640entry: 641 %splat = shufflevector <8 x i32> poison, <8 x i32> poison, <8 x i32> zeroinitializer 642 br label %body 643body: 644 %0 = icmp eq <8 x i32> zeroinitializer, %splat 645 tail call void @llvm.masked.store.v8f64.p0(<8 x double> poison, ptr poison, i32 8, <8 x i1> %0) 646 br label %body 647} 648declare void @llvm.masked.store.v8f64.p0(<8 x double>, ptr nocapture, i32 immarg, <8 x i1>) 649 650declare <4 x i8> @llvm.vector.extract.v4i8.v8i8(<8 x i8>, i64) 651declare <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8>, i64) 652declare <16 x i8> @llvm.vector.extract.v16i8.v32i8(<32 x i8>, i64) 653declare <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8>, i64) 654declare <64 x i8> @llvm.vector.extract.v64i8.v128i8(<128 x i8>, i64) 655declare <128 x i8> @llvm.vector.extract.v128i8.v256i8(<256 x i8>, i64) 656 657declare <2 x i16> @llvm.vector.extract.v2i16.v4i16(<4 x i16>, i64) 658declare <4 x i16> @llvm.vector.extract.v4i16.v8i16(<8 x i16>, i64) 659declare <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16>, i64) 660declare <16 x i16> @llvm.vector.extract.v16i16.v32i16(<32 x i16>, i64) 661declare <32 x i16> @llvm.vector.extract.v32i16.v64i16(<64 x i16>, i64) 662declare <64 x i16> @llvm.vector.extract.v64i16.v128i16(<128 x i16>, i64) 663 664declare <1 x i32> @llvm.vector.extract.v1i32.v2i32(<2 x i32>, i64) 665declare <2 x i32> @llvm.vector.extract.v2i32.v4i32(<4 x i32>, i64) 666declare <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32>, i64) 667declare <8 x i32> @llvm.vector.extract.v8i32.v16i32(<16 x i32>, i64) 668declare <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32>, i64) 669declare <32 x i32> @llvm.vector.extract.v32i32.v64i32(<64 x i32>, i64) 670 671declare <1 x i64> @llvm.vector.extract.v1i64.v2i64(<2 x i64>, i64) 672declare <2 x i64> @llvm.vector.extract.v2i64.v4i64(<4 x i64>, i64) 673declare <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64>, i64) 674declare <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64>, i64) 675declare <16 x i64> @llvm.vector.extract.v16i64.v32i64(<32 x i64>, i64) 676 677declare <2 x half> @llvm.vector.extract.v2f16.v4f16(<4 x half>, i64) 678declare <4 x half> @llvm.vector.extract.v4f16.v8f16(<8 x half>, i64) 679declare <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half>, i64) 680declare <16 x half> @llvm.vector.extract.v16f16.v32f16(<32 x half>, i64) 681declare <32 x half> @llvm.vector.extract.v32f16.v64f16(<64 x half>, i64) 682declare <64 x half> @llvm.vector.extract.v64f16.v128f16(<128 x half>, i64) 683 684declare <1 x float> @llvm.vector.extract.v1f32.v2f32(<2 x float>, i64) 685declare <2 x float> @llvm.vector.extract.v2f32.v4f32(<4 x float>, i64) 686declare <4 x float> @llvm.vector.extract.v4f32.v8f32(<8 x float>, i64) 687declare <8 x float> @llvm.vector.extract.v8f32.v16f32(<16 x float>, i64) 688declare <16 x float> @llvm.vector.extract.v16f32.v32f32(<32 x float>, i64) 689declare <32 x float> @llvm.vector.extract.v32f32.v64f32(<64 x float>, i64) 690 691declare <1 x double> @llvm.vector.extract.v1f64.v2f64(<2 x double>, i64) 692declare <2 x double> @llvm.vector.extract.v2f64.v4f64(<4 x double>, i64) 693declare <4 x double> @llvm.vector.extract.v4f64.v8f64(<8 x double>, i64) 694declare <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double>, i64) 695declare <16 x double> @llvm.vector.extract.v16f64.v32f64(<32 x double>, i64) 696 697attributes #0 = { "target-features"="+sve" } 698