1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; Don't use SVE for 64-bit vectors. 9define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, i1 %mask) vscale_range(2,0) #0 { 10; CHECK-LABEL: select_v4f16: 11; CHECK: // %bb.0: 12; CHECK-NEXT: tst w0, #0x1 13; CHECK-NEXT: csetm w8, ne 14; CHECK-NEXT: dup v2.4h, w8 15; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 16; CHECK-NEXT: ret 17 %sel = select i1 %mask, <4 x half> %op1, <4 x half> %op2 18 ret <4 x half> %sel 19} 20 21; Don't use SVE for 128-bit vectors. 22define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, i1 %mask) vscale_range(2,0) #0 { 23; CHECK-LABEL: select_v8f16: 24; CHECK: // %bb.0: 25; CHECK-NEXT: tst w0, #0x1 26; CHECK-NEXT: csetm w8, ne 27; CHECK-NEXT: dup v2.8h, w8 28; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 29; CHECK-NEXT: ret 30 %sel = select i1 %mask, <8 x half> %op1, <8 x half> %op2 31 ret <8 x half> %sel 32} 33 34define void @select_v16f16(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { 35; CHECK-LABEL: select_v16f16: 36; CHECK: // %bb.0: 37; CHECK-NEXT: mov z0.h, w2 38; CHECK-NEXT: ptrue p0.h 39; CHECK-NEXT: ptrue p1.h, vl16 40; CHECK-NEXT: and z0.h, z0.h, #0x1 41; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 42; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] 43; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] 44; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h 45; CHECK-NEXT: st1h { z0.h }, p1, [x0] 46; CHECK-NEXT: ret 47 %op1 = load volatile <16 x half>, ptr %a 48 %op2 = load volatile <16 x half>, ptr %b 49 %sel = select i1 %mask, <16 x half> %op1, <16 x half> %op2 50 store <16 x half> %sel, ptr %a 51 ret void 52} 53 54define void @select_v32f16(ptr %a, ptr %b, i1 %mask) #0 { 55; VBITS_GE_256-LABEL: select_v32f16: 56; VBITS_GE_256: // %bb.0: 57; VBITS_GE_256-NEXT: mov z0.h, w2 58; VBITS_GE_256-NEXT: ptrue p0.h 59; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 60; VBITS_GE_256-NEXT: ptrue p1.h, vl16 61; VBITS_GE_256-NEXT: and z0.h, z0.h, #0x1 62; VBITS_GE_256-NEXT: cmpne p0.h, p0/z, z0.h, #0 63; VBITS_GE_256-NEXT: ld1h { z0.h }, p1/z, [x0, x8, lsl #1] 64; VBITS_GE_256-NEXT: ld1h { z1.h }, p1/z, [x0] 65; VBITS_GE_256-NEXT: ld1h { z2.h }, p1/z, [x1, x8, lsl #1] 66; VBITS_GE_256-NEXT: ld1h { z3.h }, p1/z, [x1] 67; VBITS_GE_256-NEXT: sel z0.h, p0, z0.h, z2.h 68; VBITS_GE_256-NEXT: sel z1.h, p0, z1.h, z3.h 69; VBITS_GE_256-NEXT: st1h { z0.h }, p1, [x0, x8, lsl #1] 70; VBITS_GE_256-NEXT: st1h { z1.h }, p1, [x0] 71; VBITS_GE_256-NEXT: ret 72; 73; VBITS_GE_512-LABEL: select_v32f16: 74; VBITS_GE_512: // %bb.0: 75; VBITS_GE_512-NEXT: mov z0.h, w2 76; VBITS_GE_512-NEXT: ptrue p0.h 77; VBITS_GE_512-NEXT: ptrue p1.h, vl32 78; VBITS_GE_512-NEXT: and z0.h, z0.h, #0x1 79; VBITS_GE_512-NEXT: cmpne p0.h, p0/z, z0.h, #0 80; VBITS_GE_512-NEXT: ld1h { z0.h }, p1/z, [x0] 81; VBITS_GE_512-NEXT: ld1h { z1.h }, p1/z, [x1] 82; VBITS_GE_512-NEXT: sel z0.h, p0, z0.h, z1.h 83; VBITS_GE_512-NEXT: st1h { z0.h }, p1, [x0] 84; VBITS_GE_512-NEXT: ret 85 %op1 = load volatile <32 x half>, ptr %a 86 %op2 = load volatile <32 x half>, ptr %b 87 %sel = select i1 %mask, <32 x half> %op1, <32 x half> %op2 88 store <32 x half> %sel, ptr %a 89 ret void 90} 91 92define void @select_v64f16(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { 93; CHECK-LABEL: select_v64f16: 94; CHECK: // %bb.0: 95; CHECK-NEXT: mov z0.h, w2 96; CHECK-NEXT: ptrue p0.h 97; CHECK-NEXT: ptrue p1.h, vl64 98; CHECK-NEXT: and z0.h, z0.h, #0x1 99; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 100; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] 101; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] 102; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h 103; CHECK-NEXT: st1h { z0.h }, p1, [x0] 104; CHECK-NEXT: ret 105 %op1 = load volatile <64 x half>, ptr %a 106 %op2 = load volatile <64 x half>, ptr %b 107 %sel = select i1 %mask, <64 x half> %op1, <64 x half> %op2 108 store <64 x half> %sel, ptr %a 109 ret void 110} 111 112define void @select_v128f16(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { 113; CHECK-LABEL: select_v128f16: 114; CHECK: // %bb.0: 115; CHECK-NEXT: mov z0.h, w2 116; CHECK-NEXT: ptrue p0.h 117; CHECK-NEXT: ptrue p1.h, vl128 118; CHECK-NEXT: and z0.h, z0.h, #0x1 119; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 120; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0] 121; CHECK-NEXT: ld1h { z1.h }, p1/z, [x1] 122; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h 123; CHECK-NEXT: st1h { z0.h }, p1, [x0] 124; CHECK-NEXT: ret 125 %op1 = load volatile <128 x half>, ptr %a 126 %op2 = load volatile <128 x half>, ptr %b 127 %sel = select i1 %mask, <128 x half> %op1, <128 x half> %op2 128 store <128 x half> %sel, ptr %a 129 ret void 130} 131 132; Don't use SVE for 64-bit vectors. 133define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, i1 %mask) vscale_range(2,0) #0 { 134; CHECK-LABEL: select_v2f32: 135; CHECK: // %bb.0: 136; CHECK-NEXT: tst w0, #0x1 137; CHECK-NEXT: csetm w8, ne 138; CHECK-NEXT: dup v2.2s, w8 139; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 140; CHECK-NEXT: ret 141 %sel = select i1 %mask, <2 x float> %op1, <2 x float> %op2 142 ret <2 x float> %sel 143} 144 145; Don't use SVE for 128-bit vectors. 146define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, i1 %mask) vscale_range(2,0) #0 { 147; CHECK-LABEL: select_v4f32: 148; CHECK: // %bb.0: 149; CHECK-NEXT: tst w0, #0x1 150; CHECK-NEXT: csetm w8, ne 151; CHECK-NEXT: dup v2.4s, w8 152; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 153; CHECK-NEXT: ret 154 %sel = select i1 %mask, <4 x float> %op1, <4 x float> %op2 155 ret <4 x float> %sel 156} 157 158define void @select_v8f32(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { 159; CHECK-LABEL: select_v8f32: 160; CHECK: // %bb.0: 161; CHECK-NEXT: and w8, w2, #0x1 162; CHECK-NEXT: ptrue p0.s 163; CHECK-NEXT: mov z0.s, w8 164; CHECK-NEXT: ptrue p1.s, vl8 165; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 166; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 167; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 168; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s 169; CHECK-NEXT: st1w { z0.s }, p1, [x0] 170; CHECK-NEXT: ret 171 %op1 = load volatile <8 x float>, ptr %a 172 %op2 = load volatile <8 x float>, ptr %b 173 %sel = select i1 %mask, <8 x float> %op1, <8 x float> %op2 174 store <8 x float> %sel, ptr %a 175 ret void 176} 177 178define void @select_v16f32(ptr %a, ptr %b, i1 %mask) #0 { 179; VBITS_GE_256-LABEL: select_v16f32: 180; VBITS_GE_256: // %bb.0: 181; VBITS_GE_256-NEXT: and w8, w2, #0x1 182; VBITS_GE_256-NEXT: ptrue p0.s 183; VBITS_GE_256-NEXT: mov z0.s, w8 184; VBITS_GE_256-NEXT: ptrue p1.s, vl8 185; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 186; VBITS_GE_256-NEXT: cmpne p0.s, p0/z, z0.s, #0 187; VBITS_GE_256-NEXT: ld1w { z0.s }, p1/z, [x0, x8, lsl #2] 188; VBITS_GE_256-NEXT: ld1w { z1.s }, p1/z, [x0] 189; VBITS_GE_256-NEXT: ld1w { z2.s }, p1/z, [x1, x8, lsl #2] 190; VBITS_GE_256-NEXT: ld1w { z3.s }, p1/z, [x1] 191; VBITS_GE_256-NEXT: sel z0.s, p0, z0.s, z2.s 192; VBITS_GE_256-NEXT: sel z1.s, p0, z1.s, z3.s 193; VBITS_GE_256-NEXT: st1w { z0.s }, p1, [x0, x8, lsl #2] 194; VBITS_GE_256-NEXT: st1w { z1.s }, p1, [x0] 195; VBITS_GE_256-NEXT: ret 196; 197; VBITS_GE_512-LABEL: select_v16f32: 198; VBITS_GE_512: // %bb.0: 199; VBITS_GE_512-NEXT: and w8, w2, #0x1 200; VBITS_GE_512-NEXT: ptrue p0.s 201; VBITS_GE_512-NEXT: mov z0.s, w8 202; VBITS_GE_512-NEXT: ptrue p1.s, vl16 203; VBITS_GE_512-NEXT: cmpne p0.s, p0/z, z0.s, #0 204; VBITS_GE_512-NEXT: ld1w { z0.s }, p1/z, [x0] 205; VBITS_GE_512-NEXT: ld1w { z1.s }, p1/z, [x1] 206; VBITS_GE_512-NEXT: sel z0.s, p0, z0.s, z1.s 207; VBITS_GE_512-NEXT: st1w { z0.s }, p1, [x0] 208; VBITS_GE_512-NEXT: ret 209 %op1 = load volatile <16 x float>, ptr %a 210 %op2 = load volatile <16 x float>, ptr %b 211 %sel = select i1 %mask, <16 x float> %op1, <16 x float> %op2 212 store <16 x float> %sel, ptr %a 213 ret void 214} 215 216define void @select_v32f32(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { 217; CHECK-LABEL: select_v32f32: 218; CHECK: // %bb.0: 219; CHECK-NEXT: and w8, w2, #0x1 220; CHECK-NEXT: ptrue p0.s 221; CHECK-NEXT: mov z0.s, w8 222; CHECK-NEXT: ptrue p1.s, vl32 223; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 224; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 225; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 226; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s 227; CHECK-NEXT: st1w { z0.s }, p1, [x0] 228; CHECK-NEXT: ret 229 %op1 = load volatile <32 x float>, ptr %a 230 %op2 = load volatile <32 x float>, ptr %b 231 %sel = select i1 %mask, <32 x float> %op1, <32 x float> %op2 232 store <32 x float> %sel, ptr %a 233 ret void 234} 235 236define void @select_v64f32(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { 237; CHECK-LABEL: select_v64f32: 238; CHECK: // %bb.0: 239; CHECK-NEXT: and w8, w2, #0x1 240; CHECK-NEXT: ptrue p0.s 241; CHECK-NEXT: mov z0.s, w8 242; CHECK-NEXT: ptrue p1.s, vl64 243; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 244; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] 245; CHECK-NEXT: ld1w { z1.s }, p1/z, [x1] 246; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s 247; CHECK-NEXT: st1w { z0.s }, p1, [x0] 248; CHECK-NEXT: ret 249 %op1 = load volatile <64 x float>, ptr %a 250 %op2 = load volatile <64 x float>, ptr %b 251 %sel = select i1 %mask, <64 x float> %op1, <64 x float> %op2 252 store <64 x float> %sel, ptr %a 253 ret void 254} 255 256; Don't use SVE for 64-bit vectors. 257define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, i1 %mask) vscale_range(2,0) #0 { 258; CHECK-LABEL: select_v1f64: 259; CHECK: // %bb.0: 260; CHECK-NEXT: tst w0, #0x1 261; CHECK-NEXT: csetm x8, ne 262; CHECK-NEXT: fmov d2, x8 263; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 264; CHECK-NEXT: ret 265 %sel = select i1 %mask, <1 x double> %op1, <1 x double> %op2 266 ret <1 x double> %sel 267} 268 269; Don't use SVE for 128-bit vectors. 270define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, i1 %mask) vscale_range(2,0) #0 { 271; CHECK-LABEL: select_v2f64: 272; CHECK: // %bb.0: 273; CHECK-NEXT: tst w0, #0x1 274; CHECK-NEXT: csetm x8, ne 275; CHECK-NEXT: dup v2.2d, x8 276; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 277; CHECK-NEXT: ret 278 %sel = select i1 %mask, <2 x double> %op1, <2 x double> %op2 279 ret <2 x double> %sel 280} 281 282define void @select_v4f64(ptr %a, ptr %b, i1 %mask) vscale_range(2,0) #0 { 283; CHECK-LABEL: select_v4f64: 284; CHECK: // %bb.0: 285; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 286; CHECK-NEXT: and x8, x2, #0x1 287; CHECK-NEXT: ptrue p0.d 288; CHECK-NEXT: mov z0.d, x8 289; CHECK-NEXT: ptrue p1.d, vl4 290; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 291; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] 292; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 293; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d 294; CHECK-NEXT: st1d { z0.d }, p1, [x0] 295; CHECK-NEXT: ret 296 %op1 = load volatile <4 x double>, ptr %a 297 %op2 = load volatile <4 x double>, ptr %b 298 %sel = select i1 %mask, <4 x double> %op1, <4 x double> %op2 299 store <4 x double> %sel, ptr %a 300 ret void 301} 302 303define void @select_v8f64(ptr %a, ptr %b, i1 %mask) #0 { 304; VBITS_GE_256-LABEL: select_v8f64: 305; VBITS_GE_256: // %bb.0: 306; VBITS_GE_256-NEXT: // kill: def $w2 killed $w2 def $x2 307; VBITS_GE_256-NEXT: and x8, x2, #0x1 308; VBITS_GE_256-NEXT: ptrue p0.d 309; VBITS_GE_256-NEXT: mov z0.d, x8 310; VBITS_GE_256-NEXT: ptrue p1.d, vl4 311; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 312; VBITS_GE_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 313; VBITS_GE_256-NEXT: ld1d { z0.d }, p1/z, [x0, x8, lsl #3] 314; VBITS_GE_256-NEXT: ld1d { z1.d }, p1/z, [x0] 315; VBITS_GE_256-NEXT: ld1d { z2.d }, p1/z, [x1, x8, lsl #3] 316; VBITS_GE_256-NEXT: ld1d { z3.d }, p1/z, [x1] 317; VBITS_GE_256-NEXT: sel z0.d, p0, z0.d, z2.d 318; VBITS_GE_256-NEXT: sel z1.d, p0, z1.d, z3.d 319; VBITS_GE_256-NEXT: st1d { z0.d }, p1, [x0, x8, lsl #3] 320; VBITS_GE_256-NEXT: st1d { z1.d }, p1, [x0] 321; VBITS_GE_256-NEXT: ret 322; 323; VBITS_GE_512-LABEL: select_v8f64: 324; VBITS_GE_512: // %bb.0: 325; VBITS_GE_512-NEXT: // kill: def $w2 killed $w2 def $x2 326; VBITS_GE_512-NEXT: and x8, x2, #0x1 327; VBITS_GE_512-NEXT: ptrue p0.d 328; VBITS_GE_512-NEXT: mov z0.d, x8 329; VBITS_GE_512-NEXT: ptrue p1.d, vl8 330; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 331; VBITS_GE_512-NEXT: ld1d { z0.d }, p1/z, [x0] 332; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] 333; VBITS_GE_512-NEXT: sel z0.d, p0, z0.d, z1.d 334; VBITS_GE_512-NEXT: st1d { z0.d }, p1, [x0] 335; VBITS_GE_512-NEXT: ret 336 %op1 = load volatile <8 x double>, ptr %a 337 %op2 = load volatile <8 x double>, ptr %b 338 %sel = select i1 %mask, <8 x double> %op1, <8 x double> %op2 339 store <8 x double> %sel, ptr %a 340 ret void 341} 342 343define void @select_v16f64(ptr %a, ptr %b, i1 %mask) vscale_range(8,0) #0 { 344; CHECK-LABEL: select_v16f64: 345; CHECK: // %bb.0: 346; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 347; CHECK-NEXT: and x8, x2, #0x1 348; CHECK-NEXT: ptrue p0.d 349; CHECK-NEXT: mov z0.d, x8 350; CHECK-NEXT: ptrue p1.d, vl16 351; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 352; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] 353; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 354; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d 355; CHECK-NEXT: st1d { z0.d }, p1, [x0] 356; CHECK-NEXT: ret 357 %op1 = load volatile <16 x double>, ptr %a 358 %op2 = load volatile <16 x double>, ptr %b 359 %sel = select i1 %mask, <16 x double> %op1, <16 x double> %op2 360 store <16 x double> %sel, ptr %a 361 ret void 362} 363 364define void @select_v32f64(ptr %a, ptr %b, i1 %mask) vscale_range(16,0) #0 { 365; CHECK-LABEL: select_v32f64: 366; CHECK: // %bb.0: 367; CHECK-NEXT: // kill: def $w2 killed $w2 def $x2 368; CHECK-NEXT: and x8, x2, #0x1 369; CHECK-NEXT: ptrue p0.d 370; CHECK-NEXT: mov z0.d, x8 371; CHECK-NEXT: ptrue p1.d, vl32 372; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 373; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0] 374; CHECK-NEXT: ld1d { z1.d }, p1/z, [x1] 375; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d 376; CHECK-NEXT: st1d { z0.d }, p1, [x0] 377; CHECK-NEXT: ret 378 %op1 = load volatile <32 x double>, ptr %a 379 %op2 = load volatile <32 x double>, ptr %b 380 %sel = select i1 %mask, <32 x double> %op1, <32 x double> %op2 381 store <32 x double> %sel, ptr %a 382 ret void 383} 384 385attributes #0 = { "target-features"="+sve" } 386