1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; Don't use SVE for 64-bit vectors. 9define <4 x half> @select_v4f16(<4 x half> %op1, <4 x half> %op2, <4 x i1> %mask) vscale_range(2,0) #0 { 10; CHECK-LABEL: select_v4f16: 11; CHECK: // %bb.0: 12; CHECK-NEXT: shl v2.4h, v2.4h, #15 13; CHECK-NEXT: cmlt v2.4h, v2.4h, #0 14; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 15; CHECK-NEXT: ret 16 %sel = select <4 x i1> %mask, <4 x half> %op1, <4 x half> %op2 17 ret <4 x half> %sel 18} 19 20; Don't use SVE for 128-bit vectors. 21define <8 x half> @select_v8f16(<8 x half> %op1, <8 x half> %op2, <8 x i1> %mask) vscale_range(2,0) #0 { 22; CHECK-LABEL: select_v8f16: 23; CHECK: // %bb.0: 24; CHECK-NEXT: ushll v2.8h, v2.8b, #0 25; CHECK-NEXT: shl v2.8h, v2.8h, #15 26; CHECK-NEXT: cmlt v2.8h, v2.8h, #0 27; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 28; CHECK-NEXT: ret 29 %sel = select <8 x i1> %mask, <8 x half> %op1, <8 x half> %op2 30 ret <8 x half> %sel 31} 32 33define void @select_v16f16(ptr %a, ptr %b) vscale_range(2,0) #0 { 34; CHECK-LABEL: select_v16f16: 35; CHECK: // %bb.0: 36; CHECK-NEXT: ptrue p0.h, vl16 37; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 38; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 39; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h 40; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h 41; CHECK-NEXT: st1h { z0.h }, p0, [x0] 42; CHECK-NEXT: ret 43 %op1 = load <16 x half>, ptr %a 44 %op2 = load <16 x half>, ptr %b 45 %mask = fcmp oeq <16 x half> %op1, %op2 46 %sel = select <16 x i1> %mask, <16 x half> %op1, <16 x half> %op2 47 store <16 x half> %sel, ptr %a 48 ret void 49} 50 51define void @select_v32f16(ptr %a, ptr %b) #0 { 52; VBITS_GE_256-LABEL: select_v32f16: 53; VBITS_GE_256: // %bb.0: 54; VBITS_GE_256-NEXT: ptrue p0.h, vl16 55; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 56; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 57; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1, x8, lsl #1] 58; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x0] 59; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1] 60; VBITS_GE_256-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h 61; VBITS_GE_256-NEXT: fcmeq p2.h, p0/z, z2.h, z3.h 62; VBITS_GE_256-NEXT: sel z0.h, p1, z0.h, z1.h 63; VBITS_GE_256-NEXT: sel z1.h, p2, z2.h, z3.h 64; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1] 65; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0] 66; VBITS_GE_256-NEXT: ret 67; 68; VBITS_GE_512-LABEL: select_v32f16: 69; VBITS_GE_512: // %bb.0: 70; VBITS_GE_512-NEXT: ptrue p0.h, vl32 71; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 72; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] 73; VBITS_GE_512-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h 74; VBITS_GE_512-NEXT: sel z0.h, p1, z0.h, z1.h 75; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] 76; VBITS_GE_512-NEXT: ret 77 %op1 = load <32 x half>, ptr %a 78 %op2 = load <32 x half>, ptr %b 79 %mask = fcmp oeq <32 x half> %op1, %op2 80 %sel = select <32 x i1> %mask, <32 x half> %op1, <32 x half> %op2 81 store <32 x half> %sel, ptr %a 82 ret void 83} 84 85define void @select_v64f16(ptr %a, ptr %b) vscale_range(8,0) #0 { 86; CHECK-LABEL: select_v64f16: 87; CHECK: // %bb.0: 88; CHECK-NEXT: ptrue p0.h, vl64 89; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 90; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 91; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h 92; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h 93; CHECK-NEXT: st1h { z0.h }, p0, [x0] 94; CHECK-NEXT: ret 95 %op1 = load <64 x half>, ptr %a 96 %op2 = load <64 x half>, ptr %b 97 %mask = fcmp oeq <64 x half> %op1, %op2 98 %sel = select <64 x i1> %mask, <64 x half> %op1, <64 x half> %op2 99 store <64 x half> %sel, ptr %a 100 ret void 101} 102 103define void @select_v128f16(ptr %a, ptr %b) vscale_range(16,0) #0 { 104; CHECK-LABEL: select_v128f16: 105; CHECK: // %bb.0: 106; CHECK-NEXT: ptrue p0.h, vl128 107; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 108; CHECK-NEXT: ld1h { z1.h }, p0/z, [x1] 109; CHECK-NEXT: fcmeq p1.h, p0/z, z0.h, z1.h 110; CHECK-NEXT: sel z0.h, p1, z0.h, z1.h 111; CHECK-NEXT: st1h { z0.h }, p0, [x0] 112; CHECK-NEXT: ret 113 %op1 = load <128 x half>, ptr %a 114 %op2 = load <128 x half>, ptr %b 115 %mask = fcmp oeq <128 x half> %op1, %op2 116 %sel = select <128 x i1> %mask, <128 x half> %op1, <128 x half> %op2 117 store <128 x half> %sel, ptr %a 118 ret void 119} 120 121; Don't use SVE for 64-bit vectors. 122define <2 x float> @select_v2f32(<2 x float> %op1, <2 x float> %op2, <2 x i1> %mask) vscale_range(2,0) #0 { 123; CHECK-LABEL: select_v2f32: 124; CHECK: // %bb.0: 125; CHECK-NEXT: shl v2.2s, v2.2s, #31 126; CHECK-NEXT: cmlt v2.2s, v2.2s, #0 127; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 128; CHECK-NEXT: ret 129 %sel = select <2 x i1> %mask, <2 x float> %op1, <2 x float> %op2 130 ret <2 x float> %sel 131} 132 133; Don't use SVE for 128-bit vectors. 134define <4 x float> @select_v4f32(<4 x float> %op1, <4 x float> %op2, <4 x i1> %mask) vscale_range(2,0) #0 { 135; CHECK-LABEL: select_v4f32: 136; CHECK: // %bb.0: 137; CHECK-NEXT: ushll v2.4s, v2.4h, #0 138; CHECK-NEXT: shl v2.4s, v2.4s, #31 139; CHECK-NEXT: cmlt v2.4s, v2.4s, #0 140; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 141; CHECK-NEXT: ret 142 %sel = select <4 x i1> %mask, <4 x float> %op1, <4 x float> %op2 143 ret <4 x float> %sel 144} 145 146define void @select_v8f32(ptr %a, ptr %b) vscale_range(2,0) #0 { 147; CHECK-LABEL: select_v8f32: 148; CHECK: // %bb.0: 149; CHECK-NEXT: ptrue p0.s, vl8 150; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 151; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 152; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 153; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s 154; CHECK-NEXT: st1w { z0.s }, p0, [x0] 155; CHECK-NEXT: ret 156 %op1 = load <8 x float>, ptr %a 157 %op2 = load <8 x float>, ptr %b 158 %mask = fcmp oeq <8 x float> %op1, %op2 159 %sel = select <8 x i1> %mask, <8 x float> %op1, <8 x float> %op2 160 store <8 x float> %sel, ptr %a 161 ret void 162} 163 164define void @select_v16f32(ptr %a, ptr %b) #0 { 165; VBITS_GE_256-LABEL: select_v16f32: 166; VBITS_GE_256: // %bb.0: 167; VBITS_GE_256-NEXT: ptrue p0.s, vl8 168; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 169; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 170; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1, x8, lsl #2] 171; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x0] 172; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1] 173; VBITS_GE_256-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 174; VBITS_GE_256-NEXT: fcmeq p2.s, p0/z, z2.s, z3.s 175; VBITS_GE_256-NEXT: sel z0.s, p1, z0.s, z1.s 176; VBITS_GE_256-NEXT: sel z1.s, p2, z2.s, z3.s 177; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2] 178; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0] 179; VBITS_GE_256-NEXT: ret 180; 181; VBITS_GE_512-LABEL: select_v16f32: 182; VBITS_GE_512: // %bb.0: 183; VBITS_GE_512-NEXT: ptrue p0.s, vl16 184; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 185; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] 186; VBITS_GE_512-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 187; VBITS_GE_512-NEXT: sel z0.s, p1, z0.s, z1.s 188; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] 189; VBITS_GE_512-NEXT: ret 190 %op1 = load <16 x float>, ptr %a 191 %op2 = load <16 x float>, ptr %b 192 %mask = fcmp oeq <16 x float> %op1, %op2 193 %sel = select <16 x i1> %mask, <16 x float> %op1, <16 x float> %op2 194 store <16 x float> %sel, ptr %a 195 ret void 196} 197 198define void @select_v32f32(ptr %a, ptr %b) vscale_range(8,0) #0 { 199; CHECK-LABEL: select_v32f32: 200; CHECK: // %bb.0: 201; CHECK-NEXT: ptrue p0.s, vl32 202; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 203; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 204; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 205; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s 206; CHECK-NEXT: st1w { z0.s }, p0, [x0] 207; CHECK-NEXT: ret 208 %op1 = load <32 x float>, ptr %a 209 %op2 = load <32 x float>, ptr %b 210 %mask = fcmp oeq <32 x float> %op1, %op2 211 %sel = select <32 x i1> %mask, <32 x float> %op1, <32 x float> %op2 212 store <32 x float> %sel, ptr %a 213 ret void 214} 215 216define void @select_v64f32(ptr %a, ptr %b) vscale_range(16,0) #0 { 217; CHECK-LABEL: select_v64f32: 218; CHECK: // %bb.0: 219; CHECK-NEXT: ptrue p0.s, vl64 220; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 221; CHECK-NEXT: ld1w { z1.s }, p0/z, [x1] 222; CHECK-NEXT: fcmeq p1.s, p0/z, z0.s, z1.s 223; CHECK-NEXT: sel z0.s, p1, z0.s, z1.s 224; CHECK-NEXT: st1w { z0.s }, p0, [x0] 225; CHECK-NEXT: ret 226 %op1 = load <64 x float>, ptr %a 227 %op2 = load <64 x float>, ptr %b 228 %mask = fcmp oeq <64 x float> %op1, %op2 229 %sel = select <64 x i1> %mask, <64 x float> %op1, <64 x float> %op2 230 store <64 x float> %sel, ptr %a 231 ret void 232} 233 234; Don't use SVE for 64-bit vectors. 235define <1 x double> @select_v1f64(<1 x double> %op1, <1 x double> %op2, <1 x i1> %mask) vscale_range(2,0) #0 { 236; CHECK-LABEL: select_v1f64: 237; CHECK: // %bb.0: 238; CHECK-NEXT: tst w0, #0x1 239; CHECK-NEXT: csetm x8, ne 240; CHECK-NEXT: fmov d2, x8 241; CHECK-NEXT: bif v0.8b, v1.8b, v2.8b 242; CHECK-NEXT: ret 243 %sel = select <1 x i1> %mask, <1 x double> %op1, <1 x double> %op2 244 ret <1 x double> %sel 245} 246 247; Don't use SVE for 128-bit vectors. 248define <2 x double> @select_v2f64(<2 x double> %op1, <2 x double> %op2, <2 x i1> %mask) vscale_range(2,0) #0 { 249; CHECK-LABEL: select_v2f64: 250; CHECK: // %bb.0: 251; CHECK-NEXT: ushll v2.2d, v2.2s, #0 252; CHECK-NEXT: shl v2.2d, v2.2d, #63 253; CHECK-NEXT: cmlt v2.2d, v2.2d, #0 254; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b 255; CHECK-NEXT: ret 256 %sel = select <2 x i1> %mask, <2 x double> %op1, <2 x double> %op2 257 ret <2 x double> %sel 258} 259 260define void @select_v4f64(ptr %a, ptr %b) vscale_range(2,0) #0 { 261; CHECK-LABEL: select_v4f64: 262; CHECK: // %bb.0: 263; CHECK-NEXT: ptrue p0.d, vl4 264; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 265; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 266; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 267; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d 268; CHECK-NEXT: st1d { z0.d }, p0, [x0] 269; CHECK-NEXT: ret 270 %op1 = load <4 x double>, ptr %a 271 %op2 = load <4 x double>, ptr %b 272 %mask = fcmp oeq <4 x double> %op1, %op2 273 %sel = select <4 x i1> %mask, <4 x double> %op1, <4 x double> %op2 274 store <4 x double> %sel, ptr %a 275 ret void 276} 277 278define void @select_v8f64(ptr %a, ptr %b) #0 { 279; VBITS_GE_256-LABEL: select_v8f64: 280; VBITS_GE_256: // %bb.0: 281; VBITS_GE_256-NEXT: ptrue p0.d, vl4 282; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 283; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 284; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x1, x8, lsl #3] 285; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x0] 286; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1] 287; VBITS_GE_256-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 288; VBITS_GE_256-NEXT: fcmeq p2.d, p0/z, z2.d, z3.d 289; VBITS_GE_256-NEXT: sel z0.d, p1, z0.d, z1.d 290; VBITS_GE_256-NEXT: sel z1.d, p2, z2.d, z3.d 291; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] 292; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0] 293; VBITS_GE_256-NEXT: ret 294; 295; VBITS_GE_512-LABEL: select_v8f64: 296; VBITS_GE_512: // %bb.0: 297; VBITS_GE_512-NEXT: ptrue p0.d, vl8 298; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 299; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] 300; VBITS_GE_512-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 301; VBITS_GE_512-NEXT: sel z0.d, p1, z0.d, z1.d 302; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] 303; VBITS_GE_512-NEXT: ret 304 %op1 = load <8 x double>, ptr %a 305 %op2 = load <8 x double>, ptr %b 306 %mask = fcmp oeq <8 x double> %op1, %op2 307 %sel = select <8 x i1> %mask, <8 x double> %op1, <8 x double> %op2 308 store <8 x double> %sel, ptr %a 309 ret void 310} 311 312define void @select_v16f64(ptr %a, ptr %b) vscale_range(8,0) #0 { 313; CHECK-LABEL: select_v16f64: 314; CHECK: // %bb.0: 315; CHECK-NEXT: ptrue p0.d, vl16 316; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 317; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 318; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 319; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d 320; CHECK-NEXT: st1d { z0.d }, p0, [x0] 321; CHECK-NEXT: ret 322 %op1 = load <16 x double>, ptr %a 323 %op2 = load <16 x double>, ptr %b 324 %mask = fcmp oeq <16 x double> %op1, %op2 325 %sel = select <16 x i1> %mask, <16 x double> %op1, <16 x double> %op2 326 store <16 x double> %sel, ptr %a 327 ret void 328} 329 330define void @select_v32f64(ptr %a, ptr %b) vscale_range(16,0) #0 { 331; CHECK-LABEL: select_v32f64: 332; CHECK: // %bb.0: 333; CHECK-NEXT: ptrue p0.d, vl32 334; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 335; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] 336; CHECK-NEXT: fcmeq p1.d, p0/z, z0.d, z1.d 337; CHECK-NEXT: sel z0.d, p1, z0.d, z1.d 338; CHECK-NEXT: st1d { z0.d }, p0, [x0] 339; CHECK-NEXT: ret 340 %op1 = load <32 x double>, ptr %a 341 %op2 = load <32 x double>, ptr %b 342 %mask = fcmp oeq <32 x double> %op1, %op2 343 %sel = select <32 x i1> %mask, <32 x double> %op1, <32 x double> %op2 344 store <32 x double> %sel, ptr %a 345 ret void 346} 347 348attributes #0 = { "target-features"="+sve" } 349