1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; ANDV 10; 11 12; No single instruction NEON ANDV support. Use SVE. 13define i8 @andv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 14; CHECK-LABEL: andv_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: ptrue p0.b, vl8 17; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 18; CHECK-NEXT: andv b0, p0, z0.b 19; CHECK-NEXT: fmov w0, s0 20; CHECK-NEXT: ret 21 %res = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %a) 22 ret i8 %res 23} 24 25; No single instruction NEON ANDV support. Use SVE. 26define i8 @andv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 27; CHECK-LABEL: andv_v16i8: 28; CHECK: // %bb.0: 29; CHECK-NEXT: ptrue p0.b, vl16 30; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 31; CHECK-NEXT: andv b0, p0, z0.b 32; CHECK-NEXT: fmov w0, s0 33; CHECK-NEXT: ret 34 %res = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %a) 35 ret i8 %res 36} 37 38define i8 @andv_v32i8(ptr %a) vscale_range(2,0) #0 { 39; CHECK-LABEL: andv_v32i8: 40; CHECK: // %bb.0: 41; CHECK-NEXT: ptrue p0.b, vl32 42; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 43; CHECK-NEXT: andv b0, p0, z0.b 44; CHECK-NEXT: fmov w0, s0 45; CHECK-NEXT: ret 46 %op = load <32 x i8>, ptr %a 47 %res = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %op) 48 ret i8 %res 49} 50 51define i8 @andv_v64i8(ptr %a) #0 { 52; VBITS_GE_256-LABEL: andv_v64i8: 53; VBITS_GE_256: // %bb.0: 54; VBITS_GE_256-NEXT: ptrue p0.b, vl32 55; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 56; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 57; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 58; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d 59; VBITS_GE_256-NEXT: andv b0, p0, z0.b 60; VBITS_GE_256-NEXT: fmov w0, s0 61; VBITS_GE_256-NEXT: ret 62; 63; VBITS_GE_512-LABEL: andv_v64i8: 64; VBITS_GE_512: // %bb.0: 65; VBITS_GE_512-NEXT: ptrue p0.b, vl64 66; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 67; VBITS_GE_512-NEXT: andv b0, p0, z0.b 68; VBITS_GE_512-NEXT: fmov w0, s0 69; VBITS_GE_512-NEXT: ret 70 %op = load <64 x i8>, ptr %a 71 %res = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %op) 72 ret i8 %res 73} 74 75define i8 @andv_v128i8(ptr %a) vscale_range(8,0) #0 { 76; CHECK-LABEL: andv_v128i8: 77; CHECK: // %bb.0: 78; CHECK-NEXT: ptrue p0.b, vl128 79; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 80; CHECK-NEXT: andv b0, p0, z0.b 81; CHECK-NEXT: fmov w0, s0 82; CHECK-NEXT: ret 83 %op = load <128 x i8>, ptr %a 84 %res = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %op) 85 ret i8 %res 86} 87 88define i8 @andv_v256i8(ptr %a) vscale_range(16,0) #0 { 89; CHECK-LABEL: andv_v256i8: 90; CHECK: // %bb.0: 91; CHECK-NEXT: ptrue p0.b, vl256 92; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 93; CHECK-NEXT: andv b0, p0, z0.b 94; CHECK-NEXT: fmov w0, s0 95; CHECK-NEXT: ret 96 %op = load <256 x i8>, ptr %a 97 %res = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %op) 98 ret i8 %res 99} 100 101; No single instruction NEON ANDV support. Use SVE. 102define i16 @andv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 103; CHECK-LABEL: andv_v4i16: 104; CHECK: // %bb.0: 105; CHECK-NEXT: ptrue p0.h, vl4 106; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 107; CHECK-NEXT: andv h0, p0, z0.h 108; CHECK-NEXT: fmov w0, s0 109; CHECK-NEXT: ret 110 %res = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %a) 111 ret i16 %res 112} 113 114; No single instruction NEON ANDV support. Use SVE. 115define i16 @andv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 116; CHECK-LABEL: andv_v8i16: 117; CHECK: // %bb.0: 118; CHECK-NEXT: ptrue p0.h, vl8 119; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 120; CHECK-NEXT: andv h0, p0, z0.h 121; CHECK-NEXT: fmov w0, s0 122; CHECK-NEXT: ret 123 %res = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %a) 124 ret i16 %res 125} 126 127define i16 @andv_v16i16(ptr %a) vscale_range(2,0) #0 { 128; CHECK-LABEL: andv_v16i16: 129; CHECK: // %bb.0: 130; CHECK-NEXT: ptrue p0.h, vl16 131; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 132; CHECK-NEXT: andv h0, p0, z0.h 133; CHECK-NEXT: fmov w0, s0 134; CHECK-NEXT: ret 135 %op = load <16 x i16>, ptr %a 136 %res = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %op) 137 ret i16 %res 138} 139 140define i16 @andv_v32i16(ptr %a) #0 { 141; VBITS_GE_256-LABEL: andv_v32i16: 142; VBITS_GE_256: // %bb.0: 143; VBITS_GE_256-NEXT: ptrue p0.h, vl16 144; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 145; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 146; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 147; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d 148; VBITS_GE_256-NEXT: andv h0, p0, z0.h 149; VBITS_GE_256-NEXT: fmov w0, s0 150; VBITS_GE_256-NEXT: ret 151; 152; VBITS_GE_512-LABEL: andv_v32i16: 153; VBITS_GE_512: // %bb.0: 154; VBITS_GE_512-NEXT: ptrue p0.h, vl32 155; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 156; VBITS_GE_512-NEXT: andv h0, p0, z0.h 157; VBITS_GE_512-NEXT: fmov w0, s0 158; VBITS_GE_512-NEXT: ret 159 %op = load <32 x i16>, ptr %a 160 %res = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %op) 161 ret i16 %res 162} 163 164define i16 @andv_v64i16(ptr %a) vscale_range(8,0) #0 { 165; CHECK-LABEL: andv_v64i16: 166; CHECK: // %bb.0: 167; CHECK-NEXT: ptrue p0.h, vl64 168; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 169; CHECK-NEXT: andv h0, p0, z0.h 170; CHECK-NEXT: fmov w0, s0 171; CHECK-NEXT: ret 172 %op = load <64 x i16>, ptr %a 173 %res = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %op) 174 ret i16 %res 175} 176 177define i16 @andv_v128i16(ptr %a) vscale_range(16,0) #0 { 178; CHECK-LABEL: andv_v128i16: 179; CHECK: // %bb.0: 180; CHECK-NEXT: ptrue p0.h, vl128 181; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 182; CHECK-NEXT: andv h0, p0, z0.h 183; CHECK-NEXT: fmov w0, s0 184; CHECK-NEXT: ret 185 %op = load <128 x i16>, ptr %a 186 %res = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %op) 187 ret i16 %res 188} 189 190; No single instruction NEON ANDV support. Use SVE. 191define i32 @andv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 192; CHECK-LABEL: andv_v2i32: 193; CHECK: // %bb.0: 194; CHECK-NEXT: ptrue p0.s, vl2 195; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 196; CHECK-NEXT: andv s0, p0, z0.s 197; CHECK-NEXT: fmov w0, s0 198; CHECK-NEXT: ret 199 %res = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %a) 200 ret i32 %res 201} 202 203; No single instruction NEON ANDV support. Use SVE. 204define i32 @andv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 205; CHECK-LABEL: andv_v4i32: 206; CHECK: // %bb.0: 207; CHECK-NEXT: ptrue p0.s, vl4 208; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 209; CHECK-NEXT: andv s0, p0, z0.s 210; CHECK-NEXT: fmov w0, s0 211; CHECK-NEXT: ret 212 %res = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a) 213 ret i32 %res 214} 215 216define i32 @andv_v8i32(ptr %a) vscale_range(2,0) #0 { 217; CHECK-LABEL: andv_v8i32: 218; CHECK: // %bb.0: 219; CHECK-NEXT: ptrue p0.s, vl8 220; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 221; CHECK-NEXT: andv s0, p0, z0.s 222; CHECK-NEXT: fmov w0, s0 223; CHECK-NEXT: ret 224 %op = load <8 x i32>, ptr %a 225 %res = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %op) 226 ret i32 %res 227} 228 229define i32 @andv_v16i32(ptr %a) #0 { 230; VBITS_GE_256-LABEL: andv_v16i32: 231; VBITS_GE_256: // %bb.0: 232; VBITS_GE_256-NEXT: ptrue p0.s, vl8 233; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 234; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 235; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 236; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d 237; VBITS_GE_256-NEXT: andv s0, p0, z0.s 238; VBITS_GE_256-NEXT: fmov w0, s0 239; VBITS_GE_256-NEXT: ret 240; 241; VBITS_GE_512-LABEL: andv_v16i32: 242; VBITS_GE_512: // %bb.0: 243; VBITS_GE_512-NEXT: ptrue p0.s, vl16 244; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 245; VBITS_GE_512-NEXT: andv s0, p0, z0.s 246; VBITS_GE_512-NEXT: fmov w0, s0 247; VBITS_GE_512-NEXT: ret 248 %op = load <16 x i32>, ptr %a 249 %res = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %op) 250 ret i32 %res 251} 252 253define i32 @andv_v32i32(ptr %a) vscale_range(8,0) #0 { 254; CHECK-LABEL: andv_v32i32: 255; CHECK: // %bb.0: 256; CHECK-NEXT: ptrue p0.s, vl32 257; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 258; CHECK-NEXT: andv s0, p0, z0.s 259; CHECK-NEXT: fmov w0, s0 260; CHECK-NEXT: ret 261 %op = load <32 x i32>, ptr %a 262 %res = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %op) 263 ret i32 %res 264} 265 266define i32 @andv_v64i32(ptr %a) vscale_range(16,0) #0 { 267; CHECK-LABEL: andv_v64i32: 268; CHECK: // %bb.0: 269; CHECK-NEXT: ptrue p0.s, vl64 270; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 271; CHECK-NEXT: andv s0, p0, z0.s 272; CHECK-NEXT: fmov w0, s0 273; CHECK-NEXT: ret 274 %op = load <64 x i32>, ptr %a 275 %res = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %op) 276 ret i32 %res 277} 278 279; Nothing to do for single element vectors. 280define i64 @andv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 281; CHECK-LABEL: andv_v1i64: 282; CHECK: // %bb.0: 283; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 284; CHECK-NEXT: fmov x0, d0 285; CHECK-NEXT: ret 286 %res = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %a) 287 ret i64 %res 288} 289 290; Use SVE for 128-bit vectors 291define i64 @andv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 292; CHECK-LABEL: andv_v2i64: 293; CHECK: // %bb.0: 294; CHECK-NEXT: ptrue p0.d, vl2 295; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 296; CHECK-NEXT: andv d0, p0, z0.d 297; CHECK-NEXT: fmov x0, d0 298; CHECK-NEXT: ret 299 %res = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a) 300 ret i64 %res 301} 302 303define i64 @andv_v4i64(ptr %a) vscale_range(2,0) #0 { 304; CHECK-LABEL: andv_v4i64: 305; CHECK: // %bb.0: 306; CHECK-NEXT: ptrue p0.d, vl4 307; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 308; CHECK-NEXT: andv d0, p0, z0.d 309; CHECK-NEXT: fmov x0, d0 310; CHECK-NEXT: ret 311 %op = load <4 x i64>, ptr %a 312 %res = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %op) 313 ret i64 %res 314} 315 316define i64 @andv_v8i64(ptr %a) #0 { 317; VBITS_GE_256-LABEL: andv_v8i64: 318; VBITS_GE_256: // %bb.0: 319; VBITS_GE_256-NEXT: ptrue p0.d, vl4 320; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 321; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 322; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 323; VBITS_GE_256-NEXT: and z0.d, z1.d, z0.d 324; VBITS_GE_256-NEXT: andv d0, p0, z0.d 325; VBITS_GE_256-NEXT: fmov x0, d0 326; VBITS_GE_256-NEXT: ret 327; 328; VBITS_GE_512-LABEL: andv_v8i64: 329; VBITS_GE_512: // %bb.0: 330; VBITS_GE_512-NEXT: ptrue p0.d, vl8 331; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 332; VBITS_GE_512-NEXT: andv d0, p0, z0.d 333; VBITS_GE_512-NEXT: fmov x0, d0 334; VBITS_GE_512-NEXT: ret 335 %op = load <8 x i64>, ptr %a 336 %res = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %op) 337 ret i64 %res 338} 339 340define i64 @andv_v16i64(ptr %a) vscale_range(8,0) #0 { 341; CHECK-LABEL: andv_v16i64: 342; CHECK: // %bb.0: 343; CHECK-NEXT: ptrue p0.d, vl16 344; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 345; CHECK-NEXT: andv d0, p0, z0.d 346; CHECK-NEXT: fmov x0, d0 347; CHECK-NEXT: ret 348 %op = load <16 x i64>, ptr %a 349 %res = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %op) 350 ret i64 %res 351} 352 353define i64 @andv_v32i64(ptr %a) vscale_range(16,0) #0 { 354; CHECK-LABEL: andv_v32i64: 355; CHECK: // %bb.0: 356; CHECK-NEXT: ptrue p0.d, vl32 357; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 358; CHECK-NEXT: andv d0, p0, z0.d 359; CHECK-NEXT: fmov x0, d0 360; CHECK-NEXT: ret 361 %op = load <32 x i64>, ptr %a 362 %res = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %op) 363 ret i64 %res 364} 365 366; 367; EORV 368; 369 370; No single instruction NEON EORV support. Use SVE. 371define i8 @eorv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 372; CHECK-LABEL: eorv_v8i8: 373; CHECK: // %bb.0: 374; CHECK-NEXT: ptrue p0.b, vl8 375; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 376; CHECK-NEXT: eorv b0, p0, z0.b 377; CHECK-NEXT: fmov w0, s0 378; CHECK-NEXT: ret 379 %res = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %a) 380 ret i8 %res 381} 382 383; No single instruction NEON EORV support. Use SVE. 384define i8 @eorv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 385; CHECK-LABEL: eorv_v16i8: 386; CHECK: // %bb.0: 387; CHECK-NEXT: ptrue p0.b, vl16 388; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 389; CHECK-NEXT: eorv b0, p0, z0.b 390; CHECK-NEXT: fmov w0, s0 391; CHECK-NEXT: ret 392 %res = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %a) 393 ret i8 %res 394} 395 396define i8 @eorv_v32i8(ptr %a) vscale_range(2,0) #0 { 397; CHECK-LABEL: eorv_v32i8: 398; CHECK: // %bb.0: 399; CHECK-NEXT: ptrue p0.b, vl32 400; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 401; CHECK-NEXT: eorv b0, p0, z0.b 402; CHECK-NEXT: fmov w0, s0 403; CHECK-NEXT: ret 404 %op = load <32 x i8>, ptr %a 405 %res = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %op) 406 ret i8 %res 407} 408 409define i8 @eorv_v64i8(ptr %a) #0 { 410; VBITS_GE_256-LABEL: eorv_v64i8: 411; VBITS_GE_256: // %bb.0: 412; VBITS_GE_256-NEXT: ptrue p0.b, vl32 413; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 414; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 415; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 416; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d 417; VBITS_GE_256-NEXT: eorv b0, p0, z0.b 418; VBITS_GE_256-NEXT: fmov w0, s0 419; VBITS_GE_256-NEXT: ret 420; 421; VBITS_GE_512-LABEL: eorv_v64i8: 422; VBITS_GE_512: // %bb.0: 423; VBITS_GE_512-NEXT: ptrue p0.b, vl64 424; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 425; VBITS_GE_512-NEXT: eorv b0, p0, z0.b 426; VBITS_GE_512-NEXT: fmov w0, s0 427; VBITS_GE_512-NEXT: ret 428 %op = load <64 x i8>, ptr %a 429 %res = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %op) 430 ret i8 %res 431} 432 433define i8 @eorv_v128i8(ptr %a) vscale_range(8,0) #0 { 434; CHECK-LABEL: eorv_v128i8: 435; CHECK: // %bb.0: 436; CHECK-NEXT: ptrue p0.b, vl128 437; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 438; CHECK-NEXT: eorv b0, p0, z0.b 439; CHECK-NEXT: fmov w0, s0 440; CHECK-NEXT: ret 441 %op = load <128 x i8>, ptr %a 442 %res = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %op) 443 ret i8 %res 444} 445 446define i8 @eorv_v256i8(ptr %a) vscale_range(16,0) #0 { 447; CHECK-LABEL: eorv_v256i8: 448; CHECK: // %bb.0: 449; CHECK-NEXT: ptrue p0.b, vl256 450; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 451; CHECK-NEXT: eorv b0, p0, z0.b 452; CHECK-NEXT: fmov w0, s0 453; CHECK-NEXT: ret 454 %op = load <256 x i8>, ptr %a 455 %res = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %op) 456 ret i8 %res 457} 458 459; No single instruction NEON EORV support. Use SVE. 460define i16 @eorv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 461; CHECK-LABEL: eorv_v4i16: 462; CHECK: // %bb.0: 463; CHECK-NEXT: ptrue p0.h, vl4 464; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 465; CHECK-NEXT: eorv h0, p0, z0.h 466; CHECK-NEXT: fmov w0, s0 467; CHECK-NEXT: ret 468 %res = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %a) 469 ret i16 %res 470} 471 472; No single instruction NEON EORV support. Use SVE. 473define i16 @eorv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 474; CHECK-LABEL: eorv_v8i16: 475; CHECK: // %bb.0: 476; CHECK-NEXT: ptrue p0.h, vl8 477; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 478; CHECK-NEXT: eorv h0, p0, z0.h 479; CHECK-NEXT: fmov w0, s0 480; CHECK-NEXT: ret 481 %res = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %a) 482 ret i16 %res 483} 484 485define i16 @eorv_v16i16(ptr %a) vscale_range(2,0) #0 { 486; CHECK-LABEL: eorv_v16i16: 487; CHECK: // %bb.0: 488; CHECK-NEXT: ptrue p0.h, vl16 489; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 490; CHECK-NEXT: eorv h0, p0, z0.h 491; CHECK-NEXT: fmov w0, s0 492; CHECK-NEXT: ret 493 %op = load <16 x i16>, ptr %a 494 %res = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %op) 495 ret i16 %res 496} 497 498define i16 @eorv_v32i16(ptr %a) #0 { 499; VBITS_GE_256-LABEL: eorv_v32i16: 500; VBITS_GE_256: // %bb.0: 501; VBITS_GE_256-NEXT: ptrue p0.h, vl16 502; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 503; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 504; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 505; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d 506; VBITS_GE_256-NEXT: eorv h0, p0, z0.h 507; VBITS_GE_256-NEXT: fmov w0, s0 508; VBITS_GE_256-NEXT: ret 509; 510; VBITS_GE_512-LABEL: eorv_v32i16: 511; VBITS_GE_512: // %bb.0: 512; VBITS_GE_512-NEXT: ptrue p0.h, vl32 513; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 514; VBITS_GE_512-NEXT: eorv h0, p0, z0.h 515; VBITS_GE_512-NEXT: fmov w0, s0 516; VBITS_GE_512-NEXT: ret 517 %op = load <32 x i16>, ptr %a 518 %res = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %op) 519 ret i16 %res 520} 521 522define i16 @eorv_v64i16(ptr %a) vscale_range(8,0) #0 { 523; CHECK-LABEL: eorv_v64i16: 524; CHECK: // %bb.0: 525; CHECK-NEXT: ptrue p0.h, vl64 526; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 527; CHECK-NEXT: eorv h0, p0, z0.h 528; CHECK-NEXT: fmov w0, s0 529; CHECK-NEXT: ret 530 %op = load <64 x i16>, ptr %a 531 %res = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %op) 532 ret i16 %res 533} 534 535define i16 @eorv_v128i16(ptr %a) vscale_range(16,0) #0 { 536; CHECK-LABEL: eorv_v128i16: 537; CHECK: // %bb.0: 538; CHECK-NEXT: ptrue p0.h, vl128 539; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 540; CHECK-NEXT: eorv h0, p0, z0.h 541; CHECK-NEXT: fmov w0, s0 542; CHECK-NEXT: ret 543 %op = load <128 x i16>, ptr %a 544 %res = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %op) 545 ret i16 %res 546} 547 548; No single instruction NEON EORV support. Use SVE. 549define i32 @eorv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 550; CHECK-LABEL: eorv_v2i32: 551; CHECK: // %bb.0: 552; CHECK-NEXT: ptrue p0.s, vl2 553; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 554; CHECK-NEXT: eorv s0, p0, z0.s 555; CHECK-NEXT: fmov w0, s0 556; CHECK-NEXT: ret 557 %res = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %a) 558 ret i32 %res 559} 560 561; No single instruction NEON EORV support. Use SVE. 562define i32 @eorv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 563; CHECK-LABEL: eorv_v4i32: 564; CHECK: // %bb.0: 565; CHECK-NEXT: ptrue p0.s, vl4 566; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 567; CHECK-NEXT: eorv s0, p0, z0.s 568; CHECK-NEXT: fmov w0, s0 569; CHECK-NEXT: ret 570 %res = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %a) 571 ret i32 %res 572} 573 574define i32 @eorv_v8i32(ptr %a) vscale_range(2,0) #0 { 575; CHECK-LABEL: eorv_v8i32: 576; CHECK: // %bb.0: 577; CHECK-NEXT: ptrue p0.s, vl8 578; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 579; CHECK-NEXT: eorv s0, p0, z0.s 580; CHECK-NEXT: fmov w0, s0 581; CHECK-NEXT: ret 582 %op = load <8 x i32>, ptr %a 583 %res = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %op) 584 ret i32 %res 585} 586 587define i32 @eorv_v16i32(ptr %a) #0 { 588; VBITS_GE_256-LABEL: eorv_v16i32: 589; VBITS_GE_256: // %bb.0: 590; VBITS_GE_256-NEXT: ptrue p0.s, vl8 591; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 592; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 593; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 594; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d 595; VBITS_GE_256-NEXT: eorv s0, p0, z0.s 596; VBITS_GE_256-NEXT: fmov w0, s0 597; VBITS_GE_256-NEXT: ret 598; 599; VBITS_GE_512-LABEL: eorv_v16i32: 600; VBITS_GE_512: // %bb.0: 601; VBITS_GE_512-NEXT: ptrue p0.s, vl16 602; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 603; VBITS_GE_512-NEXT: eorv s0, p0, z0.s 604; VBITS_GE_512-NEXT: fmov w0, s0 605; VBITS_GE_512-NEXT: ret 606 %op = load <16 x i32>, ptr %a 607 %res = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %op) 608 ret i32 %res 609} 610 611define i32 @eorv_v32i32(ptr %a) vscale_range(8,0) #0 { 612; CHECK-LABEL: eorv_v32i32: 613; CHECK: // %bb.0: 614; CHECK-NEXT: ptrue p0.s, vl32 615; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 616; CHECK-NEXT: eorv s0, p0, z0.s 617; CHECK-NEXT: fmov w0, s0 618; CHECK-NEXT: ret 619 %op = load <32 x i32>, ptr %a 620 %res = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %op) 621 ret i32 %res 622} 623 624define i32 @eorv_v64i32(ptr %a) vscale_range(16,0) #0 { 625; CHECK-LABEL: eorv_v64i32: 626; CHECK: // %bb.0: 627; CHECK-NEXT: ptrue p0.s, vl64 628; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 629; CHECK-NEXT: eorv s0, p0, z0.s 630; CHECK-NEXT: fmov w0, s0 631; CHECK-NEXT: ret 632 %op = load <64 x i32>, ptr %a 633 %res = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %op) 634 ret i32 %res 635} 636 637; Nothing to do for single element vectors. 638define i64 @eorv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 639; CHECK-LABEL: eorv_v1i64: 640; CHECK: // %bb.0: 641; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 642; CHECK-NEXT: fmov x0, d0 643; CHECK-NEXT: ret 644 %res = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %a) 645 ret i64 %res 646} 647 648; Use SVE for 128-bit vectors 649define i64 @eorv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 650; CHECK-LABEL: eorv_v2i64: 651; CHECK: // %bb.0: 652; CHECK-NEXT: ptrue p0.d, vl2 653; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 654; CHECK-NEXT: eorv d0, p0, z0.d 655; CHECK-NEXT: fmov x0, d0 656; CHECK-NEXT: ret 657 %res = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %a) 658 ret i64 %res 659} 660 661define i64 @eorv_v4i64(ptr %a) vscale_range(2,0) #0 { 662; CHECK-LABEL: eorv_v4i64: 663; CHECK: // %bb.0: 664; CHECK-NEXT: ptrue p0.d, vl4 665; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 666; CHECK-NEXT: eorv d0, p0, z0.d 667; CHECK-NEXT: fmov x0, d0 668; CHECK-NEXT: ret 669 %op = load <4 x i64>, ptr %a 670 %res = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %op) 671 ret i64 %res 672} 673 674define i64 @eorv_v8i64(ptr %a) #0 { 675; VBITS_GE_256-LABEL: eorv_v8i64: 676; VBITS_GE_256: // %bb.0: 677; VBITS_GE_256-NEXT: ptrue p0.d, vl4 678; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 679; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 680; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 681; VBITS_GE_256-NEXT: eor z0.d, z1.d, z0.d 682; VBITS_GE_256-NEXT: eorv d0, p0, z0.d 683; VBITS_GE_256-NEXT: fmov x0, d0 684; VBITS_GE_256-NEXT: ret 685; 686; VBITS_GE_512-LABEL: eorv_v8i64: 687; VBITS_GE_512: // %bb.0: 688; VBITS_GE_512-NEXT: ptrue p0.d, vl8 689; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 690; VBITS_GE_512-NEXT: eorv d0, p0, z0.d 691; VBITS_GE_512-NEXT: fmov x0, d0 692; VBITS_GE_512-NEXT: ret 693 %op = load <8 x i64>, ptr %a 694 %res = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %op) 695 ret i64 %res 696} 697 698define i64 @eorv_v16i64(ptr %a) vscale_range(8,0) #0 { 699; CHECK-LABEL: eorv_v16i64: 700; CHECK: // %bb.0: 701; CHECK-NEXT: ptrue p0.d, vl16 702; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 703; CHECK-NEXT: eorv d0, p0, z0.d 704; CHECK-NEXT: fmov x0, d0 705; CHECK-NEXT: ret 706 %op = load <16 x i64>, ptr %a 707 %res = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %op) 708 ret i64 %res 709} 710 711define i64 @eorv_v32i64(ptr %a) vscale_range(16,0) #0 { 712; CHECK-LABEL: eorv_v32i64: 713; CHECK: // %bb.0: 714; CHECK-NEXT: ptrue p0.d, vl32 715; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 716; CHECK-NEXT: eorv d0, p0, z0.d 717; CHECK-NEXT: fmov x0, d0 718; CHECK-NEXT: ret 719 %op = load <32 x i64>, ptr %a 720 %res = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %op) 721 ret i64 %res 722} 723 724; 725; ORV 726; 727 728; No single instruction NEON ORV support. Use SVE. 729define i8 @orv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 730; CHECK-LABEL: orv_v8i8: 731; CHECK: // %bb.0: 732; CHECK-NEXT: ptrue p0.b, vl8 733; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 734; CHECK-NEXT: orv b0, p0, z0.b 735; CHECK-NEXT: fmov w0, s0 736; CHECK-NEXT: ret 737 %res = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %a) 738 ret i8 %res 739} 740 741; No single instruction NEON ORV support. Use SVE. 742define i8 @orv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 743; CHECK-LABEL: orv_v16i8: 744; CHECK: // %bb.0: 745; CHECK-NEXT: ptrue p0.b, vl16 746; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 747; CHECK-NEXT: orv b0, p0, z0.b 748; CHECK-NEXT: fmov w0, s0 749; CHECK-NEXT: ret 750 %res = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %a) 751 ret i8 %res 752} 753 754define i8 @orv_v32i8(ptr %a) vscale_range(2,0) #0 { 755; CHECK-LABEL: orv_v32i8: 756; CHECK: // %bb.0: 757; CHECK-NEXT: ptrue p0.b, vl32 758; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 759; CHECK-NEXT: orv b0, p0, z0.b 760; CHECK-NEXT: fmov w0, s0 761; CHECK-NEXT: ret 762 %op = load <32 x i8>, ptr %a 763 %res = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %op) 764 ret i8 %res 765} 766 767define i8 @orv_v64i8(ptr %a) #0 { 768; VBITS_GE_256-LABEL: orv_v64i8: 769; VBITS_GE_256: // %bb.0: 770; VBITS_GE_256-NEXT: ptrue p0.b, vl32 771; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 772; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 773; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 774; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d 775; VBITS_GE_256-NEXT: orv b0, p0, z0.b 776; VBITS_GE_256-NEXT: fmov w0, s0 777; VBITS_GE_256-NEXT: ret 778; 779; VBITS_GE_512-LABEL: orv_v64i8: 780; VBITS_GE_512: // %bb.0: 781; VBITS_GE_512-NEXT: ptrue p0.b, vl64 782; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 783; VBITS_GE_512-NEXT: orv b0, p0, z0.b 784; VBITS_GE_512-NEXT: fmov w0, s0 785; VBITS_GE_512-NEXT: ret 786 %op = load <64 x i8>, ptr %a 787 %res = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %op) 788 ret i8 %res 789} 790 791define i8 @orv_v128i8(ptr %a) vscale_range(8,0) #0 { 792; CHECK-LABEL: orv_v128i8: 793; CHECK: // %bb.0: 794; CHECK-NEXT: ptrue p0.b, vl128 795; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 796; CHECK-NEXT: orv b0, p0, z0.b 797; CHECK-NEXT: fmov w0, s0 798; CHECK-NEXT: ret 799 %op = load <128 x i8>, ptr %a 800 %res = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %op) 801 ret i8 %res 802} 803 804define i8 @orv_v256i8(ptr %a) vscale_range(16,0) #0 { 805; CHECK-LABEL: orv_v256i8: 806; CHECK: // %bb.0: 807; CHECK-NEXT: ptrue p0.b, vl256 808; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 809; CHECK-NEXT: orv b0, p0, z0.b 810; CHECK-NEXT: fmov w0, s0 811; CHECK-NEXT: ret 812 %op = load <256 x i8>, ptr %a 813 %res = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %op) 814 ret i8 %res 815} 816 817; No single instruction NEON ORV support. Use SVE. 818define i16 @orv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 819; CHECK-LABEL: orv_v4i16: 820; CHECK: // %bb.0: 821; CHECK-NEXT: ptrue p0.h, vl4 822; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 823; CHECK-NEXT: orv h0, p0, z0.h 824; CHECK-NEXT: fmov w0, s0 825; CHECK-NEXT: ret 826 %res = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %a) 827 ret i16 %res 828} 829 830; No single instruction NEON ORV support. Use SVE. 831define i16 @orv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 832; CHECK-LABEL: orv_v8i16: 833; CHECK: // %bb.0: 834; CHECK-NEXT: ptrue p0.h, vl8 835; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 836; CHECK-NEXT: orv h0, p0, z0.h 837; CHECK-NEXT: fmov w0, s0 838; CHECK-NEXT: ret 839 %res = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %a) 840 ret i16 %res 841} 842 843define i16 @orv_v16i16(ptr %a) vscale_range(2,0) #0 { 844; CHECK-LABEL: orv_v16i16: 845; CHECK: // %bb.0: 846; CHECK-NEXT: ptrue p0.h, vl16 847; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 848; CHECK-NEXT: orv h0, p0, z0.h 849; CHECK-NEXT: fmov w0, s0 850; CHECK-NEXT: ret 851 %op = load <16 x i16>, ptr %a 852 %res = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %op) 853 ret i16 %res 854} 855 856define i16 @orv_v32i16(ptr %a) #0 { 857; VBITS_GE_256-LABEL: orv_v32i16: 858; VBITS_GE_256: // %bb.0: 859; VBITS_GE_256-NEXT: ptrue p0.h, vl16 860; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 861; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 862; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 863; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d 864; VBITS_GE_256-NEXT: orv h0, p0, z0.h 865; VBITS_GE_256-NEXT: fmov w0, s0 866; VBITS_GE_256-NEXT: ret 867; 868; VBITS_GE_512-LABEL: orv_v32i16: 869; VBITS_GE_512: // %bb.0: 870; VBITS_GE_512-NEXT: ptrue p0.h, vl32 871; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 872; VBITS_GE_512-NEXT: orv h0, p0, z0.h 873; VBITS_GE_512-NEXT: fmov w0, s0 874; VBITS_GE_512-NEXT: ret 875 %op = load <32 x i16>, ptr %a 876 %res = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %op) 877 ret i16 %res 878} 879 880define i16 @orv_v64i16(ptr %a) vscale_range(8,0) #0 { 881; CHECK-LABEL: orv_v64i16: 882; CHECK: // %bb.0: 883; CHECK-NEXT: ptrue p0.h, vl64 884; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 885; CHECK-NEXT: orv h0, p0, z0.h 886; CHECK-NEXT: fmov w0, s0 887; CHECK-NEXT: ret 888 %op = load <64 x i16>, ptr %a 889 %res = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %op) 890 ret i16 %res 891} 892 893define i16 @orv_v128i16(ptr %a) vscale_range(16,0) #0 { 894; CHECK-LABEL: orv_v128i16: 895; CHECK: // %bb.0: 896; CHECK-NEXT: ptrue p0.h, vl128 897; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 898; CHECK-NEXT: orv h0, p0, z0.h 899; CHECK-NEXT: fmov w0, s0 900; CHECK-NEXT: ret 901 %op = load <128 x i16>, ptr %a 902 %res = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %op) 903 ret i16 %res 904} 905 906; No single instruction NEON ORV support. Use SVE. 907define i32 @orv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 908; CHECK-LABEL: orv_v2i32: 909; CHECK: // %bb.0: 910; CHECK-NEXT: ptrue p0.s, vl2 911; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0 912; CHECK-NEXT: orv s0, p0, z0.s 913; CHECK-NEXT: fmov w0, s0 914; CHECK-NEXT: ret 915 %res = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %a) 916 ret i32 %res 917} 918 919; No single instruction NEON ORV support. Use SVE. 920define i32 @orv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 921; CHECK-LABEL: orv_v4i32: 922; CHECK: // %bb.0: 923; CHECK-NEXT: ptrue p0.s, vl4 924; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 925; CHECK-NEXT: orv s0, p0, z0.s 926; CHECK-NEXT: fmov w0, s0 927; CHECK-NEXT: ret 928 %res = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a) 929 ret i32 %res 930} 931 932define i32 @orv_v8i32(ptr %a) vscale_range(2,0) #0 { 933; CHECK-LABEL: orv_v8i32: 934; CHECK: // %bb.0: 935; CHECK-NEXT: ptrue p0.s, vl8 936; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 937; CHECK-NEXT: orv s0, p0, z0.s 938; CHECK-NEXT: fmov w0, s0 939; CHECK-NEXT: ret 940 %op = load <8 x i32>, ptr %a 941 %res = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %op) 942 ret i32 %res 943} 944 945define i32 @orv_v16i32(ptr %a) #0 { 946; VBITS_GE_256-LABEL: orv_v16i32: 947; VBITS_GE_256: // %bb.0: 948; VBITS_GE_256-NEXT: ptrue p0.s, vl8 949; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 950; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 951; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 952; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d 953; VBITS_GE_256-NEXT: orv s0, p0, z0.s 954; VBITS_GE_256-NEXT: fmov w0, s0 955; VBITS_GE_256-NEXT: ret 956; 957; VBITS_GE_512-LABEL: orv_v16i32: 958; VBITS_GE_512: // %bb.0: 959; VBITS_GE_512-NEXT: ptrue p0.s, vl16 960; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 961; VBITS_GE_512-NEXT: orv s0, p0, z0.s 962; VBITS_GE_512-NEXT: fmov w0, s0 963; VBITS_GE_512-NEXT: ret 964 %op = load <16 x i32>, ptr %a 965 %res = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %op) 966 ret i32 %res 967} 968 969define i32 @orv_v32i32(ptr %a) vscale_range(8,0) #0 { 970; CHECK-LABEL: orv_v32i32: 971; CHECK: // %bb.0: 972; CHECK-NEXT: ptrue p0.s, vl32 973; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 974; CHECK-NEXT: orv s0, p0, z0.s 975; CHECK-NEXT: fmov w0, s0 976; CHECK-NEXT: ret 977 %op = load <32 x i32>, ptr %a 978 %res = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %op) 979 ret i32 %res 980} 981 982define i32 @orv_v64i32(ptr %a) vscale_range(16,0) #0 { 983; CHECK-LABEL: orv_v64i32: 984; CHECK: // %bb.0: 985; CHECK-NEXT: ptrue p0.s, vl64 986; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 987; CHECK-NEXT: orv s0, p0, z0.s 988; CHECK-NEXT: fmov w0, s0 989; CHECK-NEXT: ret 990 %op = load <64 x i32>, ptr %a 991 %res = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %op) 992 ret i32 %res 993} 994 995; Nothing to do for single element vectors. 996define i64 @orv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 997; CHECK-LABEL: orv_v1i64: 998; CHECK: // %bb.0: 999; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1000; CHECK-NEXT: fmov x0, d0 1001; CHECK-NEXT: ret 1002 %res = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %a) 1003 ret i64 %res 1004} 1005 1006; Use SVE for 128-bit vectors 1007define i64 @orv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 1008; CHECK-LABEL: orv_v2i64: 1009; CHECK: // %bb.0: 1010; CHECK-NEXT: ptrue p0.d, vl2 1011; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1012; CHECK-NEXT: orv d0, p0, z0.d 1013; CHECK-NEXT: fmov x0, d0 1014; CHECK-NEXT: ret 1015 %res = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a) 1016 ret i64 %res 1017} 1018 1019define i64 @orv_v4i64(ptr %a) vscale_range(2,0) #0 { 1020; CHECK-LABEL: orv_v4i64: 1021; CHECK: // %bb.0: 1022; CHECK-NEXT: ptrue p0.d, vl4 1023; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1024; CHECK-NEXT: orv d0, p0, z0.d 1025; CHECK-NEXT: fmov x0, d0 1026; CHECK-NEXT: ret 1027 %op = load <4 x i64>, ptr %a 1028 %res = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %op) 1029 ret i64 %res 1030} 1031 1032define i64 @orv_v8i64(ptr %a) #0 { 1033; VBITS_GE_256-LABEL: orv_v8i64: 1034; VBITS_GE_256: // %bb.0: 1035; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1036; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1037; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1038; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1039; VBITS_GE_256-NEXT: orr z0.d, z1.d, z0.d 1040; VBITS_GE_256-NEXT: orv d0, p0, z0.d 1041; VBITS_GE_256-NEXT: fmov x0, d0 1042; VBITS_GE_256-NEXT: ret 1043; 1044; VBITS_GE_512-LABEL: orv_v8i64: 1045; VBITS_GE_512: // %bb.0: 1046; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1047; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1048; VBITS_GE_512-NEXT: orv d0, p0, z0.d 1049; VBITS_GE_512-NEXT: fmov x0, d0 1050; VBITS_GE_512-NEXT: ret 1051 %op = load <8 x i64>, ptr %a 1052 %res = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %op) 1053 ret i64 %res 1054} 1055 1056define i64 @orv_v16i64(ptr %a) vscale_range(8,0) #0 { 1057; CHECK-LABEL: orv_v16i64: 1058; CHECK: // %bb.0: 1059; CHECK-NEXT: ptrue p0.d, vl16 1060; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1061; CHECK-NEXT: orv d0, p0, z0.d 1062; CHECK-NEXT: fmov x0, d0 1063; CHECK-NEXT: ret 1064 %op = load <16 x i64>, ptr %a 1065 %res = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %op) 1066 ret i64 %res 1067} 1068 1069define i64 @orv_v32i64(ptr %a) vscale_range(16,0) #0 { 1070; CHECK-LABEL: orv_v32i64: 1071; CHECK: // %bb.0: 1072; CHECK-NEXT: ptrue p0.d, vl32 1073; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1074; CHECK-NEXT: orv d0, p0, z0.d 1075; CHECK-NEXT: fmov x0, d0 1076; CHECK-NEXT: ret 1077 %op = load <32 x i64>, ptr %a 1078 %res = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %op) 1079 ret i64 %res 1080} 1081 1082attributes #0 = { "target-features"="+sve" } 1083 1084declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) 1085declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) 1086declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) 1087declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) 1088declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) 1089declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>) 1090 1091declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) 1092declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) 1093declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) 1094declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) 1095declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) 1096declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>) 1097 1098declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) 1099declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) 1100declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) 1101declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) 1102declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) 1103declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>) 1104 1105declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>) 1106declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) 1107declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) 1108declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) 1109declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) 1110declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>) 1111 1112declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) 1113declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) 1114declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) 1115declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) 1116declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) 1117declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>) 1118 1119declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) 1120declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) 1121declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) 1122declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) 1123declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) 1124declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>) 1125 1126declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) 1127declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) 1128declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) 1129declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) 1130declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) 1131declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>) 1132 1133declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>) 1134declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) 1135declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) 1136declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) 1137declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) 1138declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>) 1139 1140declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) 1141declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) 1142declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) 1143declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>) 1144declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>) 1145declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>) 1146 1147declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) 1148declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) 1149declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) 1150declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>) 1151declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>) 1152declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>) 1153 1154declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) 1155declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) 1156declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) 1157declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) 1158declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>) 1159declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>) 1160 1161declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>) 1162declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) 1163declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) 1164declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) 1165declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) 1166declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>) 1167