1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_256 3; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 4; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 5 6target triple = "aarch64-unknown-linux-gnu" 7 8; 9; UADDV 10; 11 12; Don't use SVE for 64-bit vectors. 13define i8 @uaddv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 14; CHECK-LABEL: uaddv_v8i8: 15; CHECK: // %bb.0: 16; CHECK-NEXT: addv b0, v0.8b 17; CHECK-NEXT: fmov w0, s0 18; CHECK-NEXT: ret 19 %res = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %a) 20 ret i8 %res 21} 22 23; Don't use SVE for 128-bit vectors. 24define i8 @uaddv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 25; CHECK-LABEL: uaddv_v16i8: 26; CHECK: // %bb.0: 27; CHECK-NEXT: addv b0, v0.16b 28; CHECK-NEXT: fmov w0, s0 29; CHECK-NEXT: ret 30 %res = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %a) 31 ret i8 %res 32} 33 34define i8 @uaddv_v32i8(ptr %a) vscale_range(2,0) #0 { 35; CHECK-LABEL: uaddv_v32i8: 36; CHECK: // %bb.0: 37; CHECK-NEXT: ptrue p0.b, vl32 38; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 39; CHECK-NEXT: uaddv d0, p0, z0.b 40; CHECK-NEXT: fmov w0, s0 41; CHECK-NEXT: ret 42 %op = load <32 x i8>, ptr %a 43 %res = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %op) 44 ret i8 %res 45} 46 47define i8 @uaddv_v64i8(ptr %a) #0 { 48; VBITS_GE_256-LABEL: uaddv_v64i8: 49; VBITS_GE_256: // %bb.0: 50; VBITS_GE_256-NEXT: ptrue p0.b, vl32 51; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 52; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 53; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 54; VBITS_GE_256-NEXT: add z0.b, z1.b, z0.b 55; VBITS_GE_256-NEXT: uaddv d0, p0, z0.b 56; VBITS_GE_256-NEXT: fmov w0, s0 57; VBITS_GE_256-NEXT: ret 58; 59; VBITS_GE_512-LABEL: uaddv_v64i8: 60; VBITS_GE_512: // %bb.0: 61; VBITS_GE_512-NEXT: ptrue p0.b, vl64 62; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 63; VBITS_GE_512-NEXT: uaddv d0, p0, z0.b 64; VBITS_GE_512-NEXT: fmov w0, s0 65; VBITS_GE_512-NEXT: ret 66 %op = load <64 x i8>, ptr %a 67 %res = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %op) 68 ret i8 %res 69} 70 71define i8 @uaddv_v128i8(ptr %a) vscale_range(8,0) #0 { 72; CHECK-LABEL: uaddv_v128i8: 73; CHECK: // %bb.0: 74; CHECK-NEXT: ptrue p0.b, vl128 75; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 76; CHECK-NEXT: uaddv d0, p0, z0.b 77; CHECK-NEXT: fmov w0, s0 78; CHECK-NEXT: ret 79 %op = load <128 x i8>, ptr %a 80 %res = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %op) 81 ret i8 %res 82} 83 84define i8 @uaddv_v256i8(ptr %a) vscale_range(16,0) #0 { 85; CHECK-LABEL: uaddv_v256i8: 86; CHECK: // %bb.0: 87; CHECK-NEXT: ptrue p0.b, vl256 88; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 89; CHECK-NEXT: uaddv d0, p0, z0.b 90; CHECK-NEXT: fmov w0, s0 91; CHECK-NEXT: ret 92 %op = load <256 x i8>, ptr %a 93 %res = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %op) 94 ret i8 %res 95} 96 97; Don't use SVE for 64-bit vectors. 98define i16 @uaddv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 99; CHECK-LABEL: uaddv_v4i16: 100; CHECK: // %bb.0: 101; CHECK-NEXT: addv h0, v0.4h 102; CHECK-NEXT: fmov w0, s0 103; CHECK-NEXT: ret 104 %res = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %a) 105 ret i16 %res 106} 107 108; Don't use SVE for 128-bit vectors. 109define i16 @uaddv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 110; CHECK-LABEL: uaddv_v8i16: 111; CHECK: // %bb.0: 112; CHECK-NEXT: addv h0, v0.8h 113; CHECK-NEXT: fmov w0, s0 114; CHECK-NEXT: ret 115 %res = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %a) 116 ret i16 %res 117} 118 119define i16 @uaddv_v16i16(ptr %a) vscale_range(2,0) #0 { 120; CHECK-LABEL: uaddv_v16i16: 121; CHECK: // %bb.0: 122; CHECK-NEXT: ptrue p0.h, vl16 123; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 124; CHECK-NEXT: uaddv d0, p0, z0.h 125; CHECK-NEXT: fmov w0, s0 126; CHECK-NEXT: ret 127 %op = load <16 x i16>, ptr %a 128 %res = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %op) 129 ret i16 %res 130} 131 132define i16 @uaddv_v32i16(ptr %a) #0 { 133; VBITS_GE_256-LABEL: uaddv_v32i16: 134; VBITS_GE_256: // %bb.0: 135; VBITS_GE_256-NEXT: ptrue p0.h, vl16 136; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 137; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 138; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 139; VBITS_GE_256-NEXT: add z0.h, z1.h, z0.h 140; VBITS_GE_256-NEXT: uaddv d0, p0, z0.h 141; VBITS_GE_256-NEXT: fmov w0, s0 142; VBITS_GE_256-NEXT: ret 143; 144; VBITS_GE_512-LABEL: uaddv_v32i16: 145; VBITS_GE_512: // %bb.0: 146; VBITS_GE_512-NEXT: ptrue p0.h, vl32 147; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 148; VBITS_GE_512-NEXT: uaddv d0, p0, z0.h 149; VBITS_GE_512-NEXT: fmov w0, s0 150; VBITS_GE_512-NEXT: ret 151 %op = load <32 x i16>, ptr %a 152 %res = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %op) 153 ret i16 %res 154} 155 156define i16 @uaddv_v64i16(ptr %a) vscale_range(8,0) #0 { 157; CHECK-LABEL: uaddv_v64i16: 158; CHECK: // %bb.0: 159; CHECK-NEXT: ptrue p0.h, vl64 160; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 161; CHECK-NEXT: uaddv d0, p0, z0.h 162; CHECK-NEXT: fmov w0, s0 163; CHECK-NEXT: ret 164 %op = load <64 x i16>, ptr %a 165 %res = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %op) 166 ret i16 %res 167} 168 169define i16 @uaddv_v128i16(ptr %a) vscale_range(16,0) #0 { 170; CHECK-LABEL: uaddv_v128i16: 171; CHECK: // %bb.0: 172; CHECK-NEXT: ptrue p0.h, vl128 173; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 174; CHECK-NEXT: uaddv d0, p0, z0.h 175; CHECK-NEXT: fmov w0, s0 176; CHECK-NEXT: ret 177 %op = load <128 x i16>, ptr %a 178 %res = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %op) 179 ret i16 %res 180} 181 182; Don't use SVE for 64-bit vectors. 183define i32 @uaddv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 184; CHECK-LABEL: uaddv_v2i32: 185; CHECK: // %bb.0: 186; CHECK-NEXT: addp v0.2s, v0.2s, v0.2s 187; CHECK-NEXT: fmov w0, s0 188; CHECK-NEXT: ret 189 %res = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %a) 190 ret i32 %res 191} 192 193; Don't use SVE for 128-bit vectors. 194define i32 @uaddv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 195; CHECK-LABEL: uaddv_v4i32: 196; CHECK: // %bb.0: 197; CHECK-NEXT: addv s0, v0.4s 198; CHECK-NEXT: fmov w0, s0 199; CHECK-NEXT: ret 200 %res = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a) 201 ret i32 %res 202} 203 204define i32 @uaddv_v8i32(ptr %a) vscale_range(2,0) #0 { 205; CHECK-LABEL: uaddv_v8i32: 206; CHECK: // %bb.0: 207; CHECK-NEXT: ptrue p0.s, vl8 208; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 209; CHECK-NEXT: uaddv d0, p0, z0.s 210; CHECK-NEXT: fmov w0, s0 211; CHECK-NEXT: ret 212 %op = load <8 x i32>, ptr %a 213 %res = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %op) 214 ret i32 %res 215} 216 217define i32 @uaddv_v16i32(ptr %a) #0 { 218; VBITS_GE_256-LABEL: uaddv_v16i32: 219; VBITS_GE_256: // %bb.0: 220; VBITS_GE_256-NEXT: ptrue p0.s, vl8 221; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 222; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 223; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 224; VBITS_GE_256-NEXT: add z0.s, z1.s, z0.s 225; VBITS_GE_256-NEXT: uaddv d0, p0, z0.s 226; VBITS_GE_256-NEXT: fmov w0, s0 227; VBITS_GE_256-NEXT: ret 228; 229; VBITS_GE_512-LABEL: uaddv_v16i32: 230; VBITS_GE_512: // %bb.0: 231; VBITS_GE_512-NEXT: ptrue p0.s, vl16 232; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 233; VBITS_GE_512-NEXT: uaddv d0, p0, z0.s 234; VBITS_GE_512-NEXT: fmov w0, s0 235; VBITS_GE_512-NEXT: ret 236 %op = load <16 x i32>, ptr %a 237 %res = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %op) 238 ret i32 %res 239} 240 241define i32 @uaddv_v32i32(ptr %a) vscale_range(8,0) #0 { 242; CHECK-LABEL: uaddv_v32i32: 243; CHECK: // %bb.0: 244; CHECK-NEXT: ptrue p0.s, vl32 245; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 246; CHECK-NEXT: uaddv d0, p0, z0.s 247; CHECK-NEXT: fmov w0, s0 248; CHECK-NEXT: ret 249 %op = load <32 x i32>, ptr %a 250 %res = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %op) 251 ret i32 %res 252} 253 254define i32 @uaddv_v64i32(ptr %a) vscale_range(16,0) #0 { 255; CHECK-LABEL: uaddv_v64i32: 256; CHECK: // %bb.0: 257; CHECK-NEXT: ptrue p0.s, vl64 258; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 259; CHECK-NEXT: uaddv d0, p0, z0.s 260; CHECK-NEXT: fmov w0, s0 261; CHECK-NEXT: ret 262 %op = load <64 x i32>, ptr %a 263 %res = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %op) 264 ret i32 %res 265} 266 267; Nothing to do for single element vectors. 268define i64 @uaddv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 269; CHECK-LABEL: uaddv_v1i64: 270; CHECK: // %bb.0: 271; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 272; CHECK-NEXT: fmov x0, d0 273; CHECK-NEXT: ret 274 %res = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %a) 275 ret i64 %res 276} 277 278; Don't use SVE for 128-bit vectors. 279define i64 @uaddv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 280; CHECK-LABEL: uaddv_v2i64: 281; CHECK: // %bb.0: 282; CHECK-NEXT: addp d0, v0.2d 283; CHECK-NEXT: fmov x0, d0 284; CHECK-NEXT: ret 285 %res = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %a) 286 ret i64 %res 287} 288 289define i64 @uaddv_v4i64(ptr %a) vscale_range(2,0) #0 { 290; CHECK-LABEL: uaddv_v4i64: 291; CHECK: // %bb.0: 292; CHECK-NEXT: ptrue p0.d, vl4 293; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 294; CHECK-NEXT: uaddv d0, p0, z0.d 295; CHECK-NEXT: fmov x0, d0 296; CHECK-NEXT: ret 297 %op = load <4 x i64>, ptr %a 298 %res = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %op) 299 ret i64 %res 300} 301 302define i64 @uaddv_v8i64(ptr %a) #0 { 303; VBITS_GE_256-LABEL: uaddv_v8i64: 304; VBITS_GE_256: // %bb.0: 305; VBITS_GE_256-NEXT: ptrue p0.d, vl4 306; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 307; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 308; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 309; VBITS_GE_256-NEXT: add z0.d, z1.d, z0.d 310; VBITS_GE_256-NEXT: uaddv d0, p0, z0.d 311; VBITS_GE_256-NEXT: fmov x0, d0 312; VBITS_GE_256-NEXT: ret 313; 314; VBITS_GE_512-LABEL: uaddv_v8i64: 315; VBITS_GE_512: // %bb.0: 316; VBITS_GE_512-NEXT: ptrue p0.d, vl8 317; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 318; VBITS_GE_512-NEXT: uaddv d0, p0, z0.d 319; VBITS_GE_512-NEXT: fmov x0, d0 320; VBITS_GE_512-NEXT: ret 321 %op = load <8 x i64>, ptr %a 322 %res = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %op) 323 ret i64 %res 324} 325 326define i64 @uaddv_v16i64(ptr %a) vscale_range(8,0) #0 { 327; CHECK-LABEL: uaddv_v16i64: 328; CHECK: // %bb.0: 329; CHECK-NEXT: ptrue p0.d, vl16 330; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 331; CHECK-NEXT: uaddv d0, p0, z0.d 332; CHECK-NEXT: fmov x0, d0 333; CHECK-NEXT: ret 334 %op = load <16 x i64>, ptr %a 335 %res = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %op) 336 ret i64 %res 337} 338 339define i64 @uaddv_v32i64(ptr %a) vscale_range(16,0) #0 { 340; CHECK-LABEL: uaddv_v32i64: 341; CHECK: // %bb.0: 342; CHECK-NEXT: ptrue p0.d, vl32 343; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 344; CHECK-NEXT: uaddv d0, p0, z0.d 345; CHECK-NEXT: fmov x0, d0 346; CHECK-NEXT: ret 347 %op = load <32 x i64>, ptr %a 348 %res = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %op) 349 ret i64 %res 350} 351 352; 353; SMAXV 354; 355 356; Don't use SVE for 64-bit vectors. 357define i8 @smaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 358; CHECK-LABEL: smaxv_v8i8: 359; CHECK: // %bb.0: 360; CHECK-NEXT: smaxv b0, v0.8b 361; CHECK-NEXT: fmov w0, s0 362; CHECK-NEXT: ret 363 %res = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %a) 364 ret i8 %res 365} 366 367; Don't use SVE for 128-bit vectors. 368define i8 @smaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 369; CHECK-LABEL: smaxv_v16i8: 370; CHECK: // %bb.0: 371; CHECK-NEXT: smaxv b0, v0.16b 372; CHECK-NEXT: fmov w0, s0 373; CHECK-NEXT: ret 374 %res = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %a) 375 ret i8 %res 376} 377 378define i8 @smaxv_v32i8(ptr %a) vscale_range(2,0) #0 { 379; CHECK-LABEL: smaxv_v32i8: 380; CHECK: // %bb.0: 381; CHECK-NEXT: ptrue p0.b, vl32 382; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 383; CHECK-NEXT: smaxv b0, p0, z0.b 384; CHECK-NEXT: fmov w0, s0 385; CHECK-NEXT: ret 386 %op = load <32 x i8>, ptr %a 387 %res = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %op) 388 ret i8 %res 389} 390 391define i8 @smaxv_v64i8(ptr %a) #0 { 392; VBITS_GE_256-LABEL: smaxv_v64i8: 393; VBITS_GE_256: // %bb.0: 394; VBITS_GE_256-NEXT: ptrue p0.b, vl32 395; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 396; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 397; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 398; VBITS_GE_256-NEXT: smax z0.b, p0/m, z0.b, z1.b 399; VBITS_GE_256-NEXT: smaxv b0, p0, z0.b 400; VBITS_GE_256-NEXT: fmov w0, s0 401; VBITS_GE_256-NEXT: ret 402; 403; VBITS_GE_512-LABEL: smaxv_v64i8: 404; VBITS_GE_512: // %bb.0: 405; VBITS_GE_512-NEXT: ptrue p0.b, vl64 406; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 407; VBITS_GE_512-NEXT: smaxv b0, p0, z0.b 408; VBITS_GE_512-NEXT: fmov w0, s0 409; VBITS_GE_512-NEXT: ret 410 %op = load <64 x i8>, ptr %a 411 %res = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %op) 412 ret i8 %res 413} 414 415define i8 @smaxv_v128i8(ptr %a) vscale_range(8,0) #0 { 416; CHECK-LABEL: smaxv_v128i8: 417; CHECK: // %bb.0: 418; CHECK-NEXT: ptrue p0.b, vl128 419; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 420; CHECK-NEXT: smaxv b0, p0, z0.b 421; CHECK-NEXT: fmov w0, s0 422; CHECK-NEXT: ret 423 %op = load <128 x i8>, ptr %a 424 %res = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %op) 425 ret i8 %res 426} 427 428define i8 @smaxv_v256i8(ptr %a) vscale_range(16,0) #0 { 429; CHECK-LABEL: smaxv_v256i8: 430; CHECK: // %bb.0: 431; CHECK-NEXT: ptrue p0.b, vl256 432; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 433; CHECK-NEXT: smaxv b0, p0, z0.b 434; CHECK-NEXT: fmov w0, s0 435; CHECK-NEXT: ret 436 %op = load <256 x i8>, ptr %a 437 %res = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %op) 438 ret i8 %res 439} 440 441; Don't use SVE for 64-bit vectors. 442define i16 @smaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 443; CHECK-LABEL: smaxv_v4i16: 444; CHECK: // %bb.0: 445; CHECK-NEXT: smaxv h0, v0.4h 446; CHECK-NEXT: fmov w0, s0 447; CHECK-NEXT: ret 448 %res = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %a) 449 ret i16 %res 450} 451 452; Don't use SVE for 128-bit vectors. 453define i16 @smaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 454; CHECK-LABEL: smaxv_v8i16: 455; CHECK: // %bb.0: 456; CHECK-NEXT: smaxv h0, v0.8h 457; CHECK-NEXT: fmov w0, s0 458; CHECK-NEXT: ret 459 %res = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %a) 460 ret i16 %res 461} 462 463define i16 @smaxv_v16i16(ptr %a) vscale_range(2,0) #0 { 464; CHECK-LABEL: smaxv_v16i16: 465; CHECK: // %bb.0: 466; CHECK-NEXT: ptrue p0.h, vl16 467; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 468; CHECK-NEXT: smaxv h0, p0, z0.h 469; CHECK-NEXT: fmov w0, s0 470; CHECK-NEXT: ret 471 %op = load <16 x i16>, ptr %a 472 %res = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %op) 473 ret i16 %res 474} 475 476define i16 @smaxv_v32i16(ptr %a) #0 { 477; VBITS_GE_256-LABEL: smaxv_v32i16: 478; VBITS_GE_256: // %bb.0: 479; VBITS_GE_256-NEXT: ptrue p0.h, vl16 480; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 481; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 482; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 483; VBITS_GE_256-NEXT: smax z0.h, p0/m, z0.h, z1.h 484; VBITS_GE_256-NEXT: smaxv h0, p0, z0.h 485; VBITS_GE_256-NEXT: fmov w0, s0 486; VBITS_GE_256-NEXT: ret 487; 488; VBITS_GE_512-LABEL: smaxv_v32i16: 489; VBITS_GE_512: // %bb.0: 490; VBITS_GE_512-NEXT: ptrue p0.h, vl32 491; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 492; VBITS_GE_512-NEXT: smaxv h0, p0, z0.h 493; VBITS_GE_512-NEXT: fmov w0, s0 494; VBITS_GE_512-NEXT: ret 495 %op = load <32 x i16>, ptr %a 496 %res = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %op) 497 ret i16 %res 498} 499 500define i16 @smaxv_v64i16(ptr %a) vscale_range(8,0) #0 { 501; CHECK-LABEL: smaxv_v64i16: 502; CHECK: // %bb.0: 503; CHECK-NEXT: ptrue p0.h, vl64 504; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 505; CHECK-NEXT: smaxv h0, p0, z0.h 506; CHECK-NEXT: fmov w0, s0 507; CHECK-NEXT: ret 508 %op = load <64 x i16>, ptr %a 509 %res = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %op) 510 ret i16 %res 511} 512 513define i16 @smaxv_v128i16(ptr %a) vscale_range(16,0) #0 { 514; CHECK-LABEL: smaxv_v128i16: 515; CHECK: // %bb.0: 516; CHECK-NEXT: ptrue p0.h, vl128 517; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 518; CHECK-NEXT: smaxv h0, p0, z0.h 519; CHECK-NEXT: fmov w0, s0 520; CHECK-NEXT: ret 521 %op = load <128 x i16>, ptr %a 522 %res = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %op) 523 ret i16 %res 524} 525 526; Don't use SVE for 64-bit vectors. 527define i32 @smaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 528; CHECK-LABEL: smaxv_v2i32: 529; CHECK: // %bb.0: 530; CHECK-NEXT: smaxp v0.2s, v0.2s, v0.2s 531; CHECK-NEXT: fmov w0, s0 532; CHECK-NEXT: ret 533 %res = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %a) 534 ret i32 %res 535} 536 537; Don't use SVE for 128-bit vectors. 538define i32 @smaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 539; CHECK-LABEL: smaxv_v4i32: 540; CHECK: // %bb.0: 541; CHECK-NEXT: smaxv s0, v0.4s 542; CHECK-NEXT: fmov w0, s0 543; CHECK-NEXT: ret 544 %res = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %a) 545 ret i32 %res 546} 547 548define i32 @smaxv_v8i32(ptr %a) vscale_range(2,0) #0 { 549; CHECK-LABEL: smaxv_v8i32: 550; CHECK: // %bb.0: 551; CHECK-NEXT: ptrue p0.s, vl8 552; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 553; CHECK-NEXT: smaxv s0, p0, z0.s 554; CHECK-NEXT: fmov w0, s0 555; CHECK-NEXT: ret 556 %op = load <8 x i32>, ptr %a 557 %res = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %op) 558 ret i32 %res 559} 560 561define i32 @smaxv_v16i32(ptr %a) #0 { 562; VBITS_GE_256-LABEL: smaxv_v16i32: 563; VBITS_GE_256: // %bb.0: 564; VBITS_GE_256-NEXT: ptrue p0.s, vl8 565; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 566; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 567; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 568; VBITS_GE_256-NEXT: smax z0.s, p0/m, z0.s, z1.s 569; VBITS_GE_256-NEXT: smaxv s0, p0, z0.s 570; VBITS_GE_256-NEXT: fmov w0, s0 571; VBITS_GE_256-NEXT: ret 572; 573; VBITS_GE_512-LABEL: smaxv_v16i32: 574; VBITS_GE_512: // %bb.0: 575; VBITS_GE_512-NEXT: ptrue p0.s, vl16 576; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 577; VBITS_GE_512-NEXT: smaxv s0, p0, z0.s 578; VBITS_GE_512-NEXT: fmov w0, s0 579; VBITS_GE_512-NEXT: ret 580 %op = load <16 x i32>, ptr %a 581 %res = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %op) 582 ret i32 %res 583} 584 585define i32 @smaxv_v32i32(ptr %a) vscale_range(8,0) #0 { 586; CHECK-LABEL: smaxv_v32i32: 587; CHECK: // %bb.0: 588; CHECK-NEXT: ptrue p0.s, vl32 589; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 590; CHECK-NEXT: smaxv s0, p0, z0.s 591; CHECK-NEXT: fmov w0, s0 592; CHECK-NEXT: ret 593 %op = load <32 x i32>, ptr %a 594 %res = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %op) 595 ret i32 %res 596} 597 598define i32 @smaxv_v64i32(ptr %a) vscale_range(16,0) #0 { 599; CHECK-LABEL: smaxv_v64i32: 600; CHECK: // %bb.0: 601; CHECK-NEXT: ptrue p0.s, vl64 602; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 603; CHECK-NEXT: smaxv s0, p0, z0.s 604; CHECK-NEXT: fmov w0, s0 605; CHECK-NEXT: ret 606 %op = load <64 x i32>, ptr %a 607 %res = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %op) 608 ret i32 %res 609} 610 611; Nothing to do for single element vectors. 612define i64 @smaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 613; CHECK-LABEL: smaxv_v1i64: 614; CHECK: // %bb.0: 615; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 616; CHECK-NEXT: fmov x0, d0 617; CHECK-NEXT: ret 618 %res = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %a) 619 ret i64 %res 620} 621 622; No NEON 64-bit vector SMAXV support. Use SVE. 623define i64 @smaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 624; CHECK-LABEL: smaxv_v2i64: 625; CHECK: // %bb.0: 626; CHECK-NEXT: ptrue p0.d, vl2 627; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 628; CHECK-NEXT: smaxv d0, p0, z0.d 629; CHECK-NEXT: fmov x0, d0 630; CHECK-NEXT: ret 631 %res = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %a) 632 ret i64 %res 633} 634 635define i64 @smaxv_v4i64(ptr %a) vscale_range(2,0) #0 { 636; CHECK-LABEL: smaxv_v4i64: 637; CHECK: // %bb.0: 638; CHECK-NEXT: ptrue p0.d, vl4 639; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 640; CHECK-NEXT: smaxv d0, p0, z0.d 641; CHECK-NEXT: fmov x0, d0 642; CHECK-NEXT: ret 643 %op = load <4 x i64>, ptr %a 644 %res = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %op) 645 ret i64 %res 646} 647 648define i64 @smaxv_v8i64(ptr %a) #0 { 649; VBITS_GE_256-LABEL: smaxv_v8i64: 650; VBITS_GE_256: // %bb.0: 651; VBITS_GE_256-NEXT: ptrue p0.d, vl4 652; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 653; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 654; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 655; VBITS_GE_256-NEXT: smax z0.d, p0/m, z0.d, z1.d 656; VBITS_GE_256-NEXT: smaxv d0, p0, z0.d 657; VBITS_GE_256-NEXT: fmov x0, d0 658; VBITS_GE_256-NEXT: ret 659; 660; VBITS_GE_512-LABEL: smaxv_v8i64: 661; VBITS_GE_512: // %bb.0: 662; VBITS_GE_512-NEXT: ptrue p0.d, vl8 663; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 664; VBITS_GE_512-NEXT: smaxv d0, p0, z0.d 665; VBITS_GE_512-NEXT: fmov x0, d0 666; VBITS_GE_512-NEXT: ret 667 %op = load <8 x i64>, ptr %a 668 %res = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %op) 669 ret i64 %res 670} 671 672define i64 @smaxv_v16i64(ptr %a) vscale_range(8,0) #0 { 673; CHECK-LABEL: smaxv_v16i64: 674; CHECK: // %bb.0: 675; CHECK-NEXT: ptrue p0.d, vl16 676; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 677; CHECK-NEXT: smaxv d0, p0, z0.d 678; CHECK-NEXT: fmov x0, d0 679; CHECK-NEXT: ret 680 %op = load <16 x i64>, ptr %a 681 %res = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %op) 682 ret i64 %res 683} 684 685define i64 @smaxv_v32i64(ptr %a) vscale_range(16,0) #0 { 686; CHECK-LABEL: smaxv_v32i64: 687; CHECK: // %bb.0: 688; CHECK-NEXT: ptrue p0.d, vl32 689; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 690; CHECK-NEXT: smaxv d0, p0, z0.d 691; CHECK-NEXT: fmov x0, d0 692; CHECK-NEXT: ret 693 %op = load <32 x i64>, ptr %a 694 %res = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %op) 695 ret i64 %res 696} 697 698; 699; SMINV 700; 701 702; Don't use SVE for 64-bit vectors. 703define i8 @sminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 704; CHECK-LABEL: sminv_v8i8: 705; CHECK: // %bb.0: 706; CHECK-NEXT: sminv b0, v0.8b 707; CHECK-NEXT: fmov w0, s0 708; CHECK-NEXT: ret 709 %res = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %a) 710 ret i8 %res 711} 712 713; Don't use SVE for 128-bit vectors. 714define i8 @sminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 715; CHECK-LABEL: sminv_v16i8: 716; CHECK: // %bb.0: 717; CHECK-NEXT: sminv b0, v0.16b 718; CHECK-NEXT: fmov w0, s0 719; CHECK-NEXT: ret 720 %res = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %a) 721 ret i8 %res 722} 723 724define i8 @sminv_v32i8(ptr %a) vscale_range(2,0) #0 { 725; CHECK-LABEL: sminv_v32i8: 726; CHECK: // %bb.0: 727; CHECK-NEXT: ptrue p0.b, vl32 728; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 729; CHECK-NEXT: sminv b0, p0, z0.b 730; CHECK-NEXT: fmov w0, s0 731; CHECK-NEXT: ret 732 %op = load <32 x i8>, ptr %a 733 %res = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %op) 734 ret i8 %res 735} 736 737define i8 @sminv_v64i8(ptr %a) #0 { 738; VBITS_GE_256-LABEL: sminv_v64i8: 739; VBITS_GE_256: // %bb.0: 740; VBITS_GE_256-NEXT: ptrue p0.b, vl32 741; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 742; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 743; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 744; VBITS_GE_256-NEXT: smin z0.b, p0/m, z0.b, z1.b 745; VBITS_GE_256-NEXT: sminv b0, p0, z0.b 746; VBITS_GE_256-NEXT: fmov w0, s0 747; VBITS_GE_256-NEXT: ret 748; 749; VBITS_GE_512-LABEL: sminv_v64i8: 750; VBITS_GE_512: // %bb.0: 751; VBITS_GE_512-NEXT: ptrue p0.b, vl64 752; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 753; VBITS_GE_512-NEXT: sminv b0, p0, z0.b 754; VBITS_GE_512-NEXT: fmov w0, s0 755; VBITS_GE_512-NEXT: ret 756 %op = load <64 x i8>, ptr %a 757 %res = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %op) 758 ret i8 %res 759} 760 761define i8 @sminv_v128i8(ptr %a) vscale_range(8,0) #0 { 762; CHECK-LABEL: sminv_v128i8: 763; CHECK: // %bb.0: 764; CHECK-NEXT: ptrue p0.b, vl128 765; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 766; CHECK-NEXT: sminv b0, p0, z0.b 767; CHECK-NEXT: fmov w0, s0 768; CHECK-NEXT: ret 769 %op = load <128 x i8>, ptr %a 770 %res = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %op) 771 ret i8 %res 772} 773 774define i8 @sminv_v256i8(ptr %a) vscale_range(16,0) #0 { 775; CHECK-LABEL: sminv_v256i8: 776; CHECK: // %bb.0: 777; CHECK-NEXT: ptrue p0.b, vl256 778; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 779; CHECK-NEXT: sminv b0, p0, z0.b 780; CHECK-NEXT: fmov w0, s0 781; CHECK-NEXT: ret 782 %op = load <256 x i8>, ptr %a 783 %res = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %op) 784 ret i8 %res 785} 786 787; Don't use SVE for 64-bit vectors. 788define i16 @sminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 789; CHECK-LABEL: sminv_v4i16: 790; CHECK: // %bb.0: 791; CHECK-NEXT: sminv h0, v0.4h 792; CHECK-NEXT: fmov w0, s0 793; CHECK-NEXT: ret 794 %res = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %a) 795 ret i16 %res 796} 797 798; Don't use SVE for 128-bit vectors. 799define i16 @sminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 800; CHECK-LABEL: sminv_v8i16: 801; CHECK: // %bb.0: 802; CHECK-NEXT: sminv h0, v0.8h 803; CHECK-NEXT: fmov w0, s0 804; CHECK-NEXT: ret 805 %res = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %a) 806 ret i16 %res 807} 808 809define i16 @sminv_v16i16(ptr %a) vscale_range(2,0) #0 { 810; CHECK-LABEL: sminv_v16i16: 811; CHECK: // %bb.0: 812; CHECK-NEXT: ptrue p0.h, vl16 813; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 814; CHECK-NEXT: sminv h0, p0, z0.h 815; CHECK-NEXT: fmov w0, s0 816; CHECK-NEXT: ret 817 %op = load <16 x i16>, ptr %a 818 %res = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %op) 819 ret i16 %res 820} 821 822define i16 @sminv_v32i16(ptr %a) #0 { 823; VBITS_GE_256-LABEL: sminv_v32i16: 824; VBITS_GE_256: // %bb.0: 825; VBITS_GE_256-NEXT: ptrue p0.h, vl16 826; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 827; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 828; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 829; VBITS_GE_256-NEXT: smin z0.h, p0/m, z0.h, z1.h 830; VBITS_GE_256-NEXT: sminv h0, p0, z0.h 831; VBITS_GE_256-NEXT: fmov w0, s0 832; VBITS_GE_256-NEXT: ret 833; 834; VBITS_GE_512-LABEL: sminv_v32i16: 835; VBITS_GE_512: // %bb.0: 836; VBITS_GE_512-NEXT: ptrue p0.h, vl32 837; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 838; VBITS_GE_512-NEXT: sminv h0, p0, z0.h 839; VBITS_GE_512-NEXT: fmov w0, s0 840; VBITS_GE_512-NEXT: ret 841 %op = load <32 x i16>, ptr %a 842 %res = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %op) 843 ret i16 %res 844} 845 846define i16 @sminv_v64i16(ptr %a) vscale_range(8,0) #0 { 847; CHECK-LABEL: sminv_v64i16: 848; CHECK: // %bb.0: 849; CHECK-NEXT: ptrue p0.h, vl64 850; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 851; CHECK-NEXT: sminv h0, p0, z0.h 852; CHECK-NEXT: fmov w0, s0 853; CHECK-NEXT: ret 854 %op = load <64 x i16>, ptr %a 855 %res = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %op) 856 ret i16 %res 857} 858 859define i16 @sminv_v128i16(ptr %a) vscale_range(16,0) #0 { 860; CHECK-LABEL: sminv_v128i16: 861; CHECK: // %bb.0: 862; CHECK-NEXT: ptrue p0.h, vl128 863; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 864; CHECK-NEXT: sminv h0, p0, z0.h 865; CHECK-NEXT: fmov w0, s0 866; CHECK-NEXT: ret 867 %op = load <128 x i16>, ptr %a 868 %res = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %op) 869 ret i16 %res 870} 871 872; Don't use SVE for 64-bit vectors. 873define i32 @sminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 874; CHECK-LABEL: sminv_v2i32: 875; CHECK: // %bb.0: 876; CHECK-NEXT: sminp v0.2s, v0.2s, v0.2s 877; CHECK-NEXT: fmov w0, s0 878; CHECK-NEXT: ret 879 %res = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %a) 880 ret i32 %res 881} 882 883; Don't use SVE for 128-bit vectors. 884define i32 @sminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 885; CHECK-LABEL: sminv_v4i32: 886; CHECK: // %bb.0: 887; CHECK-NEXT: sminv s0, v0.4s 888; CHECK-NEXT: fmov w0, s0 889; CHECK-NEXT: ret 890 %res = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %a) 891 ret i32 %res 892} 893 894define i32 @sminv_v8i32(ptr %a) vscale_range(2,0) #0 { 895; CHECK-LABEL: sminv_v8i32: 896; CHECK: // %bb.0: 897; CHECK-NEXT: ptrue p0.s, vl8 898; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 899; CHECK-NEXT: sminv s0, p0, z0.s 900; CHECK-NEXT: fmov w0, s0 901; CHECK-NEXT: ret 902 %op = load <8 x i32>, ptr %a 903 %res = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %op) 904 ret i32 %res 905} 906 907define i32 @sminv_v16i32(ptr %a) #0 { 908; VBITS_GE_256-LABEL: sminv_v16i32: 909; VBITS_GE_256: // %bb.0: 910; VBITS_GE_256-NEXT: ptrue p0.s, vl8 911; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 912; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 913; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 914; VBITS_GE_256-NEXT: smin z0.s, p0/m, z0.s, z1.s 915; VBITS_GE_256-NEXT: sminv s0, p0, z0.s 916; VBITS_GE_256-NEXT: fmov w0, s0 917; VBITS_GE_256-NEXT: ret 918; 919; VBITS_GE_512-LABEL: sminv_v16i32: 920; VBITS_GE_512: // %bb.0: 921; VBITS_GE_512-NEXT: ptrue p0.s, vl16 922; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 923; VBITS_GE_512-NEXT: sminv s0, p0, z0.s 924; VBITS_GE_512-NEXT: fmov w0, s0 925; VBITS_GE_512-NEXT: ret 926 %op = load <16 x i32>, ptr %a 927 %res = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %op) 928 ret i32 %res 929} 930 931define i32 @sminv_v32i32(ptr %a) vscale_range(8,0) #0 { 932; CHECK-LABEL: sminv_v32i32: 933; CHECK: // %bb.0: 934; CHECK-NEXT: ptrue p0.s, vl32 935; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 936; CHECK-NEXT: sminv s0, p0, z0.s 937; CHECK-NEXT: fmov w0, s0 938; CHECK-NEXT: ret 939 %op = load <32 x i32>, ptr %a 940 %res = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %op) 941 ret i32 %res 942} 943 944define i32 @sminv_v64i32(ptr %a) vscale_range(16,0) #0 { 945; CHECK-LABEL: sminv_v64i32: 946; CHECK: // %bb.0: 947; CHECK-NEXT: ptrue p0.s, vl64 948; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 949; CHECK-NEXT: sminv s0, p0, z0.s 950; CHECK-NEXT: fmov w0, s0 951; CHECK-NEXT: ret 952 %op = load <64 x i32>, ptr %a 953 %res = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %op) 954 ret i32 %res 955} 956 957; Nothing to do for single element vectors. 958define i64 @sminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 959; CHECK-LABEL: sminv_v1i64: 960; CHECK: // %bb.0: 961; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 962; CHECK-NEXT: fmov x0, d0 963; CHECK-NEXT: ret 964 %res = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %a) 965 ret i64 %res 966} 967 968; No NEON 64-bit vector SMINV support. Use SVE. 969define i64 @sminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 970; CHECK-LABEL: sminv_v2i64: 971; CHECK: // %bb.0: 972; CHECK-NEXT: ptrue p0.d, vl2 973; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 974; CHECK-NEXT: sminv d0, p0, z0.d 975; CHECK-NEXT: fmov x0, d0 976; CHECK-NEXT: ret 977 %res = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %a) 978 ret i64 %res 979} 980 981define i64 @sminv_v4i64(ptr %a) vscale_range(2,0) #0 { 982; CHECK-LABEL: sminv_v4i64: 983; CHECK: // %bb.0: 984; CHECK-NEXT: ptrue p0.d, vl4 985; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 986; CHECK-NEXT: sminv d0, p0, z0.d 987; CHECK-NEXT: fmov x0, d0 988; CHECK-NEXT: ret 989 %op = load <4 x i64>, ptr %a 990 %res = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %op) 991 ret i64 %res 992} 993 994define i64 @sminv_v8i64(ptr %a) #0 { 995; VBITS_GE_256-LABEL: sminv_v8i64: 996; VBITS_GE_256: // %bb.0: 997; VBITS_GE_256-NEXT: ptrue p0.d, vl4 998; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 999; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1000; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1001; VBITS_GE_256-NEXT: smin z0.d, p0/m, z0.d, z1.d 1002; VBITS_GE_256-NEXT: sminv d0, p0, z0.d 1003; VBITS_GE_256-NEXT: fmov x0, d0 1004; VBITS_GE_256-NEXT: ret 1005; 1006; VBITS_GE_512-LABEL: sminv_v8i64: 1007; VBITS_GE_512: // %bb.0: 1008; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1009; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1010; VBITS_GE_512-NEXT: sminv d0, p0, z0.d 1011; VBITS_GE_512-NEXT: fmov x0, d0 1012; VBITS_GE_512-NEXT: ret 1013 %op = load <8 x i64>, ptr %a 1014 %res = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %op) 1015 ret i64 %res 1016} 1017 1018define i64 @sminv_v16i64(ptr %a) vscale_range(8,0) #0 { 1019; CHECK-LABEL: sminv_v16i64: 1020; CHECK: // %bb.0: 1021; CHECK-NEXT: ptrue p0.d, vl16 1022; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1023; CHECK-NEXT: sminv d0, p0, z0.d 1024; CHECK-NEXT: fmov x0, d0 1025; CHECK-NEXT: ret 1026 %op = load <16 x i64>, ptr %a 1027 %res = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %op) 1028 ret i64 %res 1029} 1030 1031define i64 @sminv_v32i64(ptr %a) vscale_range(16,0) #0 { 1032; CHECK-LABEL: sminv_v32i64: 1033; CHECK: // %bb.0: 1034; CHECK-NEXT: ptrue p0.d, vl32 1035; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1036; CHECK-NEXT: sminv d0, p0, z0.d 1037; CHECK-NEXT: fmov x0, d0 1038; CHECK-NEXT: ret 1039 %op = load <32 x i64>, ptr %a 1040 %res = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %op) 1041 ret i64 %res 1042} 1043 1044; 1045; UMAXV 1046; 1047 1048; Don't use SVE for 64-bit vectors. 1049define i8 @umaxv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 1050; CHECK-LABEL: umaxv_v8i8: 1051; CHECK: // %bb.0: 1052; CHECK-NEXT: umaxv b0, v0.8b 1053; CHECK-NEXT: fmov w0, s0 1054; CHECK-NEXT: ret 1055 %res = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %a) 1056 ret i8 %res 1057} 1058 1059; Don't use SVE for 128-bit vectors. 1060define i8 @umaxv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 1061; CHECK-LABEL: umaxv_v16i8: 1062; CHECK: // %bb.0: 1063; CHECK-NEXT: umaxv b0, v0.16b 1064; CHECK-NEXT: fmov w0, s0 1065; CHECK-NEXT: ret 1066 %res = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %a) 1067 ret i8 %res 1068} 1069 1070define i8 @umaxv_v32i8(ptr %a) vscale_range(2,0) #0 { 1071; CHECK-LABEL: umaxv_v32i8: 1072; CHECK: // %bb.0: 1073; CHECK-NEXT: ptrue p0.b, vl32 1074; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1075; CHECK-NEXT: umaxv b0, p0, z0.b 1076; CHECK-NEXT: fmov w0, s0 1077; CHECK-NEXT: ret 1078 %op = load <32 x i8>, ptr %a 1079 %res = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %op) 1080 ret i8 %res 1081} 1082 1083define i8 @umaxv_v64i8(ptr %a) #0 { 1084; VBITS_GE_256-LABEL: umaxv_v64i8: 1085; VBITS_GE_256: // %bb.0: 1086; VBITS_GE_256-NEXT: ptrue p0.b, vl32 1087; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 1088; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 1089; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 1090; VBITS_GE_256-NEXT: umax z0.b, p0/m, z0.b, z1.b 1091; VBITS_GE_256-NEXT: umaxv b0, p0, z0.b 1092; VBITS_GE_256-NEXT: fmov w0, s0 1093; VBITS_GE_256-NEXT: ret 1094; 1095; VBITS_GE_512-LABEL: umaxv_v64i8: 1096; VBITS_GE_512: // %bb.0: 1097; VBITS_GE_512-NEXT: ptrue p0.b, vl64 1098; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 1099; VBITS_GE_512-NEXT: umaxv b0, p0, z0.b 1100; VBITS_GE_512-NEXT: fmov w0, s0 1101; VBITS_GE_512-NEXT: ret 1102 %op = load <64 x i8>, ptr %a 1103 %res = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %op) 1104 ret i8 %res 1105} 1106 1107define i8 @umaxv_v128i8(ptr %a) vscale_range(8,0) #0 { 1108; CHECK-LABEL: umaxv_v128i8: 1109; CHECK: // %bb.0: 1110; CHECK-NEXT: ptrue p0.b, vl128 1111; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1112; CHECK-NEXT: umaxv b0, p0, z0.b 1113; CHECK-NEXT: fmov w0, s0 1114; CHECK-NEXT: ret 1115 %op = load <128 x i8>, ptr %a 1116 %res = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %op) 1117 ret i8 %res 1118} 1119 1120define i8 @umaxv_v256i8(ptr %a) vscale_range(16,0) #0 { 1121; CHECK-LABEL: umaxv_v256i8: 1122; CHECK: // %bb.0: 1123; CHECK-NEXT: ptrue p0.b, vl256 1124; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1125; CHECK-NEXT: umaxv b0, p0, z0.b 1126; CHECK-NEXT: fmov w0, s0 1127; CHECK-NEXT: ret 1128 %op = load <256 x i8>, ptr %a 1129 %res = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %op) 1130 ret i8 %res 1131} 1132 1133; Don't use SVE for 64-bit vectors. 1134define i16 @umaxv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 1135; CHECK-LABEL: umaxv_v4i16: 1136; CHECK: // %bb.0: 1137; CHECK-NEXT: umaxv h0, v0.4h 1138; CHECK-NEXT: fmov w0, s0 1139; CHECK-NEXT: ret 1140 %res = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %a) 1141 ret i16 %res 1142} 1143 1144; Don't use SVE for 128-bit vectors. 1145define i16 @umaxv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 1146; CHECK-LABEL: umaxv_v8i16: 1147; CHECK: // %bb.0: 1148; CHECK-NEXT: umaxv h0, v0.8h 1149; CHECK-NEXT: fmov w0, s0 1150; CHECK-NEXT: ret 1151 %res = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %a) 1152 ret i16 %res 1153} 1154 1155define i16 @umaxv_v16i16(ptr %a) vscale_range(2,0) #0 { 1156; CHECK-LABEL: umaxv_v16i16: 1157; CHECK: // %bb.0: 1158; CHECK-NEXT: ptrue p0.h, vl16 1159; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1160; CHECK-NEXT: umaxv h0, p0, z0.h 1161; CHECK-NEXT: fmov w0, s0 1162; CHECK-NEXT: ret 1163 %op = load <16 x i16>, ptr %a 1164 %res = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %op) 1165 ret i16 %res 1166} 1167 1168define i16 @umaxv_v32i16(ptr %a) #0 { 1169; VBITS_GE_256-LABEL: umaxv_v32i16: 1170; VBITS_GE_256: // %bb.0: 1171; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1172; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1173; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1174; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1175; VBITS_GE_256-NEXT: umax z0.h, p0/m, z0.h, z1.h 1176; VBITS_GE_256-NEXT: umaxv h0, p0, z0.h 1177; VBITS_GE_256-NEXT: fmov w0, s0 1178; VBITS_GE_256-NEXT: ret 1179; 1180; VBITS_GE_512-LABEL: umaxv_v32i16: 1181; VBITS_GE_512: // %bb.0: 1182; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1183; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1184; VBITS_GE_512-NEXT: umaxv h0, p0, z0.h 1185; VBITS_GE_512-NEXT: fmov w0, s0 1186; VBITS_GE_512-NEXT: ret 1187 %op = load <32 x i16>, ptr %a 1188 %res = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %op) 1189 ret i16 %res 1190} 1191 1192define i16 @umaxv_v64i16(ptr %a) vscale_range(8,0) #0 { 1193; CHECK-LABEL: umaxv_v64i16: 1194; CHECK: // %bb.0: 1195; CHECK-NEXT: ptrue p0.h, vl64 1196; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1197; CHECK-NEXT: umaxv h0, p0, z0.h 1198; CHECK-NEXT: fmov w0, s0 1199; CHECK-NEXT: ret 1200 %op = load <64 x i16>, ptr %a 1201 %res = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %op) 1202 ret i16 %res 1203} 1204 1205define i16 @umaxv_v128i16(ptr %a) vscale_range(16,0) #0 { 1206; CHECK-LABEL: umaxv_v128i16: 1207; CHECK: // %bb.0: 1208; CHECK-NEXT: ptrue p0.h, vl128 1209; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1210; CHECK-NEXT: umaxv h0, p0, z0.h 1211; CHECK-NEXT: fmov w0, s0 1212; CHECK-NEXT: ret 1213 %op = load <128 x i16>, ptr %a 1214 %res = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %op) 1215 ret i16 %res 1216} 1217 1218; Don't use SVE for 64-bit vectors. 1219define i32 @umaxv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 1220; CHECK-LABEL: umaxv_v2i32: 1221; CHECK: // %bb.0: 1222; CHECK-NEXT: umaxp v0.2s, v0.2s, v0.2s 1223; CHECK-NEXT: fmov w0, s0 1224; CHECK-NEXT: ret 1225 %res = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %a) 1226 ret i32 %res 1227} 1228 1229; Don't use SVE for 128-bit vectors. 1230define i32 @umaxv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 1231; CHECK-LABEL: umaxv_v4i32: 1232; CHECK: // %bb.0: 1233; CHECK-NEXT: umaxv s0, v0.4s 1234; CHECK-NEXT: fmov w0, s0 1235; CHECK-NEXT: ret 1236 %res = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %a) 1237 ret i32 %res 1238} 1239 1240define i32 @umaxv_v8i32(ptr %a) vscale_range(2,0) #0 { 1241; CHECK-LABEL: umaxv_v8i32: 1242; CHECK: // %bb.0: 1243; CHECK-NEXT: ptrue p0.s, vl8 1244; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1245; CHECK-NEXT: umaxv s0, p0, z0.s 1246; CHECK-NEXT: fmov w0, s0 1247; CHECK-NEXT: ret 1248 %op = load <8 x i32>, ptr %a 1249 %res = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %op) 1250 ret i32 %res 1251} 1252 1253define i32 @umaxv_v16i32(ptr %a) #0 { 1254; VBITS_GE_256-LABEL: umaxv_v16i32: 1255; VBITS_GE_256: // %bb.0: 1256; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1257; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1258; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1259; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1260; VBITS_GE_256-NEXT: umax z0.s, p0/m, z0.s, z1.s 1261; VBITS_GE_256-NEXT: umaxv s0, p0, z0.s 1262; VBITS_GE_256-NEXT: fmov w0, s0 1263; VBITS_GE_256-NEXT: ret 1264; 1265; VBITS_GE_512-LABEL: umaxv_v16i32: 1266; VBITS_GE_512: // %bb.0: 1267; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1268; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1269; VBITS_GE_512-NEXT: umaxv s0, p0, z0.s 1270; VBITS_GE_512-NEXT: fmov w0, s0 1271; VBITS_GE_512-NEXT: ret 1272 %op = load <16 x i32>, ptr %a 1273 %res = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %op) 1274 ret i32 %res 1275} 1276 1277define i32 @umaxv_v32i32(ptr %a) vscale_range(8,0) #0 { 1278; CHECK-LABEL: umaxv_v32i32: 1279; CHECK: // %bb.0: 1280; CHECK-NEXT: ptrue p0.s, vl32 1281; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1282; CHECK-NEXT: umaxv s0, p0, z0.s 1283; CHECK-NEXT: fmov w0, s0 1284; CHECK-NEXT: ret 1285 %op = load <32 x i32>, ptr %a 1286 %res = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %op) 1287 ret i32 %res 1288} 1289 1290define i32 @umaxv_v64i32(ptr %a) vscale_range(16,0) #0 { 1291; CHECK-LABEL: umaxv_v64i32: 1292; CHECK: // %bb.0: 1293; CHECK-NEXT: ptrue p0.s, vl64 1294; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1295; CHECK-NEXT: umaxv s0, p0, z0.s 1296; CHECK-NEXT: fmov w0, s0 1297; CHECK-NEXT: ret 1298 %op = load <64 x i32>, ptr %a 1299 %res = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %op) 1300 ret i32 %res 1301} 1302 1303; Nothing to do for single element vectors. 1304define i64 @umaxv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 1305; CHECK-LABEL: umaxv_v1i64: 1306; CHECK: // %bb.0: 1307; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1308; CHECK-NEXT: fmov x0, d0 1309; CHECK-NEXT: ret 1310 %res = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %a) 1311 ret i64 %res 1312} 1313 1314; No NEON 64-bit vector UMAXV support. Use SVE. 1315define i64 @umaxv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 1316; CHECK-LABEL: umaxv_v2i64: 1317; CHECK: // %bb.0: 1318; CHECK-NEXT: ptrue p0.d, vl2 1319; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1320; CHECK-NEXT: umaxv d0, p0, z0.d 1321; CHECK-NEXT: fmov x0, d0 1322; CHECK-NEXT: ret 1323 %res = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %a) 1324 ret i64 %res 1325} 1326 1327define i64 @umaxv_v4i64(ptr %a) vscale_range(2,0) #0 { 1328; CHECK-LABEL: umaxv_v4i64: 1329; CHECK: // %bb.0: 1330; CHECK-NEXT: ptrue p0.d, vl4 1331; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1332; CHECK-NEXT: umaxv d0, p0, z0.d 1333; CHECK-NEXT: fmov x0, d0 1334; CHECK-NEXT: ret 1335 %op = load <4 x i64>, ptr %a 1336 %res = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %op) 1337 ret i64 %res 1338} 1339 1340define i64 @umaxv_v8i64(ptr %a) #0 { 1341; VBITS_GE_256-LABEL: umaxv_v8i64: 1342; VBITS_GE_256: // %bb.0: 1343; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1344; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1345; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1346; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1347; VBITS_GE_256-NEXT: umax z0.d, p0/m, z0.d, z1.d 1348; VBITS_GE_256-NEXT: umaxv d0, p0, z0.d 1349; VBITS_GE_256-NEXT: fmov x0, d0 1350; VBITS_GE_256-NEXT: ret 1351; 1352; VBITS_GE_512-LABEL: umaxv_v8i64: 1353; VBITS_GE_512: // %bb.0: 1354; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1355; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1356; VBITS_GE_512-NEXT: umaxv d0, p0, z0.d 1357; VBITS_GE_512-NEXT: fmov x0, d0 1358; VBITS_GE_512-NEXT: ret 1359 %op = load <8 x i64>, ptr %a 1360 %res = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %op) 1361 ret i64 %res 1362} 1363 1364define i64 @umaxv_v16i64(ptr %a) vscale_range(8,0) #0 { 1365; CHECK-LABEL: umaxv_v16i64: 1366; CHECK: // %bb.0: 1367; CHECK-NEXT: ptrue p0.d, vl16 1368; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1369; CHECK-NEXT: umaxv d0, p0, z0.d 1370; CHECK-NEXT: fmov x0, d0 1371; CHECK-NEXT: ret 1372 %op = load <16 x i64>, ptr %a 1373 %res = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %op) 1374 ret i64 %res 1375} 1376 1377define i64 @umaxv_v32i64(ptr %a) vscale_range(16,0) #0 { 1378; CHECK-LABEL: umaxv_v32i64: 1379; CHECK: // %bb.0: 1380; CHECK-NEXT: ptrue p0.d, vl32 1381; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1382; CHECK-NEXT: umaxv d0, p0, z0.d 1383; CHECK-NEXT: fmov x0, d0 1384; CHECK-NEXT: ret 1385 %op = load <32 x i64>, ptr %a 1386 %res = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %op) 1387 ret i64 %res 1388} 1389 1390; 1391; UMINV 1392; 1393 1394; Don't use SVE for 64-bit vectors. 1395define i8 @uminv_v8i8(<8 x i8> %a) vscale_range(2,0) #0 { 1396; CHECK-LABEL: uminv_v8i8: 1397; CHECK: // %bb.0: 1398; CHECK-NEXT: uminv b0, v0.8b 1399; CHECK-NEXT: fmov w0, s0 1400; CHECK-NEXT: ret 1401 %res = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %a) 1402 ret i8 %res 1403} 1404 1405; Don't use SVE for 128-bit vectors. 1406define i8 @uminv_v16i8(<16 x i8> %a) vscale_range(2,0) #0 { 1407; CHECK-LABEL: uminv_v16i8: 1408; CHECK: // %bb.0: 1409; CHECK-NEXT: uminv b0, v0.16b 1410; CHECK-NEXT: fmov w0, s0 1411; CHECK-NEXT: ret 1412 %res = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %a) 1413 ret i8 %res 1414} 1415 1416define i8 @uminv_v32i8(ptr %a) vscale_range(2,0) #0 { 1417; CHECK-LABEL: uminv_v32i8: 1418; CHECK: // %bb.0: 1419; CHECK-NEXT: ptrue p0.b, vl32 1420; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1421; CHECK-NEXT: uminv b0, p0, z0.b 1422; CHECK-NEXT: fmov w0, s0 1423; CHECK-NEXT: ret 1424 %op = load <32 x i8>, ptr %a 1425 %res = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %op) 1426 ret i8 %res 1427} 1428 1429define i8 @uminv_v64i8(ptr %a) #0 { 1430; VBITS_GE_256-LABEL: uminv_v64i8: 1431; VBITS_GE_256: // %bb.0: 1432; VBITS_GE_256-NEXT: ptrue p0.b, vl32 1433; VBITS_GE_256-NEXT: mov w8, #32 // =0x20 1434; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8] 1435; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0] 1436; VBITS_GE_256-NEXT: umin z0.b, p0/m, z0.b, z1.b 1437; VBITS_GE_256-NEXT: uminv b0, p0, z0.b 1438; VBITS_GE_256-NEXT: fmov w0, s0 1439; VBITS_GE_256-NEXT: ret 1440; 1441; VBITS_GE_512-LABEL: uminv_v64i8: 1442; VBITS_GE_512: // %bb.0: 1443; VBITS_GE_512-NEXT: ptrue p0.b, vl64 1444; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] 1445; VBITS_GE_512-NEXT: uminv b0, p0, z0.b 1446; VBITS_GE_512-NEXT: fmov w0, s0 1447; VBITS_GE_512-NEXT: ret 1448 %op = load <64 x i8>, ptr %a 1449 %res = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %op) 1450 ret i8 %res 1451} 1452 1453define i8 @uminv_v128i8(ptr %a) vscale_range(8,0) #0 { 1454; CHECK-LABEL: uminv_v128i8: 1455; CHECK: // %bb.0: 1456; CHECK-NEXT: ptrue p0.b, vl128 1457; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1458; CHECK-NEXT: uminv b0, p0, z0.b 1459; CHECK-NEXT: fmov w0, s0 1460; CHECK-NEXT: ret 1461 %op = load <128 x i8>, ptr %a 1462 %res = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %op) 1463 ret i8 %res 1464} 1465 1466define i8 @uminv_v256i8(ptr %a) vscale_range(16,0) #0 { 1467; CHECK-LABEL: uminv_v256i8: 1468; CHECK: // %bb.0: 1469; CHECK-NEXT: ptrue p0.b, vl256 1470; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] 1471; CHECK-NEXT: uminv b0, p0, z0.b 1472; CHECK-NEXT: fmov w0, s0 1473; CHECK-NEXT: ret 1474 %op = load <256 x i8>, ptr %a 1475 %res = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %op) 1476 ret i8 %res 1477} 1478 1479; Don't use SVE for 64-bit vectors. 1480define i16 @uminv_v4i16(<4 x i16> %a) vscale_range(2,0) #0 { 1481; CHECK-LABEL: uminv_v4i16: 1482; CHECK: // %bb.0: 1483; CHECK-NEXT: uminv h0, v0.4h 1484; CHECK-NEXT: fmov w0, s0 1485; CHECK-NEXT: ret 1486 %res = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %a) 1487 ret i16 %res 1488} 1489 1490; Don't use SVE for 128-bit vectors. 1491define i16 @uminv_v8i16(<8 x i16> %a) vscale_range(2,0) #0 { 1492; CHECK-LABEL: uminv_v8i16: 1493; CHECK: // %bb.0: 1494; CHECK-NEXT: uminv h0, v0.8h 1495; CHECK-NEXT: fmov w0, s0 1496; CHECK-NEXT: ret 1497 %res = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %a) 1498 ret i16 %res 1499} 1500 1501define i16 @uminv_v16i16(ptr %a) vscale_range(2,0) #0 { 1502; CHECK-LABEL: uminv_v16i16: 1503; CHECK: // %bb.0: 1504; CHECK-NEXT: ptrue p0.h, vl16 1505; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1506; CHECK-NEXT: uminv h0, p0, z0.h 1507; CHECK-NEXT: fmov w0, s0 1508; CHECK-NEXT: ret 1509 %op = load <16 x i16>, ptr %a 1510 %res = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %op) 1511 ret i16 %res 1512} 1513 1514define i16 @uminv_v32i16(ptr %a) #0 { 1515; VBITS_GE_256-LABEL: uminv_v32i16: 1516; VBITS_GE_256: // %bb.0: 1517; VBITS_GE_256-NEXT: ptrue p0.h, vl16 1518; VBITS_GE_256-NEXT: mov x8, #16 // =0x10 1519; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1] 1520; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0] 1521; VBITS_GE_256-NEXT: umin z0.h, p0/m, z0.h, z1.h 1522; VBITS_GE_256-NEXT: uminv h0, p0, z0.h 1523; VBITS_GE_256-NEXT: fmov w0, s0 1524; VBITS_GE_256-NEXT: ret 1525; 1526; VBITS_GE_512-LABEL: uminv_v32i16: 1527; VBITS_GE_512: // %bb.0: 1528; VBITS_GE_512-NEXT: ptrue p0.h, vl32 1529; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] 1530; VBITS_GE_512-NEXT: uminv h0, p0, z0.h 1531; VBITS_GE_512-NEXT: fmov w0, s0 1532; VBITS_GE_512-NEXT: ret 1533 %op = load <32 x i16>, ptr %a 1534 %res = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %op) 1535 ret i16 %res 1536} 1537 1538define i16 @uminv_v64i16(ptr %a) vscale_range(8,0) #0 { 1539; CHECK-LABEL: uminv_v64i16: 1540; CHECK: // %bb.0: 1541; CHECK-NEXT: ptrue p0.h, vl64 1542; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1543; CHECK-NEXT: uminv h0, p0, z0.h 1544; CHECK-NEXT: fmov w0, s0 1545; CHECK-NEXT: ret 1546 %op = load <64 x i16>, ptr %a 1547 %res = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %op) 1548 ret i16 %res 1549} 1550 1551define i16 @uminv_v128i16(ptr %a) vscale_range(16,0) #0 { 1552; CHECK-LABEL: uminv_v128i16: 1553; CHECK: // %bb.0: 1554; CHECK-NEXT: ptrue p0.h, vl128 1555; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] 1556; CHECK-NEXT: uminv h0, p0, z0.h 1557; CHECK-NEXT: fmov w0, s0 1558; CHECK-NEXT: ret 1559 %op = load <128 x i16>, ptr %a 1560 %res = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %op) 1561 ret i16 %res 1562} 1563 1564; Don't use SVE for 64-bit vectors. 1565define i32 @uminv_v2i32(<2 x i32> %a) vscale_range(2,0) #0 { 1566; CHECK-LABEL: uminv_v2i32: 1567; CHECK: // %bb.0: 1568; CHECK-NEXT: uminp v0.2s, v0.2s, v0.2s 1569; CHECK-NEXT: fmov w0, s0 1570; CHECK-NEXT: ret 1571 %res = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %a) 1572 ret i32 %res 1573} 1574 1575; Don't use SVE for 128-bit vectors. 1576define i32 @uminv_v4i32(<4 x i32> %a) vscale_range(2,0) #0 { 1577; CHECK-LABEL: uminv_v4i32: 1578; CHECK: // %bb.0: 1579; CHECK-NEXT: uminv s0, v0.4s 1580; CHECK-NEXT: fmov w0, s0 1581; CHECK-NEXT: ret 1582 %res = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %a) 1583 ret i32 %res 1584} 1585 1586define i32 @uminv_v8i32(ptr %a) vscale_range(2,0) #0 { 1587; CHECK-LABEL: uminv_v8i32: 1588; CHECK: // %bb.0: 1589; CHECK-NEXT: ptrue p0.s, vl8 1590; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1591; CHECK-NEXT: uminv s0, p0, z0.s 1592; CHECK-NEXT: fmov w0, s0 1593; CHECK-NEXT: ret 1594 %op = load <8 x i32>, ptr %a 1595 %res = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %op) 1596 ret i32 %res 1597} 1598 1599define i32 @uminv_v16i32(ptr %a) #0 { 1600; VBITS_GE_256-LABEL: uminv_v16i32: 1601; VBITS_GE_256: // %bb.0: 1602; VBITS_GE_256-NEXT: ptrue p0.s, vl8 1603; VBITS_GE_256-NEXT: mov x8, #8 // =0x8 1604; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2] 1605; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0] 1606; VBITS_GE_256-NEXT: umin z0.s, p0/m, z0.s, z1.s 1607; VBITS_GE_256-NEXT: uminv s0, p0, z0.s 1608; VBITS_GE_256-NEXT: fmov w0, s0 1609; VBITS_GE_256-NEXT: ret 1610; 1611; VBITS_GE_512-LABEL: uminv_v16i32: 1612; VBITS_GE_512: // %bb.0: 1613; VBITS_GE_512-NEXT: ptrue p0.s, vl16 1614; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] 1615; VBITS_GE_512-NEXT: uminv s0, p0, z0.s 1616; VBITS_GE_512-NEXT: fmov w0, s0 1617; VBITS_GE_512-NEXT: ret 1618 %op = load <16 x i32>, ptr %a 1619 %res = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %op) 1620 ret i32 %res 1621} 1622 1623define i32 @uminv_v32i32(ptr %a) vscale_range(8,0) #0 { 1624; CHECK-LABEL: uminv_v32i32: 1625; CHECK: // %bb.0: 1626; CHECK-NEXT: ptrue p0.s, vl32 1627; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1628; CHECK-NEXT: uminv s0, p0, z0.s 1629; CHECK-NEXT: fmov w0, s0 1630; CHECK-NEXT: ret 1631 %op = load <32 x i32>, ptr %a 1632 %res = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %op) 1633 ret i32 %res 1634} 1635 1636define i32 @uminv_v64i32(ptr %a) vscale_range(16,0) #0 { 1637; CHECK-LABEL: uminv_v64i32: 1638; CHECK: // %bb.0: 1639; CHECK-NEXT: ptrue p0.s, vl64 1640; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] 1641; CHECK-NEXT: uminv s0, p0, z0.s 1642; CHECK-NEXT: fmov w0, s0 1643; CHECK-NEXT: ret 1644 %op = load <64 x i32>, ptr %a 1645 %res = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %op) 1646 ret i32 %res 1647} 1648 1649; Nothing to do for single element vectors. 1650define i64 @uminv_v1i64(<1 x i64> %a) vscale_range(2,0) #0 { 1651; CHECK-LABEL: uminv_v1i64: 1652; CHECK: // %bb.0: 1653; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 1654; CHECK-NEXT: fmov x0, d0 1655; CHECK-NEXT: ret 1656 %res = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %a) 1657 ret i64 %res 1658} 1659 1660; No NEON 64-bit vector UMINV support. Use SVE. 1661define i64 @uminv_v2i64(<2 x i64> %a) vscale_range(2,0) #0 { 1662; CHECK-LABEL: uminv_v2i64: 1663; CHECK: // %bb.0: 1664; CHECK-NEXT: ptrue p0.d, vl2 1665; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0 1666; CHECK-NEXT: uminv d0, p0, z0.d 1667; CHECK-NEXT: fmov x0, d0 1668; CHECK-NEXT: ret 1669 %res = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %a) 1670 ret i64 %res 1671} 1672 1673define i64 @uminv_v4i64(ptr %a) vscale_range(2,0) #0 { 1674; CHECK-LABEL: uminv_v4i64: 1675; CHECK: // %bb.0: 1676; CHECK-NEXT: ptrue p0.d, vl4 1677; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1678; CHECK-NEXT: uminv d0, p0, z0.d 1679; CHECK-NEXT: fmov x0, d0 1680; CHECK-NEXT: ret 1681 %op = load <4 x i64>, ptr %a 1682 %res = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %op) 1683 ret i64 %res 1684} 1685 1686define i64 @uminv_v8i64(ptr %a) #0 { 1687; VBITS_GE_256-LABEL: uminv_v8i64: 1688; VBITS_GE_256: // %bb.0: 1689; VBITS_GE_256-NEXT: ptrue p0.d, vl4 1690; VBITS_GE_256-NEXT: mov x8, #4 // =0x4 1691; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3] 1692; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0] 1693; VBITS_GE_256-NEXT: umin z0.d, p0/m, z0.d, z1.d 1694; VBITS_GE_256-NEXT: uminv d0, p0, z0.d 1695; VBITS_GE_256-NEXT: fmov x0, d0 1696; VBITS_GE_256-NEXT: ret 1697; 1698; VBITS_GE_512-LABEL: uminv_v8i64: 1699; VBITS_GE_512: // %bb.0: 1700; VBITS_GE_512-NEXT: ptrue p0.d, vl8 1701; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0] 1702; VBITS_GE_512-NEXT: uminv d0, p0, z0.d 1703; VBITS_GE_512-NEXT: fmov x0, d0 1704; VBITS_GE_512-NEXT: ret 1705 %op = load <8 x i64>, ptr %a 1706 %res = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %op) 1707 ret i64 %res 1708} 1709 1710define i64 @uminv_v16i64(ptr %a) vscale_range(8,0) #0 { 1711; CHECK-LABEL: uminv_v16i64: 1712; CHECK: // %bb.0: 1713; CHECK-NEXT: ptrue p0.d, vl16 1714; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1715; CHECK-NEXT: uminv d0, p0, z0.d 1716; CHECK-NEXT: fmov x0, d0 1717; CHECK-NEXT: ret 1718 %op = load <16 x i64>, ptr %a 1719 %res = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %op) 1720 ret i64 %res 1721} 1722 1723define i64 @uminv_v32i64(ptr %a) vscale_range(16,0) #0 { 1724; CHECK-LABEL: uminv_v32i64: 1725; CHECK: // %bb.0: 1726; CHECK-NEXT: ptrue p0.d, vl32 1727; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] 1728; CHECK-NEXT: uminv d0, p0, z0.d 1729; CHECK-NEXT: fmov x0, d0 1730; CHECK-NEXT: ret 1731 %op = load <32 x i64>, ptr %a 1732 %res = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %op) 1733 ret i64 %res 1734} 1735 1736attributes #0 = { "target-features"="+sve" } 1737 1738declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 1739declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 1740declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 1741declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 1742declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 1743declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>) 1744 1745declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 1746declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 1747declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 1748declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 1749declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 1750declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>) 1751 1752declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 1753declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 1754declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 1755declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 1756declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 1757declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 1758 1759declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) 1760declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 1761declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1762declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1763declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1764declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) 1765 1766declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) 1767declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) 1768declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) 1769declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) 1770declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) 1771declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>) 1772 1773declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) 1774declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) 1775declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) 1776declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) 1777declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) 1778declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>) 1779 1780declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) 1781declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) 1782declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) 1783declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) 1784declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) 1785declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>) 1786 1787declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) 1788declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) 1789declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) 1790declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) 1791declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) 1792declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>) 1793 1794declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) 1795declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) 1796declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) 1797declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) 1798declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) 1799declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>) 1800 1801declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) 1802declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) 1803declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) 1804declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) 1805declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) 1806declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>) 1807 1808declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) 1809declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 1810declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) 1811declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) 1812declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) 1813declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>) 1814 1815declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) 1816declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) 1817declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) 1818declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) 1819declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) 1820declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>) 1821 1822declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) 1823declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) 1824declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) 1825declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) 1826declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) 1827declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>) 1828 1829declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) 1830declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) 1831declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) 1832declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) 1833declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) 1834declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>) 1835 1836declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) 1837declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) 1838declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) 1839declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) 1840declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) 1841declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>) 1842 1843declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) 1844declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) 1845declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) 1846declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) 1847declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) 1848declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>) 1849 1850declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) 1851declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) 1852declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) 1853declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) 1854declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) 1855declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>) 1856 1857declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) 1858declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) 1859declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) 1860declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) 1861declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) 1862declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>) 1863 1864declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) 1865declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 1866declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) 1867declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) 1868declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) 1869declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>) 1870 1871declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) 1872declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) 1873declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) 1874declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) 1875declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) 1876declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>) 1877