1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 3; RUN: llc -mtriple=riscv64 -mattr=+m,+v -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 4 5declare i8 @llvm.vector.reduce.add.v1i8(<1 x i8>) 6 7define i8 @vreduce_add_v1i8(<1 x i8> %v) { 8; CHECK-LABEL: vreduce_add_v1i8: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 11; CHECK-NEXT: vmv.x.s a0, v8 12; CHECK-NEXT: ret 13 %red = call i8 @llvm.vector.reduce.add.v1i8(<1 x i8> %v) 14 ret i8 %red 15} 16 17declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) 18 19define i8 @vreduce_add_v2i8(ptr %x) { 20; CHECK-LABEL: vreduce_add_v2i8: 21; CHECK: # %bb.0: 22; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 23; CHECK-NEXT: vle8.v v8, (a0) 24; CHECK-NEXT: vmv.s.x v9, zero 25; CHECK-NEXT: vredsum.vs v8, v8, v9 26; CHECK-NEXT: vmv.x.s a0, v8 27; CHECK-NEXT: ret 28 %v = load <2 x i8>, ptr %x 29 %red = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %v) 30 ret i8 %red 31} 32 33declare i8 @llvm.vector.reduce.add.v3i8(<3 x i8>) 34 35define i8 @vreduce_add_v3i8(ptr %x) { 36; CHECK-LABEL: vreduce_add_v3i8: 37; CHECK: # %bb.0: 38; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 39; CHECK-NEXT: vle8.v v8, (a0) 40; CHECK-NEXT: vmv.s.x v9, zero 41; CHECK-NEXT: vredsum.vs v8, v8, v9 42; CHECK-NEXT: vmv.x.s a0, v8 43; CHECK-NEXT: ret 44 %v = load <3 x i8>, ptr %x 45 %red = call i8 @llvm.vector.reduce.add.v3i8(<3 x i8> %v) 46 ret i8 %red 47} 48 49declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) 50 51define i8 @vreduce_add_v4i8(ptr %x) { 52; CHECK-LABEL: vreduce_add_v4i8: 53; CHECK: # %bb.0: 54; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 55; CHECK-NEXT: vle8.v v8, (a0) 56; CHECK-NEXT: vmv.s.x v9, zero 57; CHECK-NEXT: vredsum.vs v8, v8, v9 58; CHECK-NEXT: vmv.x.s a0, v8 59; CHECK-NEXT: ret 60 %v = load <4 x i8>, ptr %x 61 %red = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %v) 62 ret i8 %red 63} 64 65declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) 66 67define i8 @vreduce_add_v8i8(ptr %x) { 68; CHECK-LABEL: vreduce_add_v8i8: 69; CHECK: # %bb.0: 70; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 71; CHECK-NEXT: vle8.v v8, (a0) 72; CHECK-NEXT: vmv.s.x v9, zero 73; CHECK-NEXT: vredsum.vs v8, v8, v9 74; CHECK-NEXT: vmv.x.s a0, v8 75; CHECK-NEXT: ret 76 %v = load <8 x i8>, ptr %x 77 %red = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %v) 78 ret i8 %red 79} 80 81declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) 82 83define i8 @vreduce_add_v16i8(ptr %x) { 84; CHECK-LABEL: vreduce_add_v16i8: 85; CHECK: # %bb.0: 86; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 87; CHECK-NEXT: vle8.v v8, (a0) 88; CHECK-NEXT: vmv.s.x v9, zero 89; CHECK-NEXT: vredsum.vs v8, v8, v9 90; CHECK-NEXT: vmv.x.s a0, v8 91; CHECK-NEXT: ret 92 %v = load <16 x i8>, ptr %x 93 %red = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %v) 94 ret i8 %red 95} 96 97declare i8 @llvm.vector.reduce.add.v32i8(<32 x i8>) 98 99define i8 @vreduce_add_v32i8(ptr %x) { 100; CHECK-LABEL: vreduce_add_v32i8: 101; CHECK: # %bb.0: 102; CHECK-NEXT: li a1, 32 103; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 104; CHECK-NEXT: vle8.v v8, (a0) 105; CHECK-NEXT: vmv.s.x v10, zero 106; CHECK-NEXT: vredsum.vs v8, v8, v10 107; CHECK-NEXT: vmv.x.s a0, v8 108; CHECK-NEXT: ret 109 %v = load <32 x i8>, ptr %x 110 %red = call i8 @llvm.vector.reduce.add.v32i8(<32 x i8> %v) 111 ret i8 %red 112} 113 114declare i8 @llvm.vector.reduce.add.v64i8(<64 x i8>) 115 116define i8 @vreduce_add_v64i8(ptr %x) { 117; CHECK-LABEL: vreduce_add_v64i8: 118; CHECK: # %bb.0: 119; CHECK-NEXT: li a1, 64 120; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 121; CHECK-NEXT: vle8.v v8, (a0) 122; CHECK-NEXT: vmv.s.x v12, zero 123; CHECK-NEXT: vredsum.vs v8, v8, v12 124; CHECK-NEXT: vmv.x.s a0, v8 125; CHECK-NEXT: ret 126 %v = load <64 x i8>, ptr %x 127 %red = call i8 @llvm.vector.reduce.add.v64i8(<64 x i8> %v) 128 ret i8 %red 129} 130 131declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) 132 133define i8 @vreduce_add_v128i8(ptr %x) { 134; CHECK-LABEL: vreduce_add_v128i8: 135; CHECK: # %bb.0: 136; CHECK-NEXT: li a1, 128 137; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 138; CHECK-NEXT: vle8.v v8, (a0) 139; CHECK-NEXT: vmv.s.x v16, zero 140; CHECK-NEXT: vredsum.vs v8, v8, v16 141; CHECK-NEXT: vmv.x.s a0, v8 142; CHECK-NEXT: ret 143 %v = load <128 x i8>, ptr %x 144 %red = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %v) 145 ret i8 %red 146} 147 148declare i8 @llvm.vector.reduce.add.v256i8(<256 x i8>) 149 150define i8 @vreduce_add_v256i8(ptr %x) { 151; CHECK-LABEL: vreduce_add_v256i8: 152; CHECK: # %bb.0: 153; CHECK-NEXT: li a1, 128 154; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 155; CHECK-NEXT: vle8.v v8, (a0) 156; CHECK-NEXT: addi a0, a0, 128 157; CHECK-NEXT: vle8.v v16, (a0) 158; CHECK-NEXT: vadd.vv v8, v8, v16 159; CHECK-NEXT: vmv.s.x v16, zero 160; CHECK-NEXT: vredsum.vs v8, v8, v16 161; CHECK-NEXT: vmv.x.s a0, v8 162; CHECK-NEXT: ret 163 %v = load <256 x i8>, ptr %x 164 %red = call i8 @llvm.vector.reduce.add.v256i8(<256 x i8> %v) 165 ret i8 %red 166} 167 168declare i16 @llvm.vector.reduce.add.v1i16(<1 x i16>) 169 170define i16 @vreduce_add_v1i16(<1 x i16> %v) { 171; CHECK-LABEL: vreduce_add_v1i16: 172; CHECK: # %bb.0: 173; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 174; CHECK-NEXT: vmv.x.s a0, v8 175; CHECK-NEXT: ret 176 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %v) 177 ret i16 %red 178} 179 180define i16 @vwreduce_add_v1i16(<1 x i8> %v) { 181; CHECK-LABEL: vwreduce_add_v1i16: 182; CHECK: # %bb.0: 183; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma 184; CHECK-NEXT: vsext.vf2 v9, v8 185; CHECK-NEXT: vmv.x.s a0, v9 186; CHECK-NEXT: ret 187 %e = sext <1 x i8> %v to <1 x i16> 188 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e) 189 ret i16 %red 190} 191 192define i16 @vwreduce_uadd_v1i16(<1 x i8> %v) { 193; CHECK-LABEL: vwreduce_uadd_v1i16: 194; CHECK: # %bb.0: 195; CHECK-NEXT: vsetivli zero, 1, e16, mf4, ta, ma 196; CHECK-NEXT: vzext.vf2 v9, v8 197; CHECK-NEXT: vmv.x.s a0, v9 198; CHECK-NEXT: ret 199 %e = zext <1 x i8> %v to <1 x i16> 200 %red = call i16 @llvm.vector.reduce.add.v1i16(<1 x i16> %e) 201 ret i16 %red 202} 203 204declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) 205 206define i16 @vreduce_add_v2i16(ptr %x) { 207; CHECK-LABEL: vreduce_add_v2i16: 208; CHECK: # %bb.0: 209; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 210; CHECK-NEXT: vle16.v v8, (a0) 211; CHECK-NEXT: vmv.s.x v9, zero 212; CHECK-NEXT: vredsum.vs v8, v8, v9 213; CHECK-NEXT: vmv.x.s a0, v8 214; CHECK-NEXT: ret 215 %v = load <2 x i16>, ptr %x 216 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %v) 217 ret i16 %red 218} 219 220define i16 @vwreduce_add_v2i16(ptr %x) { 221; CHECK-LABEL: vwreduce_add_v2i16: 222; CHECK: # %bb.0: 223; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 224; CHECK-NEXT: vle8.v v8, (a0) 225; CHECK-NEXT: vmv.s.x v9, zero 226; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma 227; CHECK-NEXT: vwredsum.vs v8, v8, v9 228; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma 229; CHECK-NEXT: vmv.x.s a0, v8 230; CHECK-NEXT: ret 231 %v = load <2 x i8>, ptr %x 232 %e = sext <2 x i8> %v to <2 x i16> 233 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e) 234 ret i16 %red 235} 236 237define i16 @vwreduce_uadd_v2i16(ptr %x) { 238; CHECK-LABEL: vwreduce_uadd_v2i16: 239; CHECK: # %bb.0: 240; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 241; CHECK-NEXT: vle8.v v8, (a0) 242; CHECK-NEXT: vmv.s.x v9, zero 243; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, ma 244; CHECK-NEXT: vwredsumu.vs v8, v8, v9 245; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma 246; CHECK-NEXT: vmv.x.s a0, v8 247; CHECK-NEXT: ret 248 %v = load <2 x i8>, ptr %x 249 %e = zext <2 x i8> %v to <2 x i16> 250 %red = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %e) 251 ret i16 %red 252} 253 254declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 255 256define i16 @vreduce_add_v4i16(ptr %x) { 257; CHECK-LABEL: vreduce_add_v4i16: 258; CHECK: # %bb.0: 259; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 260; CHECK-NEXT: vle16.v v8, (a0) 261; CHECK-NEXT: vmv.s.x v9, zero 262; CHECK-NEXT: vredsum.vs v8, v8, v9 263; CHECK-NEXT: vmv.x.s a0, v8 264; CHECK-NEXT: ret 265 %v = load <4 x i16>, ptr %x 266 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %v) 267 ret i16 %red 268} 269 270define i16 @vwreduce_add_v4i16(ptr %x) { 271; CHECK-LABEL: vwreduce_add_v4i16: 272; CHECK: # %bb.0: 273; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 274; CHECK-NEXT: vle8.v v8, (a0) 275; CHECK-NEXT: vmv.s.x v9, zero 276; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma 277; CHECK-NEXT: vwredsum.vs v8, v8, v9 278; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 279; CHECK-NEXT: vmv.x.s a0, v8 280; CHECK-NEXT: ret 281 %v = load <4 x i8>, ptr %x 282 %e = sext <4 x i8> %v to <4 x i16> 283 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e) 284 ret i16 %red 285} 286 287define i16 @vwreduce_uadd_v4i16(ptr %x) { 288; CHECK-LABEL: vwreduce_uadd_v4i16: 289; CHECK: # %bb.0: 290; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 291; CHECK-NEXT: vle8.v v8, (a0) 292; CHECK-NEXT: vmv.s.x v9, zero 293; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, ma 294; CHECK-NEXT: vwredsumu.vs v8, v8, v9 295; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 296; CHECK-NEXT: vmv.x.s a0, v8 297; CHECK-NEXT: ret 298 %v = load <4 x i8>, ptr %x 299 %e = zext <4 x i8> %v to <4 x i16> 300 %red = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %e) 301 ret i16 %red 302} 303 304declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) 305 306define i16 @vreduce_add_v8i16(ptr %x) { 307; CHECK-LABEL: vreduce_add_v8i16: 308; CHECK: # %bb.0: 309; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 310; CHECK-NEXT: vle16.v v8, (a0) 311; CHECK-NEXT: vmv.s.x v9, zero 312; CHECK-NEXT: vredsum.vs v8, v8, v9 313; CHECK-NEXT: vmv.x.s a0, v8 314; CHECK-NEXT: ret 315 %v = load <8 x i16>, ptr %x 316 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %v) 317 ret i16 %red 318} 319 320define i16 @vwreduce_add_v8i16(ptr %x) { 321; CHECK-LABEL: vwreduce_add_v8i16: 322; CHECK: # %bb.0: 323; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 324; CHECK-NEXT: vle8.v v8, (a0) 325; CHECK-NEXT: vmv.s.x v9, zero 326; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma 327; CHECK-NEXT: vwredsum.vs v8, v8, v9 328; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma 329; CHECK-NEXT: vmv.x.s a0, v8 330; CHECK-NEXT: ret 331 %v = load <8 x i8>, ptr %x 332 %e = sext <8 x i8> %v to <8 x i16> 333 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e) 334 ret i16 %red 335} 336 337define i16 @vwreduce_uadd_v8i16(ptr %x) { 338; CHECK-LABEL: vwreduce_uadd_v8i16: 339; CHECK: # %bb.0: 340; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 341; CHECK-NEXT: vle8.v v8, (a0) 342; CHECK-NEXT: vmv.s.x v9, zero 343; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, ma 344; CHECK-NEXT: vwredsumu.vs v8, v8, v9 345; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma 346; CHECK-NEXT: vmv.x.s a0, v8 347; CHECK-NEXT: ret 348 %v = load <8 x i8>, ptr %x 349 %e = zext <8 x i8> %v to <8 x i16> 350 %red = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %e) 351 ret i16 %red 352} 353 354declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) 355 356define i16 @vreduce_add_v16i16(ptr %x) { 357; CHECK-LABEL: vreduce_add_v16i16: 358; CHECK: # %bb.0: 359; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 360; CHECK-NEXT: vle16.v v8, (a0) 361; CHECK-NEXT: vmv.s.x v10, zero 362; CHECK-NEXT: vredsum.vs v8, v8, v10 363; CHECK-NEXT: vmv.x.s a0, v8 364; CHECK-NEXT: ret 365 %v = load <16 x i16>, ptr %x 366 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %v) 367 ret i16 %red 368} 369 370define i16 @vwreduce_add_v16i16(ptr %x) { 371; CHECK-LABEL: vwreduce_add_v16i16: 372; CHECK: # %bb.0: 373; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 374; CHECK-NEXT: vle8.v v8, (a0) 375; CHECK-NEXT: vmv.s.x v9, zero 376; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 377; CHECK-NEXT: vwredsum.vs v8, v8, v9 378; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 379; CHECK-NEXT: vmv.x.s a0, v8 380; CHECK-NEXT: ret 381 %v = load <16 x i8>, ptr %x 382 %e = sext <16 x i8> %v to <16 x i16> 383 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e) 384 ret i16 %red 385} 386 387define i16 @vwreduce_uadd_v16i16(ptr %x) { 388; CHECK-LABEL: vwreduce_uadd_v16i16: 389; CHECK: # %bb.0: 390; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 391; CHECK-NEXT: vle8.v v8, (a0) 392; CHECK-NEXT: vmv.s.x v9, zero 393; CHECK-NEXT: vsetvli zero, zero, e8, m1, ta, ma 394; CHECK-NEXT: vwredsumu.vs v8, v8, v9 395; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 396; CHECK-NEXT: vmv.x.s a0, v8 397; CHECK-NEXT: ret 398 %v = load <16 x i8>, ptr %x 399 %e = zext <16 x i8> %v to <16 x i16> 400 %red = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %e) 401 ret i16 %red 402} 403 404declare i16 @llvm.vector.reduce.add.v32i16(<32 x i16>) 405 406define i16 @vreduce_add_v32i16(ptr %x) { 407; CHECK-LABEL: vreduce_add_v32i16: 408; CHECK: # %bb.0: 409; CHECK-NEXT: li a1, 32 410; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 411; CHECK-NEXT: vle16.v v8, (a0) 412; CHECK-NEXT: vmv.s.x v12, zero 413; CHECK-NEXT: vredsum.vs v8, v8, v12 414; CHECK-NEXT: vmv.x.s a0, v8 415; CHECK-NEXT: ret 416 %v = load <32 x i16>, ptr %x 417 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %v) 418 ret i16 %red 419} 420 421define i16 @vwreduce_add_v32i16(ptr %x) { 422; CHECK-LABEL: vwreduce_add_v32i16: 423; CHECK: # %bb.0: 424; CHECK-NEXT: li a1, 32 425; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 426; CHECK-NEXT: vle8.v v8, (a0) 427; CHECK-NEXT: vmv.s.x v10, zero 428; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma 429; CHECK-NEXT: vwredsum.vs v8, v8, v10 430; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma 431; CHECK-NEXT: vmv.x.s a0, v8 432; CHECK-NEXT: ret 433 %v = load <32 x i8>, ptr %x 434 %e = sext <32 x i8> %v to <32 x i16> 435 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e) 436 ret i16 %red 437} 438 439define i16 @vwreduce_uadd_v32i16(ptr %x) { 440; CHECK-LABEL: vwreduce_uadd_v32i16: 441; CHECK: # %bb.0: 442; CHECK-NEXT: li a1, 32 443; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 444; CHECK-NEXT: vle8.v v8, (a0) 445; CHECK-NEXT: vmv.s.x v10, zero 446; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma 447; CHECK-NEXT: vwredsumu.vs v8, v8, v10 448; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma 449; CHECK-NEXT: vmv.x.s a0, v8 450; CHECK-NEXT: ret 451 %v = load <32 x i8>, ptr %x 452 %e = zext <32 x i8> %v to <32 x i16> 453 %red = call i16 @llvm.vector.reduce.add.v32i16(<32 x i16> %e) 454 ret i16 %red 455} 456 457declare i16 @llvm.vector.reduce.add.v64i16(<64 x i16>) 458 459define i16 @vreduce_add_v64i16(ptr %x) { 460; CHECK-LABEL: vreduce_add_v64i16: 461; CHECK: # %bb.0: 462; CHECK-NEXT: li a1, 64 463; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 464; CHECK-NEXT: vle16.v v8, (a0) 465; CHECK-NEXT: vmv.s.x v16, zero 466; CHECK-NEXT: vredsum.vs v8, v8, v16 467; CHECK-NEXT: vmv.x.s a0, v8 468; CHECK-NEXT: ret 469 %v = load <64 x i16>, ptr %x 470 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %v) 471 ret i16 %red 472} 473 474define i16 @vwreduce_add_v64i16(ptr %x) { 475; CHECK-LABEL: vwreduce_add_v64i16: 476; CHECK: # %bb.0: 477; CHECK-NEXT: li a1, 64 478; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 479; CHECK-NEXT: vle8.v v8, (a0) 480; CHECK-NEXT: vmv.s.x v12, zero 481; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma 482; CHECK-NEXT: vwredsum.vs v8, v8, v12 483; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma 484; CHECK-NEXT: vmv.x.s a0, v8 485; CHECK-NEXT: ret 486 %v = load <64 x i8>, ptr %x 487 %e = sext <64 x i8> %v to <64 x i16> 488 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e) 489 ret i16 %red 490} 491 492define i16 @vwreduce_uadd_v64i16(ptr %x) { 493; CHECK-LABEL: vwreduce_uadd_v64i16: 494; CHECK: # %bb.0: 495; CHECK-NEXT: li a1, 64 496; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 497; CHECK-NEXT: vle8.v v8, (a0) 498; CHECK-NEXT: vmv.s.x v12, zero 499; CHECK-NEXT: vsetvli zero, zero, e8, m4, ta, ma 500; CHECK-NEXT: vwredsumu.vs v8, v8, v12 501; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma 502; CHECK-NEXT: vmv.x.s a0, v8 503; CHECK-NEXT: ret 504 %v = load <64 x i8>, ptr %x 505 %e = zext <64 x i8> %v to <64 x i16> 506 %red = call i16 @llvm.vector.reduce.add.v64i16(<64 x i16> %e) 507 ret i16 %red 508} 509 510declare i16 @llvm.vector.reduce.add.v128i16(<128 x i16>) 511 512define i16 @vreduce_add_v128i16(ptr %x) { 513; CHECK-LABEL: vreduce_add_v128i16: 514; CHECK: # %bb.0: 515; CHECK-NEXT: li a1, 64 516; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 517; CHECK-NEXT: vle16.v v8, (a0) 518; CHECK-NEXT: addi a0, a0, 128 519; CHECK-NEXT: vle16.v v16, (a0) 520; CHECK-NEXT: vadd.vv v8, v8, v16 521; CHECK-NEXT: vmv.s.x v16, zero 522; CHECK-NEXT: vredsum.vs v8, v8, v16 523; CHECK-NEXT: vmv.x.s a0, v8 524; CHECK-NEXT: ret 525 %v = load <128 x i16>, ptr %x 526 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %v) 527 ret i16 %red 528} 529 530define i16 @vwreduce_add_v128i16(ptr %x) { 531; CHECK-LABEL: vwreduce_add_v128i16: 532; CHECK: # %bb.0: 533; CHECK-NEXT: li a1, 128 534; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 535; CHECK-NEXT: vle8.v v8, (a0) 536; CHECK-NEXT: li a0, 64 537; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma 538; CHECK-NEXT: vslidedown.vx v16, v8, a0 539; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma 540; CHECK-NEXT: vwadd.vv v24, v8, v16 541; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma 542; CHECK-NEXT: vmv.s.x v8, zero 543; CHECK-NEXT: vredsum.vs v8, v24, v8 544; CHECK-NEXT: vmv.x.s a0, v8 545; CHECK-NEXT: ret 546 %v = load <128 x i8>, ptr %x 547 %e = sext <128 x i8> %v to <128 x i16> 548 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e) 549 ret i16 %red 550} 551 552define i16 @vwreduce_uadd_v128i16(ptr %x) { 553; CHECK-LABEL: vwreduce_uadd_v128i16: 554; CHECK: # %bb.0: 555; CHECK-NEXT: li a1, 128 556; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 557; CHECK-NEXT: vle8.v v8, (a0) 558; CHECK-NEXT: li a0, 64 559; CHECK-NEXT: vsetvli zero, a0, e8, m8, ta, ma 560; CHECK-NEXT: vslidedown.vx v16, v8, a0 561; CHECK-NEXT: vsetvli zero, a0, e8, m4, ta, ma 562; CHECK-NEXT: vwaddu.vv v24, v8, v16 563; CHECK-NEXT: vsetvli zero, zero, e16, m8, ta, ma 564; CHECK-NEXT: vmv.s.x v8, zero 565; CHECK-NEXT: vredsum.vs v8, v24, v8 566; CHECK-NEXT: vmv.x.s a0, v8 567; CHECK-NEXT: ret 568 %v = load <128 x i8>, ptr %x 569 %e = zext <128 x i8> %v to <128 x i16> 570 %red = call i16 @llvm.vector.reduce.add.v128i16(<128 x i16> %e) 571 ret i16 %red 572} 573 574declare i32 @llvm.vector.reduce.add.v1i32(<1 x i32>) 575 576define i32 @vreduce_add_v1i32(<1 x i32> %v) { 577; CHECK-LABEL: vreduce_add_v1i32: 578; CHECK: # %bb.0: 579; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 580; CHECK-NEXT: vmv.x.s a0, v8 581; CHECK-NEXT: ret 582 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %v) 583 ret i32 %red 584} 585 586define i32 @vwreduce_add_v1i32(<1 x i16> %v) { 587; CHECK-LABEL: vwreduce_add_v1i32: 588; CHECK: # %bb.0: 589; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 590; CHECK-NEXT: vsext.vf2 v9, v8 591; CHECK-NEXT: vmv.x.s a0, v9 592; CHECK-NEXT: ret 593 %e = sext <1 x i16> %v to <1 x i32> 594 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e) 595 ret i32 %red 596} 597 598define i32 @vwreduce_uadd_v1i32(<1 x i16> %v) { 599; CHECK-LABEL: vwreduce_uadd_v1i32: 600; CHECK: # %bb.0: 601; CHECK-NEXT: vsetivli zero, 1, e32, mf2, ta, ma 602; CHECK-NEXT: vzext.vf2 v9, v8 603; CHECK-NEXT: vmv.x.s a0, v9 604; CHECK-NEXT: ret 605 %e = zext <1 x i16> %v to <1 x i32> 606 %red = call i32 @llvm.vector.reduce.add.v1i32(<1 x i32> %e) 607 ret i32 %red 608} 609 610declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) 611 612define i32 @vreduce_add_v2i32(ptr %x) { 613; CHECK-LABEL: vreduce_add_v2i32: 614; CHECK: # %bb.0: 615; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 616; CHECK-NEXT: vle32.v v8, (a0) 617; CHECK-NEXT: vmv.s.x v9, zero 618; CHECK-NEXT: vredsum.vs v8, v8, v9 619; CHECK-NEXT: vmv.x.s a0, v8 620; CHECK-NEXT: ret 621 %v = load <2 x i32>, ptr %x 622 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %v) 623 ret i32 %red 624} 625 626define i32 @vwreduce_add_v2i32(ptr %x) { 627; CHECK-LABEL: vwreduce_add_v2i32: 628; CHECK: # %bb.0: 629; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 630; CHECK-NEXT: vle16.v v8, (a0) 631; CHECK-NEXT: vmv.s.x v9, zero 632; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma 633; CHECK-NEXT: vwredsum.vs v8, v8, v9 634; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 635; CHECK-NEXT: vmv.x.s a0, v8 636; CHECK-NEXT: ret 637 %v = load <2 x i16>, ptr %x 638 %e = sext <2 x i16> %v to <2 x i32> 639 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e) 640 ret i32 %red 641} 642 643define i32 @vwreduce_uadd_v2i32(ptr %x) { 644; CHECK-LABEL: vwreduce_uadd_v2i32: 645; CHECK: # %bb.0: 646; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 647; CHECK-NEXT: vle16.v v8, (a0) 648; CHECK-NEXT: vmv.s.x v9, zero 649; CHECK-NEXT: vsetvli zero, zero, e16, mf4, ta, ma 650; CHECK-NEXT: vwredsumu.vs v8, v8, v9 651; CHECK-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 652; CHECK-NEXT: vmv.x.s a0, v8 653; CHECK-NEXT: ret 654 %v = load <2 x i16>, ptr %x 655 %e = zext <2 x i16> %v to <2 x i32> 656 %red = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %e) 657 ret i32 %red 658} 659 660declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) 661 662define i32 @vreduce_add_v4i32(ptr %x) { 663; CHECK-LABEL: vreduce_add_v4i32: 664; CHECK: # %bb.0: 665; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 666; CHECK-NEXT: vle32.v v8, (a0) 667; CHECK-NEXT: vmv.s.x v9, zero 668; CHECK-NEXT: vredsum.vs v8, v8, v9 669; CHECK-NEXT: vmv.x.s a0, v8 670; CHECK-NEXT: ret 671 %v = load <4 x i32>, ptr %x 672 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v) 673 ret i32 %red 674} 675 676define i32 @vwreduce_add_v4i32(ptr %x) { 677; CHECK-LABEL: vwreduce_add_v4i32: 678; CHECK: # %bb.0: 679; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 680; CHECK-NEXT: vle16.v v8, (a0) 681; CHECK-NEXT: vmv.s.x v9, zero 682; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 683; CHECK-NEXT: vwredsum.vs v8, v8, v9 684; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 685; CHECK-NEXT: vmv.x.s a0, v8 686; CHECK-NEXT: ret 687 %v = load <4 x i16>, ptr %x 688 %e = sext <4 x i16> %v to <4 x i32> 689 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e) 690 ret i32 %red 691} 692 693define i32 @vwreduce_uadd_v4i32(ptr %x) { 694; CHECK-LABEL: vwreduce_uadd_v4i32: 695; CHECK: # %bb.0: 696; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 697; CHECK-NEXT: vle16.v v8, (a0) 698; CHECK-NEXT: vmv.s.x v9, zero 699; CHECK-NEXT: vsetvli zero, zero, e16, mf2, ta, ma 700; CHECK-NEXT: vwredsumu.vs v8, v8, v9 701; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 702; CHECK-NEXT: vmv.x.s a0, v8 703; CHECK-NEXT: ret 704 %v = load <4 x i16>, ptr %x 705 %e = zext <4 x i16> %v to <4 x i32> 706 %red = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %e) 707 ret i32 %red 708} 709 710declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) 711 712define i32 @vreduce_add_v8i32(ptr %x) { 713; CHECK-LABEL: vreduce_add_v8i32: 714; CHECK: # %bb.0: 715; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 716; CHECK-NEXT: vle32.v v8, (a0) 717; CHECK-NEXT: vmv.s.x v10, zero 718; CHECK-NEXT: vredsum.vs v8, v8, v10 719; CHECK-NEXT: vmv.x.s a0, v8 720; CHECK-NEXT: ret 721 %v = load <8 x i32>, ptr %x 722 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %v) 723 ret i32 %red 724} 725 726define i32 @vwreduce_add_v8i32(ptr %x) { 727; CHECK-LABEL: vwreduce_add_v8i32: 728; CHECK: # %bb.0: 729; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 730; CHECK-NEXT: vle16.v v8, (a0) 731; CHECK-NEXT: vmv.s.x v9, zero 732; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma 733; CHECK-NEXT: vwredsum.vs v8, v8, v9 734; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma 735; CHECK-NEXT: vmv.x.s a0, v8 736; CHECK-NEXT: ret 737 %v = load <8 x i16>, ptr %x 738 %e = sext <8 x i16> %v to <8 x i32> 739 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e) 740 ret i32 %red 741} 742 743define i32 @vwreduce_uadd_v8i32(ptr %x) { 744; CHECK-LABEL: vwreduce_uadd_v8i32: 745; CHECK: # %bb.0: 746; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 747; CHECK-NEXT: vle16.v v8, (a0) 748; CHECK-NEXT: vmv.s.x v9, zero 749; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma 750; CHECK-NEXT: vwredsumu.vs v8, v8, v9 751; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma 752; CHECK-NEXT: vmv.x.s a0, v8 753; CHECK-NEXT: ret 754 %v = load <8 x i16>, ptr %x 755 %e = zext <8 x i16> %v to <8 x i32> 756 %red = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %e) 757 ret i32 %red 758} 759 760declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) 761 762define i32 @vreduce_add_v16i32(ptr %x) { 763; CHECK-LABEL: vreduce_add_v16i32: 764; CHECK: # %bb.0: 765; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 766; CHECK-NEXT: vle32.v v8, (a0) 767; CHECK-NEXT: vmv.s.x v12, zero 768; CHECK-NEXT: vredsum.vs v8, v8, v12 769; CHECK-NEXT: vmv.x.s a0, v8 770; CHECK-NEXT: ret 771 %v = load <16 x i32>, ptr %x 772 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %v) 773 ret i32 %red 774} 775 776define i32 @vwreduce_add_v16i32(ptr %x) { 777; CHECK-LABEL: vwreduce_add_v16i32: 778; CHECK: # %bb.0: 779; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 780; CHECK-NEXT: vle16.v v8, (a0) 781; CHECK-NEXT: vmv.s.x v10, zero 782; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 783; CHECK-NEXT: vwredsum.vs v8, v8, v10 784; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 785; CHECK-NEXT: vmv.x.s a0, v8 786; CHECK-NEXT: ret 787 %v = load <16 x i16>, ptr %x 788 %e = sext <16 x i16> %v to <16 x i32> 789 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e) 790 ret i32 %red 791} 792 793define i32 @vwreduce_uadd_v16i32(ptr %x) { 794; CHECK-LABEL: vwreduce_uadd_v16i32: 795; CHECK: # %bb.0: 796; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 797; CHECK-NEXT: vle16.v v8, (a0) 798; CHECK-NEXT: vmv.s.x v10, zero 799; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 800; CHECK-NEXT: vwredsumu.vs v8, v8, v10 801; CHECK-NEXT: vsetvli zero, zero, e32, m4, ta, ma 802; CHECK-NEXT: vmv.x.s a0, v8 803; CHECK-NEXT: ret 804 %v = load <16 x i16>, ptr %x 805 %e = zext <16 x i16> %v to <16 x i32> 806 %red = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %e) 807 ret i32 %red 808} 809 810declare i32 @llvm.vector.reduce.add.v32i32(<32 x i32>) 811 812define i32 @vreduce_add_v32i32(ptr %x) { 813; CHECK-LABEL: vreduce_add_v32i32: 814; CHECK: # %bb.0: 815; CHECK-NEXT: li a1, 32 816; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 817; CHECK-NEXT: vle32.v v8, (a0) 818; CHECK-NEXT: vmv.s.x v16, zero 819; CHECK-NEXT: vredsum.vs v8, v8, v16 820; CHECK-NEXT: vmv.x.s a0, v8 821; CHECK-NEXT: ret 822 %v = load <32 x i32>, ptr %x 823 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %v) 824 ret i32 %red 825} 826 827define i32 @vwreduce_add_v32i32(ptr %x) { 828; CHECK-LABEL: vwreduce_add_v32i32: 829; CHECK: # %bb.0: 830; CHECK-NEXT: li a1, 32 831; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 832; CHECK-NEXT: vle16.v v8, (a0) 833; CHECK-NEXT: vmv.s.x v12, zero 834; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma 835; CHECK-NEXT: vwredsum.vs v8, v8, v12 836; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma 837; CHECK-NEXT: vmv.x.s a0, v8 838; CHECK-NEXT: ret 839 %v = load <32 x i16>, ptr %x 840 %e = sext <32 x i16> %v to <32 x i32> 841 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e) 842 ret i32 %red 843} 844 845define i32 @vwreduce_uadd_v32i32(ptr %x) { 846; CHECK-LABEL: vwreduce_uadd_v32i32: 847; CHECK: # %bb.0: 848; CHECK-NEXT: li a1, 32 849; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 850; CHECK-NEXT: vle16.v v8, (a0) 851; CHECK-NEXT: vmv.s.x v12, zero 852; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma 853; CHECK-NEXT: vwredsumu.vs v8, v8, v12 854; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma 855; CHECK-NEXT: vmv.x.s a0, v8 856; CHECK-NEXT: ret 857 %v = load <32 x i16>, ptr %x 858 %e = zext <32 x i16> %v to <32 x i32> 859 %red = call i32 @llvm.vector.reduce.add.v32i32(<32 x i32> %e) 860 ret i32 %red 861} 862 863declare i32 @llvm.vector.reduce.add.v64i32(<64 x i32>) 864 865define i32 @vreduce_add_v64i32(ptr %x) { 866; CHECK-LABEL: vreduce_add_v64i32: 867; CHECK: # %bb.0: 868; CHECK-NEXT: li a1, 32 869; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 870; CHECK-NEXT: vle32.v v8, (a0) 871; CHECK-NEXT: addi a0, a0, 128 872; CHECK-NEXT: vle32.v v16, (a0) 873; CHECK-NEXT: vadd.vv v8, v8, v16 874; CHECK-NEXT: vmv.s.x v16, zero 875; CHECK-NEXT: vredsum.vs v8, v8, v16 876; CHECK-NEXT: vmv.x.s a0, v8 877; CHECK-NEXT: ret 878 %v = load <64 x i32>, ptr %x 879 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %v) 880 ret i32 %red 881} 882 883define i32 @vwreduce_add_v64i32(ptr %x) { 884; CHECK-LABEL: vwreduce_add_v64i32: 885; CHECK: # %bb.0: 886; CHECK-NEXT: li a1, 64 887; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 888; CHECK-NEXT: vle16.v v8, (a0) 889; CHECK-NEXT: li a0, 32 890; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma 891; CHECK-NEXT: vslidedown.vx v16, v8, a0 892; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma 893; CHECK-NEXT: vwadd.vv v24, v8, v16 894; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma 895; CHECK-NEXT: vmv.s.x v8, zero 896; CHECK-NEXT: vredsum.vs v8, v24, v8 897; CHECK-NEXT: vmv.x.s a0, v8 898; CHECK-NEXT: ret 899 %v = load <64 x i16>, ptr %x 900 %e = sext <64 x i16> %v to <64 x i32> 901 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e) 902 ret i32 %red 903} 904 905define i32 @vwreduce_uadd_v64i32(ptr %x) { 906; CHECK-LABEL: vwreduce_uadd_v64i32: 907; CHECK: # %bb.0: 908; CHECK-NEXT: li a1, 64 909; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 910; CHECK-NEXT: vle16.v v8, (a0) 911; CHECK-NEXT: li a0, 32 912; CHECK-NEXT: vsetvli zero, a0, e16, m8, ta, ma 913; CHECK-NEXT: vslidedown.vx v16, v8, a0 914; CHECK-NEXT: vsetvli zero, a0, e16, m4, ta, ma 915; CHECK-NEXT: vwaddu.vv v24, v8, v16 916; CHECK-NEXT: vsetvli zero, zero, e32, m8, ta, ma 917; CHECK-NEXT: vmv.s.x v8, zero 918; CHECK-NEXT: vredsum.vs v8, v24, v8 919; CHECK-NEXT: vmv.x.s a0, v8 920; CHECK-NEXT: ret 921 %v = load <64 x i16>, ptr %x 922 %e = zext <64 x i16> %v to <64 x i32> 923 %red = call i32 @llvm.vector.reduce.add.v64i32(<64 x i32> %e) 924 ret i32 %red 925} 926 927declare i64 @llvm.vector.reduce.add.v1i64(<1 x i64>) 928 929define i64 @vreduce_add_v1i64(<1 x i64> %v) { 930; RV32-LABEL: vreduce_add_v1i64: 931; RV32: # %bb.0: 932; RV32-NEXT: li a0, 32 933; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 934; RV32-NEXT: vsrl.vx v9, v8, a0 935; RV32-NEXT: vmv.x.s a1, v9 936; RV32-NEXT: vmv.x.s a0, v8 937; RV32-NEXT: ret 938; 939; RV64-LABEL: vreduce_add_v1i64: 940; RV64: # %bb.0: 941; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 942; RV64-NEXT: vmv.x.s a0, v8 943; RV64-NEXT: ret 944 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %v) 945 ret i64 %red 946} 947 948define i64 @vwreduce_add_v1i64(<1 x i32> %v) { 949; RV32-LABEL: vwreduce_add_v1i64: 950; RV32: # %bb.0: 951; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 952; RV32-NEXT: vsext.vf2 v9, v8 953; RV32-NEXT: li a0, 32 954; RV32-NEXT: vsrl.vx v8, v9, a0 955; RV32-NEXT: vmv.x.s a1, v8 956; RV32-NEXT: vmv.x.s a0, v9 957; RV32-NEXT: ret 958; 959; RV64-LABEL: vwreduce_add_v1i64: 960; RV64: # %bb.0: 961; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 962; RV64-NEXT: vsext.vf2 v9, v8 963; RV64-NEXT: vmv.x.s a0, v9 964; RV64-NEXT: ret 965 %e = sext <1 x i32> %v to <1 x i64> 966 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) 967 ret i64 %red 968} 969 970define i64 @vwreduce_uadd_v1i64(<1 x i32> %v) { 971; RV32-LABEL: vwreduce_uadd_v1i64: 972; RV32: # %bb.0: 973; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 974; RV32-NEXT: vzext.vf2 v9, v8 975; RV32-NEXT: li a0, 32 976; RV32-NEXT: vsrl.vx v8, v9, a0 977; RV32-NEXT: vmv.x.s a1, v8 978; RV32-NEXT: vmv.x.s a0, v9 979; RV32-NEXT: ret 980; 981; RV64-LABEL: vwreduce_uadd_v1i64: 982; RV64: # %bb.0: 983; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 984; RV64-NEXT: vzext.vf2 v9, v8 985; RV64-NEXT: vmv.x.s a0, v9 986; RV64-NEXT: ret 987 %e = zext <1 x i32> %v to <1 x i64> 988 %red = call i64 @llvm.vector.reduce.add.v1i64(<1 x i64> %e) 989 ret i64 %red 990} 991 992declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) 993 994define i64 @vreduce_add_v2i64(ptr %x) { 995; RV32-LABEL: vreduce_add_v2i64: 996; RV32: # %bb.0: 997; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 998; RV32-NEXT: vle64.v v8, (a0) 999; RV32-NEXT: vmv.s.x v9, zero 1000; RV32-NEXT: li a1, 32 1001; RV32-NEXT: vredsum.vs v8, v8, v9 1002; RV32-NEXT: vmv.x.s a0, v8 1003; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1004; RV32-NEXT: vsrl.vx v8, v8, a1 1005; RV32-NEXT: vmv.x.s a1, v8 1006; RV32-NEXT: ret 1007; 1008; RV64-LABEL: vreduce_add_v2i64: 1009; RV64: # %bb.0: 1010; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 1011; RV64-NEXT: vle64.v v8, (a0) 1012; RV64-NEXT: vmv.s.x v9, zero 1013; RV64-NEXT: vredsum.vs v8, v8, v9 1014; RV64-NEXT: vmv.x.s a0, v8 1015; RV64-NEXT: ret 1016 %v = load <2 x i64>, ptr %x 1017 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %v) 1018 ret i64 %red 1019} 1020 1021define i64 @vwreduce_add_v2i64(ptr %x) { 1022; RV32-LABEL: vwreduce_add_v2i64: 1023; RV32: # %bb.0: 1024; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 1025; RV32-NEXT: vle32.v v8, (a0) 1026; RV32-NEXT: vmv.s.x v9, zero 1027; RV32-NEXT: li a1, 32 1028; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 1029; RV32-NEXT: vwredsum.vs v8, v8, v9 1030; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1031; RV32-NEXT: vmv.x.s a0, v8 1032; RV32-NEXT: vsrl.vx v8, v8, a1 1033; RV32-NEXT: vmv.x.s a1, v8 1034; RV32-NEXT: ret 1035; 1036; RV64-LABEL: vwreduce_add_v2i64: 1037; RV64: # %bb.0: 1038; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 1039; RV64-NEXT: vle32.v v8, (a0) 1040; RV64-NEXT: vmv.s.x v9, zero 1041; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 1042; RV64-NEXT: vwredsum.vs v8, v8, v9 1043; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma 1044; RV64-NEXT: vmv.x.s a0, v8 1045; RV64-NEXT: ret 1046 %v = load <2 x i32>, ptr %x 1047 %e = sext <2 x i32> %v to <2 x i64> 1048 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) 1049 ret i64 %red 1050} 1051 1052define i64 @vwreduce_uadd_v2i64(ptr %x) { 1053; RV32-LABEL: vwreduce_uadd_v2i64: 1054; RV32: # %bb.0: 1055; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 1056; RV32-NEXT: vle32.v v8, (a0) 1057; RV32-NEXT: vmv.s.x v9, zero 1058; RV32-NEXT: li a1, 32 1059; RV32-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 1060; RV32-NEXT: vwredsumu.vs v8, v8, v9 1061; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1062; RV32-NEXT: vmv.x.s a0, v8 1063; RV32-NEXT: vsrl.vx v8, v8, a1 1064; RV32-NEXT: vmv.x.s a1, v8 1065; RV32-NEXT: ret 1066; 1067; RV64-LABEL: vwreduce_uadd_v2i64: 1068; RV64: # %bb.0: 1069; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 1070; RV64-NEXT: vle32.v v8, (a0) 1071; RV64-NEXT: vmv.s.x v9, zero 1072; RV64-NEXT: vsetvli zero, zero, e32, mf2, ta, ma 1073; RV64-NEXT: vwredsumu.vs v8, v8, v9 1074; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma 1075; RV64-NEXT: vmv.x.s a0, v8 1076; RV64-NEXT: ret 1077 %v = load <2 x i32>, ptr %x 1078 %e = zext <2 x i32> %v to <2 x i64> 1079 %red = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %e) 1080 ret i64 %red 1081} 1082 1083declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 1084 1085define i64 @vreduce_add_v4i64(ptr %x) { 1086; RV32-LABEL: vreduce_add_v4i64: 1087; RV32: # %bb.0: 1088; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1089; RV32-NEXT: vle64.v v8, (a0) 1090; RV32-NEXT: vmv.s.x v10, zero 1091; RV32-NEXT: li a1, 32 1092; RV32-NEXT: vredsum.vs v8, v8, v10 1093; RV32-NEXT: vmv.x.s a0, v8 1094; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1095; RV32-NEXT: vsrl.vx v8, v8, a1 1096; RV32-NEXT: vmv.x.s a1, v8 1097; RV32-NEXT: ret 1098; 1099; RV64-LABEL: vreduce_add_v4i64: 1100; RV64: # %bb.0: 1101; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1102; RV64-NEXT: vle64.v v8, (a0) 1103; RV64-NEXT: vmv.s.x v10, zero 1104; RV64-NEXT: vredsum.vs v8, v8, v10 1105; RV64-NEXT: vmv.x.s a0, v8 1106; RV64-NEXT: ret 1107 %v = load <4 x i64>, ptr %x 1108 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 1109 ret i64 %red 1110} 1111 1112define i64 @vwreduce_add_v4i64(ptr %x) { 1113; RV32-LABEL: vwreduce_add_v4i64: 1114; RV32: # %bb.0: 1115; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1116; RV32-NEXT: vle32.v v8, (a0) 1117; RV32-NEXT: vmv.s.x v9, zero 1118; RV32-NEXT: li a1, 32 1119; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma 1120; RV32-NEXT: vwredsum.vs v8, v8, v9 1121; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1122; RV32-NEXT: vmv.x.s a0, v8 1123; RV32-NEXT: vsrl.vx v8, v8, a1 1124; RV32-NEXT: vmv.x.s a1, v8 1125; RV32-NEXT: ret 1126; 1127; RV64-LABEL: vwreduce_add_v4i64: 1128; RV64: # %bb.0: 1129; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1130; RV64-NEXT: vle32.v v8, (a0) 1131; RV64-NEXT: vmv.s.x v9, zero 1132; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma 1133; RV64-NEXT: vwredsum.vs v8, v8, v9 1134; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma 1135; RV64-NEXT: vmv.x.s a0, v8 1136; RV64-NEXT: ret 1137 %v = load <4 x i32>, ptr %x 1138 %e = sext <4 x i32> %v to <4 x i64> 1139 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) 1140 ret i64 %red 1141} 1142 1143define i64 @vwreduce_uadd_v4i64(ptr %x) { 1144; RV32-LABEL: vwreduce_uadd_v4i64: 1145; RV32: # %bb.0: 1146; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1147; RV32-NEXT: vle32.v v8, (a0) 1148; RV32-NEXT: vmv.s.x v9, zero 1149; RV32-NEXT: li a1, 32 1150; RV32-NEXT: vsetvli zero, zero, e32, m1, ta, ma 1151; RV32-NEXT: vwredsumu.vs v8, v8, v9 1152; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1153; RV32-NEXT: vmv.x.s a0, v8 1154; RV32-NEXT: vsrl.vx v8, v8, a1 1155; RV32-NEXT: vmv.x.s a1, v8 1156; RV32-NEXT: ret 1157; 1158; RV64-LABEL: vwreduce_uadd_v4i64: 1159; RV64: # %bb.0: 1160; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 1161; RV64-NEXT: vle32.v v8, (a0) 1162; RV64-NEXT: vmv.s.x v9, zero 1163; RV64-NEXT: vsetvli zero, zero, e32, m1, ta, ma 1164; RV64-NEXT: vwredsumu.vs v8, v8, v9 1165; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma 1166; RV64-NEXT: vmv.x.s a0, v8 1167; RV64-NEXT: ret 1168 %v = load <4 x i32>, ptr %x 1169 %e = zext <4 x i32> %v to <4 x i64> 1170 %red = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %e) 1171 ret i64 %red 1172} 1173 1174declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) 1175 1176define i64 @vreduce_add_v8i64(ptr %x) { 1177; RV32-LABEL: vreduce_add_v8i64: 1178; RV32: # %bb.0: 1179; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1180; RV32-NEXT: vle64.v v8, (a0) 1181; RV32-NEXT: vmv.s.x v12, zero 1182; RV32-NEXT: li a1, 32 1183; RV32-NEXT: vredsum.vs v8, v8, v12 1184; RV32-NEXT: vmv.x.s a0, v8 1185; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1186; RV32-NEXT: vsrl.vx v8, v8, a1 1187; RV32-NEXT: vmv.x.s a1, v8 1188; RV32-NEXT: ret 1189; 1190; RV64-LABEL: vreduce_add_v8i64: 1191; RV64: # %bb.0: 1192; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1193; RV64-NEXT: vle64.v v8, (a0) 1194; RV64-NEXT: vmv.s.x v12, zero 1195; RV64-NEXT: vredsum.vs v8, v8, v12 1196; RV64-NEXT: vmv.x.s a0, v8 1197; RV64-NEXT: ret 1198 %v = load <8 x i64>, ptr %x 1199 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %v) 1200 ret i64 %red 1201} 1202 1203define i64 @vwreduce_add_v8i64(ptr %x) { 1204; RV32-LABEL: vwreduce_add_v8i64: 1205; RV32: # %bb.0: 1206; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1207; RV32-NEXT: vle32.v v8, (a0) 1208; RV32-NEXT: vmv.s.x v10, zero 1209; RV32-NEXT: li a1, 32 1210; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma 1211; RV32-NEXT: vwredsum.vs v8, v8, v10 1212; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1213; RV32-NEXT: vmv.x.s a0, v8 1214; RV32-NEXT: vsrl.vx v8, v8, a1 1215; RV32-NEXT: vmv.x.s a1, v8 1216; RV32-NEXT: ret 1217; 1218; RV64-LABEL: vwreduce_add_v8i64: 1219; RV64: # %bb.0: 1220; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1221; RV64-NEXT: vle32.v v8, (a0) 1222; RV64-NEXT: vmv.s.x v10, zero 1223; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma 1224; RV64-NEXT: vwredsum.vs v8, v8, v10 1225; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma 1226; RV64-NEXT: vmv.x.s a0, v8 1227; RV64-NEXT: ret 1228 %v = load <8 x i32>, ptr %x 1229 %e = sext <8 x i32> %v to <8 x i64> 1230 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) 1231 ret i64 %red 1232} 1233 1234define i64 @vwreduce_uadd_v8i64(ptr %x) { 1235; RV32-LABEL: vwreduce_uadd_v8i64: 1236; RV32: # %bb.0: 1237; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1238; RV32-NEXT: vle32.v v8, (a0) 1239; RV32-NEXT: vmv.s.x v10, zero 1240; RV32-NEXT: li a1, 32 1241; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma 1242; RV32-NEXT: vwredsumu.vs v8, v8, v10 1243; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1244; RV32-NEXT: vmv.x.s a0, v8 1245; RV32-NEXT: vsrl.vx v8, v8, a1 1246; RV32-NEXT: vmv.x.s a1, v8 1247; RV32-NEXT: ret 1248; 1249; RV64-LABEL: vwreduce_uadd_v8i64: 1250; RV64: # %bb.0: 1251; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 1252; RV64-NEXT: vle32.v v8, (a0) 1253; RV64-NEXT: vmv.s.x v10, zero 1254; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma 1255; RV64-NEXT: vwredsumu.vs v8, v8, v10 1256; RV64-NEXT: vsetvli zero, zero, e64, m4, ta, ma 1257; RV64-NEXT: vmv.x.s a0, v8 1258; RV64-NEXT: ret 1259 %v = load <8 x i32>, ptr %x 1260 %e = zext <8 x i32> %v to <8 x i64> 1261 %red = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %e) 1262 ret i64 %red 1263} 1264 1265declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) 1266 1267define i64 @vreduce_add_v16i64(ptr %x) { 1268; RV32-LABEL: vreduce_add_v16i64: 1269; RV32: # %bb.0: 1270; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1271; RV32-NEXT: vle64.v v8, (a0) 1272; RV32-NEXT: vmv.s.x v16, zero 1273; RV32-NEXT: li a1, 32 1274; RV32-NEXT: vredsum.vs v8, v8, v16 1275; RV32-NEXT: vmv.x.s a0, v8 1276; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1277; RV32-NEXT: vsrl.vx v8, v8, a1 1278; RV32-NEXT: vmv.x.s a1, v8 1279; RV32-NEXT: ret 1280; 1281; RV64-LABEL: vreduce_add_v16i64: 1282; RV64: # %bb.0: 1283; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1284; RV64-NEXT: vle64.v v8, (a0) 1285; RV64-NEXT: vmv.s.x v16, zero 1286; RV64-NEXT: vredsum.vs v8, v8, v16 1287; RV64-NEXT: vmv.x.s a0, v8 1288; RV64-NEXT: ret 1289 %v = load <16 x i64>, ptr %x 1290 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %v) 1291 ret i64 %red 1292} 1293 1294define i64 @vwreduce_add_v16i64(ptr %x) { 1295; RV32-LABEL: vwreduce_add_v16i64: 1296; RV32: # %bb.0: 1297; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1298; RV32-NEXT: vle32.v v8, (a0) 1299; RV32-NEXT: vmv.s.x v12, zero 1300; RV32-NEXT: li a1, 32 1301; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma 1302; RV32-NEXT: vwredsum.vs v8, v8, v12 1303; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1304; RV32-NEXT: vmv.x.s a0, v8 1305; RV32-NEXT: vsrl.vx v8, v8, a1 1306; RV32-NEXT: vmv.x.s a1, v8 1307; RV32-NEXT: ret 1308; 1309; RV64-LABEL: vwreduce_add_v16i64: 1310; RV64: # %bb.0: 1311; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1312; RV64-NEXT: vle32.v v8, (a0) 1313; RV64-NEXT: vmv.s.x v12, zero 1314; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma 1315; RV64-NEXT: vwredsum.vs v8, v8, v12 1316; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1317; RV64-NEXT: vmv.x.s a0, v8 1318; RV64-NEXT: ret 1319 %v = load <16 x i32>, ptr %x 1320 %e = sext <16 x i32> %v to <16 x i64> 1321 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) 1322 ret i64 %red 1323} 1324 1325define i64 @vwreduce_uadd_v16i64(ptr %x) { 1326; RV32-LABEL: vwreduce_uadd_v16i64: 1327; RV32: # %bb.0: 1328; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1329; RV32-NEXT: vle32.v v8, (a0) 1330; RV32-NEXT: vmv.s.x v12, zero 1331; RV32-NEXT: li a1, 32 1332; RV32-NEXT: vsetvli zero, zero, e32, m4, ta, ma 1333; RV32-NEXT: vwredsumu.vs v8, v8, v12 1334; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1335; RV32-NEXT: vmv.x.s a0, v8 1336; RV32-NEXT: vsrl.vx v8, v8, a1 1337; RV32-NEXT: vmv.x.s a1, v8 1338; RV32-NEXT: ret 1339; 1340; RV64-LABEL: vwreduce_uadd_v16i64: 1341; RV64: # %bb.0: 1342; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1343; RV64-NEXT: vle32.v v8, (a0) 1344; RV64-NEXT: vmv.s.x v12, zero 1345; RV64-NEXT: vsetvli zero, zero, e32, m4, ta, ma 1346; RV64-NEXT: vwredsumu.vs v8, v8, v12 1347; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1348; RV64-NEXT: vmv.x.s a0, v8 1349; RV64-NEXT: ret 1350 %v = load <16 x i32>, ptr %x 1351 %e = zext <16 x i32> %v to <16 x i64> 1352 %red = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %e) 1353 ret i64 %red 1354} 1355 1356declare i64 @llvm.vector.reduce.add.v32i64(<32 x i64>) 1357 1358define i64 @vreduce_add_v32i64(ptr %x) { 1359; RV32-LABEL: vreduce_add_v32i64: 1360; RV32: # %bb.0: 1361; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1362; RV32-NEXT: vle64.v v8, (a0) 1363; RV32-NEXT: addi a0, a0, 128 1364; RV32-NEXT: vle64.v v16, (a0) 1365; RV32-NEXT: vadd.vv v8, v8, v16 1366; RV32-NEXT: vmv.s.x v16, zero 1367; RV32-NEXT: li a1, 32 1368; RV32-NEXT: vredsum.vs v8, v8, v16 1369; RV32-NEXT: vmv.x.s a0, v8 1370; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1371; RV32-NEXT: vsrl.vx v8, v8, a1 1372; RV32-NEXT: vmv.x.s a1, v8 1373; RV32-NEXT: ret 1374; 1375; RV64-LABEL: vreduce_add_v32i64: 1376; RV64: # %bb.0: 1377; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1378; RV64-NEXT: vle64.v v8, (a0) 1379; RV64-NEXT: addi a0, a0, 128 1380; RV64-NEXT: vle64.v v16, (a0) 1381; RV64-NEXT: vadd.vv v8, v8, v16 1382; RV64-NEXT: vmv.s.x v16, zero 1383; RV64-NEXT: vredsum.vs v8, v8, v16 1384; RV64-NEXT: vmv.x.s a0, v8 1385; RV64-NEXT: ret 1386 %v = load <32 x i64>, ptr %x 1387 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %v) 1388 ret i64 %red 1389} 1390 1391define i64 @vwreduce_add_v32i64(ptr %x) { 1392; RV32-LABEL: vwreduce_add_v32i64: 1393; RV32: # %bb.0: 1394; RV32-NEXT: li a1, 32 1395; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma 1396; RV32-NEXT: vle32.v v8, (a0) 1397; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1398; RV32-NEXT: vslidedown.vi v16, v8, 16 1399; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1400; RV32-NEXT: vwadd.vv v24, v8, v16 1401; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1402; RV32-NEXT: vmv.s.x v8, zero 1403; RV32-NEXT: vredsum.vs v8, v24, v8 1404; RV32-NEXT: vmv.x.s a0, v8 1405; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1406; RV32-NEXT: vsrl.vx v8, v8, a1 1407; RV32-NEXT: vmv.x.s a1, v8 1408; RV32-NEXT: ret 1409; 1410; RV64-LABEL: vwreduce_add_v32i64: 1411; RV64: # %bb.0: 1412; RV64-NEXT: li a1, 32 1413; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma 1414; RV64-NEXT: vle32.v v8, (a0) 1415; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1416; RV64-NEXT: vslidedown.vi v16, v8, 16 1417; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1418; RV64-NEXT: vwadd.vv v24, v8, v16 1419; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1420; RV64-NEXT: vmv.s.x v8, zero 1421; RV64-NEXT: vredsum.vs v8, v24, v8 1422; RV64-NEXT: vmv.x.s a0, v8 1423; RV64-NEXT: ret 1424 %v = load <32 x i32>, ptr %x 1425 %e = sext <32 x i32> %v to <32 x i64> 1426 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) 1427 ret i64 %red 1428} 1429 1430define i64 @vwreduce_uadd_v32i64(ptr %x) { 1431; RV32-LABEL: vwreduce_uadd_v32i64: 1432; RV32: # %bb.0: 1433; RV32-NEXT: li a1, 32 1434; RV32-NEXT: vsetvli zero, a1, e32, m8, ta, ma 1435; RV32-NEXT: vle32.v v8, (a0) 1436; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1437; RV32-NEXT: vslidedown.vi v16, v8, 16 1438; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1439; RV32-NEXT: vwaddu.vv v24, v8, v16 1440; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1441; RV32-NEXT: vmv.s.x v8, zero 1442; RV32-NEXT: vredsum.vs v8, v24, v8 1443; RV32-NEXT: vmv.x.s a0, v8 1444; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1445; RV32-NEXT: vsrl.vx v8, v8, a1 1446; RV32-NEXT: vmv.x.s a1, v8 1447; RV32-NEXT: ret 1448; 1449; RV64-LABEL: vwreduce_uadd_v32i64: 1450; RV64: # %bb.0: 1451; RV64-NEXT: li a1, 32 1452; RV64-NEXT: vsetvli zero, a1, e32, m8, ta, ma 1453; RV64-NEXT: vle32.v v8, (a0) 1454; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1455; RV64-NEXT: vslidedown.vi v16, v8, 16 1456; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1457; RV64-NEXT: vwaddu.vv v24, v8, v16 1458; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1459; RV64-NEXT: vmv.s.x v8, zero 1460; RV64-NEXT: vredsum.vs v8, v24, v8 1461; RV64-NEXT: vmv.x.s a0, v8 1462; RV64-NEXT: ret 1463 %v = load <32 x i32>, ptr %x 1464 %e = zext <32 x i32> %v to <32 x i64> 1465 %red = call i64 @llvm.vector.reduce.add.v32i64(<32 x i64> %e) 1466 ret i64 %red 1467} 1468 1469declare i64 @llvm.vector.reduce.add.v64i64(<64 x i64>) 1470 1471define i64 @vreduce_add_v64i64(ptr %x) nounwind { 1472; RV32-LABEL: vreduce_add_v64i64: 1473; RV32: # %bb.0: 1474; RV32-NEXT: addi a1, a0, 384 1475; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1476; RV32-NEXT: vle64.v v24, (a1) 1477; RV32-NEXT: addi a1, a0, 128 1478; RV32-NEXT: vle64.v v0, (a1) 1479; RV32-NEXT: vle64.v v8, (a0) 1480; RV32-NEXT: addi a0, a0, 256 1481; RV32-NEXT: vle64.v v16, (a0) 1482; RV32-NEXT: vadd.vv v24, v0, v24 1483; RV32-NEXT: vmv.s.x v7, zero 1484; RV32-NEXT: li a1, 32 1485; RV32-NEXT: vadd.vv v8, v8, v16 1486; RV32-NEXT: vadd.vv v8, v8, v24 1487; RV32-NEXT: vredsum.vs v8, v8, v7 1488; RV32-NEXT: vmv.x.s a0, v8 1489; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1490; RV32-NEXT: vsrl.vx v8, v8, a1 1491; RV32-NEXT: vmv.x.s a1, v8 1492; RV32-NEXT: ret 1493; 1494; RV64-LABEL: vreduce_add_v64i64: 1495; RV64: # %bb.0: 1496; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 1497; RV64-NEXT: vle64.v v8, (a0) 1498; RV64-NEXT: addi a1, a0, 384 1499; RV64-NEXT: vle64.v v16, (a1) 1500; RV64-NEXT: addi a1, a0, 256 1501; RV64-NEXT: addi a0, a0, 128 1502; RV64-NEXT: vle64.v v24, (a0) 1503; RV64-NEXT: vle64.v v0, (a1) 1504; RV64-NEXT: vadd.vv v16, v24, v16 1505; RV64-NEXT: vadd.vv v8, v8, v0 1506; RV64-NEXT: vadd.vv v8, v8, v16 1507; RV64-NEXT: vmv.s.x v16, zero 1508; RV64-NEXT: vredsum.vs v8, v8, v16 1509; RV64-NEXT: vmv.x.s a0, v8 1510; RV64-NEXT: ret 1511 %v = load <64 x i64>, ptr %x 1512 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %v) 1513 ret i64 %red 1514} 1515 1516define i64 @vwreduce_add_v64i64(ptr %x) { 1517; RV32-LABEL: vwreduce_add_v64i64: 1518; RV32: # %bb.0: 1519; RV32-NEXT: addi a1, a0, 128 1520; RV32-NEXT: li a2, 32 1521; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma 1522; RV32-NEXT: vle32.v v16, (a0) 1523; RV32-NEXT: vle32.v v8, (a1) 1524; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1525; RV32-NEXT: vslidedown.vi v0, v16, 16 1526; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1527; RV32-NEXT: vwadd.vv v24, v16, v8 1528; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1529; RV32-NEXT: vslidedown.vi v8, v8, 16 1530; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1531; RV32-NEXT: vwadd.vv v16, v0, v8 1532; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1533; RV32-NEXT: vadd.vv v8, v24, v16 1534; RV32-NEXT: vmv.s.x v16, zero 1535; RV32-NEXT: vredsum.vs v8, v8, v16 1536; RV32-NEXT: vmv.x.s a0, v8 1537; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1538; RV32-NEXT: vsrl.vx v8, v8, a2 1539; RV32-NEXT: vmv.x.s a1, v8 1540; RV32-NEXT: ret 1541; 1542; RV64-LABEL: vwreduce_add_v64i64: 1543; RV64: # %bb.0: 1544; RV64-NEXT: addi sp, sp, -16 1545; RV64-NEXT: .cfi_def_cfa_offset 16 1546; RV64-NEXT: csrr a1, vlenb 1547; RV64-NEXT: slli a1, a1, 3 1548; RV64-NEXT: sub sp, sp, a1 1549; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 1550; RV64-NEXT: addi a1, a0, 128 1551; RV64-NEXT: li a2, 32 1552; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma 1553; RV64-NEXT: vle32.v v8, (a0) 1554; RV64-NEXT: vle32.v v16, (a1) 1555; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1556; RV64-NEXT: vslidedown.vi v24, v8, 16 1557; RV64-NEXT: addi a0, sp, 16 1558; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill 1559; RV64-NEXT: vslidedown.vi v0, v16, 16 1560; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1561; RV64-NEXT: vwadd.vv v24, v8, v16 1562; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1563; RV64-NEXT: vwadd.vv v8, v16, v0 1564; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1565; RV64-NEXT: vadd.vv v8, v24, v8 1566; RV64-NEXT: vmv.s.x v16, zero 1567; RV64-NEXT: vredsum.vs v8, v8, v16 1568; RV64-NEXT: vmv.x.s a0, v8 1569; RV64-NEXT: csrr a1, vlenb 1570; RV64-NEXT: slli a1, a1, 3 1571; RV64-NEXT: add sp, sp, a1 1572; RV64-NEXT: .cfi_def_cfa sp, 16 1573; RV64-NEXT: addi sp, sp, 16 1574; RV64-NEXT: .cfi_def_cfa_offset 0 1575; RV64-NEXT: ret 1576 %v = load <64 x i32>, ptr %x 1577 %e = sext <64 x i32> %v to <64 x i64> 1578 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) 1579 ret i64 %red 1580} 1581 1582define i64 @vwreduce_uadd_v64i64(ptr %x) { 1583; RV32-LABEL: vwreduce_uadd_v64i64: 1584; RV32: # %bb.0: 1585; RV32-NEXT: addi a1, a0, 128 1586; RV32-NEXT: li a2, 32 1587; RV32-NEXT: vsetvli zero, a2, e32, m8, ta, ma 1588; RV32-NEXT: vle32.v v16, (a0) 1589; RV32-NEXT: vle32.v v8, (a1) 1590; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1591; RV32-NEXT: vslidedown.vi v0, v16, 16 1592; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1593; RV32-NEXT: vwaddu.vv v24, v16, v8 1594; RV32-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1595; RV32-NEXT: vslidedown.vi v8, v8, 16 1596; RV32-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1597; RV32-NEXT: vwaddu.vv v16, v0, v8 1598; RV32-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1599; RV32-NEXT: vadd.vv v8, v24, v16 1600; RV32-NEXT: vmv.s.x v16, zero 1601; RV32-NEXT: vredsum.vs v8, v8, v16 1602; RV32-NEXT: vmv.x.s a0, v8 1603; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 1604; RV32-NEXT: vsrl.vx v8, v8, a2 1605; RV32-NEXT: vmv.x.s a1, v8 1606; RV32-NEXT: ret 1607; 1608; RV64-LABEL: vwreduce_uadd_v64i64: 1609; RV64: # %bb.0: 1610; RV64-NEXT: addi sp, sp, -16 1611; RV64-NEXT: .cfi_def_cfa_offset 16 1612; RV64-NEXT: csrr a1, vlenb 1613; RV64-NEXT: slli a1, a1, 3 1614; RV64-NEXT: sub sp, sp, a1 1615; RV64-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb 1616; RV64-NEXT: addi a1, a0, 128 1617; RV64-NEXT: li a2, 32 1618; RV64-NEXT: vsetvli zero, a2, e32, m8, ta, ma 1619; RV64-NEXT: vle32.v v8, (a0) 1620; RV64-NEXT: vle32.v v16, (a1) 1621; RV64-NEXT: vsetivli zero, 16, e32, m8, ta, ma 1622; RV64-NEXT: vslidedown.vi v24, v8, 16 1623; RV64-NEXT: addi a0, sp, 16 1624; RV64-NEXT: vs8r.v v24, (a0) # Unknown-size Folded Spill 1625; RV64-NEXT: vslidedown.vi v0, v16, 16 1626; RV64-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1627; RV64-NEXT: vwaddu.vv v24, v8, v16 1628; RV64-NEXT: vl8r.v v16, (a0) # Unknown-size Folded Reload 1629; RV64-NEXT: vwaddu.vv v8, v16, v0 1630; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma 1631; RV64-NEXT: vadd.vv v8, v24, v8 1632; RV64-NEXT: vmv.s.x v16, zero 1633; RV64-NEXT: vredsum.vs v8, v8, v16 1634; RV64-NEXT: vmv.x.s a0, v8 1635; RV64-NEXT: csrr a1, vlenb 1636; RV64-NEXT: slli a1, a1, 3 1637; RV64-NEXT: add sp, sp, a1 1638; RV64-NEXT: .cfi_def_cfa sp, 16 1639; RV64-NEXT: addi sp, sp, 16 1640; RV64-NEXT: .cfi_def_cfa_offset 0 1641; RV64-NEXT: ret 1642 %v = load <64 x i32>, ptr %x 1643 %e = zext <64 x i32> %v to <64 x i64> 1644 %red = call i64 @llvm.vector.reduce.add.v64i64(<64 x i64> %e) 1645 ret i64 %red 1646} 1647 1648declare i8 @llvm.vector.reduce.and.v1i8(<1 x i8>) 1649 1650define i8 @vreduce_and_v1i8(<1 x i8> %v) { 1651; CHECK-LABEL: vreduce_and_v1i8: 1652; CHECK: # %bb.0: 1653; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 1654; CHECK-NEXT: vmv.x.s a0, v8 1655; CHECK-NEXT: ret 1656 %red = call i8 @llvm.vector.reduce.and.v1i8(<1 x i8> %v) 1657 ret i8 %red 1658} 1659 1660declare i8 @llvm.vector.reduce.and.v2i8(<2 x i8>) 1661 1662define i8 @vreduce_and_v2i8(ptr %x) { 1663; CHECK-LABEL: vreduce_and_v2i8: 1664; CHECK: # %bb.0: 1665; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 1666; CHECK-NEXT: vle8.v v8, (a0) 1667; CHECK-NEXT: vredand.vs v8, v8, v8 1668; CHECK-NEXT: vmv.x.s a0, v8 1669; CHECK-NEXT: ret 1670 %v = load <2 x i8>, ptr %x 1671 %red = call i8 @llvm.vector.reduce.and.v2i8(<2 x i8> %v) 1672 ret i8 %red 1673} 1674 1675declare i8 @llvm.vector.reduce.and.v3i8(<3 x i8>) 1676 1677define i8 @vreduce_and_v3i8(ptr %x) { 1678; CHECK-LABEL: vreduce_and_v3i8: 1679; CHECK: # %bb.0: 1680; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 1681; CHECK-NEXT: vle8.v v8, (a0) 1682; CHECK-NEXT: li a0, -1 1683; CHECK-NEXT: vmv.s.x v9, a0 1684; CHECK-NEXT: vredand.vs v8, v8, v9 1685; CHECK-NEXT: vmv.x.s a0, v8 1686; CHECK-NEXT: ret 1687 %v = load <3 x i8>, ptr %x 1688 %red = call i8 @llvm.vector.reduce.and.v3i8(<3 x i8> %v) 1689 ret i8 %red 1690} 1691 1692 1693declare i8 @llvm.vector.reduce.and.v4i8(<4 x i8>) 1694 1695define i8 @vreduce_and_v4i8(ptr %x) { 1696; CHECK-LABEL: vreduce_and_v4i8: 1697; CHECK: # %bb.0: 1698; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 1699; CHECK-NEXT: vle8.v v8, (a0) 1700; CHECK-NEXT: vredand.vs v8, v8, v8 1701; CHECK-NEXT: vmv.x.s a0, v8 1702; CHECK-NEXT: ret 1703 %v = load <4 x i8>, ptr %x 1704 %red = call i8 @llvm.vector.reduce.and.v4i8(<4 x i8> %v) 1705 ret i8 %red 1706} 1707 1708declare i8 @llvm.vector.reduce.and.v8i8(<8 x i8>) 1709 1710define i8 @vreduce_and_v8i8(ptr %x) { 1711; CHECK-LABEL: vreduce_and_v8i8: 1712; CHECK: # %bb.0: 1713; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 1714; CHECK-NEXT: vle8.v v8, (a0) 1715; CHECK-NEXT: vredand.vs v8, v8, v8 1716; CHECK-NEXT: vmv.x.s a0, v8 1717; CHECK-NEXT: ret 1718 %v = load <8 x i8>, ptr %x 1719 %red = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> %v) 1720 ret i8 %red 1721} 1722 1723declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>) 1724 1725define i8 @vreduce_and_v16i8(ptr %x) { 1726; CHECK-LABEL: vreduce_and_v16i8: 1727; CHECK: # %bb.0: 1728; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 1729; CHECK-NEXT: vle8.v v8, (a0) 1730; CHECK-NEXT: vredand.vs v8, v8, v8 1731; CHECK-NEXT: vmv.x.s a0, v8 1732; CHECK-NEXT: ret 1733 %v = load <16 x i8>, ptr %x 1734 %red = call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %v) 1735 ret i8 %red 1736} 1737 1738declare i8 @llvm.vector.reduce.and.v32i8(<32 x i8>) 1739 1740define i8 @vreduce_and_v32i8(ptr %x) { 1741; CHECK-LABEL: vreduce_and_v32i8: 1742; CHECK: # %bb.0: 1743; CHECK-NEXT: li a1, 32 1744; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 1745; CHECK-NEXT: vle8.v v8, (a0) 1746; CHECK-NEXT: vredand.vs v8, v8, v8 1747; CHECK-NEXT: vmv.x.s a0, v8 1748; CHECK-NEXT: ret 1749 %v = load <32 x i8>, ptr %x 1750 %red = call i8 @llvm.vector.reduce.and.v32i8(<32 x i8> %v) 1751 ret i8 %red 1752} 1753 1754declare i8 @llvm.vector.reduce.and.v64i8(<64 x i8>) 1755 1756define i8 @vreduce_and_v64i8(ptr %x) { 1757; CHECK-LABEL: vreduce_and_v64i8: 1758; CHECK: # %bb.0: 1759; CHECK-NEXT: li a1, 64 1760; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 1761; CHECK-NEXT: vle8.v v8, (a0) 1762; CHECK-NEXT: vredand.vs v8, v8, v8 1763; CHECK-NEXT: vmv.x.s a0, v8 1764; CHECK-NEXT: ret 1765 %v = load <64 x i8>, ptr %x 1766 %red = call i8 @llvm.vector.reduce.and.v64i8(<64 x i8> %v) 1767 ret i8 %red 1768} 1769 1770declare i8 @llvm.vector.reduce.and.v128i8(<128 x i8>) 1771 1772define i8 @vreduce_and_v128i8(ptr %x) { 1773; CHECK-LABEL: vreduce_and_v128i8: 1774; CHECK: # %bb.0: 1775; CHECK-NEXT: li a1, 128 1776; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 1777; CHECK-NEXT: vle8.v v8, (a0) 1778; CHECK-NEXT: vredand.vs v8, v8, v8 1779; CHECK-NEXT: vmv.x.s a0, v8 1780; CHECK-NEXT: ret 1781 %v = load <128 x i8>, ptr %x 1782 %red = call i8 @llvm.vector.reduce.and.v128i8(<128 x i8> %v) 1783 ret i8 %red 1784} 1785 1786declare i8 @llvm.vector.reduce.and.v256i8(<256 x i8>) 1787 1788define i8 @vreduce_and_v256i8(ptr %x) { 1789; CHECK-LABEL: vreduce_and_v256i8: 1790; CHECK: # %bb.0: 1791; CHECK-NEXT: li a1, 128 1792; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 1793; CHECK-NEXT: vle8.v v8, (a0) 1794; CHECK-NEXT: addi a0, a0, 128 1795; CHECK-NEXT: vle8.v v16, (a0) 1796; CHECK-NEXT: vand.vv v8, v8, v16 1797; CHECK-NEXT: vredand.vs v8, v8, v8 1798; CHECK-NEXT: vmv.x.s a0, v8 1799; CHECK-NEXT: ret 1800 %v = load <256 x i8>, ptr %x 1801 %red = call i8 @llvm.vector.reduce.and.v256i8(<256 x i8> %v) 1802 ret i8 %red 1803} 1804 1805declare i16 @llvm.vector.reduce.and.v1i16(<1 x i16>) 1806 1807define i16 @vreduce_and_v1i16(<1 x i16> %v) { 1808; CHECK-LABEL: vreduce_and_v1i16: 1809; CHECK: # %bb.0: 1810; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 1811; CHECK-NEXT: vmv.x.s a0, v8 1812; CHECK-NEXT: ret 1813 %red = call i16 @llvm.vector.reduce.and.v1i16(<1 x i16> %v) 1814 ret i16 %red 1815} 1816 1817declare i16 @llvm.vector.reduce.and.v2i16(<2 x i16>) 1818 1819define i16 @vreduce_and_v2i16(ptr %x) { 1820; CHECK-LABEL: vreduce_and_v2i16: 1821; CHECK: # %bb.0: 1822; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 1823; CHECK-NEXT: vle16.v v8, (a0) 1824; CHECK-NEXT: vredand.vs v8, v8, v8 1825; CHECK-NEXT: vmv.x.s a0, v8 1826; CHECK-NEXT: ret 1827 %v = load <2 x i16>, ptr %x 1828 %red = call i16 @llvm.vector.reduce.and.v2i16(<2 x i16> %v) 1829 ret i16 %red 1830} 1831 1832declare i16 @llvm.vector.reduce.and.v4i16(<4 x i16>) 1833 1834define i16 @vreduce_and_v4i16(ptr %x) { 1835; CHECK-LABEL: vreduce_and_v4i16: 1836; CHECK: # %bb.0: 1837; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 1838; CHECK-NEXT: vle16.v v8, (a0) 1839; CHECK-NEXT: vredand.vs v8, v8, v8 1840; CHECK-NEXT: vmv.x.s a0, v8 1841; CHECK-NEXT: ret 1842 %v = load <4 x i16>, ptr %x 1843 %red = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> %v) 1844 ret i16 %red 1845} 1846 1847declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>) 1848 1849define i16 @vreduce_and_v8i16(ptr %x) { 1850; CHECK-LABEL: vreduce_and_v8i16: 1851; CHECK: # %bb.0: 1852; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 1853; CHECK-NEXT: vle16.v v8, (a0) 1854; CHECK-NEXT: vredand.vs v8, v8, v8 1855; CHECK-NEXT: vmv.x.s a0, v8 1856; CHECK-NEXT: ret 1857 %v = load <8 x i16>, ptr %x 1858 %red = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %v) 1859 ret i16 %red 1860} 1861 1862declare i16 @llvm.vector.reduce.and.v16i16(<16 x i16>) 1863 1864define i16 @vreduce_and_v16i16(ptr %x) { 1865; CHECK-LABEL: vreduce_and_v16i16: 1866; CHECK: # %bb.0: 1867; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 1868; CHECK-NEXT: vle16.v v8, (a0) 1869; CHECK-NEXT: vredand.vs v8, v8, v8 1870; CHECK-NEXT: vmv.x.s a0, v8 1871; CHECK-NEXT: ret 1872 %v = load <16 x i16>, ptr %x 1873 %red = call i16 @llvm.vector.reduce.and.v16i16(<16 x i16> %v) 1874 ret i16 %red 1875} 1876 1877declare i16 @llvm.vector.reduce.and.v32i16(<32 x i16>) 1878 1879define i16 @vreduce_and_v32i16(ptr %x) { 1880; CHECK-LABEL: vreduce_and_v32i16: 1881; CHECK: # %bb.0: 1882; CHECK-NEXT: li a1, 32 1883; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 1884; CHECK-NEXT: vle16.v v8, (a0) 1885; CHECK-NEXT: vredand.vs v8, v8, v8 1886; CHECK-NEXT: vmv.x.s a0, v8 1887; CHECK-NEXT: ret 1888 %v = load <32 x i16>, ptr %x 1889 %red = call i16 @llvm.vector.reduce.and.v32i16(<32 x i16> %v) 1890 ret i16 %red 1891} 1892 1893declare i16 @llvm.vector.reduce.and.v64i16(<64 x i16>) 1894 1895define i16 @vreduce_and_v64i16(ptr %x) { 1896; CHECK-LABEL: vreduce_and_v64i16: 1897; CHECK: # %bb.0: 1898; CHECK-NEXT: li a1, 64 1899; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 1900; CHECK-NEXT: vle16.v v8, (a0) 1901; CHECK-NEXT: vredand.vs v8, v8, v8 1902; CHECK-NEXT: vmv.x.s a0, v8 1903; CHECK-NEXT: ret 1904 %v = load <64 x i16>, ptr %x 1905 %red = call i16 @llvm.vector.reduce.and.v64i16(<64 x i16> %v) 1906 ret i16 %red 1907} 1908 1909declare i16 @llvm.vector.reduce.and.v128i16(<128 x i16>) 1910 1911define i16 @vreduce_and_v128i16(ptr %x) { 1912; CHECK-LABEL: vreduce_and_v128i16: 1913; CHECK: # %bb.0: 1914; CHECK-NEXT: li a1, 64 1915; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 1916; CHECK-NEXT: vle16.v v8, (a0) 1917; CHECK-NEXT: addi a0, a0, 128 1918; CHECK-NEXT: vle16.v v16, (a0) 1919; CHECK-NEXT: vand.vv v8, v8, v16 1920; CHECK-NEXT: vredand.vs v8, v8, v8 1921; CHECK-NEXT: vmv.x.s a0, v8 1922; CHECK-NEXT: ret 1923 %v = load <128 x i16>, ptr %x 1924 %red = call i16 @llvm.vector.reduce.and.v128i16(<128 x i16> %v) 1925 ret i16 %red 1926} 1927 1928declare i32 @llvm.vector.reduce.and.v1i32(<1 x i32>) 1929 1930define i32 @vreduce_and_v1i32(<1 x i32> %v) { 1931; CHECK-LABEL: vreduce_and_v1i32: 1932; CHECK: # %bb.0: 1933; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 1934; CHECK-NEXT: vmv.x.s a0, v8 1935; CHECK-NEXT: ret 1936 %red = call i32 @llvm.vector.reduce.and.v1i32(<1 x i32> %v) 1937 ret i32 %red 1938} 1939 1940declare i32 @llvm.vector.reduce.and.v2i32(<2 x i32>) 1941 1942define i32 @vreduce_and_v2i32(ptr %x) { 1943; CHECK-LABEL: vreduce_and_v2i32: 1944; CHECK: # %bb.0: 1945; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 1946; CHECK-NEXT: vle32.v v8, (a0) 1947; CHECK-NEXT: vredand.vs v8, v8, v8 1948; CHECK-NEXT: vmv.x.s a0, v8 1949; CHECK-NEXT: ret 1950 %v = load <2 x i32>, ptr %x 1951 %red = call i32 @llvm.vector.reduce.and.v2i32(<2 x i32> %v) 1952 ret i32 %red 1953} 1954 1955declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>) 1956 1957define i32 @vreduce_and_v4i32(ptr %x) { 1958; CHECK-LABEL: vreduce_and_v4i32: 1959; CHECK: # %bb.0: 1960; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 1961; CHECK-NEXT: vle32.v v8, (a0) 1962; CHECK-NEXT: vredand.vs v8, v8, v8 1963; CHECK-NEXT: vmv.x.s a0, v8 1964; CHECK-NEXT: ret 1965 %v = load <4 x i32>, ptr %x 1966 %red = call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %v) 1967 ret i32 %red 1968} 1969 1970declare i32 @llvm.vector.reduce.and.v8i32(<8 x i32>) 1971 1972define i32 @vreduce_and_v8i32(ptr %x) { 1973; CHECK-LABEL: vreduce_and_v8i32: 1974; CHECK: # %bb.0: 1975; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 1976; CHECK-NEXT: vle32.v v8, (a0) 1977; CHECK-NEXT: vredand.vs v8, v8, v8 1978; CHECK-NEXT: vmv.x.s a0, v8 1979; CHECK-NEXT: ret 1980 %v = load <8 x i32>, ptr %x 1981 %red = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> %v) 1982 ret i32 %red 1983} 1984 1985declare i32 @llvm.vector.reduce.and.v16i32(<16 x i32>) 1986 1987define i32 @vreduce_and_v16i32(ptr %x) { 1988; CHECK-LABEL: vreduce_and_v16i32: 1989; CHECK: # %bb.0: 1990; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 1991; CHECK-NEXT: vle32.v v8, (a0) 1992; CHECK-NEXT: vredand.vs v8, v8, v8 1993; CHECK-NEXT: vmv.x.s a0, v8 1994; CHECK-NEXT: ret 1995 %v = load <16 x i32>, ptr %x 1996 %red = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> %v) 1997 ret i32 %red 1998} 1999 2000declare i32 @llvm.vector.reduce.and.v32i32(<32 x i32>) 2001 2002define i32 @vreduce_and_v32i32(ptr %x) { 2003; CHECK-LABEL: vreduce_and_v32i32: 2004; CHECK: # %bb.0: 2005; CHECK-NEXT: li a1, 32 2006; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 2007; CHECK-NEXT: vle32.v v8, (a0) 2008; CHECK-NEXT: vredand.vs v8, v8, v8 2009; CHECK-NEXT: vmv.x.s a0, v8 2010; CHECK-NEXT: ret 2011 %v = load <32 x i32>, ptr %x 2012 %red = call i32 @llvm.vector.reduce.and.v32i32(<32 x i32> %v) 2013 ret i32 %red 2014} 2015 2016declare i32 @llvm.vector.reduce.and.v64i32(<64 x i32>) 2017 2018define i32 @vreduce_and_v64i32(ptr %x) { 2019; CHECK-LABEL: vreduce_and_v64i32: 2020; CHECK: # %bb.0: 2021; CHECK-NEXT: li a1, 32 2022; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 2023; CHECK-NEXT: vle32.v v8, (a0) 2024; CHECK-NEXT: addi a0, a0, 128 2025; CHECK-NEXT: vle32.v v16, (a0) 2026; CHECK-NEXT: vand.vv v8, v8, v16 2027; CHECK-NEXT: vredand.vs v8, v8, v8 2028; CHECK-NEXT: vmv.x.s a0, v8 2029; CHECK-NEXT: ret 2030 %v = load <64 x i32>, ptr %x 2031 %red = call i32 @llvm.vector.reduce.and.v64i32(<64 x i32> %v) 2032 ret i32 %red 2033} 2034 2035declare i64 @llvm.vector.reduce.and.v1i64(<1 x i64>) 2036 2037define i64 @vreduce_and_v1i64(<1 x i64> %v) { 2038; RV32-LABEL: vreduce_and_v1i64: 2039; RV32: # %bb.0: 2040; RV32-NEXT: li a0, 32 2041; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2042; RV32-NEXT: vsrl.vx v9, v8, a0 2043; RV32-NEXT: vmv.x.s a1, v9 2044; RV32-NEXT: vmv.x.s a0, v8 2045; RV32-NEXT: ret 2046; 2047; RV64-LABEL: vreduce_and_v1i64: 2048; RV64: # %bb.0: 2049; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2050; RV64-NEXT: vmv.x.s a0, v8 2051; RV64-NEXT: ret 2052 %red = call i64 @llvm.vector.reduce.and.v1i64(<1 x i64> %v) 2053 ret i64 %red 2054} 2055 2056declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>) 2057 2058define i64 @vreduce_and_v2i64(ptr %x) { 2059; RV32-LABEL: vreduce_and_v2i64: 2060; RV32: # %bb.0: 2061; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 2062; RV32-NEXT: vle64.v v8, (a0) 2063; RV32-NEXT: li a0, 32 2064; RV32-NEXT: vredand.vs v8, v8, v8 2065; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2066; RV32-NEXT: vsrl.vx v9, v8, a0 2067; RV32-NEXT: vmv.x.s a1, v9 2068; RV32-NEXT: vmv.x.s a0, v8 2069; RV32-NEXT: ret 2070; 2071; RV64-LABEL: vreduce_and_v2i64: 2072; RV64: # %bb.0: 2073; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 2074; RV64-NEXT: vle64.v v8, (a0) 2075; RV64-NEXT: vredand.vs v8, v8, v8 2076; RV64-NEXT: vmv.x.s a0, v8 2077; RV64-NEXT: ret 2078 %v = load <2 x i64>, ptr %x 2079 %red = call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %v) 2080 ret i64 %red 2081} 2082 2083declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) 2084 2085define i64 @vreduce_and_v4i64(ptr %x) { 2086; RV32-LABEL: vreduce_and_v4i64: 2087; RV32: # %bb.0: 2088; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 2089; RV32-NEXT: vle64.v v8, (a0) 2090; RV32-NEXT: li a1, 32 2091; RV32-NEXT: vredand.vs v8, v8, v8 2092; RV32-NEXT: vmv.x.s a0, v8 2093; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2094; RV32-NEXT: vsrl.vx v8, v8, a1 2095; RV32-NEXT: vmv.x.s a1, v8 2096; RV32-NEXT: ret 2097; 2098; RV64-LABEL: vreduce_and_v4i64: 2099; RV64: # %bb.0: 2100; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 2101; RV64-NEXT: vle64.v v8, (a0) 2102; RV64-NEXT: vredand.vs v8, v8, v8 2103; RV64-NEXT: vmv.x.s a0, v8 2104; RV64-NEXT: ret 2105 %v = load <4 x i64>, ptr %x 2106 %red = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) 2107 ret i64 %red 2108} 2109 2110declare i64 @llvm.vector.reduce.and.v8i64(<8 x i64>) 2111 2112define i64 @vreduce_and_v8i64(ptr %x) { 2113; RV32-LABEL: vreduce_and_v8i64: 2114; RV32: # %bb.0: 2115; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 2116; RV32-NEXT: vle64.v v8, (a0) 2117; RV32-NEXT: li a1, 32 2118; RV32-NEXT: vredand.vs v8, v8, v8 2119; RV32-NEXT: vmv.x.s a0, v8 2120; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2121; RV32-NEXT: vsrl.vx v8, v8, a1 2122; RV32-NEXT: vmv.x.s a1, v8 2123; RV32-NEXT: ret 2124; 2125; RV64-LABEL: vreduce_and_v8i64: 2126; RV64: # %bb.0: 2127; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 2128; RV64-NEXT: vle64.v v8, (a0) 2129; RV64-NEXT: vredand.vs v8, v8, v8 2130; RV64-NEXT: vmv.x.s a0, v8 2131; RV64-NEXT: ret 2132 %v = load <8 x i64>, ptr %x 2133 %red = call i64 @llvm.vector.reduce.and.v8i64(<8 x i64> %v) 2134 ret i64 %red 2135} 2136 2137declare i64 @llvm.vector.reduce.and.v16i64(<16 x i64>) 2138 2139define i64 @vreduce_and_v16i64(ptr %x) { 2140; RV32-LABEL: vreduce_and_v16i64: 2141; RV32: # %bb.0: 2142; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2143; RV32-NEXT: vle64.v v8, (a0) 2144; RV32-NEXT: li a1, 32 2145; RV32-NEXT: vredand.vs v8, v8, v8 2146; RV32-NEXT: vmv.x.s a0, v8 2147; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2148; RV32-NEXT: vsrl.vx v8, v8, a1 2149; RV32-NEXT: vmv.x.s a1, v8 2150; RV32-NEXT: ret 2151; 2152; RV64-LABEL: vreduce_and_v16i64: 2153; RV64: # %bb.0: 2154; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2155; RV64-NEXT: vle64.v v8, (a0) 2156; RV64-NEXT: vredand.vs v8, v8, v8 2157; RV64-NEXT: vmv.x.s a0, v8 2158; RV64-NEXT: ret 2159 %v = load <16 x i64>, ptr %x 2160 %red = call i64 @llvm.vector.reduce.and.v16i64(<16 x i64> %v) 2161 ret i64 %red 2162} 2163 2164declare i64 @llvm.vector.reduce.and.v32i64(<32 x i64>) 2165 2166define i64 @vreduce_and_v32i64(ptr %x) { 2167; RV32-LABEL: vreduce_and_v32i64: 2168; RV32: # %bb.0: 2169; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2170; RV32-NEXT: vle64.v v8, (a0) 2171; RV32-NEXT: addi a0, a0, 128 2172; RV32-NEXT: vle64.v v16, (a0) 2173; RV32-NEXT: li a1, 32 2174; RV32-NEXT: vand.vv v8, v8, v16 2175; RV32-NEXT: vredand.vs v8, v8, v8 2176; RV32-NEXT: vmv.x.s a0, v8 2177; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2178; RV32-NEXT: vsrl.vx v8, v8, a1 2179; RV32-NEXT: vmv.x.s a1, v8 2180; RV32-NEXT: ret 2181; 2182; RV64-LABEL: vreduce_and_v32i64: 2183; RV64: # %bb.0: 2184; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2185; RV64-NEXT: vle64.v v8, (a0) 2186; RV64-NEXT: addi a0, a0, 128 2187; RV64-NEXT: vle64.v v16, (a0) 2188; RV64-NEXT: vand.vv v8, v8, v16 2189; RV64-NEXT: vredand.vs v8, v8, v8 2190; RV64-NEXT: vmv.x.s a0, v8 2191; RV64-NEXT: ret 2192 %v = load <32 x i64>, ptr %x 2193 %red = call i64 @llvm.vector.reduce.and.v32i64(<32 x i64> %v) 2194 ret i64 %red 2195} 2196 2197declare i64 @llvm.vector.reduce.and.v64i64(<64 x i64>) 2198 2199define i64 @vreduce_and_v64i64(ptr %x) nounwind { 2200; RV32-LABEL: vreduce_and_v64i64: 2201; RV32: # %bb.0: 2202; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2203; RV32-NEXT: vle64.v v8, (a0) 2204; RV32-NEXT: addi a1, a0, 384 2205; RV32-NEXT: vle64.v v16, (a1) 2206; RV32-NEXT: addi a1, a0, 256 2207; RV32-NEXT: addi a0, a0, 128 2208; RV32-NEXT: vle64.v v0, (a0) 2209; RV32-NEXT: vle64.v v24, (a1) 2210; RV32-NEXT: li a1, 32 2211; RV32-NEXT: vand.vv v16, v0, v16 2212; RV32-NEXT: vand.vv v8, v8, v24 2213; RV32-NEXT: vand.vv v8, v8, v16 2214; RV32-NEXT: vredand.vs v8, v8, v8 2215; RV32-NEXT: vmv.x.s a0, v8 2216; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2217; RV32-NEXT: vsrl.vx v8, v8, a1 2218; RV32-NEXT: vmv.x.s a1, v8 2219; RV32-NEXT: ret 2220; 2221; RV64-LABEL: vreduce_and_v64i64: 2222; RV64: # %bb.0: 2223; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2224; RV64-NEXT: vle64.v v8, (a0) 2225; RV64-NEXT: addi a1, a0, 384 2226; RV64-NEXT: vle64.v v16, (a1) 2227; RV64-NEXT: addi a1, a0, 256 2228; RV64-NEXT: addi a0, a0, 128 2229; RV64-NEXT: vle64.v v24, (a0) 2230; RV64-NEXT: vle64.v v0, (a1) 2231; RV64-NEXT: vand.vv v16, v24, v16 2232; RV64-NEXT: vand.vv v8, v8, v0 2233; RV64-NEXT: vand.vv v8, v8, v16 2234; RV64-NEXT: vredand.vs v8, v8, v8 2235; RV64-NEXT: vmv.x.s a0, v8 2236; RV64-NEXT: ret 2237 %v = load <64 x i64>, ptr %x 2238 %red = call i64 @llvm.vector.reduce.and.v64i64(<64 x i64> %v) 2239 ret i64 %red 2240} 2241 2242declare i8 @llvm.vector.reduce.or.v1i8(<1 x i8>) 2243 2244define i8 @vreduce_or_v1i8(<1 x i8> %v) { 2245; CHECK-LABEL: vreduce_or_v1i8: 2246; CHECK: # %bb.0: 2247; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 2248; CHECK-NEXT: vmv.x.s a0, v8 2249; CHECK-NEXT: ret 2250 %red = call i8 @llvm.vector.reduce.or.v1i8(<1 x i8> %v) 2251 ret i8 %red 2252} 2253 2254declare i8 @llvm.vector.reduce.or.v2i8(<2 x i8>) 2255 2256define i8 @vreduce_or_v2i8(ptr %x) { 2257; CHECK-LABEL: vreduce_or_v2i8: 2258; CHECK: # %bb.0: 2259; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 2260; CHECK-NEXT: vle8.v v8, (a0) 2261; CHECK-NEXT: vredor.vs v8, v8, v8 2262; CHECK-NEXT: vmv.x.s a0, v8 2263; CHECK-NEXT: ret 2264 %v = load <2 x i8>, ptr %x 2265 %red = call i8 @llvm.vector.reduce.or.v2i8(<2 x i8> %v) 2266 ret i8 %red 2267} 2268 2269declare i8 @llvm.vector.reduce.or.v3i8(<3 x i8>) 2270 2271define i8 @vreduce_or_v3i8(ptr %x) { 2272; CHECK-LABEL: vreduce_or_v3i8: 2273; CHECK: # %bb.0: 2274; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 2275; CHECK-NEXT: vle8.v v8, (a0) 2276; CHECK-NEXT: vmv.s.x v9, zero 2277; CHECK-NEXT: vredor.vs v8, v8, v9 2278; CHECK-NEXT: vmv.x.s a0, v8 2279; CHECK-NEXT: ret 2280 %v = load <3 x i8>, ptr %x 2281 %red = call i8 @llvm.vector.reduce.or.v3i8(<3 x i8> %v) 2282 ret i8 %red 2283} 2284 2285declare i8 @llvm.vector.reduce.or.v4i8(<4 x i8>) 2286 2287define i8 @vreduce_or_v4i8(ptr %x) { 2288; CHECK-LABEL: vreduce_or_v4i8: 2289; CHECK: # %bb.0: 2290; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 2291; CHECK-NEXT: vle8.v v8, (a0) 2292; CHECK-NEXT: vredor.vs v8, v8, v8 2293; CHECK-NEXT: vmv.x.s a0, v8 2294; CHECK-NEXT: ret 2295 %v = load <4 x i8>, ptr %x 2296 %red = call i8 @llvm.vector.reduce.or.v4i8(<4 x i8> %v) 2297 ret i8 %red 2298} 2299 2300declare i8 @llvm.vector.reduce.or.v8i8(<8 x i8>) 2301 2302define i8 @vreduce_or_v8i8(ptr %x) { 2303; CHECK-LABEL: vreduce_or_v8i8: 2304; CHECK: # %bb.0: 2305; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 2306; CHECK-NEXT: vle8.v v8, (a0) 2307; CHECK-NEXT: vredor.vs v8, v8, v8 2308; CHECK-NEXT: vmv.x.s a0, v8 2309; CHECK-NEXT: ret 2310 %v = load <8 x i8>, ptr %x 2311 %red = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> %v) 2312 ret i8 %red 2313} 2314 2315declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>) 2316 2317define i8 @vreduce_or_v16i8(ptr %x) { 2318; CHECK-LABEL: vreduce_or_v16i8: 2319; CHECK: # %bb.0: 2320; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 2321; CHECK-NEXT: vle8.v v8, (a0) 2322; CHECK-NEXT: vredor.vs v8, v8, v8 2323; CHECK-NEXT: vmv.x.s a0, v8 2324; CHECK-NEXT: ret 2325 %v = load <16 x i8>, ptr %x 2326 %red = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %v) 2327 ret i8 %red 2328} 2329 2330declare i8 @llvm.vector.reduce.or.v32i8(<32 x i8>) 2331 2332define i8 @vreduce_or_v32i8(ptr %x) { 2333; CHECK-LABEL: vreduce_or_v32i8: 2334; CHECK: # %bb.0: 2335; CHECK-NEXT: li a1, 32 2336; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 2337; CHECK-NEXT: vle8.v v8, (a0) 2338; CHECK-NEXT: vredor.vs v8, v8, v8 2339; CHECK-NEXT: vmv.x.s a0, v8 2340; CHECK-NEXT: ret 2341 %v = load <32 x i8>, ptr %x 2342 %red = call i8 @llvm.vector.reduce.or.v32i8(<32 x i8> %v) 2343 ret i8 %red 2344} 2345 2346declare i8 @llvm.vector.reduce.or.v64i8(<64 x i8>) 2347 2348define i8 @vreduce_or_v64i8(ptr %x) { 2349; CHECK-LABEL: vreduce_or_v64i8: 2350; CHECK: # %bb.0: 2351; CHECK-NEXT: li a1, 64 2352; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 2353; CHECK-NEXT: vle8.v v8, (a0) 2354; CHECK-NEXT: vredor.vs v8, v8, v8 2355; CHECK-NEXT: vmv.x.s a0, v8 2356; CHECK-NEXT: ret 2357 %v = load <64 x i8>, ptr %x 2358 %red = call i8 @llvm.vector.reduce.or.v64i8(<64 x i8> %v) 2359 ret i8 %red 2360} 2361 2362declare i8 @llvm.vector.reduce.or.v128i8(<128 x i8>) 2363 2364define i8 @vreduce_or_v128i8(ptr %x) { 2365; CHECK-LABEL: vreduce_or_v128i8: 2366; CHECK: # %bb.0: 2367; CHECK-NEXT: li a1, 128 2368; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 2369; CHECK-NEXT: vle8.v v8, (a0) 2370; CHECK-NEXT: vredor.vs v8, v8, v8 2371; CHECK-NEXT: vmv.x.s a0, v8 2372; CHECK-NEXT: ret 2373 %v = load <128 x i8>, ptr %x 2374 %red = call i8 @llvm.vector.reduce.or.v128i8(<128 x i8> %v) 2375 ret i8 %red 2376} 2377 2378declare i8 @llvm.vector.reduce.or.v256i8(<256 x i8>) 2379 2380define i8 @vreduce_or_v256i8(ptr %x) { 2381; CHECK-LABEL: vreduce_or_v256i8: 2382; CHECK: # %bb.0: 2383; CHECK-NEXT: li a1, 128 2384; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 2385; CHECK-NEXT: vle8.v v8, (a0) 2386; CHECK-NEXT: addi a0, a0, 128 2387; CHECK-NEXT: vle8.v v16, (a0) 2388; CHECK-NEXT: vor.vv v8, v8, v16 2389; CHECK-NEXT: vredor.vs v8, v8, v8 2390; CHECK-NEXT: vmv.x.s a0, v8 2391; CHECK-NEXT: ret 2392 %v = load <256 x i8>, ptr %x 2393 %red = call i8 @llvm.vector.reduce.or.v256i8(<256 x i8> %v) 2394 ret i8 %red 2395} 2396 2397declare i16 @llvm.vector.reduce.or.v1i16(<1 x i16>) 2398 2399define i16 @vreduce_or_v1i16(<1 x i16> %v) { 2400; CHECK-LABEL: vreduce_or_v1i16: 2401; CHECK: # %bb.0: 2402; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 2403; CHECK-NEXT: vmv.x.s a0, v8 2404; CHECK-NEXT: ret 2405 %red = call i16 @llvm.vector.reduce.or.v1i16(<1 x i16> %v) 2406 ret i16 %red 2407} 2408 2409declare i16 @llvm.vector.reduce.or.v2i16(<2 x i16>) 2410 2411define i16 @vreduce_or_v2i16(ptr %x) { 2412; CHECK-LABEL: vreduce_or_v2i16: 2413; CHECK: # %bb.0: 2414; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 2415; CHECK-NEXT: vle16.v v8, (a0) 2416; CHECK-NEXT: vredor.vs v8, v8, v8 2417; CHECK-NEXT: vmv.x.s a0, v8 2418; CHECK-NEXT: ret 2419 %v = load <2 x i16>, ptr %x 2420 %red = call i16 @llvm.vector.reduce.or.v2i16(<2 x i16> %v) 2421 ret i16 %red 2422} 2423 2424declare i16 @llvm.vector.reduce.or.v4i16(<4 x i16>) 2425 2426define i16 @vreduce_or_v4i16(ptr %x) { 2427; CHECK-LABEL: vreduce_or_v4i16: 2428; CHECK: # %bb.0: 2429; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 2430; CHECK-NEXT: vle16.v v8, (a0) 2431; CHECK-NEXT: vredor.vs v8, v8, v8 2432; CHECK-NEXT: vmv.x.s a0, v8 2433; CHECK-NEXT: ret 2434 %v = load <4 x i16>, ptr %x 2435 %red = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> %v) 2436 ret i16 %red 2437} 2438 2439declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>) 2440 2441define i16 @vreduce_or_v8i16(ptr %x) { 2442; CHECK-LABEL: vreduce_or_v8i16: 2443; CHECK: # %bb.0: 2444; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 2445; CHECK-NEXT: vle16.v v8, (a0) 2446; CHECK-NEXT: vredor.vs v8, v8, v8 2447; CHECK-NEXT: vmv.x.s a0, v8 2448; CHECK-NEXT: ret 2449 %v = load <8 x i16>, ptr %x 2450 %red = call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %v) 2451 ret i16 %red 2452} 2453 2454declare i16 @llvm.vector.reduce.or.v16i16(<16 x i16>) 2455 2456define i16 @vreduce_or_v16i16(ptr %x) { 2457; CHECK-LABEL: vreduce_or_v16i16: 2458; CHECK: # %bb.0: 2459; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 2460; CHECK-NEXT: vle16.v v8, (a0) 2461; CHECK-NEXT: vredor.vs v8, v8, v8 2462; CHECK-NEXT: vmv.x.s a0, v8 2463; CHECK-NEXT: ret 2464 %v = load <16 x i16>, ptr %x 2465 %red = call i16 @llvm.vector.reduce.or.v16i16(<16 x i16> %v) 2466 ret i16 %red 2467} 2468 2469declare i16 @llvm.vector.reduce.or.v32i16(<32 x i16>) 2470 2471define i16 @vreduce_or_v32i16(ptr %x) { 2472; CHECK-LABEL: vreduce_or_v32i16: 2473; CHECK: # %bb.0: 2474; CHECK-NEXT: li a1, 32 2475; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 2476; CHECK-NEXT: vle16.v v8, (a0) 2477; CHECK-NEXT: vredor.vs v8, v8, v8 2478; CHECK-NEXT: vmv.x.s a0, v8 2479; CHECK-NEXT: ret 2480 %v = load <32 x i16>, ptr %x 2481 %red = call i16 @llvm.vector.reduce.or.v32i16(<32 x i16> %v) 2482 ret i16 %red 2483} 2484 2485declare i16 @llvm.vector.reduce.or.v64i16(<64 x i16>) 2486 2487define i16 @vreduce_or_v64i16(ptr %x) { 2488; CHECK-LABEL: vreduce_or_v64i16: 2489; CHECK: # %bb.0: 2490; CHECK-NEXT: li a1, 64 2491; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 2492; CHECK-NEXT: vle16.v v8, (a0) 2493; CHECK-NEXT: vredor.vs v8, v8, v8 2494; CHECK-NEXT: vmv.x.s a0, v8 2495; CHECK-NEXT: ret 2496 %v = load <64 x i16>, ptr %x 2497 %red = call i16 @llvm.vector.reduce.or.v64i16(<64 x i16> %v) 2498 ret i16 %red 2499} 2500 2501declare i16 @llvm.vector.reduce.or.v128i16(<128 x i16>) 2502 2503define i16 @vreduce_or_v128i16(ptr %x) { 2504; CHECK-LABEL: vreduce_or_v128i16: 2505; CHECK: # %bb.0: 2506; CHECK-NEXT: li a1, 64 2507; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 2508; CHECK-NEXT: vle16.v v8, (a0) 2509; CHECK-NEXT: addi a0, a0, 128 2510; CHECK-NEXT: vle16.v v16, (a0) 2511; CHECK-NEXT: vor.vv v8, v8, v16 2512; CHECK-NEXT: vredor.vs v8, v8, v8 2513; CHECK-NEXT: vmv.x.s a0, v8 2514; CHECK-NEXT: ret 2515 %v = load <128 x i16>, ptr %x 2516 %red = call i16 @llvm.vector.reduce.or.v128i16(<128 x i16> %v) 2517 ret i16 %red 2518} 2519 2520declare i32 @llvm.vector.reduce.or.v1i32(<1 x i32>) 2521 2522define i32 @vreduce_or_v1i32(<1 x i32> %v) { 2523; CHECK-LABEL: vreduce_or_v1i32: 2524; CHECK: # %bb.0: 2525; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 2526; CHECK-NEXT: vmv.x.s a0, v8 2527; CHECK-NEXT: ret 2528 %red = call i32 @llvm.vector.reduce.or.v1i32(<1 x i32> %v) 2529 ret i32 %red 2530} 2531 2532declare i32 @llvm.vector.reduce.or.v2i32(<2 x i32>) 2533 2534define i32 @vreduce_or_v2i32(ptr %x) { 2535; CHECK-LABEL: vreduce_or_v2i32: 2536; CHECK: # %bb.0: 2537; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 2538; CHECK-NEXT: vle32.v v8, (a0) 2539; CHECK-NEXT: vredor.vs v8, v8, v8 2540; CHECK-NEXT: vmv.x.s a0, v8 2541; CHECK-NEXT: ret 2542 %v = load <2 x i32>, ptr %x 2543 %red = call i32 @llvm.vector.reduce.or.v2i32(<2 x i32> %v) 2544 ret i32 %red 2545} 2546 2547declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>) 2548 2549define i32 @vreduce_or_v4i32(ptr %x) { 2550; CHECK-LABEL: vreduce_or_v4i32: 2551; CHECK: # %bb.0: 2552; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 2553; CHECK-NEXT: vle32.v v8, (a0) 2554; CHECK-NEXT: vredor.vs v8, v8, v8 2555; CHECK-NEXT: vmv.x.s a0, v8 2556; CHECK-NEXT: ret 2557 %v = load <4 x i32>, ptr %x 2558 %red = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %v) 2559 ret i32 %red 2560} 2561 2562declare i32 @llvm.vector.reduce.or.v8i32(<8 x i32>) 2563 2564define i32 @vreduce_or_v8i32(ptr %x) { 2565; CHECK-LABEL: vreduce_or_v8i32: 2566; CHECK: # %bb.0: 2567; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 2568; CHECK-NEXT: vle32.v v8, (a0) 2569; CHECK-NEXT: vredor.vs v8, v8, v8 2570; CHECK-NEXT: vmv.x.s a0, v8 2571; CHECK-NEXT: ret 2572 %v = load <8 x i32>, ptr %x 2573 %red = call i32 @llvm.vector.reduce.or.v8i32(<8 x i32> %v) 2574 ret i32 %red 2575} 2576 2577declare i32 @llvm.vector.reduce.or.v16i32(<16 x i32>) 2578 2579define i32 @vreduce_or_v16i32(ptr %x) { 2580; CHECK-LABEL: vreduce_or_v16i32: 2581; CHECK: # %bb.0: 2582; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 2583; CHECK-NEXT: vle32.v v8, (a0) 2584; CHECK-NEXT: vredor.vs v8, v8, v8 2585; CHECK-NEXT: vmv.x.s a0, v8 2586; CHECK-NEXT: ret 2587 %v = load <16 x i32>, ptr %x 2588 %red = call i32 @llvm.vector.reduce.or.v16i32(<16 x i32> %v) 2589 ret i32 %red 2590} 2591 2592declare i32 @llvm.vector.reduce.or.v32i32(<32 x i32>) 2593 2594define i32 @vreduce_or_v32i32(ptr %x) { 2595; CHECK-LABEL: vreduce_or_v32i32: 2596; CHECK: # %bb.0: 2597; CHECK-NEXT: li a1, 32 2598; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 2599; CHECK-NEXT: vle32.v v8, (a0) 2600; CHECK-NEXT: vredor.vs v8, v8, v8 2601; CHECK-NEXT: vmv.x.s a0, v8 2602; CHECK-NEXT: ret 2603 %v = load <32 x i32>, ptr %x 2604 %red = call i32 @llvm.vector.reduce.or.v32i32(<32 x i32> %v) 2605 ret i32 %red 2606} 2607 2608declare i32 @llvm.vector.reduce.or.v64i32(<64 x i32>) 2609 2610define i32 @vreduce_or_v64i32(ptr %x) { 2611; CHECK-LABEL: vreduce_or_v64i32: 2612; CHECK: # %bb.0: 2613; CHECK-NEXT: li a1, 32 2614; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 2615; CHECK-NEXT: vle32.v v8, (a0) 2616; CHECK-NEXT: addi a0, a0, 128 2617; CHECK-NEXT: vle32.v v16, (a0) 2618; CHECK-NEXT: vor.vv v8, v8, v16 2619; CHECK-NEXT: vredor.vs v8, v8, v8 2620; CHECK-NEXT: vmv.x.s a0, v8 2621; CHECK-NEXT: ret 2622 %v = load <64 x i32>, ptr %x 2623 %red = call i32 @llvm.vector.reduce.or.v64i32(<64 x i32> %v) 2624 ret i32 %red 2625} 2626 2627declare i64 @llvm.vector.reduce.or.v1i64(<1 x i64>) 2628 2629define i64 @vreduce_or_v1i64(<1 x i64> %v) { 2630; RV32-LABEL: vreduce_or_v1i64: 2631; RV32: # %bb.0: 2632; RV32-NEXT: li a0, 32 2633; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2634; RV32-NEXT: vsrl.vx v9, v8, a0 2635; RV32-NEXT: vmv.x.s a1, v9 2636; RV32-NEXT: vmv.x.s a0, v8 2637; RV32-NEXT: ret 2638; 2639; RV64-LABEL: vreduce_or_v1i64: 2640; RV64: # %bb.0: 2641; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2642; RV64-NEXT: vmv.x.s a0, v8 2643; RV64-NEXT: ret 2644 %red = call i64 @llvm.vector.reduce.or.v1i64(<1 x i64> %v) 2645 ret i64 %red 2646} 2647 2648declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>) 2649 2650define i64 @vreduce_or_v2i64(ptr %x) { 2651; RV32-LABEL: vreduce_or_v2i64: 2652; RV32: # %bb.0: 2653; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 2654; RV32-NEXT: vle64.v v8, (a0) 2655; RV32-NEXT: li a0, 32 2656; RV32-NEXT: vredor.vs v8, v8, v8 2657; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2658; RV32-NEXT: vsrl.vx v9, v8, a0 2659; RV32-NEXT: vmv.x.s a1, v9 2660; RV32-NEXT: vmv.x.s a0, v8 2661; RV32-NEXT: ret 2662; 2663; RV64-LABEL: vreduce_or_v2i64: 2664; RV64: # %bb.0: 2665; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 2666; RV64-NEXT: vle64.v v8, (a0) 2667; RV64-NEXT: vredor.vs v8, v8, v8 2668; RV64-NEXT: vmv.x.s a0, v8 2669; RV64-NEXT: ret 2670 %v = load <2 x i64>, ptr %x 2671 %red = call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %v) 2672 ret i64 %red 2673} 2674 2675declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) 2676 2677define i64 @vreduce_or_v4i64(ptr %x) { 2678; RV32-LABEL: vreduce_or_v4i64: 2679; RV32: # %bb.0: 2680; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 2681; RV32-NEXT: vle64.v v8, (a0) 2682; RV32-NEXT: li a1, 32 2683; RV32-NEXT: vredor.vs v8, v8, v8 2684; RV32-NEXT: vmv.x.s a0, v8 2685; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2686; RV32-NEXT: vsrl.vx v8, v8, a1 2687; RV32-NEXT: vmv.x.s a1, v8 2688; RV32-NEXT: ret 2689; 2690; RV64-LABEL: vreduce_or_v4i64: 2691; RV64: # %bb.0: 2692; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 2693; RV64-NEXT: vle64.v v8, (a0) 2694; RV64-NEXT: vredor.vs v8, v8, v8 2695; RV64-NEXT: vmv.x.s a0, v8 2696; RV64-NEXT: ret 2697 %v = load <4 x i64>, ptr %x 2698 %red = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) 2699 ret i64 %red 2700} 2701 2702declare i64 @llvm.vector.reduce.or.v8i64(<8 x i64>) 2703 2704define i64 @vreduce_or_v8i64(ptr %x) { 2705; RV32-LABEL: vreduce_or_v8i64: 2706; RV32: # %bb.0: 2707; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 2708; RV32-NEXT: vle64.v v8, (a0) 2709; RV32-NEXT: li a1, 32 2710; RV32-NEXT: vredor.vs v8, v8, v8 2711; RV32-NEXT: vmv.x.s a0, v8 2712; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2713; RV32-NEXT: vsrl.vx v8, v8, a1 2714; RV32-NEXT: vmv.x.s a1, v8 2715; RV32-NEXT: ret 2716; 2717; RV64-LABEL: vreduce_or_v8i64: 2718; RV64: # %bb.0: 2719; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 2720; RV64-NEXT: vle64.v v8, (a0) 2721; RV64-NEXT: vredor.vs v8, v8, v8 2722; RV64-NEXT: vmv.x.s a0, v8 2723; RV64-NEXT: ret 2724 %v = load <8 x i64>, ptr %x 2725 %red = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> %v) 2726 ret i64 %red 2727} 2728 2729declare i64 @llvm.vector.reduce.or.v16i64(<16 x i64>) 2730 2731define i64 @vreduce_or_v16i64(ptr %x) { 2732; RV32-LABEL: vreduce_or_v16i64: 2733; RV32: # %bb.0: 2734; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2735; RV32-NEXT: vle64.v v8, (a0) 2736; RV32-NEXT: li a1, 32 2737; RV32-NEXT: vredor.vs v8, v8, v8 2738; RV32-NEXT: vmv.x.s a0, v8 2739; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2740; RV32-NEXT: vsrl.vx v8, v8, a1 2741; RV32-NEXT: vmv.x.s a1, v8 2742; RV32-NEXT: ret 2743; 2744; RV64-LABEL: vreduce_or_v16i64: 2745; RV64: # %bb.0: 2746; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2747; RV64-NEXT: vle64.v v8, (a0) 2748; RV64-NEXT: vredor.vs v8, v8, v8 2749; RV64-NEXT: vmv.x.s a0, v8 2750; RV64-NEXT: ret 2751 %v = load <16 x i64>, ptr %x 2752 %red = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> %v) 2753 ret i64 %red 2754} 2755 2756declare i64 @llvm.vector.reduce.or.v32i64(<32 x i64>) 2757 2758define i64 @vreduce_or_v32i64(ptr %x) { 2759; RV32-LABEL: vreduce_or_v32i64: 2760; RV32: # %bb.0: 2761; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2762; RV32-NEXT: vle64.v v8, (a0) 2763; RV32-NEXT: addi a0, a0, 128 2764; RV32-NEXT: vle64.v v16, (a0) 2765; RV32-NEXT: li a1, 32 2766; RV32-NEXT: vor.vv v8, v8, v16 2767; RV32-NEXT: vredor.vs v8, v8, v8 2768; RV32-NEXT: vmv.x.s a0, v8 2769; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2770; RV32-NEXT: vsrl.vx v8, v8, a1 2771; RV32-NEXT: vmv.x.s a1, v8 2772; RV32-NEXT: ret 2773; 2774; RV64-LABEL: vreduce_or_v32i64: 2775; RV64: # %bb.0: 2776; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2777; RV64-NEXT: vle64.v v8, (a0) 2778; RV64-NEXT: addi a0, a0, 128 2779; RV64-NEXT: vle64.v v16, (a0) 2780; RV64-NEXT: vor.vv v8, v8, v16 2781; RV64-NEXT: vredor.vs v8, v8, v8 2782; RV64-NEXT: vmv.x.s a0, v8 2783; RV64-NEXT: ret 2784 %v = load <32 x i64>, ptr %x 2785 %red = call i64 @llvm.vector.reduce.or.v32i64(<32 x i64> %v) 2786 ret i64 %red 2787} 2788 2789declare i64 @llvm.vector.reduce.or.v64i64(<64 x i64>) 2790 2791define i64 @vreduce_or_v64i64(ptr %x) nounwind { 2792; RV32-LABEL: vreduce_or_v64i64: 2793; RV32: # %bb.0: 2794; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2795; RV32-NEXT: vle64.v v8, (a0) 2796; RV32-NEXT: addi a1, a0, 384 2797; RV32-NEXT: vle64.v v16, (a1) 2798; RV32-NEXT: addi a1, a0, 256 2799; RV32-NEXT: addi a0, a0, 128 2800; RV32-NEXT: vle64.v v0, (a0) 2801; RV32-NEXT: vle64.v v24, (a1) 2802; RV32-NEXT: li a1, 32 2803; RV32-NEXT: vor.vv v16, v0, v16 2804; RV32-NEXT: vor.vv v8, v8, v24 2805; RV32-NEXT: vor.vv v8, v8, v16 2806; RV32-NEXT: vredor.vs v8, v8, v8 2807; RV32-NEXT: vmv.x.s a0, v8 2808; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 2809; RV32-NEXT: vsrl.vx v8, v8, a1 2810; RV32-NEXT: vmv.x.s a1, v8 2811; RV32-NEXT: ret 2812; 2813; RV64-LABEL: vreduce_or_v64i64: 2814; RV64: # %bb.0: 2815; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 2816; RV64-NEXT: vle64.v v8, (a0) 2817; RV64-NEXT: addi a1, a0, 384 2818; RV64-NEXT: vle64.v v16, (a1) 2819; RV64-NEXT: addi a1, a0, 256 2820; RV64-NEXT: addi a0, a0, 128 2821; RV64-NEXT: vle64.v v24, (a0) 2822; RV64-NEXT: vle64.v v0, (a1) 2823; RV64-NEXT: vor.vv v16, v24, v16 2824; RV64-NEXT: vor.vv v8, v8, v0 2825; RV64-NEXT: vor.vv v8, v8, v16 2826; RV64-NEXT: vredor.vs v8, v8, v8 2827; RV64-NEXT: vmv.x.s a0, v8 2828; RV64-NEXT: ret 2829 %v = load <64 x i64>, ptr %x 2830 %red = call i64 @llvm.vector.reduce.or.v64i64(<64 x i64> %v) 2831 ret i64 %red 2832} 2833 2834declare i8 @llvm.vector.reduce.xor.v1i8(<1 x i8>) 2835 2836define i8 @vreduce_xor_v1i8(<1 x i8> %v) { 2837; CHECK-LABEL: vreduce_xor_v1i8: 2838; CHECK: # %bb.0: 2839; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 2840; CHECK-NEXT: vmv.x.s a0, v8 2841; CHECK-NEXT: ret 2842 %red = call i8 @llvm.vector.reduce.xor.v1i8(<1 x i8> %v) 2843 ret i8 %red 2844} 2845 2846declare i8 @llvm.vector.reduce.xor.v2i8(<2 x i8>) 2847 2848define i8 @vreduce_xor_v2i8(ptr %x) { 2849; CHECK-LABEL: vreduce_xor_v2i8: 2850; CHECK: # %bb.0: 2851; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 2852; CHECK-NEXT: vle8.v v8, (a0) 2853; CHECK-NEXT: vmv.s.x v9, zero 2854; CHECK-NEXT: vredxor.vs v8, v8, v9 2855; CHECK-NEXT: vmv.x.s a0, v8 2856; CHECK-NEXT: ret 2857 %v = load <2 x i8>, ptr %x 2858 %red = call i8 @llvm.vector.reduce.xor.v2i8(<2 x i8> %v) 2859 ret i8 %red 2860} 2861 2862declare i8 @llvm.vector.reduce.xor.v3i8(<3 x i8>) 2863 2864define i8 @vreduce_xor_v3i8(ptr %x) { 2865; CHECK-LABEL: vreduce_xor_v3i8: 2866; CHECK: # %bb.0: 2867; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 2868; CHECK-NEXT: vle8.v v8, (a0) 2869; CHECK-NEXT: vmv.s.x v9, zero 2870; CHECK-NEXT: vredxor.vs v8, v8, v9 2871; CHECK-NEXT: vmv.x.s a0, v8 2872; CHECK-NEXT: ret 2873 %v = load <3 x i8>, ptr %x 2874 %red = call i8 @llvm.vector.reduce.xor.v3i8(<3 x i8> %v) 2875 ret i8 %red 2876} 2877 2878declare i8 @llvm.vector.reduce.xor.v4i8(<4 x i8>) 2879 2880define i8 @vreduce_xor_v4i8(ptr %x) { 2881; CHECK-LABEL: vreduce_xor_v4i8: 2882; CHECK: # %bb.0: 2883; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 2884; CHECK-NEXT: vle8.v v8, (a0) 2885; CHECK-NEXT: vmv.s.x v9, zero 2886; CHECK-NEXT: vredxor.vs v8, v8, v9 2887; CHECK-NEXT: vmv.x.s a0, v8 2888; CHECK-NEXT: ret 2889 %v = load <4 x i8>, ptr %x 2890 %red = call i8 @llvm.vector.reduce.xor.v4i8(<4 x i8> %v) 2891 ret i8 %red 2892} 2893 2894declare i8 @llvm.vector.reduce.xor.v8i8(<8 x i8>) 2895 2896define i8 @vreduce_xor_v8i8(ptr %x) { 2897; CHECK-LABEL: vreduce_xor_v8i8: 2898; CHECK: # %bb.0: 2899; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 2900; CHECK-NEXT: vle8.v v8, (a0) 2901; CHECK-NEXT: vmv.s.x v9, zero 2902; CHECK-NEXT: vredxor.vs v8, v8, v9 2903; CHECK-NEXT: vmv.x.s a0, v8 2904; CHECK-NEXT: ret 2905 %v = load <8 x i8>, ptr %x 2906 %red = call i8 @llvm.vector.reduce.xor.v8i8(<8 x i8> %v) 2907 ret i8 %red 2908} 2909 2910declare i8 @llvm.vector.reduce.xor.v16i8(<16 x i8>) 2911 2912define i8 @vreduce_xor_v16i8(ptr %x) { 2913; CHECK-LABEL: vreduce_xor_v16i8: 2914; CHECK: # %bb.0: 2915; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 2916; CHECK-NEXT: vle8.v v8, (a0) 2917; CHECK-NEXT: vmv.s.x v9, zero 2918; CHECK-NEXT: vredxor.vs v8, v8, v9 2919; CHECK-NEXT: vmv.x.s a0, v8 2920; CHECK-NEXT: ret 2921 %v = load <16 x i8>, ptr %x 2922 %red = call i8 @llvm.vector.reduce.xor.v16i8(<16 x i8> %v) 2923 ret i8 %red 2924} 2925 2926declare i8 @llvm.vector.reduce.xor.v32i8(<32 x i8>) 2927 2928define i8 @vreduce_xor_v32i8(ptr %x) { 2929; CHECK-LABEL: vreduce_xor_v32i8: 2930; CHECK: # %bb.0: 2931; CHECK-NEXT: li a1, 32 2932; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 2933; CHECK-NEXT: vle8.v v8, (a0) 2934; CHECK-NEXT: vmv.s.x v10, zero 2935; CHECK-NEXT: vredxor.vs v8, v8, v10 2936; CHECK-NEXT: vmv.x.s a0, v8 2937; CHECK-NEXT: ret 2938 %v = load <32 x i8>, ptr %x 2939 %red = call i8 @llvm.vector.reduce.xor.v32i8(<32 x i8> %v) 2940 ret i8 %red 2941} 2942 2943declare i8 @llvm.vector.reduce.xor.v64i8(<64 x i8>) 2944 2945define i8 @vreduce_xor_v64i8(ptr %x) { 2946; CHECK-LABEL: vreduce_xor_v64i8: 2947; CHECK: # %bb.0: 2948; CHECK-NEXT: li a1, 64 2949; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 2950; CHECK-NEXT: vle8.v v8, (a0) 2951; CHECK-NEXT: vmv.s.x v12, zero 2952; CHECK-NEXT: vredxor.vs v8, v8, v12 2953; CHECK-NEXT: vmv.x.s a0, v8 2954; CHECK-NEXT: ret 2955 %v = load <64 x i8>, ptr %x 2956 %red = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> %v) 2957 ret i8 %red 2958} 2959 2960declare i8 @llvm.vector.reduce.xor.v128i8(<128 x i8>) 2961 2962define i8 @vreduce_xor_v128i8(ptr %x) { 2963; CHECK-LABEL: vreduce_xor_v128i8: 2964; CHECK: # %bb.0: 2965; CHECK-NEXT: li a1, 128 2966; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 2967; CHECK-NEXT: vle8.v v8, (a0) 2968; CHECK-NEXT: vmv.s.x v16, zero 2969; CHECK-NEXT: vredxor.vs v8, v8, v16 2970; CHECK-NEXT: vmv.x.s a0, v8 2971; CHECK-NEXT: ret 2972 %v = load <128 x i8>, ptr %x 2973 %red = call i8 @llvm.vector.reduce.xor.v128i8(<128 x i8> %v) 2974 ret i8 %red 2975} 2976 2977declare i8 @llvm.vector.reduce.xor.v256i8(<256 x i8>) 2978 2979define i8 @vreduce_xor_v256i8(ptr %x) { 2980; CHECK-LABEL: vreduce_xor_v256i8: 2981; CHECK: # %bb.0: 2982; CHECK-NEXT: li a1, 128 2983; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 2984; CHECK-NEXT: vle8.v v8, (a0) 2985; CHECK-NEXT: addi a0, a0, 128 2986; CHECK-NEXT: vle8.v v16, (a0) 2987; CHECK-NEXT: vxor.vv v8, v8, v16 2988; CHECK-NEXT: vmv.s.x v16, zero 2989; CHECK-NEXT: vredxor.vs v8, v8, v16 2990; CHECK-NEXT: vmv.x.s a0, v8 2991; CHECK-NEXT: ret 2992 %v = load <256 x i8>, ptr %x 2993 %red = call i8 @llvm.vector.reduce.xor.v256i8(<256 x i8> %v) 2994 ret i8 %red 2995} 2996 2997declare i16 @llvm.vector.reduce.xor.v1i16(<1 x i16>) 2998 2999define i16 @vreduce_xor_v1i16(<1 x i16> %v) { 3000; CHECK-LABEL: vreduce_xor_v1i16: 3001; CHECK: # %bb.0: 3002; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 3003; CHECK-NEXT: vmv.x.s a0, v8 3004; CHECK-NEXT: ret 3005 %red = call i16 @llvm.vector.reduce.xor.v1i16(<1 x i16> %v) 3006 ret i16 %red 3007} 3008 3009declare i16 @llvm.vector.reduce.xor.v2i16(<2 x i16>) 3010 3011define i16 @vreduce_xor_v2i16(ptr %x) { 3012; CHECK-LABEL: vreduce_xor_v2i16: 3013; CHECK: # %bb.0: 3014; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 3015; CHECK-NEXT: vle16.v v8, (a0) 3016; CHECK-NEXT: vmv.s.x v9, zero 3017; CHECK-NEXT: vredxor.vs v8, v8, v9 3018; CHECK-NEXT: vmv.x.s a0, v8 3019; CHECK-NEXT: ret 3020 %v = load <2 x i16>, ptr %x 3021 %red = call i16 @llvm.vector.reduce.xor.v2i16(<2 x i16> %v) 3022 ret i16 %red 3023} 3024 3025declare i16 @llvm.vector.reduce.xor.v4i16(<4 x i16>) 3026 3027define i16 @vreduce_xor_v4i16(ptr %x) { 3028; CHECK-LABEL: vreduce_xor_v4i16: 3029; CHECK: # %bb.0: 3030; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 3031; CHECK-NEXT: vle16.v v8, (a0) 3032; CHECK-NEXT: vmv.s.x v9, zero 3033; CHECK-NEXT: vredxor.vs v8, v8, v9 3034; CHECK-NEXT: vmv.x.s a0, v8 3035; CHECK-NEXT: ret 3036 %v = load <4 x i16>, ptr %x 3037 %red = call i16 @llvm.vector.reduce.xor.v4i16(<4 x i16> %v) 3038 ret i16 %red 3039} 3040 3041declare i16 @llvm.vector.reduce.xor.v8i16(<8 x i16>) 3042 3043define i16 @vreduce_xor_v8i16(ptr %x) { 3044; CHECK-LABEL: vreduce_xor_v8i16: 3045; CHECK: # %bb.0: 3046; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 3047; CHECK-NEXT: vle16.v v8, (a0) 3048; CHECK-NEXT: vmv.s.x v9, zero 3049; CHECK-NEXT: vredxor.vs v8, v8, v9 3050; CHECK-NEXT: vmv.x.s a0, v8 3051; CHECK-NEXT: ret 3052 %v = load <8 x i16>, ptr %x 3053 %red = call i16 @llvm.vector.reduce.xor.v8i16(<8 x i16> %v) 3054 ret i16 %red 3055} 3056 3057declare i16 @llvm.vector.reduce.xor.v16i16(<16 x i16>) 3058 3059define i16 @vreduce_xor_v16i16(ptr %x) { 3060; CHECK-LABEL: vreduce_xor_v16i16: 3061; CHECK: # %bb.0: 3062; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 3063; CHECK-NEXT: vle16.v v8, (a0) 3064; CHECK-NEXT: vmv.s.x v10, zero 3065; CHECK-NEXT: vredxor.vs v8, v8, v10 3066; CHECK-NEXT: vmv.x.s a0, v8 3067; CHECK-NEXT: ret 3068 %v = load <16 x i16>, ptr %x 3069 %red = call i16 @llvm.vector.reduce.xor.v16i16(<16 x i16> %v) 3070 ret i16 %red 3071} 3072 3073declare i16 @llvm.vector.reduce.xor.v32i16(<32 x i16>) 3074 3075define i16 @vreduce_xor_v32i16(ptr %x) { 3076; CHECK-LABEL: vreduce_xor_v32i16: 3077; CHECK: # %bb.0: 3078; CHECK-NEXT: li a1, 32 3079; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 3080; CHECK-NEXT: vle16.v v8, (a0) 3081; CHECK-NEXT: vmv.s.x v12, zero 3082; CHECK-NEXT: vredxor.vs v8, v8, v12 3083; CHECK-NEXT: vmv.x.s a0, v8 3084; CHECK-NEXT: ret 3085 %v = load <32 x i16>, ptr %x 3086 %red = call i16 @llvm.vector.reduce.xor.v32i16(<32 x i16> %v) 3087 ret i16 %red 3088} 3089 3090declare i16 @llvm.vector.reduce.xor.v64i16(<64 x i16>) 3091 3092define i16 @vreduce_xor_v64i16(ptr %x) { 3093; CHECK-LABEL: vreduce_xor_v64i16: 3094; CHECK: # %bb.0: 3095; CHECK-NEXT: li a1, 64 3096; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 3097; CHECK-NEXT: vle16.v v8, (a0) 3098; CHECK-NEXT: vmv.s.x v16, zero 3099; CHECK-NEXT: vredxor.vs v8, v8, v16 3100; CHECK-NEXT: vmv.x.s a0, v8 3101; CHECK-NEXT: ret 3102 %v = load <64 x i16>, ptr %x 3103 %red = call i16 @llvm.vector.reduce.xor.v64i16(<64 x i16> %v) 3104 ret i16 %red 3105} 3106 3107declare i16 @llvm.vector.reduce.xor.v128i16(<128 x i16>) 3108 3109define i16 @vreduce_xor_v128i16(ptr %x) { 3110; CHECK-LABEL: vreduce_xor_v128i16: 3111; CHECK: # %bb.0: 3112; CHECK-NEXT: li a1, 64 3113; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 3114; CHECK-NEXT: vle16.v v8, (a0) 3115; CHECK-NEXT: addi a0, a0, 128 3116; CHECK-NEXT: vle16.v v16, (a0) 3117; CHECK-NEXT: vxor.vv v8, v8, v16 3118; CHECK-NEXT: vmv.s.x v16, zero 3119; CHECK-NEXT: vredxor.vs v8, v8, v16 3120; CHECK-NEXT: vmv.x.s a0, v8 3121; CHECK-NEXT: ret 3122 %v = load <128 x i16>, ptr %x 3123 %red = call i16 @llvm.vector.reduce.xor.v128i16(<128 x i16> %v) 3124 ret i16 %red 3125} 3126 3127declare i32 @llvm.vector.reduce.xor.v1i32(<1 x i32>) 3128 3129define i32 @vreduce_xor_v1i32(<1 x i32> %v) { 3130; CHECK-LABEL: vreduce_xor_v1i32: 3131; CHECK: # %bb.0: 3132; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 3133; CHECK-NEXT: vmv.x.s a0, v8 3134; CHECK-NEXT: ret 3135 %red = call i32 @llvm.vector.reduce.xor.v1i32(<1 x i32> %v) 3136 ret i32 %red 3137} 3138 3139declare i32 @llvm.vector.reduce.xor.v2i32(<2 x i32>) 3140 3141define i32 @vreduce_xor_v2i32(ptr %x) { 3142; CHECK-LABEL: vreduce_xor_v2i32: 3143; CHECK: # %bb.0: 3144; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 3145; CHECK-NEXT: vle32.v v8, (a0) 3146; CHECK-NEXT: vmv.s.x v9, zero 3147; CHECK-NEXT: vredxor.vs v8, v8, v9 3148; CHECK-NEXT: vmv.x.s a0, v8 3149; CHECK-NEXT: ret 3150 %v = load <2 x i32>, ptr %x 3151 %red = call i32 @llvm.vector.reduce.xor.v2i32(<2 x i32> %v) 3152 ret i32 %red 3153} 3154 3155declare i32 @llvm.vector.reduce.xor.v4i32(<4 x i32>) 3156 3157define i32 @vreduce_xor_v4i32(ptr %x) { 3158; CHECK-LABEL: vreduce_xor_v4i32: 3159; CHECK: # %bb.0: 3160; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 3161; CHECK-NEXT: vle32.v v8, (a0) 3162; CHECK-NEXT: vmv.s.x v9, zero 3163; CHECK-NEXT: vredxor.vs v8, v8, v9 3164; CHECK-NEXT: vmv.x.s a0, v8 3165; CHECK-NEXT: ret 3166 %v = load <4 x i32>, ptr %x 3167 %red = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> %v) 3168 ret i32 %red 3169} 3170 3171declare i32 @llvm.vector.reduce.xor.v8i32(<8 x i32>) 3172 3173define i32 @vreduce_xor_v8i32(ptr %x) { 3174; CHECK-LABEL: vreduce_xor_v8i32: 3175; CHECK: # %bb.0: 3176; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 3177; CHECK-NEXT: vle32.v v8, (a0) 3178; CHECK-NEXT: vmv.s.x v10, zero 3179; CHECK-NEXT: vredxor.vs v8, v8, v10 3180; CHECK-NEXT: vmv.x.s a0, v8 3181; CHECK-NEXT: ret 3182 %v = load <8 x i32>, ptr %x 3183 %red = call i32 @llvm.vector.reduce.xor.v8i32(<8 x i32> %v) 3184 ret i32 %red 3185} 3186 3187declare i32 @llvm.vector.reduce.xor.v16i32(<16 x i32>) 3188 3189define i32 @vreduce_xor_v16i32(ptr %x) { 3190; CHECK-LABEL: vreduce_xor_v16i32: 3191; CHECK: # %bb.0: 3192; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 3193; CHECK-NEXT: vle32.v v8, (a0) 3194; CHECK-NEXT: vmv.s.x v12, zero 3195; CHECK-NEXT: vredxor.vs v8, v8, v12 3196; CHECK-NEXT: vmv.x.s a0, v8 3197; CHECK-NEXT: ret 3198 %v = load <16 x i32>, ptr %x 3199 %red = call i32 @llvm.vector.reduce.xor.v16i32(<16 x i32> %v) 3200 ret i32 %red 3201} 3202 3203declare i32 @llvm.vector.reduce.xor.v32i32(<32 x i32>) 3204 3205define i32 @vreduce_xor_v32i32(ptr %x) { 3206; CHECK-LABEL: vreduce_xor_v32i32: 3207; CHECK: # %bb.0: 3208; CHECK-NEXT: li a1, 32 3209; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 3210; CHECK-NEXT: vle32.v v8, (a0) 3211; CHECK-NEXT: vmv.s.x v16, zero 3212; CHECK-NEXT: vredxor.vs v8, v8, v16 3213; CHECK-NEXT: vmv.x.s a0, v8 3214; CHECK-NEXT: ret 3215 %v = load <32 x i32>, ptr %x 3216 %red = call i32 @llvm.vector.reduce.xor.v32i32(<32 x i32> %v) 3217 ret i32 %red 3218} 3219 3220declare i32 @llvm.vector.reduce.xor.v64i32(<64 x i32>) 3221 3222define i32 @vreduce_xor_v64i32(ptr %x) { 3223; CHECK-LABEL: vreduce_xor_v64i32: 3224; CHECK: # %bb.0: 3225; CHECK-NEXT: li a1, 32 3226; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 3227; CHECK-NEXT: vle32.v v8, (a0) 3228; CHECK-NEXT: addi a0, a0, 128 3229; CHECK-NEXT: vle32.v v16, (a0) 3230; CHECK-NEXT: vxor.vv v8, v8, v16 3231; CHECK-NEXT: vmv.s.x v16, zero 3232; CHECK-NEXT: vredxor.vs v8, v8, v16 3233; CHECK-NEXT: vmv.x.s a0, v8 3234; CHECK-NEXT: ret 3235 %v = load <64 x i32>, ptr %x 3236 %red = call i32 @llvm.vector.reduce.xor.v64i32(<64 x i32> %v) 3237 ret i32 %red 3238} 3239 3240declare i64 @llvm.vector.reduce.xor.v1i64(<1 x i64>) 3241 3242define i64 @vreduce_xor_v1i64(<1 x i64> %v) { 3243; RV32-LABEL: vreduce_xor_v1i64: 3244; RV32: # %bb.0: 3245; RV32-NEXT: li a0, 32 3246; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3247; RV32-NEXT: vsrl.vx v9, v8, a0 3248; RV32-NEXT: vmv.x.s a1, v9 3249; RV32-NEXT: vmv.x.s a0, v8 3250; RV32-NEXT: ret 3251; 3252; RV64-LABEL: vreduce_xor_v1i64: 3253; RV64: # %bb.0: 3254; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3255; RV64-NEXT: vmv.x.s a0, v8 3256; RV64-NEXT: ret 3257 %red = call i64 @llvm.vector.reduce.xor.v1i64(<1 x i64> %v) 3258 ret i64 %red 3259} 3260 3261declare i64 @llvm.vector.reduce.xor.v2i64(<2 x i64>) 3262 3263define i64 @vreduce_xor_v2i64(ptr %x) { 3264; RV32-LABEL: vreduce_xor_v2i64: 3265; RV32: # %bb.0: 3266; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 3267; RV32-NEXT: vle64.v v8, (a0) 3268; RV32-NEXT: vmv.s.x v9, zero 3269; RV32-NEXT: li a1, 32 3270; RV32-NEXT: vredxor.vs v8, v8, v9 3271; RV32-NEXT: vmv.x.s a0, v8 3272; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3273; RV32-NEXT: vsrl.vx v8, v8, a1 3274; RV32-NEXT: vmv.x.s a1, v8 3275; RV32-NEXT: ret 3276; 3277; RV64-LABEL: vreduce_xor_v2i64: 3278; RV64: # %bb.0: 3279; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 3280; RV64-NEXT: vle64.v v8, (a0) 3281; RV64-NEXT: vmv.s.x v9, zero 3282; RV64-NEXT: vredxor.vs v8, v8, v9 3283; RV64-NEXT: vmv.x.s a0, v8 3284; RV64-NEXT: ret 3285 %v = load <2 x i64>, ptr %x 3286 %red = call i64 @llvm.vector.reduce.xor.v2i64(<2 x i64> %v) 3287 ret i64 %red 3288} 3289 3290declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) 3291 3292define i64 @vreduce_xor_v4i64(ptr %x) { 3293; RV32-LABEL: vreduce_xor_v4i64: 3294; RV32: # %bb.0: 3295; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 3296; RV32-NEXT: vle64.v v8, (a0) 3297; RV32-NEXT: vmv.s.x v10, zero 3298; RV32-NEXT: li a1, 32 3299; RV32-NEXT: vredxor.vs v8, v8, v10 3300; RV32-NEXT: vmv.x.s a0, v8 3301; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3302; RV32-NEXT: vsrl.vx v8, v8, a1 3303; RV32-NEXT: vmv.x.s a1, v8 3304; RV32-NEXT: ret 3305; 3306; RV64-LABEL: vreduce_xor_v4i64: 3307; RV64: # %bb.0: 3308; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 3309; RV64-NEXT: vle64.v v8, (a0) 3310; RV64-NEXT: vmv.s.x v10, zero 3311; RV64-NEXT: vredxor.vs v8, v8, v10 3312; RV64-NEXT: vmv.x.s a0, v8 3313; RV64-NEXT: ret 3314 %v = load <4 x i64>, ptr %x 3315 %red = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) 3316 ret i64 %red 3317} 3318 3319declare i64 @llvm.vector.reduce.xor.v8i64(<8 x i64>) 3320 3321define i64 @vreduce_xor_v8i64(ptr %x) { 3322; RV32-LABEL: vreduce_xor_v8i64: 3323; RV32: # %bb.0: 3324; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 3325; RV32-NEXT: vle64.v v8, (a0) 3326; RV32-NEXT: vmv.s.x v12, zero 3327; RV32-NEXT: li a1, 32 3328; RV32-NEXT: vredxor.vs v8, v8, v12 3329; RV32-NEXT: vmv.x.s a0, v8 3330; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3331; RV32-NEXT: vsrl.vx v8, v8, a1 3332; RV32-NEXT: vmv.x.s a1, v8 3333; RV32-NEXT: ret 3334; 3335; RV64-LABEL: vreduce_xor_v8i64: 3336; RV64: # %bb.0: 3337; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 3338; RV64-NEXT: vle64.v v8, (a0) 3339; RV64-NEXT: vmv.s.x v12, zero 3340; RV64-NEXT: vredxor.vs v8, v8, v12 3341; RV64-NEXT: vmv.x.s a0, v8 3342; RV64-NEXT: ret 3343 %v = load <8 x i64>, ptr %x 3344 %red = call i64 @llvm.vector.reduce.xor.v8i64(<8 x i64> %v) 3345 ret i64 %red 3346} 3347 3348declare i64 @llvm.vector.reduce.xor.v16i64(<16 x i64>) 3349 3350define i64 @vreduce_xor_v16i64(ptr %x) { 3351; RV32-LABEL: vreduce_xor_v16i64: 3352; RV32: # %bb.0: 3353; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3354; RV32-NEXT: vle64.v v8, (a0) 3355; RV32-NEXT: vmv.s.x v16, zero 3356; RV32-NEXT: li a1, 32 3357; RV32-NEXT: vredxor.vs v8, v8, v16 3358; RV32-NEXT: vmv.x.s a0, v8 3359; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3360; RV32-NEXT: vsrl.vx v8, v8, a1 3361; RV32-NEXT: vmv.x.s a1, v8 3362; RV32-NEXT: ret 3363; 3364; RV64-LABEL: vreduce_xor_v16i64: 3365; RV64: # %bb.0: 3366; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3367; RV64-NEXT: vle64.v v8, (a0) 3368; RV64-NEXT: vmv.s.x v16, zero 3369; RV64-NEXT: vredxor.vs v8, v8, v16 3370; RV64-NEXT: vmv.x.s a0, v8 3371; RV64-NEXT: ret 3372 %v = load <16 x i64>, ptr %x 3373 %red = call i64 @llvm.vector.reduce.xor.v16i64(<16 x i64> %v) 3374 ret i64 %red 3375} 3376 3377declare i64 @llvm.vector.reduce.xor.v32i64(<32 x i64>) 3378 3379define i64 @vreduce_xor_v32i64(ptr %x) { 3380; RV32-LABEL: vreduce_xor_v32i64: 3381; RV32: # %bb.0: 3382; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3383; RV32-NEXT: vle64.v v8, (a0) 3384; RV32-NEXT: addi a0, a0, 128 3385; RV32-NEXT: vle64.v v16, (a0) 3386; RV32-NEXT: vxor.vv v8, v8, v16 3387; RV32-NEXT: vmv.s.x v16, zero 3388; RV32-NEXT: li a1, 32 3389; RV32-NEXT: vredxor.vs v8, v8, v16 3390; RV32-NEXT: vmv.x.s a0, v8 3391; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3392; RV32-NEXT: vsrl.vx v8, v8, a1 3393; RV32-NEXT: vmv.x.s a1, v8 3394; RV32-NEXT: ret 3395; 3396; RV64-LABEL: vreduce_xor_v32i64: 3397; RV64: # %bb.0: 3398; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3399; RV64-NEXT: vle64.v v8, (a0) 3400; RV64-NEXT: addi a0, a0, 128 3401; RV64-NEXT: vle64.v v16, (a0) 3402; RV64-NEXT: vxor.vv v8, v8, v16 3403; RV64-NEXT: vmv.s.x v16, zero 3404; RV64-NEXT: vredxor.vs v8, v8, v16 3405; RV64-NEXT: vmv.x.s a0, v8 3406; RV64-NEXT: ret 3407 %v = load <32 x i64>, ptr %x 3408 %red = call i64 @llvm.vector.reduce.xor.v32i64(<32 x i64> %v) 3409 ret i64 %red 3410} 3411 3412declare i64 @llvm.vector.reduce.xor.v64i64(<64 x i64>) 3413 3414define i64 @vreduce_xor_v64i64(ptr %x) nounwind { 3415; RV32-LABEL: vreduce_xor_v64i64: 3416; RV32: # %bb.0: 3417; RV32-NEXT: addi a1, a0, 384 3418; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3419; RV32-NEXT: vle64.v v24, (a1) 3420; RV32-NEXT: addi a1, a0, 128 3421; RV32-NEXT: vle64.v v0, (a1) 3422; RV32-NEXT: vle64.v v8, (a0) 3423; RV32-NEXT: addi a0, a0, 256 3424; RV32-NEXT: vle64.v v16, (a0) 3425; RV32-NEXT: vxor.vv v24, v0, v24 3426; RV32-NEXT: vmv.s.x v7, zero 3427; RV32-NEXT: li a1, 32 3428; RV32-NEXT: vxor.vv v8, v8, v16 3429; RV32-NEXT: vxor.vv v8, v8, v24 3430; RV32-NEXT: vredxor.vs v8, v8, v7 3431; RV32-NEXT: vmv.x.s a0, v8 3432; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3433; RV32-NEXT: vsrl.vx v8, v8, a1 3434; RV32-NEXT: vmv.x.s a1, v8 3435; RV32-NEXT: ret 3436; 3437; RV64-LABEL: vreduce_xor_v64i64: 3438; RV64: # %bb.0: 3439; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3440; RV64-NEXT: vle64.v v8, (a0) 3441; RV64-NEXT: addi a1, a0, 384 3442; RV64-NEXT: vle64.v v16, (a1) 3443; RV64-NEXT: addi a1, a0, 256 3444; RV64-NEXT: addi a0, a0, 128 3445; RV64-NEXT: vle64.v v24, (a0) 3446; RV64-NEXT: vle64.v v0, (a1) 3447; RV64-NEXT: vxor.vv v16, v24, v16 3448; RV64-NEXT: vxor.vv v8, v8, v0 3449; RV64-NEXT: vxor.vv v8, v8, v16 3450; RV64-NEXT: vmv.s.x v16, zero 3451; RV64-NEXT: vredxor.vs v8, v8, v16 3452; RV64-NEXT: vmv.x.s a0, v8 3453; RV64-NEXT: ret 3454 %v = load <64 x i64>, ptr %x 3455 %red = call i64 @llvm.vector.reduce.xor.v64i64(<64 x i64> %v) 3456 ret i64 %red 3457} 3458 3459declare i8 @llvm.vector.reduce.smin.v1i8(<1 x i8>) 3460 3461define i8 @vreduce_smin_v1i8(<1 x i8> %v) { 3462; CHECK-LABEL: vreduce_smin_v1i8: 3463; CHECK: # %bb.0: 3464; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 3465; CHECK-NEXT: vmv.x.s a0, v8 3466; CHECK-NEXT: ret 3467 %red = call i8 @llvm.vector.reduce.smin.v1i8(<1 x i8> %v) 3468 ret i8 %red 3469} 3470 3471declare i8 @llvm.vector.reduce.smin.v2i8(<2 x i8>) 3472 3473define i8 @vreduce_smin_v2i8(ptr %x) { 3474; CHECK-LABEL: vreduce_smin_v2i8: 3475; CHECK: # %bb.0: 3476; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 3477; CHECK-NEXT: vle8.v v8, (a0) 3478; CHECK-NEXT: vredmin.vs v8, v8, v8 3479; CHECK-NEXT: vmv.x.s a0, v8 3480; CHECK-NEXT: ret 3481 %v = load <2 x i8>, ptr %x 3482 %red = call i8 @llvm.vector.reduce.smin.v2i8(<2 x i8> %v) 3483 ret i8 %red 3484} 3485 3486declare i8 @llvm.vector.reduce.smin.v3i8(<3 x i8>) 3487 3488define i8 @vreduce_smin_v3i8(ptr %x) { 3489; CHECK-LABEL: vreduce_smin_v3i8: 3490; CHECK: # %bb.0: 3491; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 3492; CHECK-NEXT: vle8.v v8, (a0) 3493; CHECK-NEXT: li a0, 127 3494; CHECK-NEXT: vmv.s.x v9, a0 3495; CHECK-NEXT: vredmin.vs v8, v8, v9 3496; CHECK-NEXT: vmv.x.s a0, v8 3497; CHECK-NEXT: ret 3498 %v = load <3 x i8>, ptr %x 3499 %red = call i8 @llvm.vector.reduce.smin.v3i8(<3 x i8> %v) 3500 ret i8 %red 3501} 3502 3503declare i8 @llvm.vector.reduce.smin.v4i8(<4 x i8>) 3504 3505define i8 @vreduce_smin_v4i8(ptr %x) { 3506; CHECK-LABEL: vreduce_smin_v4i8: 3507; CHECK: # %bb.0: 3508; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 3509; CHECK-NEXT: vle8.v v8, (a0) 3510; CHECK-NEXT: vredmin.vs v8, v8, v8 3511; CHECK-NEXT: vmv.x.s a0, v8 3512; CHECK-NEXT: ret 3513 %v = load <4 x i8>, ptr %x 3514 %red = call i8 @llvm.vector.reduce.smin.v4i8(<4 x i8> %v) 3515 ret i8 %red 3516} 3517 3518declare i8 @llvm.vector.reduce.smin.v8i8(<8 x i8>) 3519 3520define i8 @vreduce_smin_v8i8(ptr %x) { 3521; CHECK-LABEL: vreduce_smin_v8i8: 3522; CHECK: # %bb.0: 3523; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 3524; CHECK-NEXT: vle8.v v8, (a0) 3525; CHECK-NEXT: vredmin.vs v8, v8, v8 3526; CHECK-NEXT: vmv.x.s a0, v8 3527; CHECK-NEXT: ret 3528 %v = load <8 x i8>, ptr %x 3529 %red = call i8 @llvm.vector.reduce.smin.v8i8(<8 x i8> %v) 3530 ret i8 %red 3531} 3532 3533declare i8 @llvm.vector.reduce.smin.v16i8(<16 x i8>) 3534 3535define i8 @vreduce_smin_v16i8(ptr %x) { 3536; CHECK-LABEL: vreduce_smin_v16i8: 3537; CHECK: # %bb.0: 3538; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 3539; CHECK-NEXT: vle8.v v8, (a0) 3540; CHECK-NEXT: vredmin.vs v8, v8, v8 3541; CHECK-NEXT: vmv.x.s a0, v8 3542; CHECK-NEXT: ret 3543 %v = load <16 x i8>, ptr %x 3544 %red = call i8 @llvm.vector.reduce.smin.v16i8(<16 x i8> %v) 3545 ret i8 %red 3546} 3547 3548declare i8 @llvm.vector.reduce.smin.v32i8(<32 x i8>) 3549 3550define i8 @vreduce_smin_v32i8(ptr %x) { 3551; CHECK-LABEL: vreduce_smin_v32i8: 3552; CHECK: # %bb.0: 3553; CHECK-NEXT: li a1, 32 3554; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 3555; CHECK-NEXT: vle8.v v8, (a0) 3556; CHECK-NEXT: vredmin.vs v8, v8, v8 3557; CHECK-NEXT: vmv.x.s a0, v8 3558; CHECK-NEXT: ret 3559 %v = load <32 x i8>, ptr %x 3560 %red = call i8 @llvm.vector.reduce.smin.v32i8(<32 x i8> %v) 3561 ret i8 %red 3562} 3563 3564declare i8 @llvm.vector.reduce.smin.v64i8(<64 x i8>) 3565 3566define i8 @vreduce_smin_v64i8(ptr %x) { 3567; CHECK-LABEL: vreduce_smin_v64i8: 3568; CHECK: # %bb.0: 3569; CHECK-NEXT: li a1, 64 3570; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 3571; CHECK-NEXT: vle8.v v8, (a0) 3572; CHECK-NEXT: vredmin.vs v8, v8, v8 3573; CHECK-NEXT: vmv.x.s a0, v8 3574; CHECK-NEXT: ret 3575 %v = load <64 x i8>, ptr %x 3576 %red = call i8 @llvm.vector.reduce.smin.v64i8(<64 x i8> %v) 3577 ret i8 %red 3578} 3579 3580declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) 3581 3582define i8 @vreduce_smin_v128i8(ptr %x) { 3583; CHECK-LABEL: vreduce_smin_v128i8: 3584; CHECK: # %bb.0: 3585; CHECK-NEXT: li a1, 128 3586; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 3587; CHECK-NEXT: vle8.v v8, (a0) 3588; CHECK-NEXT: vredmin.vs v8, v8, v8 3589; CHECK-NEXT: vmv.x.s a0, v8 3590; CHECK-NEXT: ret 3591 %v = load <128 x i8>, ptr %x 3592 %red = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> %v) 3593 ret i8 %red 3594} 3595 3596declare i8 @llvm.vector.reduce.smin.v256i8(<256 x i8>) 3597 3598define i8 @vreduce_smin_v256i8(ptr %x) { 3599; CHECK-LABEL: vreduce_smin_v256i8: 3600; CHECK: # %bb.0: 3601; CHECK-NEXT: li a1, 128 3602; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 3603; CHECK-NEXT: vle8.v v8, (a0) 3604; CHECK-NEXT: addi a0, a0, 128 3605; CHECK-NEXT: vle8.v v16, (a0) 3606; CHECK-NEXT: vmin.vv v8, v8, v16 3607; CHECK-NEXT: vredmin.vs v8, v8, v8 3608; CHECK-NEXT: vmv.x.s a0, v8 3609; CHECK-NEXT: ret 3610 %v = load <256 x i8>, ptr %x 3611 %red = call i8 @llvm.vector.reduce.smin.v256i8(<256 x i8> %v) 3612 ret i8 %red 3613} 3614 3615declare i16 @llvm.vector.reduce.smin.v1i16(<1 x i16>) 3616 3617define i16 @vreduce_smin_v1i16(<1 x i16> %v) { 3618; CHECK-LABEL: vreduce_smin_v1i16: 3619; CHECK: # %bb.0: 3620; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 3621; CHECK-NEXT: vmv.x.s a0, v8 3622; CHECK-NEXT: ret 3623 %red = call i16 @llvm.vector.reduce.smin.v1i16(<1 x i16> %v) 3624 ret i16 %red 3625} 3626 3627declare i16 @llvm.vector.reduce.smin.v2i16(<2 x i16>) 3628 3629define i16 @vreduce_smin_v2i16(ptr %x) { 3630; CHECK-LABEL: vreduce_smin_v2i16: 3631; CHECK: # %bb.0: 3632; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 3633; CHECK-NEXT: vle16.v v8, (a0) 3634; CHECK-NEXT: vredmin.vs v8, v8, v8 3635; CHECK-NEXT: vmv.x.s a0, v8 3636; CHECK-NEXT: ret 3637 %v = load <2 x i16>, ptr %x 3638 %red = call i16 @llvm.vector.reduce.smin.v2i16(<2 x i16> %v) 3639 ret i16 %red 3640} 3641 3642declare i16 @llvm.vector.reduce.smin.v4i16(<4 x i16>) 3643 3644define i16 @vreduce_smin_v4i16(ptr %x) { 3645; CHECK-LABEL: vreduce_smin_v4i16: 3646; CHECK: # %bb.0: 3647; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 3648; CHECK-NEXT: vle16.v v8, (a0) 3649; CHECK-NEXT: vredmin.vs v8, v8, v8 3650; CHECK-NEXT: vmv.x.s a0, v8 3651; CHECK-NEXT: ret 3652 %v = load <4 x i16>, ptr %x 3653 %red = call i16 @llvm.vector.reduce.smin.v4i16(<4 x i16> %v) 3654 ret i16 %red 3655} 3656 3657declare i16 @llvm.vector.reduce.smin.v8i16(<8 x i16>) 3658 3659define i16 @vreduce_smin_v8i16(ptr %x) { 3660; CHECK-LABEL: vreduce_smin_v8i16: 3661; CHECK: # %bb.0: 3662; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 3663; CHECK-NEXT: vle16.v v8, (a0) 3664; CHECK-NEXT: vredmin.vs v8, v8, v8 3665; CHECK-NEXT: vmv.x.s a0, v8 3666; CHECK-NEXT: ret 3667 %v = load <8 x i16>, ptr %x 3668 %red = call i16 @llvm.vector.reduce.smin.v8i16(<8 x i16> %v) 3669 ret i16 %red 3670} 3671 3672declare i16 @llvm.vector.reduce.smin.v16i16(<16 x i16>) 3673 3674define i16 @vreduce_smin_v16i16(ptr %x) { 3675; CHECK-LABEL: vreduce_smin_v16i16: 3676; CHECK: # %bb.0: 3677; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 3678; CHECK-NEXT: vle16.v v8, (a0) 3679; CHECK-NEXT: vredmin.vs v8, v8, v8 3680; CHECK-NEXT: vmv.x.s a0, v8 3681; CHECK-NEXT: ret 3682 %v = load <16 x i16>, ptr %x 3683 %red = call i16 @llvm.vector.reduce.smin.v16i16(<16 x i16> %v) 3684 ret i16 %red 3685} 3686 3687declare i16 @llvm.vector.reduce.smin.v32i16(<32 x i16>) 3688 3689define i16 @vreduce_smin_v32i16(ptr %x) { 3690; CHECK-LABEL: vreduce_smin_v32i16: 3691; CHECK: # %bb.0: 3692; CHECK-NEXT: li a1, 32 3693; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 3694; CHECK-NEXT: vle16.v v8, (a0) 3695; CHECK-NEXT: vredmin.vs v8, v8, v8 3696; CHECK-NEXT: vmv.x.s a0, v8 3697; CHECK-NEXT: ret 3698 %v = load <32 x i16>, ptr %x 3699 %red = call i16 @llvm.vector.reduce.smin.v32i16(<32 x i16> %v) 3700 ret i16 %red 3701} 3702 3703declare i16 @llvm.vector.reduce.smin.v64i16(<64 x i16>) 3704 3705define i16 @vreduce_smin_v64i16(ptr %x) { 3706; CHECK-LABEL: vreduce_smin_v64i16: 3707; CHECK: # %bb.0: 3708; CHECK-NEXT: li a1, 64 3709; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 3710; CHECK-NEXT: vle16.v v8, (a0) 3711; CHECK-NEXT: vredmin.vs v8, v8, v8 3712; CHECK-NEXT: vmv.x.s a0, v8 3713; CHECK-NEXT: ret 3714 %v = load <64 x i16>, ptr %x 3715 %red = call i16 @llvm.vector.reduce.smin.v64i16(<64 x i16> %v) 3716 ret i16 %red 3717} 3718 3719declare i16 @llvm.vector.reduce.smin.v128i16(<128 x i16>) 3720 3721define i16 @vreduce_smin_v128i16(ptr %x) { 3722; CHECK-LABEL: vreduce_smin_v128i16: 3723; CHECK: # %bb.0: 3724; CHECK-NEXT: li a1, 64 3725; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 3726; CHECK-NEXT: vle16.v v8, (a0) 3727; CHECK-NEXT: addi a0, a0, 128 3728; CHECK-NEXT: vle16.v v16, (a0) 3729; CHECK-NEXT: vmin.vv v8, v8, v16 3730; CHECK-NEXT: vredmin.vs v8, v8, v8 3731; CHECK-NEXT: vmv.x.s a0, v8 3732; CHECK-NEXT: ret 3733 %v = load <128 x i16>, ptr %x 3734 %red = call i16 @llvm.vector.reduce.smin.v128i16(<128 x i16> %v) 3735 ret i16 %red 3736} 3737 3738declare i32 @llvm.vector.reduce.smin.v1i32(<1 x i32>) 3739 3740define i32 @vreduce_smin_v1i32(<1 x i32> %v) { 3741; CHECK-LABEL: vreduce_smin_v1i32: 3742; CHECK: # %bb.0: 3743; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 3744; CHECK-NEXT: vmv.x.s a0, v8 3745; CHECK-NEXT: ret 3746 %red = call i32 @llvm.vector.reduce.smin.v1i32(<1 x i32> %v) 3747 ret i32 %red 3748} 3749 3750declare i32 @llvm.vector.reduce.smin.v2i32(<2 x i32>) 3751 3752define i32 @vreduce_smin_v2i32(ptr %x) { 3753; CHECK-LABEL: vreduce_smin_v2i32: 3754; CHECK: # %bb.0: 3755; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 3756; CHECK-NEXT: vle32.v v8, (a0) 3757; CHECK-NEXT: vredmin.vs v8, v8, v8 3758; CHECK-NEXT: vmv.x.s a0, v8 3759; CHECK-NEXT: ret 3760 %v = load <2 x i32>, ptr %x 3761 %red = call i32 @llvm.vector.reduce.smin.v2i32(<2 x i32> %v) 3762 ret i32 %red 3763} 3764 3765declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) 3766 3767define i32 @vreduce_smin_v4i32(ptr %x) { 3768; CHECK-LABEL: vreduce_smin_v4i32: 3769; CHECK: # %bb.0: 3770; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 3771; CHECK-NEXT: vle32.v v8, (a0) 3772; CHECK-NEXT: vredmin.vs v8, v8, v8 3773; CHECK-NEXT: vmv.x.s a0, v8 3774; CHECK-NEXT: ret 3775 %v = load <4 x i32>, ptr %x 3776 %red = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> %v) 3777 ret i32 %red 3778} 3779 3780declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) 3781 3782define i32 @vreduce_smin_v8i32(ptr %x) { 3783; CHECK-LABEL: vreduce_smin_v8i32: 3784; CHECK: # %bb.0: 3785; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 3786; CHECK-NEXT: vle32.v v8, (a0) 3787; CHECK-NEXT: vredmin.vs v8, v8, v8 3788; CHECK-NEXT: vmv.x.s a0, v8 3789; CHECK-NEXT: ret 3790 %v = load <8 x i32>, ptr %x 3791 %red = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> %v) 3792 ret i32 %red 3793} 3794 3795declare i32 @llvm.vector.reduce.smin.v16i32(<16 x i32>) 3796 3797define i32 @vreduce_smin_v16i32(ptr %x) { 3798; CHECK-LABEL: vreduce_smin_v16i32: 3799; CHECK: # %bb.0: 3800; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 3801; CHECK-NEXT: vle32.v v8, (a0) 3802; CHECK-NEXT: vredmin.vs v8, v8, v8 3803; CHECK-NEXT: vmv.x.s a0, v8 3804; CHECK-NEXT: ret 3805 %v = load <16 x i32>, ptr %x 3806 %red = call i32 @llvm.vector.reduce.smin.v16i32(<16 x i32> %v) 3807 ret i32 %red 3808} 3809 3810declare i32 @llvm.vector.reduce.smin.v32i32(<32 x i32>) 3811 3812define i32 @vreduce_smin_v32i32(ptr %x) { 3813; CHECK-LABEL: vreduce_smin_v32i32: 3814; CHECK: # %bb.0: 3815; CHECK-NEXT: li a1, 32 3816; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 3817; CHECK-NEXT: vle32.v v8, (a0) 3818; CHECK-NEXT: vredmin.vs v8, v8, v8 3819; CHECK-NEXT: vmv.x.s a0, v8 3820; CHECK-NEXT: ret 3821 %v = load <32 x i32>, ptr %x 3822 %red = call i32 @llvm.vector.reduce.smin.v32i32(<32 x i32> %v) 3823 ret i32 %red 3824} 3825 3826declare i32 @llvm.vector.reduce.smin.v64i32(<64 x i32>) 3827 3828define i32 @vreduce_smin_v64i32(ptr %x) { 3829; CHECK-LABEL: vreduce_smin_v64i32: 3830; CHECK: # %bb.0: 3831; CHECK-NEXT: li a1, 32 3832; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 3833; CHECK-NEXT: vle32.v v8, (a0) 3834; CHECK-NEXT: addi a0, a0, 128 3835; CHECK-NEXT: vle32.v v16, (a0) 3836; CHECK-NEXT: vmin.vv v8, v8, v16 3837; CHECK-NEXT: vredmin.vs v8, v8, v8 3838; CHECK-NEXT: vmv.x.s a0, v8 3839; CHECK-NEXT: ret 3840 %v = load <64 x i32>, ptr %x 3841 %red = call i32 @llvm.vector.reduce.smin.v64i32(<64 x i32> %v) 3842 ret i32 %red 3843} 3844 3845declare i64 @llvm.vector.reduce.smin.v1i64(<1 x i64>) 3846 3847define i64 @vreduce_smin_v1i64(<1 x i64> %v) { 3848; RV32-LABEL: vreduce_smin_v1i64: 3849; RV32: # %bb.0: 3850; RV32-NEXT: li a0, 32 3851; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3852; RV32-NEXT: vsrl.vx v9, v8, a0 3853; RV32-NEXT: vmv.x.s a1, v9 3854; RV32-NEXT: vmv.x.s a0, v8 3855; RV32-NEXT: ret 3856; 3857; RV64-LABEL: vreduce_smin_v1i64: 3858; RV64: # %bb.0: 3859; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3860; RV64-NEXT: vmv.x.s a0, v8 3861; RV64-NEXT: ret 3862 %red = call i64 @llvm.vector.reduce.smin.v1i64(<1 x i64> %v) 3863 ret i64 %red 3864} 3865 3866declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) 3867 3868define i64 @vreduce_smin_v2i64(ptr %x) { 3869; RV32-LABEL: vreduce_smin_v2i64: 3870; RV32: # %bb.0: 3871; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 3872; RV32-NEXT: vle64.v v8, (a0) 3873; RV32-NEXT: li a0, 32 3874; RV32-NEXT: vredmin.vs v8, v8, v8 3875; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3876; RV32-NEXT: vsrl.vx v9, v8, a0 3877; RV32-NEXT: vmv.x.s a1, v9 3878; RV32-NEXT: vmv.x.s a0, v8 3879; RV32-NEXT: ret 3880; 3881; RV64-LABEL: vreduce_smin_v2i64: 3882; RV64: # %bb.0: 3883; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 3884; RV64-NEXT: vle64.v v8, (a0) 3885; RV64-NEXT: vredmin.vs v8, v8, v8 3886; RV64-NEXT: vmv.x.s a0, v8 3887; RV64-NEXT: ret 3888 %v = load <2 x i64>, ptr %x 3889 %red = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> %v) 3890 ret i64 %red 3891} 3892 3893declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) 3894 3895define i64 @vreduce_smin_v4i64(ptr %x) { 3896; RV32-LABEL: vreduce_smin_v4i64: 3897; RV32: # %bb.0: 3898; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 3899; RV32-NEXT: vle64.v v8, (a0) 3900; RV32-NEXT: li a1, 32 3901; RV32-NEXT: vredmin.vs v8, v8, v8 3902; RV32-NEXT: vmv.x.s a0, v8 3903; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3904; RV32-NEXT: vsrl.vx v8, v8, a1 3905; RV32-NEXT: vmv.x.s a1, v8 3906; RV32-NEXT: ret 3907; 3908; RV64-LABEL: vreduce_smin_v4i64: 3909; RV64: # %bb.0: 3910; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 3911; RV64-NEXT: vle64.v v8, (a0) 3912; RV64-NEXT: vredmin.vs v8, v8, v8 3913; RV64-NEXT: vmv.x.s a0, v8 3914; RV64-NEXT: ret 3915 %v = load <4 x i64>, ptr %x 3916 %red = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) 3917 ret i64 %red 3918} 3919 3920declare i64 @llvm.vector.reduce.smin.v8i64(<8 x i64>) 3921 3922define i64 @vreduce_smin_v8i64(ptr %x) { 3923; RV32-LABEL: vreduce_smin_v8i64: 3924; RV32: # %bb.0: 3925; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 3926; RV32-NEXT: vle64.v v8, (a0) 3927; RV32-NEXT: li a1, 32 3928; RV32-NEXT: vredmin.vs v8, v8, v8 3929; RV32-NEXT: vmv.x.s a0, v8 3930; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3931; RV32-NEXT: vsrl.vx v8, v8, a1 3932; RV32-NEXT: vmv.x.s a1, v8 3933; RV32-NEXT: ret 3934; 3935; RV64-LABEL: vreduce_smin_v8i64: 3936; RV64: # %bb.0: 3937; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 3938; RV64-NEXT: vle64.v v8, (a0) 3939; RV64-NEXT: vredmin.vs v8, v8, v8 3940; RV64-NEXT: vmv.x.s a0, v8 3941; RV64-NEXT: ret 3942 %v = load <8 x i64>, ptr %x 3943 %red = call i64 @llvm.vector.reduce.smin.v8i64(<8 x i64> %v) 3944 ret i64 %red 3945} 3946 3947declare i64 @llvm.vector.reduce.smin.v16i64(<16 x i64>) 3948 3949define i64 @vreduce_smin_v16i64(ptr %x) { 3950; RV32-LABEL: vreduce_smin_v16i64: 3951; RV32: # %bb.0: 3952; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3953; RV32-NEXT: vle64.v v8, (a0) 3954; RV32-NEXT: li a1, 32 3955; RV32-NEXT: vredmin.vs v8, v8, v8 3956; RV32-NEXT: vmv.x.s a0, v8 3957; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3958; RV32-NEXT: vsrl.vx v8, v8, a1 3959; RV32-NEXT: vmv.x.s a1, v8 3960; RV32-NEXT: ret 3961; 3962; RV64-LABEL: vreduce_smin_v16i64: 3963; RV64: # %bb.0: 3964; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3965; RV64-NEXT: vle64.v v8, (a0) 3966; RV64-NEXT: vredmin.vs v8, v8, v8 3967; RV64-NEXT: vmv.x.s a0, v8 3968; RV64-NEXT: ret 3969 %v = load <16 x i64>, ptr %x 3970 %red = call i64 @llvm.vector.reduce.smin.v16i64(<16 x i64> %v) 3971 ret i64 %red 3972} 3973 3974declare i64 @llvm.vector.reduce.smin.v32i64(<32 x i64>) 3975 3976define i64 @vreduce_smin_v32i64(ptr %x) { 3977; RV32-LABEL: vreduce_smin_v32i64: 3978; RV32: # %bb.0: 3979; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3980; RV32-NEXT: vle64.v v8, (a0) 3981; RV32-NEXT: addi a0, a0, 128 3982; RV32-NEXT: vle64.v v16, (a0) 3983; RV32-NEXT: li a1, 32 3984; RV32-NEXT: vmin.vv v8, v8, v16 3985; RV32-NEXT: vredmin.vs v8, v8, v8 3986; RV32-NEXT: vmv.x.s a0, v8 3987; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 3988; RV32-NEXT: vsrl.vx v8, v8, a1 3989; RV32-NEXT: vmv.x.s a1, v8 3990; RV32-NEXT: ret 3991; 3992; RV64-LABEL: vreduce_smin_v32i64: 3993; RV64: # %bb.0: 3994; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 3995; RV64-NEXT: vle64.v v8, (a0) 3996; RV64-NEXT: addi a0, a0, 128 3997; RV64-NEXT: vle64.v v16, (a0) 3998; RV64-NEXT: vmin.vv v8, v8, v16 3999; RV64-NEXT: vredmin.vs v8, v8, v8 4000; RV64-NEXT: vmv.x.s a0, v8 4001; RV64-NEXT: ret 4002 %v = load <32 x i64>, ptr %x 4003 %red = call i64 @llvm.vector.reduce.smin.v32i64(<32 x i64> %v) 4004 ret i64 %red 4005} 4006 4007declare i64 @llvm.vector.reduce.smin.v64i64(<64 x i64>) 4008 4009define i64 @vreduce_smin_v64i64(ptr %x) nounwind { 4010; RV32-LABEL: vreduce_smin_v64i64: 4011; RV32: # %bb.0: 4012; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4013; RV32-NEXT: vle64.v v8, (a0) 4014; RV32-NEXT: addi a1, a0, 384 4015; RV32-NEXT: vle64.v v16, (a1) 4016; RV32-NEXT: addi a1, a0, 256 4017; RV32-NEXT: addi a0, a0, 128 4018; RV32-NEXT: vle64.v v0, (a0) 4019; RV32-NEXT: vle64.v v24, (a1) 4020; RV32-NEXT: li a1, 32 4021; RV32-NEXT: vmin.vv v16, v0, v16 4022; RV32-NEXT: vmin.vv v8, v8, v24 4023; RV32-NEXT: vmin.vv v8, v8, v16 4024; RV32-NEXT: vredmin.vs v8, v8, v8 4025; RV32-NEXT: vmv.x.s a0, v8 4026; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4027; RV32-NEXT: vsrl.vx v8, v8, a1 4028; RV32-NEXT: vmv.x.s a1, v8 4029; RV32-NEXT: ret 4030; 4031; RV64-LABEL: vreduce_smin_v64i64: 4032; RV64: # %bb.0: 4033; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4034; RV64-NEXT: vle64.v v8, (a0) 4035; RV64-NEXT: addi a1, a0, 384 4036; RV64-NEXT: vle64.v v16, (a1) 4037; RV64-NEXT: addi a1, a0, 256 4038; RV64-NEXT: addi a0, a0, 128 4039; RV64-NEXT: vle64.v v24, (a0) 4040; RV64-NEXT: vle64.v v0, (a1) 4041; RV64-NEXT: vmin.vv v16, v24, v16 4042; RV64-NEXT: vmin.vv v8, v8, v0 4043; RV64-NEXT: vmin.vv v8, v8, v16 4044; RV64-NEXT: vredmin.vs v8, v8, v8 4045; RV64-NEXT: vmv.x.s a0, v8 4046; RV64-NEXT: ret 4047 %v = load <64 x i64>, ptr %x 4048 %red = call i64 @llvm.vector.reduce.smin.v64i64(<64 x i64> %v) 4049 ret i64 %red 4050} 4051 4052declare i8 @llvm.vector.reduce.smax.v1i8(<1 x i8>) 4053 4054define i8 @vreduce_smax_v1i8(<1 x i8> %v) { 4055; CHECK-LABEL: vreduce_smax_v1i8: 4056; CHECK: # %bb.0: 4057; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 4058; CHECK-NEXT: vmv.x.s a0, v8 4059; CHECK-NEXT: ret 4060 %red = call i8 @llvm.vector.reduce.smax.v1i8(<1 x i8> %v) 4061 ret i8 %red 4062} 4063 4064declare i8 @llvm.vector.reduce.smax.v2i8(<2 x i8>) 4065 4066define i8 @vreduce_smax_v2i8(ptr %x) { 4067; CHECK-LABEL: vreduce_smax_v2i8: 4068; CHECK: # %bb.0: 4069; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 4070; CHECK-NEXT: vle8.v v8, (a0) 4071; CHECK-NEXT: vredmax.vs v8, v8, v8 4072; CHECK-NEXT: vmv.x.s a0, v8 4073; CHECK-NEXT: ret 4074 %v = load <2 x i8>, ptr %x 4075 %red = call i8 @llvm.vector.reduce.smax.v2i8(<2 x i8> %v) 4076 ret i8 %red 4077} 4078 4079declare i8 @llvm.vector.reduce.smax.v3i8(<3 x i8>) 4080 4081define i8 @vreduce_smax_v3i8(ptr %x) { 4082; CHECK-LABEL: vreduce_smax_v3i8: 4083; CHECK: # %bb.0: 4084; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 4085; CHECK-NEXT: vle8.v v8, (a0) 4086; CHECK-NEXT: li a0, -128 4087; CHECK-NEXT: vmv.s.x v9, a0 4088; CHECK-NEXT: vredmax.vs v8, v8, v9 4089; CHECK-NEXT: vmv.x.s a0, v8 4090; CHECK-NEXT: ret 4091 %v = load <3 x i8>, ptr %x 4092 %red = call i8 @llvm.vector.reduce.smax.v3i8(<3 x i8> %v) 4093 ret i8 %red 4094} 4095 4096declare i8 @llvm.vector.reduce.smax.v4i8(<4 x i8>) 4097 4098define i8 @vreduce_smax_v4i8(ptr %x) { 4099; CHECK-LABEL: vreduce_smax_v4i8: 4100; CHECK: # %bb.0: 4101; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 4102; CHECK-NEXT: vle8.v v8, (a0) 4103; CHECK-NEXT: vredmax.vs v8, v8, v8 4104; CHECK-NEXT: vmv.x.s a0, v8 4105; CHECK-NEXT: ret 4106 %v = load <4 x i8>, ptr %x 4107 %red = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> %v) 4108 ret i8 %red 4109} 4110 4111declare i8 @llvm.vector.reduce.smax.v8i8(<8 x i8>) 4112 4113define i8 @vreduce_smax_v8i8(ptr %x) { 4114; CHECK-LABEL: vreduce_smax_v8i8: 4115; CHECK: # %bb.0: 4116; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 4117; CHECK-NEXT: vle8.v v8, (a0) 4118; CHECK-NEXT: vredmax.vs v8, v8, v8 4119; CHECK-NEXT: vmv.x.s a0, v8 4120; CHECK-NEXT: ret 4121 %v = load <8 x i8>, ptr %x 4122 %red = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> %v) 4123 ret i8 %red 4124} 4125 4126declare i8 @llvm.vector.reduce.smax.v16i8(<16 x i8>) 4127 4128define i8 @vreduce_smax_v16i8(ptr %x) { 4129; CHECK-LABEL: vreduce_smax_v16i8: 4130; CHECK: # %bb.0: 4131; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 4132; CHECK-NEXT: vle8.v v8, (a0) 4133; CHECK-NEXT: vredmax.vs v8, v8, v8 4134; CHECK-NEXT: vmv.x.s a0, v8 4135; CHECK-NEXT: ret 4136 %v = load <16 x i8>, ptr %x 4137 %red = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> %v) 4138 ret i8 %red 4139} 4140 4141declare i8 @llvm.vector.reduce.smax.v32i8(<32 x i8>) 4142 4143define i8 @vreduce_smax_v32i8(ptr %x) { 4144; CHECK-LABEL: vreduce_smax_v32i8: 4145; CHECK: # %bb.0: 4146; CHECK-NEXT: li a1, 32 4147; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 4148; CHECK-NEXT: vle8.v v8, (a0) 4149; CHECK-NEXT: vredmax.vs v8, v8, v8 4150; CHECK-NEXT: vmv.x.s a0, v8 4151; CHECK-NEXT: ret 4152 %v = load <32 x i8>, ptr %x 4153 %red = call i8 @llvm.vector.reduce.smax.v32i8(<32 x i8> %v) 4154 ret i8 %red 4155} 4156 4157declare i8 @llvm.vector.reduce.smax.v64i8(<64 x i8>) 4158 4159define i8 @vreduce_smax_v64i8(ptr %x) { 4160; CHECK-LABEL: vreduce_smax_v64i8: 4161; CHECK: # %bb.0: 4162; CHECK-NEXT: li a1, 64 4163; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 4164; CHECK-NEXT: vle8.v v8, (a0) 4165; CHECK-NEXT: vredmax.vs v8, v8, v8 4166; CHECK-NEXT: vmv.x.s a0, v8 4167; CHECK-NEXT: ret 4168 %v = load <64 x i8>, ptr %x 4169 %red = call i8 @llvm.vector.reduce.smax.v64i8(<64 x i8> %v) 4170 ret i8 %red 4171} 4172 4173declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) 4174 4175define i8 @vreduce_smax_v128i8(ptr %x) { 4176; CHECK-LABEL: vreduce_smax_v128i8: 4177; CHECK: # %bb.0: 4178; CHECK-NEXT: li a1, 128 4179; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 4180; CHECK-NEXT: vle8.v v8, (a0) 4181; CHECK-NEXT: vredmax.vs v8, v8, v8 4182; CHECK-NEXT: vmv.x.s a0, v8 4183; CHECK-NEXT: ret 4184 %v = load <128 x i8>, ptr %x 4185 %red = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> %v) 4186 ret i8 %red 4187} 4188 4189declare i8 @llvm.vector.reduce.smax.v256i8(<256 x i8>) 4190 4191define i8 @vreduce_smax_v256i8(ptr %x) { 4192; CHECK-LABEL: vreduce_smax_v256i8: 4193; CHECK: # %bb.0: 4194; CHECK-NEXT: li a1, 128 4195; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 4196; CHECK-NEXT: vle8.v v8, (a0) 4197; CHECK-NEXT: addi a0, a0, 128 4198; CHECK-NEXT: vle8.v v16, (a0) 4199; CHECK-NEXT: vmax.vv v8, v8, v16 4200; CHECK-NEXT: vredmax.vs v8, v8, v8 4201; CHECK-NEXT: vmv.x.s a0, v8 4202; CHECK-NEXT: ret 4203 %v = load <256 x i8>, ptr %x 4204 %red = call i8 @llvm.vector.reduce.smax.v256i8(<256 x i8> %v) 4205 ret i8 %red 4206} 4207 4208declare i16 @llvm.vector.reduce.smax.v1i16(<1 x i16>) 4209 4210define i16 @vreduce_smax_v1i16(<1 x i16> %v) { 4211; CHECK-LABEL: vreduce_smax_v1i16: 4212; CHECK: # %bb.0: 4213; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 4214; CHECK-NEXT: vmv.x.s a0, v8 4215; CHECK-NEXT: ret 4216 %red = call i16 @llvm.vector.reduce.smax.v1i16(<1 x i16> %v) 4217 ret i16 %red 4218} 4219 4220declare i16 @llvm.vector.reduce.smax.v2i16(<2 x i16>) 4221 4222define i16 @vreduce_smax_v2i16(ptr %x) { 4223; CHECK-LABEL: vreduce_smax_v2i16: 4224; CHECK: # %bb.0: 4225; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 4226; CHECK-NEXT: vle16.v v8, (a0) 4227; CHECK-NEXT: vredmax.vs v8, v8, v8 4228; CHECK-NEXT: vmv.x.s a0, v8 4229; CHECK-NEXT: ret 4230 %v = load <2 x i16>, ptr %x 4231 %red = call i16 @llvm.vector.reduce.smax.v2i16(<2 x i16> %v) 4232 ret i16 %red 4233} 4234 4235declare i16 @llvm.vector.reduce.smax.v4i16(<4 x i16>) 4236 4237define i16 @vreduce_smax_v4i16(ptr %x) { 4238; CHECK-LABEL: vreduce_smax_v4i16: 4239; CHECK: # %bb.0: 4240; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 4241; CHECK-NEXT: vle16.v v8, (a0) 4242; CHECK-NEXT: vredmax.vs v8, v8, v8 4243; CHECK-NEXT: vmv.x.s a0, v8 4244; CHECK-NEXT: ret 4245 %v = load <4 x i16>, ptr %x 4246 %red = call i16 @llvm.vector.reduce.smax.v4i16(<4 x i16> %v) 4247 ret i16 %red 4248} 4249 4250declare i16 @llvm.vector.reduce.smax.v8i16(<8 x i16>) 4251 4252define i16 @vreduce_smax_v8i16(ptr %x) { 4253; CHECK-LABEL: vreduce_smax_v8i16: 4254; CHECK: # %bb.0: 4255; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 4256; CHECK-NEXT: vle16.v v8, (a0) 4257; CHECK-NEXT: vredmax.vs v8, v8, v8 4258; CHECK-NEXT: vmv.x.s a0, v8 4259; CHECK-NEXT: ret 4260 %v = load <8 x i16>, ptr %x 4261 %red = call i16 @llvm.vector.reduce.smax.v8i16(<8 x i16> %v) 4262 ret i16 %red 4263} 4264 4265declare i16 @llvm.vector.reduce.smax.v16i16(<16 x i16>) 4266 4267define i16 @vreduce_smax_v16i16(ptr %x) { 4268; CHECK-LABEL: vreduce_smax_v16i16: 4269; CHECK: # %bb.0: 4270; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 4271; CHECK-NEXT: vle16.v v8, (a0) 4272; CHECK-NEXT: vredmax.vs v8, v8, v8 4273; CHECK-NEXT: vmv.x.s a0, v8 4274; CHECK-NEXT: ret 4275 %v = load <16 x i16>, ptr %x 4276 %red = call i16 @llvm.vector.reduce.smax.v16i16(<16 x i16> %v) 4277 ret i16 %red 4278} 4279 4280declare i16 @llvm.vector.reduce.smax.v32i16(<32 x i16>) 4281 4282define i16 @vreduce_smax_v32i16(ptr %x) { 4283; CHECK-LABEL: vreduce_smax_v32i16: 4284; CHECK: # %bb.0: 4285; CHECK-NEXT: li a1, 32 4286; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 4287; CHECK-NEXT: vle16.v v8, (a0) 4288; CHECK-NEXT: vredmax.vs v8, v8, v8 4289; CHECK-NEXT: vmv.x.s a0, v8 4290; CHECK-NEXT: ret 4291 %v = load <32 x i16>, ptr %x 4292 %red = call i16 @llvm.vector.reduce.smax.v32i16(<32 x i16> %v) 4293 ret i16 %red 4294} 4295 4296declare i16 @llvm.vector.reduce.smax.v64i16(<64 x i16>) 4297 4298define i16 @vreduce_smax_v64i16(ptr %x) { 4299; CHECK-LABEL: vreduce_smax_v64i16: 4300; CHECK: # %bb.0: 4301; CHECK-NEXT: li a1, 64 4302; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 4303; CHECK-NEXT: vle16.v v8, (a0) 4304; CHECK-NEXT: vredmax.vs v8, v8, v8 4305; CHECK-NEXT: vmv.x.s a0, v8 4306; CHECK-NEXT: ret 4307 %v = load <64 x i16>, ptr %x 4308 %red = call i16 @llvm.vector.reduce.smax.v64i16(<64 x i16> %v) 4309 ret i16 %red 4310} 4311 4312declare i16 @llvm.vector.reduce.smax.v128i16(<128 x i16>) 4313 4314define i16 @vreduce_smax_v128i16(ptr %x) { 4315; CHECK-LABEL: vreduce_smax_v128i16: 4316; CHECK: # %bb.0: 4317; CHECK-NEXT: li a1, 64 4318; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 4319; CHECK-NEXT: vle16.v v8, (a0) 4320; CHECK-NEXT: addi a0, a0, 128 4321; CHECK-NEXT: vle16.v v16, (a0) 4322; CHECK-NEXT: vmax.vv v8, v8, v16 4323; CHECK-NEXT: vredmax.vs v8, v8, v8 4324; CHECK-NEXT: vmv.x.s a0, v8 4325; CHECK-NEXT: ret 4326 %v = load <128 x i16>, ptr %x 4327 %red = call i16 @llvm.vector.reduce.smax.v128i16(<128 x i16> %v) 4328 ret i16 %red 4329} 4330 4331declare i32 @llvm.vector.reduce.smax.v1i32(<1 x i32>) 4332 4333define i32 @vreduce_smax_v1i32(<1 x i32> %v) { 4334; CHECK-LABEL: vreduce_smax_v1i32: 4335; CHECK: # %bb.0: 4336; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 4337; CHECK-NEXT: vmv.x.s a0, v8 4338; CHECK-NEXT: ret 4339 %red = call i32 @llvm.vector.reduce.smax.v1i32(<1 x i32> %v) 4340 ret i32 %red 4341} 4342 4343declare i32 @llvm.vector.reduce.smax.v2i32(<2 x i32>) 4344 4345define i32 @vreduce_smax_v2i32(ptr %x) { 4346; CHECK-LABEL: vreduce_smax_v2i32: 4347; CHECK: # %bb.0: 4348; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 4349; CHECK-NEXT: vle32.v v8, (a0) 4350; CHECK-NEXT: vredmax.vs v8, v8, v8 4351; CHECK-NEXT: vmv.x.s a0, v8 4352; CHECK-NEXT: ret 4353 %v = load <2 x i32>, ptr %x 4354 %red = call i32 @llvm.vector.reduce.smax.v2i32(<2 x i32> %v) 4355 ret i32 %red 4356} 4357 4358declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) 4359 4360define i32 @vreduce_smax_v4i32(ptr %x) { 4361; CHECK-LABEL: vreduce_smax_v4i32: 4362; CHECK: # %bb.0: 4363; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 4364; CHECK-NEXT: vle32.v v8, (a0) 4365; CHECK-NEXT: vredmax.vs v8, v8, v8 4366; CHECK-NEXT: vmv.x.s a0, v8 4367; CHECK-NEXT: ret 4368 %v = load <4 x i32>, ptr %x 4369 %red = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> %v) 4370 ret i32 %red 4371} 4372 4373declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) 4374 4375define i32 @vreduce_smax_v8i32(ptr %x) { 4376; CHECK-LABEL: vreduce_smax_v8i32: 4377; CHECK: # %bb.0: 4378; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 4379; CHECK-NEXT: vle32.v v8, (a0) 4380; CHECK-NEXT: vredmax.vs v8, v8, v8 4381; CHECK-NEXT: vmv.x.s a0, v8 4382; CHECK-NEXT: ret 4383 %v = load <8 x i32>, ptr %x 4384 %red = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> %v) 4385 ret i32 %red 4386} 4387 4388declare i32 @llvm.vector.reduce.smax.v16i32(<16 x i32>) 4389 4390define i32 @vreduce_smax_v16i32(ptr %x) { 4391; CHECK-LABEL: vreduce_smax_v16i32: 4392; CHECK: # %bb.0: 4393; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 4394; CHECK-NEXT: vle32.v v8, (a0) 4395; CHECK-NEXT: vredmax.vs v8, v8, v8 4396; CHECK-NEXT: vmv.x.s a0, v8 4397; CHECK-NEXT: ret 4398 %v = load <16 x i32>, ptr %x 4399 %red = call i32 @llvm.vector.reduce.smax.v16i32(<16 x i32> %v) 4400 ret i32 %red 4401} 4402 4403declare i32 @llvm.vector.reduce.smax.v32i32(<32 x i32>) 4404 4405define i32 @vreduce_smax_v32i32(ptr %x) { 4406; CHECK-LABEL: vreduce_smax_v32i32: 4407; CHECK: # %bb.0: 4408; CHECK-NEXT: li a1, 32 4409; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 4410; CHECK-NEXT: vle32.v v8, (a0) 4411; CHECK-NEXT: vredmax.vs v8, v8, v8 4412; CHECK-NEXT: vmv.x.s a0, v8 4413; CHECK-NEXT: ret 4414 %v = load <32 x i32>, ptr %x 4415 %red = call i32 @llvm.vector.reduce.smax.v32i32(<32 x i32> %v) 4416 ret i32 %red 4417} 4418 4419declare i32 @llvm.vector.reduce.smax.v64i32(<64 x i32>) 4420 4421define i32 @vreduce_smax_v64i32(ptr %x) { 4422; CHECK-LABEL: vreduce_smax_v64i32: 4423; CHECK: # %bb.0: 4424; CHECK-NEXT: li a1, 32 4425; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 4426; CHECK-NEXT: vle32.v v8, (a0) 4427; CHECK-NEXT: addi a0, a0, 128 4428; CHECK-NEXT: vle32.v v16, (a0) 4429; CHECK-NEXT: vmax.vv v8, v8, v16 4430; CHECK-NEXT: vredmax.vs v8, v8, v8 4431; CHECK-NEXT: vmv.x.s a0, v8 4432; CHECK-NEXT: ret 4433 %v = load <64 x i32>, ptr %x 4434 %red = call i32 @llvm.vector.reduce.smax.v64i32(<64 x i32> %v) 4435 ret i32 %red 4436} 4437 4438declare i64 @llvm.vector.reduce.smax.v1i64(<1 x i64>) 4439 4440define i64 @vreduce_smax_v1i64(<1 x i64> %v) { 4441; RV32-LABEL: vreduce_smax_v1i64: 4442; RV32: # %bb.0: 4443; RV32-NEXT: li a0, 32 4444; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4445; RV32-NEXT: vsrl.vx v9, v8, a0 4446; RV32-NEXT: vmv.x.s a1, v9 4447; RV32-NEXT: vmv.x.s a0, v8 4448; RV32-NEXT: ret 4449; 4450; RV64-LABEL: vreduce_smax_v1i64: 4451; RV64: # %bb.0: 4452; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4453; RV64-NEXT: vmv.x.s a0, v8 4454; RV64-NEXT: ret 4455 %red = call i64 @llvm.vector.reduce.smax.v1i64(<1 x i64> %v) 4456 ret i64 %red 4457} 4458 4459declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) 4460 4461define i64 @vreduce_smax_v2i64(ptr %x) { 4462; RV32-LABEL: vreduce_smax_v2i64: 4463; RV32: # %bb.0: 4464; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 4465; RV32-NEXT: vle64.v v8, (a0) 4466; RV32-NEXT: li a0, 32 4467; RV32-NEXT: vredmax.vs v8, v8, v8 4468; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4469; RV32-NEXT: vsrl.vx v9, v8, a0 4470; RV32-NEXT: vmv.x.s a1, v9 4471; RV32-NEXT: vmv.x.s a0, v8 4472; RV32-NEXT: ret 4473; 4474; RV64-LABEL: vreduce_smax_v2i64: 4475; RV64: # %bb.0: 4476; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 4477; RV64-NEXT: vle64.v v8, (a0) 4478; RV64-NEXT: vredmax.vs v8, v8, v8 4479; RV64-NEXT: vmv.x.s a0, v8 4480; RV64-NEXT: ret 4481 %v = load <2 x i64>, ptr %x 4482 %red = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> %v) 4483 ret i64 %red 4484} 4485 4486declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) 4487 4488define i64 @vreduce_smax_v4i64(ptr %x) { 4489; RV32-LABEL: vreduce_smax_v4i64: 4490; RV32: # %bb.0: 4491; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 4492; RV32-NEXT: vle64.v v8, (a0) 4493; RV32-NEXT: li a1, 32 4494; RV32-NEXT: vredmax.vs v8, v8, v8 4495; RV32-NEXT: vmv.x.s a0, v8 4496; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4497; RV32-NEXT: vsrl.vx v8, v8, a1 4498; RV32-NEXT: vmv.x.s a1, v8 4499; RV32-NEXT: ret 4500; 4501; RV64-LABEL: vreduce_smax_v4i64: 4502; RV64: # %bb.0: 4503; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 4504; RV64-NEXT: vle64.v v8, (a0) 4505; RV64-NEXT: vredmax.vs v8, v8, v8 4506; RV64-NEXT: vmv.x.s a0, v8 4507; RV64-NEXT: ret 4508 %v = load <4 x i64>, ptr %x 4509 %red = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) 4510 ret i64 %red 4511} 4512 4513declare i64 @llvm.vector.reduce.smax.v8i64(<8 x i64>) 4514 4515define i64 @vreduce_smax_v8i64(ptr %x) { 4516; RV32-LABEL: vreduce_smax_v8i64: 4517; RV32: # %bb.0: 4518; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 4519; RV32-NEXT: vle64.v v8, (a0) 4520; RV32-NEXT: li a1, 32 4521; RV32-NEXT: vredmax.vs v8, v8, v8 4522; RV32-NEXT: vmv.x.s a0, v8 4523; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4524; RV32-NEXT: vsrl.vx v8, v8, a1 4525; RV32-NEXT: vmv.x.s a1, v8 4526; RV32-NEXT: ret 4527; 4528; RV64-LABEL: vreduce_smax_v8i64: 4529; RV64: # %bb.0: 4530; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 4531; RV64-NEXT: vle64.v v8, (a0) 4532; RV64-NEXT: vredmax.vs v8, v8, v8 4533; RV64-NEXT: vmv.x.s a0, v8 4534; RV64-NEXT: ret 4535 %v = load <8 x i64>, ptr %x 4536 %red = call i64 @llvm.vector.reduce.smax.v8i64(<8 x i64> %v) 4537 ret i64 %red 4538} 4539 4540declare i64 @llvm.vector.reduce.smax.v16i64(<16 x i64>) 4541 4542define i64 @vreduce_smax_v16i64(ptr %x) { 4543; RV32-LABEL: vreduce_smax_v16i64: 4544; RV32: # %bb.0: 4545; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4546; RV32-NEXT: vle64.v v8, (a0) 4547; RV32-NEXT: li a1, 32 4548; RV32-NEXT: vredmax.vs v8, v8, v8 4549; RV32-NEXT: vmv.x.s a0, v8 4550; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4551; RV32-NEXT: vsrl.vx v8, v8, a1 4552; RV32-NEXT: vmv.x.s a1, v8 4553; RV32-NEXT: ret 4554; 4555; RV64-LABEL: vreduce_smax_v16i64: 4556; RV64: # %bb.0: 4557; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4558; RV64-NEXT: vle64.v v8, (a0) 4559; RV64-NEXT: vredmax.vs v8, v8, v8 4560; RV64-NEXT: vmv.x.s a0, v8 4561; RV64-NEXT: ret 4562 %v = load <16 x i64>, ptr %x 4563 %red = call i64 @llvm.vector.reduce.smax.v16i64(<16 x i64> %v) 4564 ret i64 %red 4565} 4566 4567declare i64 @llvm.vector.reduce.smax.v32i64(<32 x i64>) 4568 4569define i64 @vreduce_smax_v32i64(ptr %x) { 4570; RV32-LABEL: vreduce_smax_v32i64: 4571; RV32: # %bb.0: 4572; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4573; RV32-NEXT: vle64.v v8, (a0) 4574; RV32-NEXT: addi a0, a0, 128 4575; RV32-NEXT: vle64.v v16, (a0) 4576; RV32-NEXT: li a1, 32 4577; RV32-NEXT: vmax.vv v8, v8, v16 4578; RV32-NEXT: vredmax.vs v8, v8, v8 4579; RV32-NEXT: vmv.x.s a0, v8 4580; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4581; RV32-NEXT: vsrl.vx v8, v8, a1 4582; RV32-NEXT: vmv.x.s a1, v8 4583; RV32-NEXT: ret 4584; 4585; RV64-LABEL: vreduce_smax_v32i64: 4586; RV64: # %bb.0: 4587; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4588; RV64-NEXT: vle64.v v8, (a0) 4589; RV64-NEXT: addi a0, a0, 128 4590; RV64-NEXT: vle64.v v16, (a0) 4591; RV64-NEXT: vmax.vv v8, v8, v16 4592; RV64-NEXT: vredmax.vs v8, v8, v8 4593; RV64-NEXT: vmv.x.s a0, v8 4594; RV64-NEXT: ret 4595 %v = load <32 x i64>, ptr %x 4596 %red = call i64 @llvm.vector.reduce.smax.v32i64(<32 x i64> %v) 4597 ret i64 %red 4598} 4599 4600declare i64 @llvm.vector.reduce.smax.v64i64(<64 x i64>) 4601 4602define i64 @vreduce_smax_v64i64(ptr %x) nounwind { 4603; RV32-LABEL: vreduce_smax_v64i64: 4604; RV32: # %bb.0: 4605; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4606; RV32-NEXT: vle64.v v8, (a0) 4607; RV32-NEXT: addi a1, a0, 384 4608; RV32-NEXT: vle64.v v16, (a1) 4609; RV32-NEXT: addi a1, a0, 256 4610; RV32-NEXT: addi a0, a0, 128 4611; RV32-NEXT: vle64.v v0, (a0) 4612; RV32-NEXT: vle64.v v24, (a1) 4613; RV32-NEXT: li a1, 32 4614; RV32-NEXT: vmax.vv v16, v0, v16 4615; RV32-NEXT: vmax.vv v8, v8, v24 4616; RV32-NEXT: vmax.vv v8, v8, v16 4617; RV32-NEXT: vredmax.vs v8, v8, v8 4618; RV32-NEXT: vmv.x.s a0, v8 4619; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 4620; RV32-NEXT: vsrl.vx v8, v8, a1 4621; RV32-NEXT: vmv.x.s a1, v8 4622; RV32-NEXT: ret 4623; 4624; RV64-LABEL: vreduce_smax_v64i64: 4625; RV64: # %bb.0: 4626; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 4627; RV64-NEXT: vle64.v v8, (a0) 4628; RV64-NEXT: addi a1, a0, 384 4629; RV64-NEXT: vle64.v v16, (a1) 4630; RV64-NEXT: addi a1, a0, 256 4631; RV64-NEXT: addi a0, a0, 128 4632; RV64-NEXT: vle64.v v24, (a0) 4633; RV64-NEXT: vle64.v v0, (a1) 4634; RV64-NEXT: vmax.vv v16, v24, v16 4635; RV64-NEXT: vmax.vv v8, v8, v0 4636; RV64-NEXT: vmax.vv v8, v8, v16 4637; RV64-NEXT: vredmax.vs v8, v8, v8 4638; RV64-NEXT: vmv.x.s a0, v8 4639; RV64-NEXT: ret 4640 %v = load <64 x i64>, ptr %x 4641 %red = call i64 @llvm.vector.reduce.smax.v64i64(<64 x i64> %v) 4642 ret i64 %red 4643} 4644 4645declare i8 @llvm.vector.reduce.umin.v1i8(<1 x i8>) 4646 4647define i8 @vreduce_umin_v1i8(<1 x i8> %v) { 4648; CHECK-LABEL: vreduce_umin_v1i8: 4649; CHECK: # %bb.0: 4650; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 4651; CHECK-NEXT: vmv.x.s a0, v8 4652; CHECK-NEXT: ret 4653 %red = call i8 @llvm.vector.reduce.umin.v1i8(<1 x i8> %v) 4654 ret i8 %red 4655} 4656 4657declare i8 @llvm.vector.reduce.umin.v2i8(<2 x i8>) 4658 4659define i8 @vreduce_umin_v2i8(ptr %x) { 4660; CHECK-LABEL: vreduce_umin_v2i8: 4661; CHECK: # %bb.0: 4662; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 4663; CHECK-NEXT: vle8.v v8, (a0) 4664; CHECK-NEXT: vredminu.vs v8, v8, v8 4665; CHECK-NEXT: vmv.x.s a0, v8 4666; CHECK-NEXT: ret 4667 %v = load <2 x i8>, ptr %x 4668 %red = call i8 @llvm.vector.reduce.umin.v2i8(<2 x i8> %v) 4669 ret i8 %red 4670} 4671 4672declare i8 @llvm.vector.reduce.umin.v3i8(<3 x i8>) 4673 4674define i8 @vreduce_umin_v3i8(ptr %x) { 4675; CHECK-LABEL: vreduce_umin_v3i8: 4676; CHECK: # %bb.0: 4677; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 4678; CHECK-NEXT: vle8.v v8, (a0) 4679; CHECK-NEXT: li a0, -1 4680; CHECK-NEXT: vmv.s.x v9, a0 4681; CHECK-NEXT: vredminu.vs v8, v8, v9 4682; CHECK-NEXT: vmv.x.s a0, v8 4683; CHECK-NEXT: ret 4684 %v = load <3 x i8>, ptr %x 4685 %red = call i8 @llvm.vector.reduce.umin.v3i8(<3 x i8> %v) 4686 ret i8 %red 4687} 4688 4689declare i8 @llvm.vector.reduce.umin.v4i8(<4 x i8>) 4690 4691define i8 @vreduce_umin_v4i8(ptr %x) { 4692; CHECK-LABEL: vreduce_umin_v4i8: 4693; CHECK: # %bb.0: 4694; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 4695; CHECK-NEXT: vle8.v v8, (a0) 4696; CHECK-NEXT: vredminu.vs v8, v8, v8 4697; CHECK-NEXT: vmv.x.s a0, v8 4698; CHECK-NEXT: ret 4699 %v = load <4 x i8>, ptr %x 4700 %red = call i8 @llvm.vector.reduce.umin.v4i8(<4 x i8> %v) 4701 ret i8 %red 4702} 4703 4704declare i8 @llvm.vector.reduce.umin.v8i8(<8 x i8>) 4705 4706define i8 @vreduce_umin_v8i8(ptr %x) { 4707; CHECK-LABEL: vreduce_umin_v8i8: 4708; CHECK: # %bb.0: 4709; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 4710; CHECK-NEXT: vle8.v v8, (a0) 4711; CHECK-NEXT: vredminu.vs v8, v8, v8 4712; CHECK-NEXT: vmv.x.s a0, v8 4713; CHECK-NEXT: ret 4714 %v = load <8 x i8>, ptr %x 4715 %red = call i8 @llvm.vector.reduce.umin.v8i8(<8 x i8> %v) 4716 ret i8 %red 4717} 4718 4719declare i8 @llvm.vector.reduce.umin.v16i8(<16 x i8>) 4720 4721define i8 @vreduce_umin_v16i8(ptr %x) { 4722; CHECK-LABEL: vreduce_umin_v16i8: 4723; CHECK: # %bb.0: 4724; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 4725; CHECK-NEXT: vle8.v v8, (a0) 4726; CHECK-NEXT: vredminu.vs v8, v8, v8 4727; CHECK-NEXT: vmv.x.s a0, v8 4728; CHECK-NEXT: ret 4729 %v = load <16 x i8>, ptr %x 4730 %red = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> %v) 4731 ret i8 %red 4732} 4733 4734declare i8 @llvm.vector.reduce.umin.v32i8(<32 x i8>) 4735 4736define i8 @vreduce_umin_v32i8(ptr %x) { 4737; CHECK-LABEL: vreduce_umin_v32i8: 4738; CHECK: # %bb.0: 4739; CHECK-NEXT: li a1, 32 4740; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 4741; CHECK-NEXT: vle8.v v8, (a0) 4742; CHECK-NEXT: vredminu.vs v8, v8, v8 4743; CHECK-NEXT: vmv.x.s a0, v8 4744; CHECK-NEXT: ret 4745 %v = load <32 x i8>, ptr %x 4746 %red = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> %v) 4747 ret i8 %red 4748} 4749 4750declare i8 @llvm.vector.reduce.umin.v64i8(<64 x i8>) 4751 4752define i8 @vreduce_umin_v64i8(ptr %x) { 4753; CHECK-LABEL: vreduce_umin_v64i8: 4754; CHECK: # %bb.0: 4755; CHECK-NEXT: li a1, 64 4756; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 4757; CHECK-NEXT: vle8.v v8, (a0) 4758; CHECK-NEXT: vredminu.vs v8, v8, v8 4759; CHECK-NEXT: vmv.x.s a0, v8 4760; CHECK-NEXT: ret 4761 %v = load <64 x i8>, ptr %x 4762 %red = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> %v) 4763 ret i8 %red 4764} 4765 4766declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) 4767 4768define i8 @vreduce_umin_v128i8(ptr %x) { 4769; CHECK-LABEL: vreduce_umin_v128i8: 4770; CHECK: # %bb.0: 4771; CHECK-NEXT: li a1, 128 4772; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 4773; CHECK-NEXT: vle8.v v8, (a0) 4774; CHECK-NEXT: vredminu.vs v8, v8, v8 4775; CHECK-NEXT: vmv.x.s a0, v8 4776; CHECK-NEXT: ret 4777 %v = load <128 x i8>, ptr %x 4778 %red = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> %v) 4779 ret i8 %red 4780} 4781 4782declare i8 @llvm.vector.reduce.umin.v256i8(<256 x i8>) 4783 4784define i8 @vreduce_umin_v256i8(ptr %x) { 4785; CHECK-LABEL: vreduce_umin_v256i8: 4786; CHECK: # %bb.0: 4787; CHECK-NEXT: li a1, 128 4788; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 4789; CHECK-NEXT: vle8.v v8, (a0) 4790; CHECK-NEXT: addi a0, a0, 128 4791; CHECK-NEXT: vle8.v v16, (a0) 4792; CHECK-NEXT: vminu.vv v8, v8, v16 4793; CHECK-NEXT: vredminu.vs v8, v8, v8 4794; CHECK-NEXT: vmv.x.s a0, v8 4795; CHECK-NEXT: ret 4796 %v = load <256 x i8>, ptr %x 4797 %red = call i8 @llvm.vector.reduce.umin.v256i8(<256 x i8> %v) 4798 ret i8 %red 4799} 4800 4801declare i16 @llvm.vector.reduce.umin.v1i16(<1 x i16>) 4802 4803define i16 @vreduce_umin_v1i16(<1 x i16> %v) { 4804; CHECK-LABEL: vreduce_umin_v1i16: 4805; CHECK: # %bb.0: 4806; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 4807; CHECK-NEXT: vmv.x.s a0, v8 4808; CHECK-NEXT: ret 4809 %red = call i16 @llvm.vector.reduce.umin.v1i16(<1 x i16> %v) 4810 ret i16 %red 4811} 4812 4813declare i16 @llvm.vector.reduce.umin.v2i16(<2 x i16>) 4814 4815define i16 @vreduce_umin_v2i16(ptr %x) { 4816; CHECK-LABEL: vreduce_umin_v2i16: 4817; CHECK: # %bb.0: 4818; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 4819; CHECK-NEXT: vle16.v v8, (a0) 4820; CHECK-NEXT: vredminu.vs v8, v8, v8 4821; CHECK-NEXT: vmv.x.s a0, v8 4822; CHECK-NEXT: ret 4823 %v = load <2 x i16>, ptr %x 4824 %red = call i16 @llvm.vector.reduce.umin.v2i16(<2 x i16> %v) 4825 ret i16 %red 4826} 4827 4828declare i16 @llvm.vector.reduce.umin.v4i16(<4 x i16>) 4829 4830define i16 @vreduce_umin_v4i16(ptr %x) { 4831; CHECK-LABEL: vreduce_umin_v4i16: 4832; CHECK: # %bb.0: 4833; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 4834; CHECK-NEXT: vle16.v v8, (a0) 4835; CHECK-NEXT: vredminu.vs v8, v8, v8 4836; CHECK-NEXT: vmv.x.s a0, v8 4837; CHECK-NEXT: ret 4838 %v = load <4 x i16>, ptr %x 4839 %red = call i16 @llvm.vector.reduce.umin.v4i16(<4 x i16> %v) 4840 ret i16 %red 4841} 4842 4843declare i16 @llvm.vector.reduce.umin.v8i16(<8 x i16>) 4844 4845define i16 @vreduce_umin_v8i16(ptr %x) { 4846; CHECK-LABEL: vreduce_umin_v8i16: 4847; CHECK: # %bb.0: 4848; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 4849; CHECK-NEXT: vle16.v v8, (a0) 4850; CHECK-NEXT: vredminu.vs v8, v8, v8 4851; CHECK-NEXT: vmv.x.s a0, v8 4852; CHECK-NEXT: ret 4853 %v = load <8 x i16>, ptr %x 4854 %red = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> %v) 4855 ret i16 %red 4856} 4857 4858declare i16 @llvm.vector.reduce.umin.v16i16(<16 x i16>) 4859 4860define i16 @vreduce_umin_v16i16(ptr %x) { 4861; CHECK-LABEL: vreduce_umin_v16i16: 4862; CHECK: # %bb.0: 4863; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 4864; CHECK-NEXT: vle16.v v8, (a0) 4865; CHECK-NEXT: vredminu.vs v8, v8, v8 4866; CHECK-NEXT: vmv.x.s a0, v8 4867; CHECK-NEXT: ret 4868 %v = load <16 x i16>, ptr %x 4869 %red = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> %v) 4870 ret i16 %red 4871} 4872 4873declare i16 @llvm.vector.reduce.umin.v32i16(<32 x i16>) 4874 4875define i16 @vreduce_umin_v32i16(ptr %x) { 4876; CHECK-LABEL: vreduce_umin_v32i16: 4877; CHECK: # %bb.0: 4878; CHECK-NEXT: li a1, 32 4879; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 4880; CHECK-NEXT: vle16.v v8, (a0) 4881; CHECK-NEXT: vredminu.vs v8, v8, v8 4882; CHECK-NEXT: vmv.x.s a0, v8 4883; CHECK-NEXT: ret 4884 %v = load <32 x i16>, ptr %x 4885 %red = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> %v) 4886 ret i16 %red 4887} 4888 4889declare i16 @llvm.vector.reduce.umin.v64i16(<64 x i16>) 4890 4891define i16 @vreduce_umin_v64i16(ptr %x) { 4892; CHECK-LABEL: vreduce_umin_v64i16: 4893; CHECK: # %bb.0: 4894; CHECK-NEXT: li a1, 64 4895; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 4896; CHECK-NEXT: vle16.v v8, (a0) 4897; CHECK-NEXT: vredminu.vs v8, v8, v8 4898; CHECK-NEXT: vmv.x.s a0, v8 4899; CHECK-NEXT: ret 4900 %v = load <64 x i16>, ptr %x 4901 %red = call i16 @llvm.vector.reduce.umin.v64i16(<64 x i16> %v) 4902 ret i16 %red 4903} 4904 4905declare i16 @llvm.vector.reduce.umin.v128i16(<128 x i16>) 4906 4907define i16 @vreduce_umin_v128i16(ptr %x) { 4908; CHECK-LABEL: vreduce_umin_v128i16: 4909; CHECK: # %bb.0: 4910; CHECK-NEXT: li a1, 64 4911; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 4912; CHECK-NEXT: vle16.v v8, (a0) 4913; CHECK-NEXT: addi a0, a0, 128 4914; CHECK-NEXT: vle16.v v16, (a0) 4915; CHECK-NEXT: vminu.vv v8, v8, v16 4916; CHECK-NEXT: vredminu.vs v8, v8, v8 4917; CHECK-NEXT: vmv.x.s a0, v8 4918; CHECK-NEXT: ret 4919 %v = load <128 x i16>, ptr %x 4920 %red = call i16 @llvm.vector.reduce.umin.v128i16(<128 x i16> %v) 4921 ret i16 %red 4922} 4923 4924declare i32 @llvm.vector.reduce.umin.v1i32(<1 x i32>) 4925 4926define i32 @vreduce_umin_v1i32(<1 x i32> %v) { 4927; CHECK-LABEL: vreduce_umin_v1i32: 4928; CHECK: # %bb.0: 4929; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 4930; CHECK-NEXT: vmv.x.s a0, v8 4931; CHECK-NEXT: ret 4932 %red = call i32 @llvm.vector.reduce.umin.v1i32(<1 x i32> %v) 4933 ret i32 %red 4934} 4935 4936declare i32 @llvm.vector.reduce.umin.v2i32(<2 x i32>) 4937 4938define i32 @vreduce_umin_v2i32(ptr %x) { 4939; CHECK-LABEL: vreduce_umin_v2i32: 4940; CHECK: # %bb.0: 4941; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 4942; CHECK-NEXT: vle32.v v8, (a0) 4943; CHECK-NEXT: vredminu.vs v8, v8, v8 4944; CHECK-NEXT: vmv.x.s a0, v8 4945; CHECK-NEXT: ret 4946 %v = load <2 x i32>, ptr %x 4947 %red = call i32 @llvm.vector.reduce.umin.v2i32(<2 x i32> %v) 4948 ret i32 %red 4949} 4950 4951declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) 4952 4953define i32 @vreduce_umin_v4i32(ptr %x) { 4954; CHECK-LABEL: vreduce_umin_v4i32: 4955; CHECK: # %bb.0: 4956; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 4957; CHECK-NEXT: vle32.v v8, (a0) 4958; CHECK-NEXT: vredminu.vs v8, v8, v8 4959; CHECK-NEXT: vmv.x.s a0, v8 4960; CHECK-NEXT: ret 4961 %v = load <4 x i32>, ptr %x 4962 %red = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> %v) 4963 ret i32 %red 4964} 4965 4966declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) 4967 4968define i32 @vreduce_umin_v8i32(ptr %x) { 4969; CHECK-LABEL: vreduce_umin_v8i32: 4970; CHECK: # %bb.0: 4971; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 4972; CHECK-NEXT: vle32.v v8, (a0) 4973; CHECK-NEXT: vredminu.vs v8, v8, v8 4974; CHECK-NEXT: vmv.x.s a0, v8 4975; CHECK-NEXT: ret 4976 %v = load <8 x i32>, ptr %x 4977 %red = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> %v) 4978 ret i32 %red 4979} 4980 4981declare i32 @llvm.vector.reduce.umin.v16i32(<16 x i32>) 4982 4983define i32 @vreduce_umin_v16i32(ptr %x) { 4984; CHECK-LABEL: vreduce_umin_v16i32: 4985; CHECK: # %bb.0: 4986; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 4987; CHECK-NEXT: vle32.v v8, (a0) 4988; CHECK-NEXT: vredminu.vs v8, v8, v8 4989; CHECK-NEXT: vmv.x.s a0, v8 4990; CHECK-NEXT: ret 4991 %v = load <16 x i32>, ptr %x 4992 %red = call i32 @llvm.vector.reduce.umin.v16i32(<16 x i32> %v) 4993 ret i32 %red 4994} 4995 4996declare i32 @llvm.vector.reduce.umin.v32i32(<32 x i32>) 4997 4998define i32 @vreduce_umin_v32i32(ptr %x) { 4999; CHECK-LABEL: vreduce_umin_v32i32: 5000; CHECK: # %bb.0: 5001; CHECK-NEXT: li a1, 32 5002; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 5003; CHECK-NEXT: vle32.v v8, (a0) 5004; CHECK-NEXT: vredminu.vs v8, v8, v8 5005; CHECK-NEXT: vmv.x.s a0, v8 5006; CHECK-NEXT: ret 5007 %v = load <32 x i32>, ptr %x 5008 %red = call i32 @llvm.vector.reduce.umin.v32i32(<32 x i32> %v) 5009 ret i32 %red 5010} 5011 5012declare i32 @llvm.vector.reduce.umin.v64i32(<64 x i32>) 5013 5014define i32 @vreduce_umin_v64i32(ptr %x) { 5015; CHECK-LABEL: vreduce_umin_v64i32: 5016; CHECK: # %bb.0: 5017; CHECK-NEXT: li a1, 32 5018; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 5019; CHECK-NEXT: vle32.v v8, (a0) 5020; CHECK-NEXT: addi a0, a0, 128 5021; CHECK-NEXT: vle32.v v16, (a0) 5022; CHECK-NEXT: vminu.vv v8, v8, v16 5023; CHECK-NEXT: vredminu.vs v8, v8, v8 5024; CHECK-NEXT: vmv.x.s a0, v8 5025; CHECK-NEXT: ret 5026 %v = load <64 x i32>, ptr %x 5027 %red = call i32 @llvm.vector.reduce.umin.v64i32(<64 x i32> %v) 5028 ret i32 %red 5029} 5030 5031declare i64 @llvm.vector.reduce.umin.v1i64(<1 x i64>) 5032 5033define i64 @vreduce_umin_v1i64(<1 x i64> %v) { 5034; RV32-LABEL: vreduce_umin_v1i64: 5035; RV32: # %bb.0: 5036; RV32-NEXT: li a0, 32 5037; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5038; RV32-NEXT: vsrl.vx v9, v8, a0 5039; RV32-NEXT: vmv.x.s a1, v9 5040; RV32-NEXT: vmv.x.s a0, v8 5041; RV32-NEXT: ret 5042; 5043; RV64-LABEL: vreduce_umin_v1i64: 5044; RV64: # %bb.0: 5045; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5046; RV64-NEXT: vmv.x.s a0, v8 5047; RV64-NEXT: ret 5048 %red = call i64 @llvm.vector.reduce.umin.v1i64(<1 x i64> %v) 5049 ret i64 %red 5050} 5051 5052declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) 5053 5054define i64 @vreduce_umin_v2i64(ptr %x) { 5055; RV32-LABEL: vreduce_umin_v2i64: 5056; RV32: # %bb.0: 5057; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 5058; RV32-NEXT: vle64.v v8, (a0) 5059; RV32-NEXT: li a0, 32 5060; RV32-NEXT: vredminu.vs v8, v8, v8 5061; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5062; RV32-NEXT: vsrl.vx v9, v8, a0 5063; RV32-NEXT: vmv.x.s a1, v9 5064; RV32-NEXT: vmv.x.s a0, v8 5065; RV32-NEXT: ret 5066; 5067; RV64-LABEL: vreduce_umin_v2i64: 5068; RV64: # %bb.0: 5069; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 5070; RV64-NEXT: vle64.v v8, (a0) 5071; RV64-NEXT: vredminu.vs v8, v8, v8 5072; RV64-NEXT: vmv.x.s a0, v8 5073; RV64-NEXT: ret 5074 %v = load <2 x i64>, ptr %x 5075 %red = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> %v) 5076 ret i64 %red 5077} 5078 5079declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) 5080 5081define i64 @vreduce_umin_v4i64(ptr %x) { 5082; RV32-LABEL: vreduce_umin_v4i64: 5083; RV32: # %bb.0: 5084; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 5085; RV32-NEXT: vle64.v v8, (a0) 5086; RV32-NEXT: li a1, 32 5087; RV32-NEXT: vredminu.vs v8, v8, v8 5088; RV32-NEXT: vmv.x.s a0, v8 5089; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5090; RV32-NEXT: vsrl.vx v8, v8, a1 5091; RV32-NEXT: vmv.x.s a1, v8 5092; RV32-NEXT: ret 5093; 5094; RV64-LABEL: vreduce_umin_v4i64: 5095; RV64: # %bb.0: 5096; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 5097; RV64-NEXT: vle64.v v8, (a0) 5098; RV64-NEXT: vredminu.vs v8, v8, v8 5099; RV64-NEXT: vmv.x.s a0, v8 5100; RV64-NEXT: ret 5101 %v = load <4 x i64>, ptr %x 5102 %red = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) 5103 ret i64 %red 5104} 5105 5106declare i64 @llvm.vector.reduce.umin.v8i64(<8 x i64>) 5107 5108define i64 @vreduce_umin_v8i64(ptr %x) { 5109; RV32-LABEL: vreduce_umin_v8i64: 5110; RV32: # %bb.0: 5111; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 5112; RV32-NEXT: vle64.v v8, (a0) 5113; RV32-NEXT: li a1, 32 5114; RV32-NEXT: vredminu.vs v8, v8, v8 5115; RV32-NEXT: vmv.x.s a0, v8 5116; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5117; RV32-NEXT: vsrl.vx v8, v8, a1 5118; RV32-NEXT: vmv.x.s a1, v8 5119; RV32-NEXT: ret 5120; 5121; RV64-LABEL: vreduce_umin_v8i64: 5122; RV64: # %bb.0: 5123; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 5124; RV64-NEXT: vle64.v v8, (a0) 5125; RV64-NEXT: vredminu.vs v8, v8, v8 5126; RV64-NEXT: vmv.x.s a0, v8 5127; RV64-NEXT: ret 5128 %v = load <8 x i64>, ptr %x 5129 %red = call i64 @llvm.vector.reduce.umin.v8i64(<8 x i64> %v) 5130 ret i64 %red 5131} 5132 5133declare i64 @llvm.vector.reduce.umin.v16i64(<16 x i64>) 5134 5135define i64 @vreduce_umin_v16i64(ptr %x) { 5136; RV32-LABEL: vreduce_umin_v16i64: 5137; RV32: # %bb.0: 5138; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5139; RV32-NEXT: vle64.v v8, (a0) 5140; RV32-NEXT: li a1, 32 5141; RV32-NEXT: vredminu.vs v8, v8, v8 5142; RV32-NEXT: vmv.x.s a0, v8 5143; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5144; RV32-NEXT: vsrl.vx v8, v8, a1 5145; RV32-NEXT: vmv.x.s a1, v8 5146; RV32-NEXT: ret 5147; 5148; RV64-LABEL: vreduce_umin_v16i64: 5149; RV64: # %bb.0: 5150; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5151; RV64-NEXT: vle64.v v8, (a0) 5152; RV64-NEXT: vredminu.vs v8, v8, v8 5153; RV64-NEXT: vmv.x.s a0, v8 5154; RV64-NEXT: ret 5155 %v = load <16 x i64>, ptr %x 5156 %red = call i64 @llvm.vector.reduce.umin.v16i64(<16 x i64> %v) 5157 ret i64 %red 5158} 5159 5160declare i64 @llvm.vector.reduce.umin.v32i64(<32 x i64>) 5161 5162define i64 @vreduce_umin_v32i64(ptr %x) { 5163; RV32-LABEL: vreduce_umin_v32i64: 5164; RV32: # %bb.0: 5165; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5166; RV32-NEXT: vle64.v v8, (a0) 5167; RV32-NEXT: addi a0, a0, 128 5168; RV32-NEXT: vle64.v v16, (a0) 5169; RV32-NEXT: li a1, 32 5170; RV32-NEXT: vminu.vv v8, v8, v16 5171; RV32-NEXT: vredminu.vs v8, v8, v8 5172; RV32-NEXT: vmv.x.s a0, v8 5173; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5174; RV32-NEXT: vsrl.vx v8, v8, a1 5175; RV32-NEXT: vmv.x.s a1, v8 5176; RV32-NEXT: ret 5177; 5178; RV64-LABEL: vreduce_umin_v32i64: 5179; RV64: # %bb.0: 5180; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5181; RV64-NEXT: vle64.v v8, (a0) 5182; RV64-NEXT: addi a0, a0, 128 5183; RV64-NEXT: vle64.v v16, (a0) 5184; RV64-NEXT: vminu.vv v8, v8, v16 5185; RV64-NEXT: vredminu.vs v8, v8, v8 5186; RV64-NEXT: vmv.x.s a0, v8 5187; RV64-NEXT: ret 5188 %v = load <32 x i64>, ptr %x 5189 %red = call i64 @llvm.vector.reduce.umin.v32i64(<32 x i64> %v) 5190 ret i64 %red 5191} 5192 5193declare i64 @llvm.vector.reduce.umin.v64i64(<64 x i64>) 5194 5195define i64 @vreduce_umin_v64i64(ptr %x) nounwind { 5196; RV32-LABEL: vreduce_umin_v64i64: 5197; RV32: # %bb.0: 5198; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5199; RV32-NEXT: vle64.v v8, (a0) 5200; RV32-NEXT: addi a1, a0, 384 5201; RV32-NEXT: vle64.v v16, (a1) 5202; RV32-NEXT: addi a1, a0, 256 5203; RV32-NEXT: addi a0, a0, 128 5204; RV32-NEXT: vle64.v v0, (a0) 5205; RV32-NEXT: vle64.v v24, (a1) 5206; RV32-NEXT: li a1, 32 5207; RV32-NEXT: vminu.vv v16, v0, v16 5208; RV32-NEXT: vminu.vv v8, v8, v24 5209; RV32-NEXT: vminu.vv v8, v8, v16 5210; RV32-NEXT: vredminu.vs v8, v8, v8 5211; RV32-NEXT: vmv.x.s a0, v8 5212; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5213; RV32-NEXT: vsrl.vx v8, v8, a1 5214; RV32-NEXT: vmv.x.s a1, v8 5215; RV32-NEXT: ret 5216; 5217; RV64-LABEL: vreduce_umin_v64i64: 5218; RV64: # %bb.0: 5219; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5220; RV64-NEXT: vle64.v v8, (a0) 5221; RV64-NEXT: addi a1, a0, 384 5222; RV64-NEXT: vle64.v v16, (a1) 5223; RV64-NEXT: addi a1, a0, 256 5224; RV64-NEXT: addi a0, a0, 128 5225; RV64-NEXT: vle64.v v24, (a0) 5226; RV64-NEXT: vle64.v v0, (a1) 5227; RV64-NEXT: vminu.vv v16, v24, v16 5228; RV64-NEXT: vminu.vv v8, v8, v0 5229; RV64-NEXT: vminu.vv v8, v8, v16 5230; RV64-NEXT: vredminu.vs v8, v8, v8 5231; RV64-NEXT: vmv.x.s a0, v8 5232; RV64-NEXT: ret 5233 %v = load <64 x i64>, ptr %x 5234 %red = call i64 @llvm.vector.reduce.umin.v64i64(<64 x i64> %v) 5235 ret i64 %red 5236} 5237 5238declare i8 @llvm.vector.reduce.umax.v1i8(<1 x i8>) 5239 5240define i8 @vreduce_umax_v1i8(<1 x i8> %v) { 5241; CHECK-LABEL: vreduce_umax_v1i8: 5242; CHECK: # %bb.0: 5243; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 5244; CHECK-NEXT: vmv.x.s a0, v8 5245; CHECK-NEXT: ret 5246 %red = call i8 @llvm.vector.reduce.umax.v1i8(<1 x i8> %v) 5247 ret i8 %red 5248} 5249 5250declare i8 @llvm.vector.reduce.umax.v2i8(<2 x i8>) 5251 5252define i8 @vreduce_umax_v2i8(ptr %x) { 5253; CHECK-LABEL: vreduce_umax_v2i8: 5254; CHECK: # %bb.0: 5255; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 5256; CHECK-NEXT: vle8.v v8, (a0) 5257; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5258; CHECK-NEXT: vmv.x.s a0, v8 5259; CHECK-NEXT: ret 5260 %v = load <2 x i8>, ptr %x 5261 %red = call i8 @llvm.vector.reduce.umax.v2i8(<2 x i8> %v) 5262 ret i8 %red 5263} 5264 5265declare i8 @llvm.vector.reduce.umax.v3i8(<3 x i8>) 5266 5267define i8 @vreduce_umax_v3i8(ptr %x) { 5268; CHECK-LABEL: vreduce_umax_v3i8: 5269; CHECK: # %bb.0: 5270; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 5271; CHECK-NEXT: vle8.v v8, (a0) 5272; CHECK-NEXT: vmv.s.x v9, zero 5273; CHECK-NEXT: vredmaxu.vs v8, v8, v9 5274; CHECK-NEXT: vmv.x.s a0, v8 5275; CHECK-NEXT: ret 5276 %v = load <3 x i8>, ptr %x 5277 %red = call i8 @llvm.vector.reduce.umax.v3i8(<3 x i8> %v) 5278 ret i8 %red 5279} 5280 5281declare i8 @llvm.vector.reduce.umax.v4i8(<4 x i8>) 5282 5283define i8 @vreduce_umax_v4i8(ptr %x) { 5284; CHECK-LABEL: vreduce_umax_v4i8: 5285; CHECK: # %bb.0: 5286; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 5287; CHECK-NEXT: vle8.v v8, (a0) 5288; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5289; CHECK-NEXT: vmv.x.s a0, v8 5290; CHECK-NEXT: ret 5291 %v = load <4 x i8>, ptr %x 5292 %red = call i8 @llvm.vector.reduce.umax.v4i8(<4 x i8> %v) 5293 ret i8 %red 5294} 5295 5296declare i8 @llvm.vector.reduce.umax.v8i8(<8 x i8>) 5297 5298define i8 @vreduce_umax_v8i8(ptr %x) { 5299; CHECK-LABEL: vreduce_umax_v8i8: 5300; CHECK: # %bb.0: 5301; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 5302; CHECK-NEXT: vle8.v v8, (a0) 5303; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5304; CHECK-NEXT: vmv.x.s a0, v8 5305; CHECK-NEXT: ret 5306 %v = load <8 x i8>, ptr %x 5307 %red = call i8 @llvm.vector.reduce.umax.v8i8(<8 x i8> %v) 5308 ret i8 %red 5309} 5310 5311declare i8 @llvm.vector.reduce.umax.v16i8(<16 x i8>) 5312 5313define i8 @vreduce_umax_v16i8(ptr %x) { 5314; CHECK-LABEL: vreduce_umax_v16i8: 5315; CHECK: # %bb.0: 5316; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 5317; CHECK-NEXT: vle8.v v8, (a0) 5318; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5319; CHECK-NEXT: vmv.x.s a0, v8 5320; CHECK-NEXT: ret 5321 %v = load <16 x i8>, ptr %x 5322 %red = call i8 @llvm.vector.reduce.umax.v16i8(<16 x i8> %v) 5323 ret i8 %red 5324} 5325 5326declare i8 @llvm.vector.reduce.umax.v32i8(<32 x i8>) 5327 5328define i8 @vreduce_umax_v32i8(ptr %x) { 5329; CHECK-LABEL: vreduce_umax_v32i8: 5330; CHECK: # %bb.0: 5331; CHECK-NEXT: li a1, 32 5332; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 5333; CHECK-NEXT: vle8.v v8, (a0) 5334; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5335; CHECK-NEXT: vmv.x.s a0, v8 5336; CHECK-NEXT: ret 5337 %v = load <32 x i8>, ptr %x 5338 %red = call i8 @llvm.vector.reduce.umax.v32i8(<32 x i8> %v) 5339 ret i8 %red 5340} 5341 5342declare i8 @llvm.vector.reduce.umax.v64i8(<64 x i8>) 5343 5344define i8 @vreduce_umax_v64i8(ptr %x) { 5345; CHECK-LABEL: vreduce_umax_v64i8: 5346; CHECK: # %bb.0: 5347; CHECK-NEXT: li a1, 64 5348; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 5349; CHECK-NEXT: vle8.v v8, (a0) 5350; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5351; CHECK-NEXT: vmv.x.s a0, v8 5352; CHECK-NEXT: ret 5353 %v = load <64 x i8>, ptr %x 5354 %red = call i8 @llvm.vector.reduce.umax.v64i8(<64 x i8> %v) 5355 ret i8 %red 5356} 5357 5358declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) 5359 5360define i8 @vreduce_umax_v128i8(ptr %x) { 5361; CHECK-LABEL: vreduce_umax_v128i8: 5362; CHECK: # %bb.0: 5363; CHECK-NEXT: li a1, 128 5364; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 5365; CHECK-NEXT: vle8.v v8, (a0) 5366; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5367; CHECK-NEXT: vmv.x.s a0, v8 5368; CHECK-NEXT: ret 5369 %v = load <128 x i8>, ptr %x 5370 %red = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> %v) 5371 ret i8 %red 5372} 5373 5374declare i8 @llvm.vector.reduce.umax.v256i8(<256 x i8>) 5375 5376define i8 @vreduce_umax_v256i8(ptr %x) { 5377; CHECK-LABEL: vreduce_umax_v256i8: 5378; CHECK: # %bb.0: 5379; CHECK-NEXT: li a1, 128 5380; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 5381; CHECK-NEXT: vle8.v v8, (a0) 5382; CHECK-NEXT: addi a0, a0, 128 5383; CHECK-NEXT: vle8.v v16, (a0) 5384; CHECK-NEXT: vmaxu.vv v8, v8, v16 5385; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5386; CHECK-NEXT: vmv.x.s a0, v8 5387; CHECK-NEXT: ret 5388 %v = load <256 x i8>, ptr %x 5389 %red = call i8 @llvm.vector.reduce.umax.v256i8(<256 x i8> %v) 5390 ret i8 %red 5391} 5392 5393declare i16 @llvm.vector.reduce.umax.v1i16(<1 x i16>) 5394 5395define i16 @vreduce_umax_v1i16(<1 x i16> %v) { 5396; CHECK-LABEL: vreduce_umax_v1i16: 5397; CHECK: # %bb.0: 5398; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 5399; CHECK-NEXT: vmv.x.s a0, v8 5400; CHECK-NEXT: ret 5401 %red = call i16 @llvm.vector.reduce.umax.v1i16(<1 x i16> %v) 5402 ret i16 %red 5403} 5404 5405declare i16 @llvm.vector.reduce.umax.v2i16(<2 x i16>) 5406 5407define i16 @vreduce_umax_v2i16(ptr %x) { 5408; CHECK-LABEL: vreduce_umax_v2i16: 5409; CHECK: # %bb.0: 5410; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 5411; CHECK-NEXT: vle16.v v8, (a0) 5412; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5413; CHECK-NEXT: vmv.x.s a0, v8 5414; CHECK-NEXT: ret 5415 %v = load <2 x i16>, ptr %x 5416 %red = call i16 @llvm.vector.reduce.umax.v2i16(<2 x i16> %v) 5417 ret i16 %red 5418} 5419 5420declare i16 @llvm.vector.reduce.umax.v4i16(<4 x i16>) 5421 5422define i16 @vreduce_umax_v4i16(ptr %x) { 5423; CHECK-LABEL: vreduce_umax_v4i16: 5424; CHECK: # %bb.0: 5425; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 5426; CHECK-NEXT: vle16.v v8, (a0) 5427; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5428; CHECK-NEXT: vmv.x.s a0, v8 5429; CHECK-NEXT: ret 5430 %v = load <4 x i16>, ptr %x 5431 %red = call i16 @llvm.vector.reduce.umax.v4i16(<4 x i16> %v) 5432 ret i16 %red 5433} 5434 5435declare i16 @llvm.vector.reduce.umax.v8i16(<8 x i16>) 5436 5437define i16 @vreduce_umax_v8i16(ptr %x) { 5438; CHECK-LABEL: vreduce_umax_v8i16: 5439; CHECK: # %bb.0: 5440; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 5441; CHECK-NEXT: vle16.v v8, (a0) 5442; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5443; CHECK-NEXT: vmv.x.s a0, v8 5444; CHECK-NEXT: ret 5445 %v = load <8 x i16>, ptr %x 5446 %red = call i16 @llvm.vector.reduce.umax.v8i16(<8 x i16> %v) 5447 ret i16 %red 5448} 5449 5450declare i16 @llvm.vector.reduce.umax.v16i16(<16 x i16>) 5451 5452define i16 @vreduce_umax_v16i16(ptr %x) { 5453; CHECK-LABEL: vreduce_umax_v16i16: 5454; CHECK: # %bb.0: 5455; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 5456; CHECK-NEXT: vle16.v v8, (a0) 5457; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5458; CHECK-NEXT: vmv.x.s a0, v8 5459; CHECK-NEXT: ret 5460 %v = load <16 x i16>, ptr %x 5461 %red = call i16 @llvm.vector.reduce.umax.v16i16(<16 x i16> %v) 5462 ret i16 %red 5463} 5464 5465declare i16 @llvm.vector.reduce.umax.v32i16(<32 x i16>) 5466 5467define i16 @vreduce_umax_v32i16(ptr %x) { 5468; CHECK-LABEL: vreduce_umax_v32i16: 5469; CHECK: # %bb.0: 5470; CHECK-NEXT: li a1, 32 5471; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 5472; CHECK-NEXT: vle16.v v8, (a0) 5473; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5474; CHECK-NEXT: vmv.x.s a0, v8 5475; CHECK-NEXT: ret 5476 %v = load <32 x i16>, ptr %x 5477 %red = call i16 @llvm.vector.reduce.umax.v32i16(<32 x i16> %v) 5478 ret i16 %red 5479} 5480 5481declare i16 @llvm.vector.reduce.umax.v64i16(<64 x i16>) 5482 5483define i16 @vreduce_umax_v64i16(ptr %x) { 5484; CHECK-LABEL: vreduce_umax_v64i16: 5485; CHECK: # %bb.0: 5486; CHECK-NEXT: li a1, 64 5487; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 5488; CHECK-NEXT: vle16.v v8, (a0) 5489; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5490; CHECK-NEXT: vmv.x.s a0, v8 5491; CHECK-NEXT: ret 5492 %v = load <64 x i16>, ptr %x 5493 %red = call i16 @llvm.vector.reduce.umax.v64i16(<64 x i16> %v) 5494 ret i16 %red 5495} 5496 5497declare i16 @llvm.vector.reduce.umax.v128i16(<128 x i16>) 5498 5499define i16 @vreduce_umax_v128i16(ptr %x) { 5500; CHECK-LABEL: vreduce_umax_v128i16: 5501; CHECK: # %bb.0: 5502; CHECK-NEXT: li a1, 64 5503; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 5504; CHECK-NEXT: vle16.v v8, (a0) 5505; CHECK-NEXT: addi a0, a0, 128 5506; CHECK-NEXT: vle16.v v16, (a0) 5507; CHECK-NEXT: vmaxu.vv v8, v8, v16 5508; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5509; CHECK-NEXT: vmv.x.s a0, v8 5510; CHECK-NEXT: ret 5511 %v = load <128 x i16>, ptr %x 5512 %red = call i16 @llvm.vector.reduce.umax.v128i16(<128 x i16> %v) 5513 ret i16 %red 5514} 5515 5516declare i32 @llvm.vector.reduce.umax.v1i32(<1 x i32>) 5517 5518define i32 @vreduce_umax_v1i32(<1 x i32> %v) { 5519; CHECK-LABEL: vreduce_umax_v1i32: 5520; CHECK: # %bb.0: 5521; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 5522; CHECK-NEXT: vmv.x.s a0, v8 5523; CHECK-NEXT: ret 5524 %red = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %v) 5525 ret i32 %red 5526} 5527 5528declare i32 @llvm.vector.reduce.umax.v2i32(<2 x i32>) 5529 5530define i32 @vreduce_umax_v2i32(ptr %x) { 5531; CHECK-LABEL: vreduce_umax_v2i32: 5532; CHECK: # %bb.0: 5533; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 5534; CHECK-NEXT: vle32.v v8, (a0) 5535; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5536; CHECK-NEXT: vmv.x.s a0, v8 5537; CHECK-NEXT: ret 5538 %v = load <2 x i32>, ptr %x 5539 %red = call i32 @llvm.vector.reduce.umax.v2i32(<2 x i32> %v) 5540 ret i32 %red 5541} 5542 5543declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) 5544 5545define i32 @vreduce_umax_v4i32(ptr %x) { 5546; CHECK-LABEL: vreduce_umax_v4i32: 5547; CHECK: # %bb.0: 5548; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 5549; CHECK-NEXT: vle32.v v8, (a0) 5550; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5551; CHECK-NEXT: vmv.x.s a0, v8 5552; CHECK-NEXT: ret 5553 %v = load <4 x i32>, ptr %x 5554 %red = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> %v) 5555 ret i32 %red 5556} 5557 5558declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) 5559 5560define i32 @vreduce_umax_v8i32(ptr %x) { 5561; CHECK-LABEL: vreduce_umax_v8i32: 5562; CHECK: # %bb.0: 5563; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 5564; CHECK-NEXT: vle32.v v8, (a0) 5565; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5566; CHECK-NEXT: vmv.x.s a0, v8 5567; CHECK-NEXT: ret 5568 %v = load <8 x i32>, ptr %x 5569 %red = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> %v) 5570 ret i32 %red 5571} 5572 5573declare i32 @llvm.vector.reduce.umax.v16i32(<16 x i32>) 5574 5575define i32 @vreduce_umax_v16i32(ptr %x) { 5576; CHECK-LABEL: vreduce_umax_v16i32: 5577; CHECK: # %bb.0: 5578; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 5579; CHECK-NEXT: vle32.v v8, (a0) 5580; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5581; CHECK-NEXT: vmv.x.s a0, v8 5582; CHECK-NEXT: ret 5583 %v = load <16 x i32>, ptr %x 5584 %red = call i32 @llvm.vector.reduce.umax.v16i32(<16 x i32> %v) 5585 ret i32 %red 5586} 5587 5588declare i32 @llvm.vector.reduce.umax.v32i32(<32 x i32>) 5589 5590define i32 @vreduce_umax_v32i32(ptr %x) { 5591; CHECK-LABEL: vreduce_umax_v32i32: 5592; CHECK: # %bb.0: 5593; CHECK-NEXT: li a1, 32 5594; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 5595; CHECK-NEXT: vle32.v v8, (a0) 5596; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5597; CHECK-NEXT: vmv.x.s a0, v8 5598; CHECK-NEXT: ret 5599 %v = load <32 x i32>, ptr %x 5600 %red = call i32 @llvm.vector.reduce.umax.v32i32(<32 x i32> %v) 5601 ret i32 %red 5602} 5603 5604declare i32 @llvm.vector.reduce.umax.v64i32(<64 x i32>) 5605 5606define i32 @vreduce_umax_v64i32(ptr %x) { 5607; CHECK-LABEL: vreduce_umax_v64i32: 5608; CHECK: # %bb.0: 5609; CHECK-NEXT: li a1, 32 5610; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 5611; CHECK-NEXT: vle32.v v8, (a0) 5612; CHECK-NEXT: addi a0, a0, 128 5613; CHECK-NEXT: vle32.v v16, (a0) 5614; CHECK-NEXT: vmaxu.vv v8, v8, v16 5615; CHECK-NEXT: vredmaxu.vs v8, v8, v8 5616; CHECK-NEXT: vmv.x.s a0, v8 5617; CHECK-NEXT: ret 5618 %v = load <64 x i32>, ptr %x 5619 %red = call i32 @llvm.vector.reduce.umax.v64i32(<64 x i32> %v) 5620 ret i32 %red 5621} 5622 5623declare i64 @llvm.vector.reduce.umax.v1i64(<1 x i64>) 5624 5625define i64 @vreduce_umax_v1i64(<1 x i64> %v) { 5626; RV32-LABEL: vreduce_umax_v1i64: 5627; RV32: # %bb.0: 5628; RV32-NEXT: li a0, 32 5629; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5630; RV32-NEXT: vsrl.vx v9, v8, a0 5631; RV32-NEXT: vmv.x.s a1, v9 5632; RV32-NEXT: vmv.x.s a0, v8 5633; RV32-NEXT: ret 5634; 5635; RV64-LABEL: vreduce_umax_v1i64: 5636; RV64: # %bb.0: 5637; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5638; RV64-NEXT: vmv.x.s a0, v8 5639; RV64-NEXT: ret 5640 %red = call i64 @llvm.vector.reduce.umax.v1i64(<1 x i64> %v) 5641 ret i64 %red 5642} 5643 5644declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) 5645 5646define i64 @vreduce_umax_v2i64(ptr %x) { 5647; RV32-LABEL: vreduce_umax_v2i64: 5648; RV32: # %bb.0: 5649; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 5650; RV32-NEXT: vle64.v v8, (a0) 5651; RV32-NEXT: li a0, 32 5652; RV32-NEXT: vredmaxu.vs v8, v8, v8 5653; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5654; RV32-NEXT: vsrl.vx v9, v8, a0 5655; RV32-NEXT: vmv.x.s a1, v9 5656; RV32-NEXT: vmv.x.s a0, v8 5657; RV32-NEXT: ret 5658; 5659; RV64-LABEL: vreduce_umax_v2i64: 5660; RV64: # %bb.0: 5661; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 5662; RV64-NEXT: vle64.v v8, (a0) 5663; RV64-NEXT: vredmaxu.vs v8, v8, v8 5664; RV64-NEXT: vmv.x.s a0, v8 5665; RV64-NEXT: ret 5666 %v = load <2 x i64>, ptr %x 5667 %red = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> %v) 5668 ret i64 %red 5669} 5670 5671declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) 5672 5673define i64 @vreduce_umax_v4i64(ptr %x) { 5674; RV32-LABEL: vreduce_umax_v4i64: 5675; RV32: # %bb.0: 5676; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 5677; RV32-NEXT: vle64.v v8, (a0) 5678; RV32-NEXT: li a1, 32 5679; RV32-NEXT: vredmaxu.vs v8, v8, v8 5680; RV32-NEXT: vmv.x.s a0, v8 5681; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5682; RV32-NEXT: vsrl.vx v8, v8, a1 5683; RV32-NEXT: vmv.x.s a1, v8 5684; RV32-NEXT: ret 5685; 5686; RV64-LABEL: vreduce_umax_v4i64: 5687; RV64: # %bb.0: 5688; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 5689; RV64-NEXT: vle64.v v8, (a0) 5690; RV64-NEXT: vredmaxu.vs v8, v8, v8 5691; RV64-NEXT: vmv.x.s a0, v8 5692; RV64-NEXT: ret 5693 %v = load <4 x i64>, ptr %x 5694 %red = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) 5695 ret i64 %red 5696} 5697 5698declare i64 @llvm.vector.reduce.umax.v8i64(<8 x i64>) 5699 5700define i64 @vreduce_umax_v8i64(ptr %x) { 5701; RV32-LABEL: vreduce_umax_v8i64: 5702; RV32: # %bb.0: 5703; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 5704; RV32-NEXT: vle64.v v8, (a0) 5705; RV32-NEXT: li a1, 32 5706; RV32-NEXT: vredmaxu.vs v8, v8, v8 5707; RV32-NEXT: vmv.x.s a0, v8 5708; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5709; RV32-NEXT: vsrl.vx v8, v8, a1 5710; RV32-NEXT: vmv.x.s a1, v8 5711; RV32-NEXT: ret 5712; 5713; RV64-LABEL: vreduce_umax_v8i64: 5714; RV64: # %bb.0: 5715; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 5716; RV64-NEXT: vle64.v v8, (a0) 5717; RV64-NEXT: vredmaxu.vs v8, v8, v8 5718; RV64-NEXT: vmv.x.s a0, v8 5719; RV64-NEXT: ret 5720 %v = load <8 x i64>, ptr %x 5721 %red = call i64 @llvm.vector.reduce.umax.v8i64(<8 x i64> %v) 5722 ret i64 %red 5723} 5724 5725declare i64 @llvm.vector.reduce.umax.v16i64(<16 x i64>) 5726 5727define i64 @vreduce_umax_v16i64(ptr %x) { 5728; RV32-LABEL: vreduce_umax_v16i64: 5729; RV32: # %bb.0: 5730; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5731; RV32-NEXT: vle64.v v8, (a0) 5732; RV32-NEXT: li a1, 32 5733; RV32-NEXT: vredmaxu.vs v8, v8, v8 5734; RV32-NEXT: vmv.x.s a0, v8 5735; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5736; RV32-NEXT: vsrl.vx v8, v8, a1 5737; RV32-NEXT: vmv.x.s a1, v8 5738; RV32-NEXT: ret 5739; 5740; RV64-LABEL: vreduce_umax_v16i64: 5741; RV64: # %bb.0: 5742; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5743; RV64-NEXT: vle64.v v8, (a0) 5744; RV64-NEXT: vredmaxu.vs v8, v8, v8 5745; RV64-NEXT: vmv.x.s a0, v8 5746; RV64-NEXT: ret 5747 %v = load <16 x i64>, ptr %x 5748 %red = call i64 @llvm.vector.reduce.umax.v16i64(<16 x i64> %v) 5749 ret i64 %red 5750} 5751 5752declare i64 @llvm.vector.reduce.umax.v32i64(<32 x i64>) 5753 5754define i64 @vreduce_umax_v32i64(ptr %x) { 5755; RV32-LABEL: vreduce_umax_v32i64: 5756; RV32: # %bb.0: 5757; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5758; RV32-NEXT: vle64.v v8, (a0) 5759; RV32-NEXT: addi a0, a0, 128 5760; RV32-NEXT: vle64.v v16, (a0) 5761; RV32-NEXT: li a1, 32 5762; RV32-NEXT: vmaxu.vv v8, v8, v16 5763; RV32-NEXT: vredmaxu.vs v8, v8, v8 5764; RV32-NEXT: vmv.x.s a0, v8 5765; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5766; RV32-NEXT: vsrl.vx v8, v8, a1 5767; RV32-NEXT: vmv.x.s a1, v8 5768; RV32-NEXT: ret 5769; 5770; RV64-LABEL: vreduce_umax_v32i64: 5771; RV64: # %bb.0: 5772; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5773; RV64-NEXT: vle64.v v8, (a0) 5774; RV64-NEXT: addi a0, a0, 128 5775; RV64-NEXT: vle64.v v16, (a0) 5776; RV64-NEXT: vmaxu.vv v8, v8, v16 5777; RV64-NEXT: vredmaxu.vs v8, v8, v8 5778; RV64-NEXT: vmv.x.s a0, v8 5779; RV64-NEXT: ret 5780 %v = load <32 x i64>, ptr %x 5781 %red = call i64 @llvm.vector.reduce.umax.v32i64(<32 x i64> %v) 5782 ret i64 %red 5783} 5784 5785declare i64 @llvm.vector.reduce.umax.v64i64(<64 x i64>) 5786 5787define i64 @vreduce_umax_v64i64(ptr %x) nounwind { 5788; RV32-LABEL: vreduce_umax_v64i64: 5789; RV32: # %bb.0: 5790; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5791; RV32-NEXT: vle64.v v8, (a0) 5792; RV32-NEXT: addi a1, a0, 384 5793; RV32-NEXT: vle64.v v16, (a1) 5794; RV32-NEXT: addi a1, a0, 256 5795; RV32-NEXT: addi a0, a0, 128 5796; RV32-NEXT: vle64.v v0, (a0) 5797; RV32-NEXT: vle64.v v24, (a1) 5798; RV32-NEXT: li a1, 32 5799; RV32-NEXT: vmaxu.vv v16, v0, v16 5800; RV32-NEXT: vmaxu.vv v8, v8, v24 5801; RV32-NEXT: vmaxu.vv v8, v8, v16 5802; RV32-NEXT: vredmaxu.vs v8, v8, v8 5803; RV32-NEXT: vmv.x.s a0, v8 5804; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 5805; RV32-NEXT: vsrl.vx v8, v8, a1 5806; RV32-NEXT: vmv.x.s a1, v8 5807; RV32-NEXT: ret 5808; 5809; RV64-LABEL: vreduce_umax_v64i64: 5810; RV64: # %bb.0: 5811; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 5812; RV64-NEXT: vle64.v v8, (a0) 5813; RV64-NEXT: addi a1, a0, 384 5814; RV64-NEXT: vle64.v v16, (a1) 5815; RV64-NEXT: addi a1, a0, 256 5816; RV64-NEXT: addi a0, a0, 128 5817; RV64-NEXT: vle64.v v24, (a0) 5818; RV64-NEXT: vle64.v v0, (a1) 5819; RV64-NEXT: vmaxu.vv v16, v24, v16 5820; RV64-NEXT: vmaxu.vv v8, v8, v0 5821; RV64-NEXT: vmaxu.vv v8, v8, v16 5822; RV64-NEXT: vredmaxu.vs v8, v8, v8 5823; RV64-NEXT: vmv.x.s a0, v8 5824; RV64-NEXT: ret 5825 %v = load <64 x i64>, ptr %x 5826 %red = call i64 @llvm.vector.reduce.umax.v64i64(<64 x i64> %v) 5827 ret i64 %red 5828} 5829 5830declare i8 @llvm.vector.reduce.mul.v1i8(<1 x i8>) 5831 5832define i8 @vreduce_mul_v1i8(<1 x i8> %v) { 5833; CHECK-LABEL: vreduce_mul_v1i8: 5834; CHECK: # %bb.0: 5835; CHECK-NEXT: vsetivli zero, 1, e8, m1, ta, ma 5836; CHECK-NEXT: vmv.x.s a0, v8 5837; CHECK-NEXT: ret 5838 %red = call i8 @llvm.vector.reduce.mul.v1i8(<1 x i8> %v) 5839 ret i8 %red 5840} 5841 5842declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>) 5843 5844define i8 @vreduce_mul_v2i8(ptr %x) { 5845; CHECK-LABEL: vreduce_mul_v2i8: 5846; CHECK: # %bb.0: 5847; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 5848; CHECK-NEXT: vle8.v v8, (a0) 5849; CHECK-NEXT: lbu a0, 1(a0) 5850; CHECK-NEXT: vmul.vx v8, v8, a0 5851; CHECK-NEXT: vmv.x.s a0, v8 5852; CHECK-NEXT: ret 5853 %v = load <2 x i8>, ptr %x 5854 %red = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> %v) 5855 ret i8 %red 5856} 5857 5858declare i8 @llvm.vector.reduce.mul.v3i8(<3 x i8>) 5859 5860define i8 @vreduce_mul_v3i8(ptr %x) { 5861; CHECK-LABEL: vreduce_mul_v3i8: 5862; CHECK: # %bb.0: 5863; CHECK-NEXT: vsetivli zero, 3, e8, mf4, ta, ma 5864; CHECK-NEXT: vle8.v v8, (a0) 5865; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 5866; CHECK-NEXT: vmv.v.i v9, 1 5867; CHECK-NEXT: vslideup.vi v8, v9, 3 5868; CHECK-NEXT: vsetivli zero, 2, e8, mf4, ta, ma 5869; CHECK-NEXT: vslidedown.vi v9, v8, 2 5870; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma 5871; CHECK-NEXT: vmul.vv v8, v8, v9 5872; CHECK-NEXT: vslidedown.vi v9, v8, 1 5873; CHECK-NEXT: vsetivli zero, 1, e8, mf8, ta, ma 5874; CHECK-NEXT: vmul.vv v8, v8, v9 5875; CHECK-NEXT: vmv.x.s a0, v8 5876; CHECK-NEXT: ret 5877 %v = load <3 x i8>, ptr %x 5878 %red = call i8 @llvm.vector.reduce.mul.v3i8(<3 x i8> %v) 5879 ret i8 %red 5880} 5881 5882declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>) 5883 5884define i8 @vreduce_mul_v4i8(ptr %x) { 5885; CHECK-LABEL: vreduce_mul_v4i8: 5886; CHECK: # %bb.0: 5887; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma 5888; CHECK-NEXT: vle8.v v8, (a0) 5889; CHECK-NEXT: vslidedown.vi v9, v8, 2 5890; CHECK-NEXT: vmul.vv v8, v8, v9 5891; CHECK-NEXT: vrgather.vi v9, v8, 1 5892; CHECK-NEXT: vmul.vv v8, v8, v9 5893; CHECK-NEXT: vmv.x.s a0, v8 5894; CHECK-NEXT: ret 5895 %v = load <4 x i8>, ptr %x 5896 %red = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> %v) 5897 ret i8 %red 5898} 5899 5900declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>) 5901 5902define i8 @vreduce_mul_v8i8(ptr %x) { 5903; CHECK-LABEL: vreduce_mul_v8i8: 5904; CHECK: # %bb.0: 5905; CHECK-NEXT: vsetivli zero, 8, e8, mf2, ta, ma 5906; CHECK-NEXT: vle8.v v8, (a0) 5907; CHECK-NEXT: vslidedown.vi v9, v8, 4 5908; CHECK-NEXT: vmul.vv v8, v8, v9 5909; CHECK-NEXT: vslidedown.vi v9, v8, 2 5910; CHECK-NEXT: vmul.vv v8, v8, v9 5911; CHECK-NEXT: vrgather.vi v9, v8, 1 5912; CHECK-NEXT: vmul.vv v8, v8, v9 5913; CHECK-NEXT: vmv.x.s a0, v8 5914; CHECK-NEXT: ret 5915 %v = load <8 x i8>, ptr %x 5916 %red = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> %v) 5917 ret i8 %red 5918} 5919 5920declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) 5921 5922define i8 @vreduce_mul_v16i8(ptr %x) { 5923; CHECK-LABEL: vreduce_mul_v16i8: 5924; CHECK: # %bb.0: 5925; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 5926; CHECK-NEXT: vle8.v v8, (a0) 5927; CHECK-NEXT: vslidedown.vi v9, v8, 8 5928; CHECK-NEXT: vmul.vv v8, v8, v9 5929; CHECK-NEXT: vslidedown.vi v9, v8, 4 5930; CHECK-NEXT: vmul.vv v8, v8, v9 5931; CHECK-NEXT: vslidedown.vi v9, v8, 2 5932; CHECK-NEXT: vmul.vv v8, v8, v9 5933; CHECK-NEXT: vrgather.vi v9, v8, 1 5934; CHECK-NEXT: vmul.vv v8, v8, v9 5935; CHECK-NEXT: vmv.x.s a0, v8 5936; CHECK-NEXT: ret 5937 %v = load <16 x i8>, ptr %x 5938 %red = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> %v) 5939 ret i8 %red 5940} 5941 5942declare i8 @llvm.vector.reduce.mul.v32i8(<32 x i8>) 5943 5944define i8 @vreduce_mul_v32i8(ptr %x) { 5945; CHECK-LABEL: vreduce_mul_v32i8: 5946; CHECK: # %bb.0: 5947; CHECK-NEXT: li a1, 32 5948; CHECK-NEXT: vsetvli zero, a1, e8, m2, ta, ma 5949; CHECK-NEXT: vle8.v v8, (a0) 5950; CHECK-NEXT: vslidedown.vi v10, v8, 16 5951; CHECK-NEXT: vmul.vv v8, v8, v10 5952; CHECK-NEXT: vslidedown.vi v10, v8, 8 5953; CHECK-NEXT: vmul.vv v8, v8, v10 5954; CHECK-NEXT: vslidedown.vi v10, v8, 4 5955; CHECK-NEXT: vmul.vv v8, v8, v10 5956; CHECK-NEXT: vslidedown.vi v10, v8, 2 5957; CHECK-NEXT: vmul.vv v8, v8, v10 5958; CHECK-NEXT: vrgather.vi v10, v8, 1 5959; CHECK-NEXT: vmul.vv v8, v8, v10 5960; CHECK-NEXT: vmv.x.s a0, v8 5961; CHECK-NEXT: ret 5962 %v = load <32 x i8>, ptr %x 5963 %red = call i8 @llvm.vector.reduce.mul.v32i8(<32 x i8> %v) 5964 ret i8 %red 5965} 5966 5967declare i8 @llvm.vector.reduce.mul.v64i8(<64 x i8>) 5968 5969define i8 @vreduce_mul_v64i8(ptr %x) { 5970; CHECK-LABEL: vreduce_mul_v64i8: 5971; CHECK: # %bb.0: 5972; CHECK-NEXT: li a1, 64 5973; CHECK-NEXT: vsetvli zero, a1, e8, m4, ta, ma 5974; CHECK-NEXT: vle8.v v8, (a0) 5975; CHECK-NEXT: li a0, 32 5976; CHECK-NEXT: vslidedown.vx v12, v8, a0 5977; CHECK-NEXT: vmul.vv v8, v8, v12 5978; CHECK-NEXT: vslidedown.vi v12, v8, 16 5979; CHECK-NEXT: vmul.vv v8, v8, v12 5980; CHECK-NEXT: vslidedown.vi v12, v8, 8 5981; CHECK-NEXT: vmul.vv v8, v8, v12 5982; CHECK-NEXT: vslidedown.vi v12, v8, 4 5983; CHECK-NEXT: vmul.vv v8, v8, v12 5984; CHECK-NEXT: vslidedown.vi v12, v8, 2 5985; CHECK-NEXT: vmul.vv v8, v8, v12 5986; CHECK-NEXT: vrgather.vi v12, v8, 1 5987; CHECK-NEXT: vmul.vv v8, v8, v12 5988; CHECK-NEXT: vmv.x.s a0, v8 5989; CHECK-NEXT: ret 5990 %v = load <64 x i8>, ptr %x 5991 %red = call i8 @llvm.vector.reduce.mul.v64i8(<64 x i8> %v) 5992 ret i8 %red 5993} 5994 5995declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>) 5996 5997define i8 @vreduce_mul_v128i8(ptr %x) { 5998; CHECK-LABEL: vreduce_mul_v128i8: 5999; CHECK: # %bb.0: 6000; CHECK-NEXT: li a1, 128 6001; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 6002; CHECK-NEXT: vle8.v v8, (a0) 6003; CHECK-NEXT: li a0, 64 6004; CHECK-NEXT: vslidedown.vx v16, v8, a0 6005; CHECK-NEXT: vmul.vv v8, v8, v16 6006; CHECK-NEXT: li a0, 32 6007; CHECK-NEXT: vslidedown.vx v16, v8, a0 6008; CHECK-NEXT: vmul.vv v8, v8, v16 6009; CHECK-NEXT: vslidedown.vi v16, v8, 16 6010; CHECK-NEXT: vmul.vv v8, v8, v16 6011; CHECK-NEXT: vslidedown.vi v16, v8, 8 6012; CHECK-NEXT: vmul.vv v8, v8, v16 6013; CHECK-NEXT: vslidedown.vi v16, v8, 4 6014; CHECK-NEXT: vmul.vv v8, v8, v16 6015; CHECK-NEXT: vslidedown.vi v16, v8, 2 6016; CHECK-NEXT: vmul.vv v8, v8, v16 6017; CHECK-NEXT: vrgather.vi v16, v8, 1 6018; CHECK-NEXT: vmul.vv v8, v8, v16 6019; CHECK-NEXT: vmv.x.s a0, v8 6020; CHECK-NEXT: ret 6021 %v = load <128 x i8>, ptr %x 6022 %red = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> %v) 6023 ret i8 %red 6024} 6025 6026declare i8 @llvm.vector.reduce.mul.v256i8(<256 x i8>) 6027 6028define i8 @vreduce_mul_v256i8(ptr %x) { 6029; CHECK-LABEL: vreduce_mul_v256i8: 6030; CHECK: # %bb.0: 6031; CHECK-NEXT: li a1, 128 6032; CHECK-NEXT: vsetvli zero, a1, e8, m8, ta, ma 6033; CHECK-NEXT: vle8.v v8, (a0) 6034; CHECK-NEXT: addi a0, a0, 128 6035; CHECK-NEXT: vle8.v v16, (a0) 6036; CHECK-NEXT: li a0, 64 6037; CHECK-NEXT: vmul.vv v8, v8, v16 6038; CHECK-NEXT: vslidedown.vx v16, v8, a0 6039; CHECK-NEXT: vmul.vv v8, v8, v16 6040; CHECK-NEXT: li a0, 32 6041; CHECK-NEXT: vslidedown.vx v16, v8, a0 6042; CHECK-NEXT: vmul.vv v8, v8, v16 6043; CHECK-NEXT: vslidedown.vi v16, v8, 16 6044; CHECK-NEXT: vmul.vv v8, v8, v16 6045; CHECK-NEXT: vslidedown.vi v16, v8, 8 6046; CHECK-NEXT: vmul.vv v8, v8, v16 6047; CHECK-NEXT: vslidedown.vi v16, v8, 4 6048; CHECK-NEXT: vmul.vv v8, v8, v16 6049; CHECK-NEXT: vslidedown.vi v16, v8, 2 6050; CHECK-NEXT: vmul.vv v8, v8, v16 6051; CHECK-NEXT: vrgather.vi v16, v8, 1 6052; CHECK-NEXT: vmul.vv v8, v8, v16 6053; CHECK-NEXT: vmv.x.s a0, v8 6054; CHECK-NEXT: ret 6055 %v = load <256 x i8>, ptr %x 6056 %red = call i8 @llvm.vector.reduce.mul.v256i8(<256 x i8> %v) 6057 ret i8 %red 6058} 6059 6060declare i16 @llvm.vector.reduce.mul.v1i16(<1 x i16>) 6061 6062define i16 @vreduce_mul_v1i16(<1 x i16> %v) { 6063; CHECK-LABEL: vreduce_mul_v1i16: 6064; CHECK: # %bb.0: 6065; CHECK-NEXT: vsetivli zero, 1, e16, m1, ta, ma 6066; CHECK-NEXT: vmv.x.s a0, v8 6067; CHECK-NEXT: ret 6068 %red = call i16 @llvm.vector.reduce.mul.v1i16(<1 x i16> %v) 6069 ret i16 %red 6070} 6071 6072declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>) 6073 6074define i16 @vreduce_mul_v2i16(ptr %x) { 6075; CHECK-LABEL: vreduce_mul_v2i16: 6076; CHECK: # %bb.0: 6077; CHECK-NEXT: vsetivli zero, 2, e16, mf4, ta, ma 6078; CHECK-NEXT: vle16.v v8, (a0) 6079; CHECK-NEXT: lh a0, 2(a0) 6080; CHECK-NEXT: vmul.vx v8, v8, a0 6081; CHECK-NEXT: vmv.x.s a0, v8 6082; CHECK-NEXT: ret 6083 %v = load <2 x i16>, ptr %x 6084 %red = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> %v) 6085 ret i16 %red 6086} 6087 6088declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>) 6089 6090define i16 @vreduce_mul_v4i16(ptr %x) { 6091; CHECK-LABEL: vreduce_mul_v4i16: 6092; CHECK: # %bb.0: 6093; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 6094; CHECK-NEXT: vle16.v v8, (a0) 6095; CHECK-NEXT: vslidedown.vi v9, v8, 2 6096; CHECK-NEXT: vmul.vv v8, v8, v9 6097; CHECK-NEXT: vrgather.vi v9, v8, 1 6098; CHECK-NEXT: vmul.vv v8, v8, v9 6099; CHECK-NEXT: vmv.x.s a0, v8 6100; CHECK-NEXT: ret 6101 %v = load <4 x i16>, ptr %x 6102 %red = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> %v) 6103 ret i16 %red 6104} 6105 6106declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) 6107 6108define i16 @vreduce_mul_v8i16(ptr %x) { 6109; CHECK-LABEL: vreduce_mul_v8i16: 6110; CHECK: # %bb.0: 6111; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 6112; CHECK-NEXT: vle16.v v8, (a0) 6113; CHECK-NEXT: vslidedown.vi v9, v8, 4 6114; CHECK-NEXT: vmul.vv v8, v8, v9 6115; CHECK-NEXT: vslidedown.vi v9, v8, 2 6116; CHECK-NEXT: vmul.vv v8, v8, v9 6117; CHECK-NEXT: vrgather.vi v9, v8, 1 6118; CHECK-NEXT: vmul.vv v8, v8, v9 6119; CHECK-NEXT: vmv.x.s a0, v8 6120; CHECK-NEXT: ret 6121 %v = load <8 x i16>, ptr %x 6122 %red = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> %v) 6123 ret i16 %red 6124} 6125 6126declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>) 6127 6128define i16 @vreduce_mul_v16i16(ptr %x) { 6129; CHECK-LABEL: vreduce_mul_v16i16: 6130; CHECK: # %bb.0: 6131; CHECK-NEXT: vsetivli zero, 16, e16, m2, ta, ma 6132; CHECK-NEXT: vle16.v v8, (a0) 6133; CHECK-NEXT: vslidedown.vi v10, v8, 8 6134; CHECK-NEXT: vmul.vv v8, v8, v10 6135; CHECK-NEXT: vslidedown.vi v10, v8, 4 6136; CHECK-NEXT: vmul.vv v8, v8, v10 6137; CHECK-NEXT: vslidedown.vi v10, v8, 2 6138; CHECK-NEXT: vmul.vv v8, v8, v10 6139; CHECK-NEXT: vrgather.vi v10, v8, 1 6140; CHECK-NEXT: vmul.vv v8, v8, v10 6141; CHECK-NEXT: vmv.x.s a0, v8 6142; CHECK-NEXT: ret 6143 %v = load <16 x i16>, ptr %x 6144 %red = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> %v) 6145 ret i16 %red 6146} 6147 6148declare i16 @llvm.vector.reduce.mul.v32i16(<32 x i16>) 6149 6150define i16 @vreduce_mul_v32i16(ptr %x) { 6151; CHECK-LABEL: vreduce_mul_v32i16: 6152; CHECK: # %bb.0: 6153; CHECK-NEXT: li a1, 32 6154; CHECK-NEXT: vsetvli zero, a1, e16, m4, ta, ma 6155; CHECK-NEXT: vle16.v v8, (a0) 6156; CHECK-NEXT: vslidedown.vi v12, v8, 16 6157; CHECK-NEXT: vmul.vv v8, v8, v12 6158; CHECK-NEXT: vslidedown.vi v12, v8, 8 6159; CHECK-NEXT: vmul.vv v8, v8, v12 6160; CHECK-NEXT: vslidedown.vi v12, v8, 4 6161; CHECK-NEXT: vmul.vv v8, v8, v12 6162; CHECK-NEXT: vslidedown.vi v12, v8, 2 6163; CHECK-NEXT: vmul.vv v8, v8, v12 6164; CHECK-NEXT: vrgather.vi v12, v8, 1 6165; CHECK-NEXT: vmul.vv v8, v8, v12 6166; CHECK-NEXT: vmv.x.s a0, v8 6167; CHECK-NEXT: ret 6168 %v = load <32 x i16>, ptr %x 6169 %red = call i16 @llvm.vector.reduce.mul.v32i16(<32 x i16> %v) 6170 ret i16 %red 6171} 6172 6173declare i16 @llvm.vector.reduce.mul.v64i16(<64 x i16>) 6174 6175define i16 @vreduce_mul_v64i16(ptr %x) { 6176; CHECK-LABEL: vreduce_mul_v64i16: 6177; CHECK: # %bb.0: 6178; CHECK-NEXT: li a1, 64 6179; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 6180; CHECK-NEXT: vle16.v v8, (a0) 6181; CHECK-NEXT: li a0, 32 6182; CHECK-NEXT: vslidedown.vx v16, v8, a0 6183; CHECK-NEXT: vmul.vv v8, v8, v16 6184; CHECK-NEXT: vslidedown.vi v16, v8, 16 6185; CHECK-NEXT: vmul.vv v8, v8, v16 6186; CHECK-NEXT: vslidedown.vi v16, v8, 8 6187; CHECK-NEXT: vmul.vv v8, v8, v16 6188; CHECK-NEXT: vslidedown.vi v16, v8, 4 6189; CHECK-NEXT: vmul.vv v8, v8, v16 6190; CHECK-NEXT: vslidedown.vi v16, v8, 2 6191; CHECK-NEXT: vmul.vv v8, v8, v16 6192; CHECK-NEXT: vrgather.vi v16, v8, 1 6193; CHECK-NEXT: vmul.vv v8, v8, v16 6194; CHECK-NEXT: vmv.x.s a0, v8 6195; CHECK-NEXT: ret 6196 %v = load <64 x i16>, ptr %x 6197 %red = call i16 @llvm.vector.reduce.mul.v64i16(<64 x i16> %v) 6198 ret i16 %red 6199} 6200 6201declare i16 @llvm.vector.reduce.mul.v128i16(<128 x i16>) 6202 6203define i16 @vreduce_mul_v128i16(ptr %x) { 6204; CHECK-LABEL: vreduce_mul_v128i16: 6205; CHECK: # %bb.0: 6206; CHECK-NEXT: li a1, 64 6207; CHECK-NEXT: vsetvli zero, a1, e16, m8, ta, ma 6208; CHECK-NEXT: vle16.v v8, (a0) 6209; CHECK-NEXT: addi a0, a0, 128 6210; CHECK-NEXT: vle16.v v16, (a0) 6211; CHECK-NEXT: vmul.vv v8, v8, v16 6212; CHECK-NEXT: li a0, 32 6213; CHECK-NEXT: vslidedown.vx v16, v8, a0 6214; CHECK-NEXT: vmul.vv v8, v8, v16 6215; CHECK-NEXT: vslidedown.vi v16, v8, 16 6216; CHECK-NEXT: vmul.vv v8, v8, v16 6217; CHECK-NEXT: vslidedown.vi v16, v8, 8 6218; CHECK-NEXT: vmul.vv v8, v8, v16 6219; CHECK-NEXT: vslidedown.vi v16, v8, 4 6220; CHECK-NEXT: vmul.vv v8, v8, v16 6221; CHECK-NEXT: vslidedown.vi v16, v8, 2 6222; CHECK-NEXT: vmul.vv v8, v8, v16 6223; CHECK-NEXT: vrgather.vi v16, v8, 1 6224; CHECK-NEXT: vmul.vv v8, v8, v16 6225; CHECK-NEXT: vmv.x.s a0, v8 6226; CHECK-NEXT: ret 6227 %v = load <128 x i16>, ptr %x 6228 %red = call i16 @llvm.vector.reduce.mul.v128i16(<128 x i16> %v) 6229 ret i16 %red 6230} 6231 6232declare i32 @llvm.vector.reduce.mul.v1i32(<1 x i32>) 6233 6234define i32 @vreduce_mul_v1i32(<1 x i32> %v) { 6235; CHECK-LABEL: vreduce_mul_v1i32: 6236; CHECK: # %bb.0: 6237; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 6238; CHECK-NEXT: vmv.x.s a0, v8 6239; CHECK-NEXT: ret 6240 %red = call i32 @llvm.vector.reduce.mul.v1i32(<1 x i32> %v) 6241 ret i32 %red 6242} 6243 6244declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) 6245 6246define i32 @vreduce_mul_v2i32(ptr %x) { 6247; CHECK-LABEL: vreduce_mul_v2i32: 6248; CHECK: # %bb.0: 6249; CHECK-NEXT: vsetivli zero, 2, e32, mf2, ta, ma 6250; CHECK-NEXT: vle32.v v8, (a0) 6251; CHECK-NEXT: lw a0, 4(a0) 6252; CHECK-NEXT: vmul.vx v8, v8, a0 6253; CHECK-NEXT: vmv.x.s a0, v8 6254; CHECK-NEXT: ret 6255 %v = load <2 x i32>, ptr %x 6256 %red = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> %v) 6257 ret i32 %red 6258} 6259 6260declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) 6261 6262define i32 @vreduce_mul_v4i32(ptr %x) { 6263; CHECK-LABEL: vreduce_mul_v4i32: 6264; CHECK: # %bb.0: 6265; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 6266; CHECK-NEXT: vle32.v v8, (a0) 6267; CHECK-NEXT: vslidedown.vi v9, v8, 2 6268; CHECK-NEXT: vmul.vv v8, v8, v9 6269; CHECK-NEXT: vrgather.vi v9, v8, 1 6270; CHECK-NEXT: vmul.vv v8, v8, v9 6271; CHECK-NEXT: vmv.x.s a0, v8 6272; CHECK-NEXT: ret 6273 %v = load <4 x i32>, ptr %x 6274 %red = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %v) 6275 ret i32 %red 6276} 6277 6278declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) 6279 6280define i32 @vreduce_mul_v8i32(ptr %x) { 6281; CHECK-LABEL: vreduce_mul_v8i32: 6282; CHECK: # %bb.0: 6283; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma 6284; CHECK-NEXT: vle32.v v8, (a0) 6285; CHECK-NEXT: vslidedown.vi v10, v8, 4 6286; CHECK-NEXT: vmul.vv v8, v8, v10 6287; CHECK-NEXT: vslidedown.vi v10, v8, 2 6288; CHECK-NEXT: vmul.vv v8, v8, v10 6289; CHECK-NEXT: vrgather.vi v10, v8, 1 6290; CHECK-NEXT: vmul.vv v8, v8, v10 6291; CHECK-NEXT: vmv.x.s a0, v8 6292; CHECK-NEXT: ret 6293 %v = load <8 x i32>, ptr %x 6294 %red = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> %v) 6295 ret i32 %red 6296} 6297 6298declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) 6299 6300define i32 @vreduce_mul_v16i32(ptr %x) { 6301; CHECK-LABEL: vreduce_mul_v16i32: 6302; CHECK: # %bb.0: 6303; CHECK-NEXT: vsetivli zero, 16, e32, m4, ta, ma 6304; CHECK-NEXT: vle32.v v8, (a0) 6305; CHECK-NEXT: vslidedown.vi v12, v8, 8 6306; CHECK-NEXT: vmul.vv v8, v8, v12 6307; CHECK-NEXT: vslidedown.vi v12, v8, 4 6308; CHECK-NEXT: vmul.vv v8, v8, v12 6309; CHECK-NEXT: vslidedown.vi v12, v8, 2 6310; CHECK-NEXT: vmul.vv v8, v8, v12 6311; CHECK-NEXT: vrgather.vi v12, v8, 1 6312; CHECK-NEXT: vmul.vv v8, v8, v12 6313; CHECK-NEXT: vmv.x.s a0, v8 6314; CHECK-NEXT: ret 6315 %v = load <16 x i32>, ptr %x 6316 %red = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> %v) 6317 ret i32 %red 6318} 6319 6320declare i32 @llvm.vector.reduce.mul.v32i32(<32 x i32>) 6321 6322define i32 @vreduce_mul_v32i32(ptr %x) { 6323; CHECK-LABEL: vreduce_mul_v32i32: 6324; CHECK: # %bb.0: 6325; CHECK-NEXT: li a1, 32 6326; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 6327; CHECK-NEXT: vle32.v v8, (a0) 6328; CHECK-NEXT: vslidedown.vi v16, v8, 16 6329; CHECK-NEXT: vmul.vv v8, v8, v16 6330; CHECK-NEXT: vslidedown.vi v16, v8, 8 6331; CHECK-NEXT: vmul.vv v8, v8, v16 6332; CHECK-NEXT: vslidedown.vi v16, v8, 4 6333; CHECK-NEXT: vmul.vv v8, v8, v16 6334; CHECK-NEXT: vslidedown.vi v16, v8, 2 6335; CHECK-NEXT: vmul.vv v8, v8, v16 6336; CHECK-NEXT: vrgather.vi v16, v8, 1 6337; CHECK-NEXT: vmul.vv v8, v8, v16 6338; CHECK-NEXT: vmv.x.s a0, v8 6339; CHECK-NEXT: ret 6340 %v = load <32 x i32>, ptr %x 6341 %red = call i32 @llvm.vector.reduce.mul.v32i32(<32 x i32> %v) 6342 ret i32 %red 6343} 6344 6345declare i32 @llvm.vector.reduce.mul.v64i32(<64 x i32>) 6346 6347define i32 @vreduce_mul_v64i32(ptr %x) { 6348; CHECK-LABEL: vreduce_mul_v64i32: 6349; CHECK: # %bb.0: 6350; CHECK-NEXT: li a1, 32 6351; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma 6352; CHECK-NEXT: vle32.v v8, (a0) 6353; CHECK-NEXT: addi a0, a0, 128 6354; CHECK-NEXT: vle32.v v16, (a0) 6355; CHECK-NEXT: vmul.vv v8, v8, v16 6356; CHECK-NEXT: vslidedown.vi v16, v8, 16 6357; CHECK-NEXT: vmul.vv v8, v8, v16 6358; CHECK-NEXT: vslidedown.vi v16, v8, 8 6359; CHECK-NEXT: vmul.vv v8, v8, v16 6360; CHECK-NEXT: vslidedown.vi v16, v8, 4 6361; CHECK-NEXT: vmul.vv v8, v8, v16 6362; CHECK-NEXT: vslidedown.vi v16, v8, 2 6363; CHECK-NEXT: vmul.vv v8, v8, v16 6364; CHECK-NEXT: vrgather.vi v16, v8, 1 6365; CHECK-NEXT: vmul.vv v8, v8, v16 6366; CHECK-NEXT: vmv.x.s a0, v8 6367; CHECK-NEXT: ret 6368 %v = load <64 x i32>, ptr %x 6369 %red = call i32 @llvm.vector.reduce.mul.v64i32(<64 x i32> %v) 6370 ret i32 %red 6371} 6372 6373declare i64 @llvm.vector.reduce.mul.v1i64(<1 x i64>) 6374 6375define i64 @vreduce_mul_v1i64(<1 x i64> %v) { 6376; RV32-LABEL: vreduce_mul_v1i64: 6377; RV32: # %bb.0: 6378; RV32-NEXT: li a0, 32 6379; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 6380; RV32-NEXT: vsrl.vx v9, v8, a0 6381; RV32-NEXT: vmv.x.s a1, v9 6382; RV32-NEXT: vmv.x.s a0, v8 6383; RV32-NEXT: ret 6384; 6385; RV64-LABEL: vreduce_mul_v1i64: 6386; RV64: # %bb.0: 6387; RV64-NEXT: vsetivli zero, 1, e64, m1, ta, ma 6388; RV64-NEXT: vmv.x.s a0, v8 6389; RV64-NEXT: ret 6390 %red = call i64 @llvm.vector.reduce.mul.v1i64(<1 x i64> %v) 6391 ret i64 %red 6392} 6393 6394declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) 6395 6396define i64 @vreduce_mul_v2i64(ptr %x) { 6397; RV32-LABEL: vreduce_mul_v2i64: 6398; RV32: # %bb.0: 6399; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 6400; RV32-NEXT: vle64.v v8, (a0) 6401; RV32-NEXT: addi a0, a0, 8 6402; RV32-NEXT: vlse64.v v9, (a0), zero 6403; RV32-NEXT: li a1, 32 6404; RV32-NEXT: vmul.vv v8, v8, v9 6405; RV32-NEXT: vmv.x.s a0, v8 6406; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 6407; RV32-NEXT: vsrl.vx v8, v8, a1 6408; RV32-NEXT: vmv.x.s a1, v8 6409; RV32-NEXT: ret 6410; 6411; RV64-LABEL: vreduce_mul_v2i64: 6412; RV64: # %bb.0: 6413; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 6414; RV64-NEXT: vle64.v v8, (a0) 6415; RV64-NEXT: ld a0, 8(a0) 6416; RV64-NEXT: vmul.vx v8, v8, a0 6417; RV64-NEXT: vmv.x.s a0, v8 6418; RV64-NEXT: ret 6419 %v = load <2 x i64>, ptr %x 6420 %red = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> %v) 6421 ret i64 %red 6422} 6423 6424declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) 6425 6426define i64 @vreduce_mul_v4i64(ptr %x) { 6427; RV32-LABEL: vreduce_mul_v4i64: 6428; RV32: # %bb.0: 6429; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma 6430; RV32-NEXT: vle64.v v8, (a0) 6431; RV32-NEXT: li a1, 32 6432; RV32-NEXT: vslidedown.vi v10, v8, 2 6433; RV32-NEXT: vmul.vv v8, v8, v10 6434; RV32-NEXT: vrgather.vi v10, v8, 1 6435; RV32-NEXT: vmul.vv v8, v8, v10 6436; RV32-NEXT: vmv.x.s a0, v8 6437; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma 6438; RV32-NEXT: vsrl.vx v8, v8, a1 6439; RV32-NEXT: vmv.x.s a1, v8 6440; RV32-NEXT: ret 6441; 6442; RV64-LABEL: vreduce_mul_v4i64: 6443; RV64: # %bb.0: 6444; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma 6445; RV64-NEXT: vle64.v v8, (a0) 6446; RV64-NEXT: vslidedown.vi v10, v8, 2 6447; RV64-NEXT: vmul.vv v8, v8, v10 6448; RV64-NEXT: vrgather.vi v10, v8, 1 6449; RV64-NEXT: vmul.vv v8, v8, v10 6450; RV64-NEXT: vmv.x.s a0, v8 6451; RV64-NEXT: ret 6452 %v = load <4 x i64>, ptr %x 6453 %red = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> %v) 6454 ret i64 %red 6455} 6456 6457declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>) 6458 6459define i64 @vreduce_mul_v8i64(ptr %x) { 6460; RV32-LABEL: vreduce_mul_v8i64: 6461; RV32: # %bb.0: 6462; RV32-NEXT: vsetivli zero, 8, e64, m4, ta, ma 6463; RV32-NEXT: vle64.v v8, (a0) 6464; RV32-NEXT: li a1, 32 6465; RV32-NEXT: vslidedown.vi v12, v8, 4 6466; RV32-NEXT: vmul.vv v8, v8, v12 6467; RV32-NEXT: vslidedown.vi v12, v8, 2 6468; RV32-NEXT: vmul.vv v8, v8, v12 6469; RV32-NEXT: vrgather.vi v12, v8, 1 6470; RV32-NEXT: vmul.vv v8, v8, v12 6471; RV32-NEXT: vmv.x.s a0, v8 6472; RV32-NEXT: vsetivli zero, 1, e64, m4, ta, ma 6473; RV32-NEXT: vsrl.vx v8, v8, a1 6474; RV32-NEXT: vmv.x.s a1, v8 6475; RV32-NEXT: ret 6476; 6477; RV64-LABEL: vreduce_mul_v8i64: 6478; RV64: # %bb.0: 6479; RV64-NEXT: vsetivli zero, 8, e64, m4, ta, ma 6480; RV64-NEXT: vle64.v v8, (a0) 6481; RV64-NEXT: vslidedown.vi v12, v8, 4 6482; RV64-NEXT: vmul.vv v8, v8, v12 6483; RV64-NEXT: vslidedown.vi v12, v8, 2 6484; RV64-NEXT: vmul.vv v8, v8, v12 6485; RV64-NEXT: vrgather.vi v12, v8, 1 6486; RV64-NEXT: vmul.vv v8, v8, v12 6487; RV64-NEXT: vmv.x.s a0, v8 6488; RV64-NEXT: ret 6489 %v = load <8 x i64>, ptr %x 6490 %red = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> %v) 6491 ret i64 %red 6492} 6493 6494declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>) 6495 6496define i64 @vreduce_mul_v16i64(ptr %x) { 6497; RV32-LABEL: vreduce_mul_v16i64: 6498; RV32: # %bb.0: 6499; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6500; RV32-NEXT: vle64.v v8, (a0) 6501; RV32-NEXT: li a1, 32 6502; RV32-NEXT: vslidedown.vi v16, v8, 8 6503; RV32-NEXT: vmul.vv v8, v8, v16 6504; RV32-NEXT: vslidedown.vi v16, v8, 4 6505; RV32-NEXT: vmul.vv v8, v8, v16 6506; RV32-NEXT: vslidedown.vi v16, v8, 2 6507; RV32-NEXT: vmul.vv v8, v8, v16 6508; RV32-NEXT: vrgather.vi v16, v8, 1 6509; RV32-NEXT: vmul.vv v8, v8, v16 6510; RV32-NEXT: vmv.x.s a0, v8 6511; RV32-NEXT: vsetivli zero, 1, e64, m8, ta, ma 6512; RV32-NEXT: vsrl.vx v8, v8, a1 6513; RV32-NEXT: vmv.x.s a1, v8 6514; RV32-NEXT: ret 6515; 6516; RV64-LABEL: vreduce_mul_v16i64: 6517; RV64: # %bb.0: 6518; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6519; RV64-NEXT: vle64.v v8, (a0) 6520; RV64-NEXT: vslidedown.vi v16, v8, 8 6521; RV64-NEXT: vmul.vv v8, v8, v16 6522; RV64-NEXT: vslidedown.vi v16, v8, 4 6523; RV64-NEXT: vmul.vv v8, v8, v16 6524; RV64-NEXT: vslidedown.vi v16, v8, 2 6525; RV64-NEXT: vmul.vv v8, v8, v16 6526; RV64-NEXT: vrgather.vi v16, v8, 1 6527; RV64-NEXT: vmul.vv v8, v8, v16 6528; RV64-NEXT: vmv.x.s a0, v8 6529; RV64-NEXT: ret 6530 %v = load <16 x i64>, ptr %x 6531 %red = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> %v) 6532 ret i64 %red 6533} 6534 6535declare i64 @llvm.vector.reduce.mul.v32i64(<32 x i64>) 6536 6537define i64 @vreduce_mul_v32i64(ptr %x) { 6538; RV32-LABEL: vreduce_mul_v32i64: 6539; RV32: # %bb.0: 6540; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6541; RV32-NEXT: vle64.v v8, (a0) 6542; RV32-NEXT: addi a0, a0, 128 6543; RV32-NEXT: vle64.v v16, (a0) 6544; RV32-NEXT: vmul.vv v8, v8, v16 6545; RV32-NEXT: vslidedown.vi v16, v8, 8 6546; RV32-NEXT: vmul.vv v8, v8, v16 6547; RV32-NEXT: vslidedown.vi v16, v8, 4 6548; RV32-NEXT: vmul.vv v8, v8, v16 6549; RV32-NEXT: vslidedown.vi v16, v8, 2 6550; RV32-NEXT: vmul.vv v8, v8, v16 6551; RV32-NEXT: vrgather.vi v16, v8, 1 6552; RV32-NEXT: vmul.vv v8, v8, v16 6553; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma 6554; RV32-NEXT: vmv.x.s a0, v8 6555; RV32-NEXT: vslidedown.vi v8, v8, 1 6556; RV32-NEXT: vmv.x.s a1, v8 6557; RV32-NEXT: ret 6558; 6559; RV64-LABEL: vreduce_mul_v32i64: 6560; RV64: # %bb.0: 6561; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6562; RV64-NEXT: vle64.v v8, (a0) 6563; RV64-NEXT: addi a0, a0, 128 6564; RV64-NEXT: vle64.v v16, (a0) 6565; RV64-NEXT: vmul.vv v8, v8, v16 6566; RV64-NEXT: vslidedown.vi v16, v8, 8 6567; RV64-NEXT: vmul.vv v8, v8, v16 6568; RV64-NEXT: vslidedown.vi v16, v8, 4 6569; RV64-NEXT: vmul.vv v8, v8, v16 6570; RV64-NEXT: vslidedown.vi v16, v8, 2 6571; RV64-NEXT: vmul.vv v8, v8, v16 6572; RV64-NEXT: vrgather.vi v16, v8, 1 6573; RV64-NEXT: vmul.vv v8, v8, v16 6574; RV64-NEXT: vmv.x.s a0, v8 6575; RV64-NEXT: ret 6576 %v = load <32 x i64>, ptr %x 6577 %red = call i64 @llvm.vector.reduce.mul.v32i64(<32 x i64> %v) 6578 ret i64 %red 6579} 6580 6581declare i64 @llvm.vector.reduce.mul.v64i64(<64 x i64>) 6582 6583define i64 @vreduce_mul_v64i64(ptr %x) nounwind { 6584; RV32-LABEL: vreduce_mul_v64i64: 6585; RV32: # %bb.0: 6586; RV32-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6587; RV32-NEXT: vle64.v v8, (a0) 6588; RV32-NEXT: addi a1, a0, 384 6589; RV32-NEXT: vle64.v v16, (a1) 6590; RV32-NEXT: addi a1, a0, 256 6591; RV32-NEXT: addi a0, a0, 128 6592; RV32-NEXT: vle64.v v24, (a0) 6593; RV32-NEXT: vle64.v v0, (a1) 6594; RV32-NEXT: vmul.vv v16, v24, v16 6595; RV32-NEXT: vmul.vv v8, v8, v0 6596; RV32-NEXT: vmul.vv v8, v8, v16 6597; RV32-NEXT: vslidedown.vi v16, v8, 8 6598; RV32-NEXT: vmul.vv v8, v8, v16 6599; RV32-NEXT: vslidedown.vi v16, v8, 4 6600; RV32-NEXT: vmul.vv v8, v8, v16 6601; RV32-NEXT: vslidedown.vi v16, v8, 2 6602; RV32-NEXT: vmul.vv v8, v8, v16 6603; RV32-NEXT: vrgather.vi v16, v8, 1 6604; RV32-NEXT: vmul.vv v8, v8, v16 6605; RV32-NEXT: vsetivli zero, 1, e32, m1, ta, ma 6606; RV32-NEXT: vmv.x.s a0, v8 6607; RV32-NEXT: vslidedown.vi v8, v8, 1 6608; RV32-NEXT: vmv.x.s a1, v8 6609; RV32-NEXT: ret 6610; 6611; RV64-LABEL: vreduce_mul_v64i64: 6612; RV64: # %bb.0: 6613; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma 6614; RV64-NEXT: vle64.v v8, (a0) 6615; RV64-NEXT: addi a1, a0, 384 6616; RV64-NEXT: vle64.v v16, (a1) 6617; RV64-NEXT: addi a1, a0, 256 6618; RV64-NEXT: addi a0, a0, 128 6619; RV64-NEXT: vle64.v v24, (a0) 6620; RV64-NEXT: vle64.v v0, (a1) 6621; RV64-NEXT: vmul.vv v16, v24, v16 6622; RV64-NEXT: vmul.vv v8, v8, v0 6623; RV64-NEXT: vmul.vv v8, v8, v16 6624; RV64-NEXT: vslidedown.vi v16, v8, 8 6625; RV64-NEXT: vmul.vv v8, v8, v16 6626; RV64-NEXT: vslidedown.vi v16, v8, 4 6627; RV64-NEXT: vmul.vv v8, v8, v16 6628; RV64-NEXT: vslidedown.vi v16, v8, 2 6629; RV64-NEXT: vmul.vv v8, v8, v16 6630; RV64-NEXT: vrgather.vi v16, v8, 1 6631; RV64-NEXT: vmul.vv v8, v8, v16 6632; RV64-NEXT: vmv.x.s a0, v8 6633; RV64-NEXT: ret 6634 %v = load <64 x i64>, ptr %x 6635 %red = call i64 @llvm.vector.reduce.mul.v64i64(<64 x i64> %v) 6636 ret i64 %red 6637} 6638