1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv64 -mattr=+d,+zvfh,+v,+zbb -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s 3 4define i64 @reduce_add(i64 %x, <4 x i64> %v) { 5; CHECK-LABEL: reduce_add: 6; CHECK: # %bb.0: # %entry 7; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 8; CHECK-NEXT: vmv.s.x v10, a0 9; CHECK-NEXT: vredsum.vs v8, v8, v10 10; CHECK-NEXT: vmv.x.s a0, v8 11; CHECK-NEXT: ret 12entry: 13 %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 14 %res = add i64 %rdx, %x 15 ret i64 %res 16} 17 18define i64 @reduce_add2(<4 x i64> %v) { 19; CHECK-LABEL: reduce_add2: 20; CHECK: # %bb.0: # %entry 21; CHECK-NEXT: vsetivli zero, 1, e64, m1, ta, ma 22; CHECK-NEXT: vmv.v.i v10, 8 23; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 24; CHECK-NEXT: vredsum.vs v8, v8, v10 25; CHECK-NEXT: vmv.x.s a0, v8 26; CHECK-NEXT: ret 27entry: 28 %rdx = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 29 %res = add i64 %rdx, 8 30 ret i64 %res 31} 32 33define i64 @reduce_and(i64 %x, <4 x i64> %v) { 34; CHECK-LABEL: reduce_and: 35; CHECK: # %bb.0: # %entry 36; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 37; CHECK-NEXT: vredand.vs v8, v8, v8 38; CHECK-NEXT: vmv.x.s a1, v8 39; CHECK-NEXT: and a0, a1, a0 40; CHECK-NEXT: ret 41entry: 42 %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) 43 %res = and i64 %rdx, %x 44 ret i64 %res 45} 46 47define i64 @reduce_and2(<4 x i64> %v) { 48; CHECK-LABEL: reduce_and2: 49; CHECK: # %bb.0: # %entry 50; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 51; CHECK-NEXT: vredand.vs v8, v8, v8 52; CHECK-NEXT: vmv.x.s a0, v8 53; CHECK-NEXT: andi a0, a0, 8 54; CHECK-NEXT: ret 55entry: 56 %rdx = call i64 @llvm.vector.reduce.and.v4i64(<4 x i64> %v) 57 %res = and i64 %rdx, 8 58 ret i64 %res 59} 60 61define i64 @reduce_or(i64 %x, <4 x i64> %v) { 62; CHECK-LABEL: reduce_or: 63; CHECK: # %bb.0: # %entry 64; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 65; CHECK-NEXT: vredor.vs v8, v8, v8 66; CHECK-NEXT: vmv.x.s a1, v8 67; CHECK-NEXT: or a0, a1, a0 68; CHECK-NEXT: ret 69entry: 70 %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) 71 %res = or i64 %rdx, %x 72 ret i64 %res 73} 74 75define i64 @reduce_or2(<4 x i64> %v) { 76; CHECK-LABEL: reduce_or2: 77; CHECK: # %bb.0: # %entry 78; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 79; CHECK-NEXT: vredor.vs v8, v8, v8 80; CHECK-NEXT: vmv.x.s a0, v8 81; CHECK-NEXT: ori a0, a0, 8 82; CHECK-NEXT: ret 83entry: 84 %rdx = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> %v) 85 %res = or i64 %rdx, 8 86 ret i64 %res 87} 88 89define i64 @reduce_xor(i64 %x, <4 x i64> %v) { 90; CHECK-LABEL: reduce_xor: 91; CHECK: # %bb.0: # %entry 92; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 93; CHECK-NEXT: vmv.s.x v10, a0 94; CHECK-NEXT: vredxor.vs v8, v8, v10 95; CHECK-NEXT: vmv.x.s a0, v8 96; CHECK-NEXT: ret 97entry: 98 %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) 99 %res = xor i64 %rdx, %x 100 ret i64 %res 101} 102 103define i64 @reduce_xor2(<4 x i64> %v) { 104; CHECK-LABEL: reduce_xor2: 105; CHECK: # %bb.0: # %entry 106; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 107; CHECK-NEXT: vmv.s.x v10, zero 108; CHECK-NEXT: vredxor.vs v8, v8, v10 109; CHECK-NEXT: vmv.x.s a0, v8 110; CHECK-NEXT: andi a0, a0, 8 111; CHECK-NEXT: ret 112entry: 113 %rdx = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) 114 %res = and i64 %rdx, 8 115 ret i64 %res 116} 117 118define i64 @reduce_umax(i64 %x, <4 x i64> %v) { 119; CHECK-LABEL: reduce_umax: 120; CHECK: # %bb.0: # %entry 121; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 122; CHECK-NEXT: vredmaxu.vs v8, v8, v8 123; CHECK-NEXT: vmv.x.s a1, v8 124; CHECK-NEXT: maxu a0, a1, a0 125; CHECK-NEXT: ret 126entry: 127 %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) 128 %res = call i64 @llvm.umax.i64(i64 %rdx, i64 %x) 129 ret i64 %res 130} 131 132define i64 @reduce_umax2(<4 x i64> %v) { 133; CHECK-LABEL: reduce_umax2: 134; CHECK: # %bb.0: # %entry 135; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 136; CHECK-NEXT: vredmaxu.vs v8, v8, v8 137; CHECK-NEXT: vmv.x.s a0, v8 138; CHECK-NEXT: li a1, 8 139; CHECK-NEXT: maxu a0, a0, a1 140; CHECK-NEXT: ret 141entry: 142 %rdx = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> %v) 143 %res = call i64 @llvm.umax.i64(i64 %rdx, i64 8) 144 ret i64 %res 145} 146 147define i64 @reduce_umin(i64 %x, <4 x i64> %v) { 148; CHECK-LABEL: reduce_umin: 149; CHECK: # %bb.0: # %entry 150; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 151; CHECK-NEXT: vredminu.vs v8, v8, v8 152; CHECK-NEXT: vmv.x.s a1, v8 153; CHECK-NEXT: minu a0, a1, a0 154; CHECK-NEXT: ret 155entry: 156 %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) 157 %res = call i64 @llvm.umin.i64(i64 %rdx, i64 %x) 158 ret i64 %res 159} 160 161define i64 @reduce_umin2(<4 x i64> %v) { 162; CHECK-LABEL: reduce_umin2: 163; CHECK: # %bb.0: # %entry 164; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 165; CHECK-NEXT: vredminu.vs v8, v8, v8 166; CHECK-NEXT: vmv.x.s a0, v8 167; CHECK-NEXT: li a1, 8 168; CHECK-NEXT: minu a0, a0, a1 169; CHECK-NEXT: ret 170entry: 171 %rdx = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> %v) 172 %res = call i64 @llvm.umin.i64(i64 %rdx, i64 8) 173 ret i64 %res 174} 175 176define i64 @reduce_smax(i64 %x, <4 x i64> %v) { 177; CHECK-LABEL: reduce_smax: 178; CHECK: # %bb.0: # %entry 179; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 180; CHECK-NEXT: vredmax.vs v8, v8, v8 181; CHECK-NEXT: vmv.x.s a1, v8 182; CHECK-NEXT: max a0, a1, a0 183; CHECK-NEXT: ret 184entry: 185 %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) 186 %res = call i64 @llvm.smax.i64(i64 %rdx, i64 %x) 187 ret i64 %res 188} 189 190define i64 @reduce_smax2(<4 x i64> %v) { 191; CHECK-LABEL: reduce_smax2: 192; CHECK: # %bb.0: # %entry 193; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 194; CHECK-NEXT: vredmax.vs v8, v8, v8 195; CHECK-NEXT: vmv.x.s a0, v8 196; CHECK-NEXT: li a1, 8 197; CHECK-NEXT: max a0, a0, a1 198; CHECK-NEXT: ret 199entry: 200 %rdx = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> %v) 201 %res = call i64 @llvm.smax.i64(i64 %rdx, i64 8) 202 ret i64 %res 203} 204 205define i64 @reduce_smin(i64 %x, <4 x i64> %v) { 206; CHECK-LABEL: reduce_smin: 207; CHECK: # %bb.0: # %entry 208; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 209; CHECK-NEXT: vredmin.vs v8, v8, v8 210; CHECK-NEXT: vmv.x.s a1, v8 211; CHECK-NEXT: min a0, a1, a0 212; CHECK-NEXT: ret 213entry: 214 %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) 215 %res = call i64 @llvm.smin.i64(i64 %rdx, i64 %x) 216 ret i64 %res 217} 218 219define i64 @reduce_smin2(<4 x i64> %v) { 220; CHECK-LABEL: reduce_smin2: 221; CHECK: # %bb.0: # %entry 222; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 223; CHECK-NEXT: vredmin.vs v8, v8, v8 224; CHECK-NEXT: vmv.x.s a0, v8 225; CHECK-NEXT: li a1, 8 226; CHECK-NEXT: min a0, a0, a1 227; CHECK-NEXT: ret 228entry: 229 %rdx = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> %v) 230 %res = call i64 @llvm.smin.i64(i64 %rdx, i64 8) 231 ret i64 %res 232} 233 234define float @reduce_fadd(float %x, <4 x float> %v) { 235; CHECK-LABEL: reduce_fadd: 236; CHECK: # %bb.0: # %entry 237; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 238; CHECK-NEXT: vfmv.s.f v9, fa0 239; CHECK-NEXT: vfredusum.vs v8, v8, v9 240; CHECK-NEXT: vfmv.f.s fa0, v8 241; CHECK-NEXT: ret 242entry: 243 %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float %x, <4 x float> %v) 244 ret float %rdx 245} 246 247define float @reduce_fadd2(float %x, <4 x float> %v) { 248; CHECK-LABEL: reduce_fadd2: 249; CHECK: # %bb.0: # %entry 250; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 251; CHECK-NEXT: vfmv.s.f v9, fa0 252; CHECK-NEXT: vfredusum.vs v8, v8, v9 253; CHECK-NEXT: vfmv.f.s fa0, v8 254; CHECK-NEXT: ret 255entry: 256 %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> %v) 257 %res = fadd fast float %rdx, %x 258 ret float %res 259} 260 261define float @reduce_fadd3(float %x, <4 x float> %v, ptr %rdxptr) { 262; CHECK-LABEL: reduce_fadd3: 263; CHECK: # %bb.0: # %entry 264; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 265; CHECK-NEXT: vmv.s.x v9, zero 266; CHECK-NEXT: vfredusum.vs v8, v8, v9 267; CHECK-NEXT: vfmv.f.s fa5, v8 268; CHECK-NEXT: fadd.s fa0, fa5, fa0 269; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 270; CHECK-NEXT: vse32.v v8, (a0) 271; CHECK-NEXT: ret 272entry: 273 %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v) 274 %res = fadd fast float %rdx, %x 275 store float %rdx, ptr %rdxptr 276 ret float %res 277} 278 279define float @reduce_fadd4(float %x, float %y, <4 x float> %v, <4 x float> %w) { 280; CHECK-LABEL: reduce_fadd4: 281; CHECK: # %bb.0: # %entry 282; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 283; CHECK-NEXT: vfmv.s.f v10, fa0 284; CHECK-NEXT: vfredusum.vs v8, v8, v10 285; CHECK-NEXT: vfmv.s.f v10, fa1 286; CHECK-NEXT: vfredusum.vs v9, v9, v10 287; CHECK-NEXT: vfmv.f.s fa5, v8 288; CHECK-NEXT: vfmv.f.s fa4, v9 289; CHECK-NEXT: fdiv.s fa0, fa5, fa4 290; CHECK-NEXT: ret 291entry: 292 %rdx = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %v) 293 %rdx2 = call fast float @llvm.vector.reduce.fadd.v4f32(float -0.0, <4 x float> %w) 294 %res = fadd fast float %rdx, %x 295 %res2 = fadd fast float %rdx2, %y 296 %div = fdiv fast float %res, %res2 297 ret float %div 298} 299 300define float @reduce_fmax(float %x, <4 x float> %v) { 301; CHECK-LABEL: reduce_fmax: 302; CHECK: # %bb.0: # %entry 303; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 304; CHECK-NEXT: vfredmax.vs v8, v8, v8 305; CHECK-NEXT: vfmv.f.s fa5, v8 306; CHECK-NEXT: fmax.s fa0, fa0, fa5 307; CHECK-NEXT: ret 308entry: 309 %rdx = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> %v) 310 %res = call float @llvm.maxnum.f32(float %x, float %rdx) 311 ret float %res 312} 313 314define float @reduce_fmin(float %x, <4 x float> %v) { 315; CHECK-LABEL: reduce_fmin: 316; CHECK: # %bb.0: # %entry 317; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 318; CHECK-NEXT: vfredmin.vs v8, v8, v8 319; CHECK-NEXT: vfmv.f.s fa5, v8 320; CHECK-NEXT: fmin.s fa0, fa0, fa5 321; CHECK-NEXT: ret 322entry: 323 %rdx = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> %v) 324 %res = call float @llvm.minnum.f32(float %x, float %rdx) 325 ret float %res 326} 327 328; Function Attrs: nofree nosync nounwind readnone willreturn 329declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) 330declare i64 @llvm.vector.reduce.and.v4i64(<4 x i64>) 331declare i64 @llvm.vector.reduce.or.v4i64(<4 x i64>) 332declare i64 @llvm.vector.reduce.xor.v4i64(<4 x i64>) 333declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) 334declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) 335declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) 336declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) 337declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) 338declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 339declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 340declare i64 @llvm.umax.i64(i64, i64) 341declare i64 @llvm.umin.i64(i64, i64) 342declare i64 @llvm.smax.i64(i64, i64) 343declare i64 @llvm.smin.i64(i64, i64) 344declare float @llvm.maxnum.f32(float ,float) 345declare float @llvm.minnum.f32(float ,float) 346 347define void @crash(<2 x i32> %0) { 348; CHECK-LABEL: crash: 349; CHECK: # %bb.0: # %entry 350; CHECK-NEXT: vsetivli zero, 1, e32, m1, ta, ma 351; CHECK-NEXT: vmv.x.s a0, v8 352; CHECK-NEXT: vsetivli zero, 4, e16, mf2, ta, ma 353; CHECK-NEXT: vmv.v.i v8, 0 354; CHECK-NEXT: vmv.s.x v9, a0 355; CHECK-NEXT: vredsum.vs v8, v8, v9 356; CHECK-NEXT: vmv.x.s a0, v8 357; CHECK-NEXT: sb a0, 0(zero) 358; CHECK-NEXT: ret 359entry: 360 %1 = extractelement <2 x i32> %0, i64 0 361 %2 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> zeroinitializer) 362 %3 = zext i16 %2 to i32 363 %op.rdx = add i32 %1, %3 364 %conv18.us = trunc i32 %op.rdx to i8 365 store i8 %conv18.us, ptr null, align 1 366 ret void 367} 368declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) 369 370define i64 @op_then_reduce(<4 x i64> %v, <4 x i64> %v2) { 371; CHECK-LABEL: op_then_reduce: 372; CHECK: # %bb.0: # %entry 373; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 374; CHECK-NEXT: vadd.vv v8, v8, v10 375; CHECK-NEXT: vmv.s.x v10, zero 376; CHECK-NEXT: vredsum.vs v8, v8, v10 377; CHECK-NEXT: vmv.x.s a0, v8 378; CHECK-NEXT: ret 379entry: 380 %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 381 %rdx2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v2) 382 %res = add i64 %rdx1, %rdx2 383 ret i64 %res 384} 385 386 387define i64 @two_reduce_scalar_bypass(<4 x i64> %v, <4 x i64> %v2) { 388; CHECK-LABEL: two_reduce_scalar_bypass: 389; CHECK: # %bb.0: # %entry 390; CHECK-NEXT: vsetivli zero, 4, e64, m2, ta, ma 391; CHECK-NEXT: vmv.s.x v12, zero 392; CHECK-NEXT: vredxor.vs v8, v8, v12 393; CHECK-NEXT: vredsum.vs v8, v10, v8 394; CHECK-NEXT: vmv.x.s a0, v8 395; CHECK-NEXT: ret 396entry: 397 %rdx1 = call i64 @llvm.vector.reduce.xor.v4i64(<4 x i64> %v) 398 %rdx2 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v2) 399 %res = add i64 %rdx1, %rdx2 400 ret i64 %res 401} 402 403define i64 @two_reduce_scalar_bypass_zext(<4 x i64> %v, <4 x i32> %v2) { 404; CHECK-LABEL: two_reduce_scalar_bypass_zext: 405; CHECK: # %bb.0: # %entry 406; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 407; CHECK-NEXT: vmv.s.x v11, zero 408; CHECK-NEXT: vredsum.vs v10, v10, v11 409; CHECK-NEXT: vmv.x.s a0, v10 410; CHECK-NEXT: slli a0, a0, 32 411; CHECK-NEXT: srli a0, a0, 32 412; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma 413; CHECK-NEXT: vmv.s.x v10, a0 414; CHECK-NEXT: vredsum.vs v8, v8, v10 415; CHECK-NEXT: vmv.x.s a0, v8 416; CHECK-NEXT: ret 417entry: 418 %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 419 %rdx2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v2) 420 %rdx2.zext = zext i32 %rdx2 to i64 421 %res = add i64 %rdx1, %rdx2.zext 422 ret i64 %res 423} 424 425define i64 @two_reduce_scalar_bypass_sext(<4 x i64> %v, <4 x i32> %v2) { 426; CHECK-LABEL: two_reduce_scalar_bypass_sext: 427; CHECK: # %bb.0: # %entry 428; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 429; CHECK-NEXT: vmv.s.x v11, zero 430; CHECK-NEXT: vredsum.vs v10, v10, v11 431; CHECK-NEXT: vmv.x.s a0, v10 432; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma 433; CHECK-NEXT: vmv.s.x v10, a0 434; CHECK-NEXT: vredsum.vs v8, v8, v10 435; CHECK-NEXT: vmv.x.s a0, v8 436; CHECK-NEXT: ret 437entry: 438 %rdx1 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %v) 439 %rdx2 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %v2) 440 %rdx2.zext = sext i32 %rdx2 to i64 441 %res = add i64 %rdx1, %rdx2.zext 442 ret i64 %res 443} 444