1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \ 3; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV32 4; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \ 5; RUN: -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,RV64 6 7define float @add_f32(<4 x float> %a, <4 x float> %b) { 8; CHECK-LABEL: add_f32: 9; CHECK: # %bb.0: 10; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 11; CHECK-NEXT: vfadd.vv v8, v8, v9 12; CHECK-NEXT: vmv.s.x v9, zero 13; CHECK-NEXT: vfredusum.vs v8, v8, v9 14; CHECK-NEXT: vfmv.f.s fa0, v8 15; CHECK-NEXT: ret 16 %r1 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %a) 17 %r2 = call fast float @llvm.vector.reduce.fadd.f32.v4f32(float -0.0, <4 x float> %b) 18 %r = fadd fast float %r1, %r2 19 ret float %r 20} 21 22define float @fmul_f32(<4 x float> %a, <4 x float> %b) { 23; CHECK-LABEL: fmul_f32: 24; CHECK: # %bb.0: 25; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 26; CHECK-NEXT: vslidedown.vi v10, v8, 2 27; CHECK-NEXT: vfmul.vv v8, v8, v10 28; CHECK-NEXT: vslidedown.vi v10, v9, 2 29; CHECK-NEXT: vfmul.vv v9, v9, v10 30; CHECK-NEXT: vrgather.vi v10, v8, 1 31; CHECK-NEXT: vfmul.vv v8, v8, v10 32; CHECK-NEXT: vrgather.vi v10, v9, 1 33; CHECK-NEXT: vfmul.vv v9, v9, v10 34; CHECK-NEXT: vfmv.f.s fa5, v8 35; CHECK-NEXT: vfmv.f.s fa4, v9 36; CHECK-NEXT: fmul.s fa0, fa5, fa4 37; CHECK-NEXT: ret 38 %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) 39 %r2 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %b) 40 %r = fmul fast float %r1, %r2 41 ret float %r 42} 43 44define float @fmin_f32(<4 x float> %a, <4 x float> %b) { 45; CHECK-LABEL: fmin_f32: 46; CHECK: # %bb.0: 47; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 48; CHECK-NEXT: vfmin.vv v8, v8, v9 49; CHECK-NEXT: vfredmin.vs v8, v8, v8 50; CHECK-NEXT: vfmv.f.s fa0, v8 51; CHECK-NEXT: ret 52 %r1 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %a) 53 %r2 = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> %b) 54 %r = call float @llvm.minnum.f32(float %r1, float %r2) 55 ret float %r 56} 57 58define float @fmax_f32(<4 x float> %a, <4 x float> %b) { 59; CHECK-LABEL: fmax_f32: 60; CHECK: # %bb.0: 61; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 62; CHECK-NEXT: vfmax.vv v8, v8, v9 63; CHECK-NEXT: vfredmax.vs v8, v8, v8 64; CHECK-NEXT: vfmv.f.s fa0, v8 65; CHECK-NEXT: ret 66 %r1 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %a) 67 %r2 = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> %b) 68 %r = call float @llvm.maxnum.f32(float %r1, float %r2) 69 ret float %r 70} 71 72 73define i32 @add_i32(<4 x i32> %a, <4 x i32> %b) { 74; CHECK-LABEL: add_i32: 75; CHECK: # %bb.0: 76; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 77; CHECK-NEXT: vadd.vv v8, v8, v9 78; CHECK-NEXT: vmv.s.x v9, zero 79; CHECK-NEXT: vredsum.vs v8, v8, v9 80; CHECK-NEXT: vmv.x.s a0, v8 81; CHECK-NEXT: ret 82 %r1 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %a) 83 %r2 = call i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32> %b) 84 %r = add i32 %r1, %r2 85 ret i32 %r 86} 87 88define i16 @add_ext_i16(<16 x i8> %a, <16 x i8> %b) { 89; CHECK-LABEL: add_ext_i16: 90; CHECK: # %bb.0: 91; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 92; CHECK-NEXT: vwaddu.vv v10, v8, v9 93; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 94; CHECK-NEXT: vmv.s.x v8, zero 95; CHECK-NEXT: vredsum.vs v8, v10, v8 96; CHECK-NEXT: vmv.x.s a0, v8 97; CHECK-NEXT: ret 98 %ae = zext <16 x i8> %a to <16 x i16> 99 %be = zext <16 x i8> %b to <16 x i16> 100 %r1 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %ae) 101 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 102 %r = add i16 %r1, %r2 103 ret i16 %r 104} 105 106define i16 @add_ext_v32i16(<32 x i8> %a, <16 x i8> %b) { 107; CHECK-LABEL: add_ext_v32i16: 108; CHECK: # %bb.0: 109; CHECK-NEXT: vsetivli zero, 16, e16, m1, ta, ma 110; CHECK-NEXT: vmv.s.x v11, zero 111; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, ma 112; CHECK-NEXT: vwredsumu.vs v10, v10, v11 113; CHECK-NEXT: li a0, 32 114; CHECK-NEXT: vsetvli zero, a0, e8, m2, ta, ma 115; CHECK-NEXT: vwredsumu.vs v8, v8, v10 116; CHECK-NEXT: vsetvli zero, zero, e16, m4, ta, ma 117; CHECK-NEXT: vmv.x.s a0, v8 118; CHECK-NEXT: ret 119 %ae = zext <32 x i8> %a to <32 x i16> 120 %be = zext <16 x i8> %b to <16 x i16> 121 %r1 = call i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16> %ae) 122 %r2 = call i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16> %be) 123 %r = add i16 %r1, %r2 124 ret i16 %r 125} 126 127define i32 @mul_i32(<4 x i32> %a, <4 x i32> %b) { 128; RV32-LABEL: mul_i32: 129; RV32: # %bb.0: 130; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma 131; RV32-NEXT: vslidedown.vi v10, v8, 2 132; RV32-NEXT: vmul.vv v8, v8, v10 133; RV32-NEXT: vslidedown.vi v10, v9, 2 134; RV32-NEXT: vmul.vv v9, v9, v10 135; RV32-NEXT: vrgather.vi v10, v8, 1 136; RV32-NEXT: vmul.vv v8, v8, v10 137; RV32-NEXT: vrgather.vi v10, v9, 1 138; RV32-NEXT: vmul.vv v9, v9, v10 139; RV32-NEXT: vmv.x.s a0, v8 140; RV32-NEXT: vmv.x.s a1, v9 141; RV32-NEXT: mul a0, a0, a1 142; RV32-NEXT: ret 143; 144; RV64-LABEL: mul_i32: 145; RV64: # %bb.0: 146; RV64-NEXT: vsetivli zero, 4, e32, m1, ta, ma 147; RV64-NEXT: vslidedown.vi v10, v8, 2 148; RV64-NEXT: vmul.vv v8, v8, v10 149; RV64-NEXT: vslidedown.vi v10, v9, 2 150; RV64-NEXT: vmul.vv v9, v9, v10 151; RV64-NEXT: vrgather.vi v10, v8, 1 152; RV64-NEXT: vmul.vv v8, v8, v10 153; RV64-NEXT: vrgather.vi v10, v9, 1 154; RV64-NEXT: vmul.vv v9, v9, v10 155; RV64-NEXT: vmv.x.s a0, v8 156; RV64-NEXT: vmv.x.s a1, v9 157; RV64-NEXT: mulw a0, a0, a1 158; RV64-NEXT: ret 159 %r1 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %a) 160 %r2 = call i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32> %b) 161 %r = mul i32 %r1, %r2 162 ret i32 %r 163} 164 165define i32 @and_i32(<4 x i32> %a, <4 x i32> %b) { 166; CHECK-LABEL: and_i32: 167; CHECK: # %bb.0: 168; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 169; CHECK-NEXT: vand.vv v8, v8, v9 170; CHECK-NEXT: vredand.vs v8, v8, v8 171; CHECK-NEXT: vmv.x.s a0, v8 172; CHECK-NEXT: ret 173 %r1 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %a) 174 %r2 = call i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32> %b) 175 %r = and i32 %r1, %r2 176 ret i32 %r 177} 178 179define i32 @or_i32(<4 x i32> %a, <4 x i32> %b) { 180; CHECK-LABEL: or_i32: 181; CHECK: # %bb.0: 182; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 183; CHECK-NEXT: vor.vv v8, v8, v9 184; CHECK-NEXT: vredor.vs v8, v8, v8 185; CHECK-NEXT: vmv.x.s a0, v8 186; CHECK-NEXT: ret 187 %r1 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %a) 188 %r2 = call i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32> %b) 189 %r = or i32 %r1, %r2 190 ret i32 %r 191} 192 193define i32 @xor_i32(<4 x i32> %a, <4 x i32> %b) { 194; CHECK-LABEL: xor_i32: 195; CHECK: # %bb.0: 196; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 197; CHECK-NEXT: vxor.vv v8, v8, v9 198; CHECK-NEXT: vmv.s.x v9, zero 199; CHECK-NEXT: vredxor.vs v8, v8, v9 200; CHECK-NEXT: vmv.x.s a0, v8 201; CHECK-NEXT: ret 202 %r1 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %a) 203 %r2 = call i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32> %b) 204 %r = xor i32 %r1, %r2 205 ret i32 %r 206} 207 208define i32 @umin_i32(<4 x i32> %a, <4 x i32> %b) { 209; CHECK-LABEL: umin_i32: 210; CHECK: # %bb.0: 211; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 212; CHECK-NEXT: vminu.vv v8, v8, v9 213; CHECK-NEXT: vredminu.vs v8, v8, v8 214; CHECK-NEXT: vmv.x.s a0, v8 215; CHECK-NEXT: ret 216 %r1 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %a) 217 %r2 = call i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32> %b) 218 %r = call i32 @llvm.umin.i32(i32 %r1, i32 %r2) 219 ret i32 %r 220} 221 222define i32 @umax_i32(<4 x i32> %a, <4 x i32> %b) { 223; CHECK-LABEL: umax_i32: 224; CHECK: # %bb.0: 225; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 226; CHECK-NEXT: vmaxu.vv v8, v8, v9 227; CHECK-NEXT: vredmaxu.vs v8, v8, v8 228; CHECK-NEXT: vmv.x.s a0, v8 229; CHECK-NEXT: ret 230 %r1 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %a) 231 %r2 = call i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32> %b) 232 %r = call i32 @llvm.umax.i32(i32 %r1, i32 %r2) 233 ret i32 %r 234} 235 236define i32 @smin_i32(<4 x i32> %a, <4 x i32> %b) { 237; CHECK-LABEL: smin_i32: 238; CHECK: # %bb.0: 239; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 240; CHECK-NEXT: vmin.vv v8, v8, v9 241; CHECK-NEXT: vredmin.vs v8, v8, v8 242; CHECK-NEXT: vmv.x.s a0, v8 243; CHECK-NEXT: ret 244 %r1 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %a) 245 %r2 = call i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32> %b) 246 %r = call i32 @llvm.smin.i32(i32 %r1, i32 %r2) 247 ret i32 %r 248} 249 250define i32 @smax_i32(<4 x i32> %a, <4 x i32> %b) { 251; CHECK-LABEL: smax_i32: 252; CHECK: # %bb.0: 253; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 254; CHECK-NEXT: vmax.vv v8, v8, v9 255; CHECK-NEXT: vredmax.vs v8, v8, v8 256; CHECK-NEXT: vmv.x.s a0, v8 257; CHECK-NEXT: ret 258 %r1 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %a) 259 %r2 = call i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32> %b) 260 %r = call i32 @llvm.smax.i32(i32 %r1, i32 %r2) 261 ret i32 %r 262} 263 264declare float @llvm.vector.reduce.fadd.f32.v4f32(float, <4 x float>) 265declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) 266declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) 267declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) 268declare i32 @llvm.vector.reduce.add.i32.v4i32(<4 x i32>) 269declare i16 @llvm.vector.reduce.add.i16.v32i16(<32 x i16>) 270declare i16 @llvm.vector.reduce.add.i16.v16i16(<16 x i16>) 271declare i32 @llvm.vector.reduce.mul.i32.v4i32(<4 x i32>) 272declare i32 @llvm.vector.reduce.and.i32.v4i32(<4 x i32>) 273declare i32 @llvm.vector.reduce.or.i32.v4i32(<4 x i32>) 274declare i32 @llvm.vector.reduce.xor.i32.v4i32(<4 x i32>) 275declare i32 @llvm.vector.reduce.umin.i32.v4i32(<4 x i32>) 276declare i32 @llvm.vector.reduce.umax.i32.v4i32(<4 x i32>) 277declare i32 @llvm.vector.reduce.smin.i32.v4i32(<4 x i32>) 278declare i32 @llvm.vector.reduce.smax.i32.v4i32(<4 x i32>) 279declare float @llvm.minnum.f32(float, float) 280declare float @llvm.maxnum.f32(float, float) 281declare i32 @llvm.umin.i32(i32, i32) 282declare i32 @llvm.umax.i32(i32, i32) 283declare i32 @llvm.smin.i32(i32, i32) 284declare i32 @llvm.smax.i32(i32, i32) 285