1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 2; RUN: llc < %s -mtriple=riscv32 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV32 3; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zvfh -verify-machineinstrs | FileCheck %s -check-prefixes=CHECK,RV64 4 5define i8 @extract_last_i8(<16 x i8> %data, <16 x i8> %mask, i8 %passthru) { 6; CHECK-LABEL: extract_last_i8: 7; CHECK: # %bb.0: 8; CHECK-NEXT: vsetivli zero, 16, e8, m1, ta, mu 9; CHECK-NEXT: vmsne.vi v0, v9, 0 10; CHECK-NEXT: vmv.v.i v9, 0 11; CHECK-NEXT: vcpop.m a1, v0 12; CHECK-NEXT: vid.v v9, v0.t 13; CHECK-NEXT: beqz a1, .LBB0_2 14; CHECK-NEXT: # %bb.1: 15; CHECK-NEXT: vredmaxu.vs v9, v9, v9 16; CHECK-NEXT: vmv.x.s a0, v9 17; CHECK-NEXT: andi a0, a0, 255 18; CHECK-NEXT: vslidedown.vx v8, v8, a0 19; CHECK-NEXT: vmv.x.s a0, v8 20; CHECK-NEXT: .LBB0_2: 21; CHECK-NEXT: ret 22 %notzero = icmp ne <16 x i8> %mask, zeroinitializer 23 %res = call i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8> %data, <16 x i1> %notzero, i8 %passthru) 24 ret i8 %res 25} 26 27define i16 @extract_last_i16(<8 x i16> %data, <8 x i16> %mask, i16 %passthru) { 28; CHECK-LABEL: extract_last_i16: 29; CHECK: # %bb.0: 30; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma 31; CHECK-NEXT: vmsne.vi v0, v9, 0 32; CHECK-NEXT: vsetvli zero, zero, e8, mf2, ta, mu 33; CHECK-NEXT: vmv.v.i v9, 0 34; CHECK-NEXT: vcpop.m a1, v0 35; CHECK-NEXT: vid.v v9, v0.t 36; CHECK-NEXT: beqz a1, .LBB1_2 37; CHECK-NEXT: # %bb.1: 38; CHECK-NEXT: vredmaxu.vs v9, v9, v9 39; CHECK-NEXT: vmv.x.s a0, v9 40; CHECK-NEXT: andi a0, a0, 255 41; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma 42; CHECK-NEXT: vslidedown.vx v8, v8, a0 43; CHECK-NEXT: vmv.x.s a0, v8 44; CHECK-NEXT: .LBB1_2: 45; CHECK-NEXT: ret 46 %notzero = icmp ne <8 x i16> %mask, zeroinitializer 47 %res = call i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16> %data, <8 x i1> %notzero, i16 %passthru) 48 ret i16 %res 49} 50 51define i32 @extract_last_i32(<4 x i32> %data, <4 x i32> %mask, i32 %passthru) { 52; CHECK-LABEL: extract_last_i32: 53; CHECK: # %bb.0: 54; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 55; CHECK-NEXT: vmsne.vi v0, v9, 0 56; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu 57; CHECK-NEXT: vmv.v.i v9, 0 58; CHECK-NEXT: vcpop.m a1, v0 59; CHECK-NEXT: vid.v v9, v0.t 60; CHECK-NEXT: beqz a1, .LBB2_2 61; CHECK-NEXT: # %bb.1: 62; CHECK-NEXT: vredmaxu.vs v9, v9, v9 63; CHECK-NEXT: vmv.x.s a0, v9 64; CHECK-NEXT: andi a0, a0, 255 65; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 66; CHECK-NEXT: vslidedown.vx v8, v8, a0 67; CHECK-NEXT: vmv.x.s a0, v8 68; CHECK-NEXT: .LBB2_2: 69; CHECK-NEXT: ret 70 %notzero = icmp ne <4 x i32> %mask, zeroinitializer 71 %res = call i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32> %data, <4 x i1> %notzero, i32 %passthru) 72 ret i32 %res 73} 74 75define i64 @extract_last_i64(<2 x i64> %data, <2 x i64> %mask, i64 %passthru) { 76; RV32-LABEL: extract_last_i64: 77; RV32: # %bb.0: 78; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma 79; RV32-NEXT: vmsne.vi v0, v9, 0 80; RV32-NEXT: vsetvli zero, zero, e8, mf8, ta, mu 81; RV32-NEXT: vmv.v.i v9, 0 82; RV32-NEXT: vcpop.m a2, v0 83; RV32-NEXT: vid.v v9, v0.t 84; RV32-NEXT: beqz a2, .LBB3_2 85; RV32-NEXT: # %bb.1: 86; RV32-NEXT: vredmaxu.vs v9, v9, v9 87; RV32-NEXT: li a1, 32 88; RV32-NEXT: vmv.x.s a0, v9 89; RV32-NEXT: andi a0, a0, 255 90; RV32-NEXT: vsetvli zero, zero, e64, m1, ta, ma 91; RV32-NEXT: vslidedown.vx v8, v8, a0 92; RV32-NEXT: vmv.x.s a0, v8 93; RV32-NEXT: vsetivli zero, 1, e64, m1, ta, ma 94; RV32-NEXT: vsrl.vx v8, v8, a1 95; RV32-NEXT: vmv.x.s a1, v8 96; RV32-NEXT: .LBB3_2: 97; RV32-NEXT: ret 98; 99; RV64-LABEL: extract_last_i64: 100; RV64: # %bb.0: 101; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma 102; RV64-NEXT: vmsne.vi v0, v9, 0 103; RV64-NEXT: vsetvli zero, zero, e8, mf8, ta, mu 104; RV64-NEXT: vmv.v.i v9, 0 105; RV64-NEXT: vcpop.m a1, v0 106; RV64-NEXT: vid.v v9, v0.t 107; RV64-NEXT: beqz a1, .LBB3_2 108; RV64-NEXT: # %bb.1: 109; RV64-NEXT: vredmaxu.vs v9, v9, v9 110; RV64-NEXT: vmv.x.s a0, v9 111; RV64-NEXT: andi a0, a0, 255 112; RV64-NEXT: vsetvli zero, zero, e64, m1, ta, ma 113; RV64-NEXT: vslidedown.vx v8, v8, a0 114; RV64-NEXT: vmv.x.s a0, v8 115; RV64-NEXT: .LBB3_2: 116; RV64-NEXT: ret 117 %notzero = icmp ne <2 x i64> %mask, zeroinitializer 118 %res = call i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64> %data, <2 x i1> %notzero, i64 %passthru) 119 ret i64 %res 120} 121 122define float @extract_last_float(<4 x float> %data, <4 x i32> %mask, float %passthru) { 123; CHECK-LABEL: extract_last_float: 124; CHECK: # %bb.0: 125; CHECK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 126; CHECK-NEXT: vmsne.vi v0, v9, 0 127; CHECK-NEXT: vsetvli zero, zero, e8, mf4, ta, mu 128; CHECK-NEXT: vmv.v.i v9, 0 129; CHECK-NEXT: vcpop.m a0, v0 130; CHECK-NEXT: vid.v v9, v0.t 131; CHECK-NEXT: beqz a0, .LBB4_2 132; CHECK-NEXT: # %bb.1: 133; CHECK-NEXT: vredmaxu.vs v9, v9, v9 134; CHECK-NEXT: vmv.x.s a0, v9 135; CHECK-NEXT: andi a0, a0, 255 136; CHECK-NEXT: vsetvli zero, zero, e32, m1, ta, ma 137; CHECK-NEXT: vslidedown.vx v8, v8, a0 138; CHECK-NEXT: vfmv.f.s fa0, v8 139; CHECK-NEXT: .LBB4_2: 140; CHECK-NEXT: ret 141 %notzero = icmp ne <4 x i32> %mask, zeroinitializer 142 %res = call float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float> %data, <4 x i1> %notzero, float %passthru) 143 ret float %res 144} 145 146define double @extract_last_double(<2 x double> %data, <2 x i64> %mask, double %passthru) { 147; CHECK-LABEL: extract_last_double: 148; CHECK: # %bb.0: 149; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma 150; CHECK-NEXT: vmsne.vi v0, v9, 0 151; CHECK-NEXT: vsetvli zero, zero, e8, mf8, ta, mu 152; CHECK-NEXT: vmv.v.i v9, 0 153; CHECK-NEXT: vcpop.m a0, v0 154; CHECK-NEXT: vid.v v9, v0.t 155; CHECK-NEXT: beqz a0, .LBB5_2 156; CHECK-NEXT: # %bb.1: 157; CHECK-NEXT: vredmaxu.vs v9, v9, v9 158; CHECK-NEXT: vmv.x.s a0, v9 159; CHECK-NEXT: andi a0, a0, 255 160; CHECK-NEXT: vsetvli zero, zero, e64, m1, ta, ma 161; CHECK-NEXT: vslidedown.vx v8, v8, a0 162; CHECK-NEXT: vfmv.f.s fa0, v8 163; CHECK-NEXT: .LBB5_2: 164; CHECK-NEXT: ret 165 %notzero = icmp ne <2 x i64> %mask, zeroinitializer 166 %res = call double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double> %data, <2 x i1> %notzero, double %passthru) 167 ret double %res 168} 169 170define i8 @extract_last_i8_scalable(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) { 171; CHECK-LABEL: extract_last_i8_scalable: 172; CHECK: # %bb.0: 173; CHECK-NEXT: vsetvli a1, zero, e8, m2, ta, mu 174; CHECK-NEXT: vmv.v.i v10, 0 175; CHECK-NEXT: vcpop.m a1, v0 176; CHECK-NEXT: vid.v v10, v0.t 177; CHECK-NEXT: beqz a1, .LBB6_2 178; CHECK-NEXT: # %bb.1: 179; CHECK-NEXT: vredmaxu.vs v10, v10, v10 180; CHECK-NEXT: vmv.x.s a0, v10 181; CHECK-NEXT: andi a0, a0, 255 182; CHECK-NEXT: vsetvli zero, zero, e8, m2, ta, ma 183; CHECK-NEXT: vslidedown.vx v8, v8, a0 184; CHECK-NEXT: vmv.x.s a0, v8 185; CHECK-NEXT: .LBB6_2: 186; CHECK-NEXT: ret 187 %res = call i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8> %data, <vscale x 16 x i1> %mask, i8 %passthru) 188 ret i8 %res 189} 190 191define i16 @extract_last_i16_scalable(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) { 192; CHECK-LABEL: extract_last_i16_scalable: 193; CHECK: # %bb.0: 194; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu 195; CHECK-NEXT: vmv.v.i v10, 0 196; CHECK-NEXT: vcpop.m a1, v0 197; CHECK-NEXT: vid.v v10, v0.t 198; CHECK-NEXT: beqz a1, .LBB7_2 199; CHECK-NEXT: # %bb.1: 200; CHECK-NEXT: vredmaxu.vs v10, v10, v10 201; CHECK-NEXT: vmv.x.s a0, v10 202; CHECK-NEXT: andi a0, a0, 255 203; CHECK-NEXT: vsetvli zero, zero, e16, m2, ta, ma 204; CHECK-NEXT: vslidedown.vx v8, v8, a0 205; CHECK-NEXT: vmv.x.s a0, v8 206; CHECK-NEXT: .LBB7_2: 207; CHECK-NEXT: ret 208 %res = call i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16> %data, <vscale x 8 x i1> %mask, i16 %passthru) 209 ret i16 %res 210} 211 212define i32 @extract_last_i32_scalable(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) { 213; CHECK-LABEL: extract_last_i32_scalable: 214; CHECK: # %bb.0: 215; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, mu 216; CHECK-NEXT: vmv.v.i v10, 0 217; CHECK-NEXT: vcpop.m a1, v0 218; CHECK-NEXT: vid.v v10, v0.t 219; CHECK-NEXT: beqz a1, .LBB8_2 220; CHECK-NEXT: # %bb.1: 221; CHECK-NEXT: vredmaxu.vs v10, v10, v10 222; CHECK-NEXT: vmv.x.s a0, v10 223; CHECK-NEXT: andi a0, a0, 255 224; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma 225; CHECK-NEXT: vslidedown.vx v8, v8, a0 226; CHECK-NEXT: vmv.x.s a0, v8 227; CHECK-NEXT: .LBB8_2: 228; CHECK-NEXT: ret 229 %res = call i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32> %data, <vscale x 4 x i1> %mask, i32 %passthru) 230 ret i32 %res 231} 232 233define i64 @extract_last_i64_scalable(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) { 234; RV32-LABEL: extract_last_i64_scalable: 235; RV32: # %bb.0: 236; RV32-NEXT: vsetvli a2, zero, e8, mf4, ta, mu 237; RV32-NEXT: vmv.v.i v10, 0 238; RV32-NEXT: vcpop.m a2, v0 239; RV32-NEXT: vid.v v10, v0.t 240; RV32-NEXT: beqz a2, .LBB9_2 241; RV32-NEXT: # %bb.1: 242; RV32-NEXT: vredmaxu.vs v10, v10, v10 243; RV32-NEXT: li a1, 32 244; RV32-NEXT: vmv.x.s a0, v10 245; RV32-NEXT: andi a0, a0, 255 246; RV32-NEXT: vsetvli zero, zero, e64, m2, ta, ma 247; RV32-NEXT: vslidedown.vx v8, v8, a0 248; RV32-NEXT: vmv.x.s a0, v8 249; RV32-NEXT: vsetivli zero, 1, e64, m2, ta, ma 250; RV32-NEXT: vsrl.vx v8, v8, a1 251; RV32-NEXT: vmv.x.s a1, v8 252; RV32-NEXT: .LBB9_2: 253; RV32-NEXT: ret 254; 255; RV64-LABEL: extract_last_i64_scalable: 256; RV64: # %bb.0: 257; RV64-NEXT: vsetvli a1, zero, e8, mf4, ta, mu 258; RV64-NEXT: vmv.v.i v10, 0 259; RV64-NEXT: vcpop.m a1, v0 260; RV64-NEXT: vid.v v10, v0.t 261; RV64-NEXT: beqz a1, .LBB9_2 262; RV64-NEXT: # %bb.1: 263; RV64-NEXT: vredmaxu.vs v10, v10, v10 264; RV64-NEXT: vmv.x.s a0, v10 265; RV64-NEXT: andi a0, a0, 255 266; RV64-NEXT: vsetvli zero, zero, e64, m2, ta, ma 267; RV64-NEXT: vslidedown.vx v8, v8, a0 268; RV64-NEXT: vmv.x.s a0, v8 269; RV64-NEXT: .LBB9_2: 270; RV64-NEXT: ret 271 %res = call i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64> %data, <vscale x 2 x i1> %mask, i64 %passthru) 272 ret i64 %res 273} 274 275define float @extract_last_float_scalable(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) { 276; CHECK-LABEL: extract_last_float_scalable: 277; CHECK: # %bb.0: 278; CHECK-NEXT: vsetvli a0, zero, e8, mf2, ta, mu 279; CHECK-NEXT: vmv.v.i v10, 0 280; CHECK-NEXT: vcpop.m a0, v0 281; CHECK-NEXT: vid.v v10, v0.t 282; CHECK-NEXT: beqz a0, .LBB10_2 283; CHECK-NEXT: # %bb.1: 284; CHECK-NEXT: vredmaxu.vs v10, v10, v10 285; CHECK-NEXT: vmv.x.s a0, v10 286; CHECK-NEXT: andi a0, a0, 255 287; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma 288; CHECK-NEXT: vslidedown.vx v8, v8, a0 289; CHECK-NEXT: vfmv.f.s fa0, v8 290; CHECK-NEXT: .LBB10_2: 291; CHECK-NEXT: ret 292 %res = call float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float> %data, <vscale x 4 x i1> %mask, float %passthru) 293 ret float %res 294} 295 296define double @extract_last_double_scalable(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) { 297; CHECK-LABEL: extract_last_double_scalable: 298; CHECK: # %bb.0: 299; CHECK-NEXT: vsetvli a0, zero, e8, mf4, ta, mu 300; CHECK-NEXT: vmv.v.i v10, 0 301; CHECK-NEXT: vcpop.m a0, v0 302; CHECK-NEXT: vid.v v10, v0.t 303; CHECK-NEXT: beqz a0, .LBB11_2 304; CHECK-NEXT: # %bb.1: 305; CHECK-NEXT: vredmaxu.vs v10, v10, v10 306; CHECK-NEXT: vmv.x.s a0, v10 307; CHECK-NEXT: andi a0, a0, 255 308; CHECK-NEXT: vsetvli zero, zero, e64, m2, ta, ma 309; CHECK-NEXT: vslidedown.vx v8, v8, a0 310; CHECK-NEXT: vfmv.f.s fa0, v8 311; CHECK-NEXT: .LBB11_2: 312; CHECK-NEXT: ret 313 %res = call double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double> %data, <vscale x 2 x i1> %mask, double %passthru) 314 ret double %res 315} 316 317declare i8 @llvm.experimental.vector.extract.last.active.v16i8(<16 x i8>, <16 x i1>, i8) 318declare i16 @llvm.experimental.vector.extract.last.active.v8i16(<8 x i16>, <8 x i1>, i16) 319declare i32 @llvm.experimental.vector.extract.last.active.v4i32(<4 x i32>, <4 x i1>, i32) 320declare i64 @llvm.experimental.vector.extract.last.active.v2i64(<2 x i64>, <2 x i1>, i64) 321declare float @llvm.experimental.vector.extract.last.active.v4f32(<4 x float>, <4 x i1>, float) 322declare double @llvm.experimental.vector.extract.last.active.v2f64(<2 x double>, <2 x i1>, double) 323declare i8 @llvm.experimental.vector.extract.last.active.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8) 324declare i16 @llvm.experimental.vector.extract.last.active.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16) 325declare i32 @llvm.experimental.vector.extract.last.active.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i1>, i32) 326declare i64 @llvm.experimental.vector.extract.last.active.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i1>, i64) 327declare float @llvm.experimental.vector.extract.last.active.nxv4f32(<vscale x 4 x float>, <vscale x 4 x i1>, float) 328declare double @llvm.experimental.vector.extract.last.active.nxv2f64(<vscale x 2 x double>, <vscale x 2 x i1>, double) 329