1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \ 3; RUN: -mattr=+no-sink-splat-operands -riscv-v-vector-bits-min=128 \ 4; RUN: | FileCheck -check-prefix=NO-SINK %s 5; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \ 6; RUN: -mattr=-no-sink-splat-operands -riscv-v-vector-bits-min=128 \ 7; RUN: | FileCheck -check-prefix=SINK %s 8; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+v,+f -target-abi=lp64f \ 9; RUN: -riscv-v-vector-bits-min=128 \ 10; RUN: | FileCheck -check-prefix=DEFAULT %s 11 12; Test that we don't sink splat operands when compiling with no-sink-splat-operands. 13; Each scalar register access requires a S2V transfer buffer entry. Using too many 14; limits performance. 15; FIXME: This is potentially bad for register pressure. Need a better heuristic. 16 17define void @sink_splat_add(ptr nocapture %a, i32 signext %x) { 18; NO-SINK-LABEL: sink_splat_add: 19; NO-SINK: # %bb.0: # %entry 20; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 21; NO-SINK-NEXT: vmv.v.x v8, a1 22; NO-SINK-NEXT: lui a1, 1 23; NO-SINK-NEXT: add a1, a0, a1 24; NO-SINK-NEXT: .LBB0_1: # %vector.body 25; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 26; NO-SINK-NEXT: vle32.v v9, (a0) 27; NO-SINK-NEXT: vadd.vv v9, v9, v8 28; NO-SINK-NEXT: vse32.v v9, (a0) 29; NO-SINK-NEXT: addi a0, a0, 16 30; NO-SINK-NEXT: bne a0, a1, .LBB0_1 31; NO-SINK-NEXT: # %bb.2: # %for.cond.cleanup 32; NO-SINK-NEXT: ret 33; 34; SINK-LABEL: sink_splat_add: 35; SINK: # %bb.0: # %entry 36; SINK-NEXT: lui a2, 1 37; SINK-NEXT: add a2, a0, a2 38; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 39; SINK-NEXT: .LBB0_1: # %vector.body 40; SINK-NEXT: # =>This Inner Loop Header: Depth=1 41; SINK-NEXT: vle32.v v8, (a0) 42; SINK-NEXT: vadd.vx v8, v8, a1 43; SINK-NEXT: vse32.v v8, (a0) 44; SINK-NEXT: addi a0, a0, 16 45; SINK-NEXT: bne a0, a2, .LBB0_1 46; SINK-NEXT: # %bb.2: # %for.cond.cleanup 47; SINK-NEXT: ret 48; 49; DEFAULT-LABEL: sink_splat_add: 50; DEFAULT: # %bb.0: # %entry 51; DEFAULT-NEXT: lui a2, 1 52; DEFAULT-NEXT: add a2, a0, a2 53; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 54; DEFAULT-NEXT: .LBB0_1: # %vector.body 55; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 56; DEFAULT-NEXT: vle32.v v8, (a0) 57; DEFAULT-NEXT: vadd.vx v8, v8, a1 58; DEFAULT-NEXT: vse32.v v8, (a0) 59; DEFAULT-NEXT: addi a0, a0, 16 60; DEFAULT-NEXT: bne a0, a2, .LBB0_1 61; DEFAULT-NEXT: # %bb.2: # %for.cond.cleanup 62; DEFAULT-NEXT: ret 63entry: 64 %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 65 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer 66 br label %vector.body 67 68vector.body: ; preds = %vector.body, %entry 69 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 70 %0 = getelementptr inbounds i32, ptr %a, i64 %index 71 %1 = bitcast ptr %0 to ptr 72 %wide.load = load <4 x i32>, ptr %1, align 4 73 %2 = add <4 x i32> %wide.load, %broadcast.splat 74 %3 = bitcast ptr %0 to ptr 75 store <4 x i32> %2, ptr %3, align 4 76 %index.next = add nuw i64 %index, 4 77 %4 = icmp eq i64 %index.next, 1024 78 br i1 %4, label %for.cond.cleanup, label %vector.body 79 80for.cond.cleanup: ; preds = %vector.body 81 ret void 82} 83 84declare i64 @llvm.vscale.i64() 85 86define void @sink_splat_add_scalable(ptr nocapture %a, i32 signext %x) { 87; NO-SINK-LABEL: sink_splat_add_scalable: 88; NO-SINK: # %bb.0: # %entry 89; NO-SINK-NEXT: csrr a5, vlenb 90; NO-SINK-NEXT: srli a3, a5, 1 91; NO-SINK-NEXT: li a2, 1024 92; NO-SINK-NEXT: bgeu a2, a3, .LBB1_2 93; NO-SINK-NEXT: # %bb.1: 94; NO-SINK-NEXT: li a2, 0 95; NO-SINK-NEXT: j .LBB1_5 96; NO-SINK-NEXT: .LBB1_2: # %vector.ph 97; NO-SINK-NEXT: addi a2, a3, -1 98; NO-SINK-NEXT: andi a4, a2, 1024 99; NO-SINK-NEXT: xori a2, a4, 1024 100; NO-SINK-NEXT: vsetvli a6, zero, e32, m2, ta, ma 101; NO-SINK-NEXT: vmv.v.x v8, a1 102; NO-SINK-NEXT: slli a5, a5, 1 103; NO-SINK-NEXT: mv a6, a0 104; NO-SINK-NEXT: mv a7, a2 105; NO-SINK-NEXT: .LBB1_3: # %vector.body 106; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 107; NO-SINK-NEXT: vl2re32.v v10, (a6) 108; NO-SINK-NEXT: sub a7, a7, a3 109; NO-SINK-NEXT: vadd.vv v10, v10, v8 110; NO-SINK-NEXT: vs2r.v v10, (a6) 111; NO-SINK-NEXT: add a6, a6, a5 112; NO-SINK-NEXT: bnez a7, .LBB1_3 113; NO-SINK-NEXT: # %bb.4: # %middle.block 114; NO-SINK-NEXT: beqz a4, .LBB1_7 115; NO-SINK-NEXT: .LBB1_5: # %for.body.preheader 116; NO-SINK-NEXT: slli a2, a2, 2 117; NO-SINK-NEXT: lui a3, 1 118; NO-SINK-NEXT: add a2, a0, a2 119; NO-SINK-NEXT: add a0, a0, a3 120; NO-SINK-NEXT: .LBB1_6: # %for.body 121; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 122; NO-SINK-NEXT: lw a3, 0(a2) 123; NO-SINK-NEXT: add a3, a3, a1 124; NO-SINK-NEXT: sw a3, 0(a2) 125; NO-SINK-NEXT: addi a2, a2, 4 126; NO-SINK-NEXT: bne a2, a0, .LBB1_6 127; NO-SINK-NEXT: .LBB1_7: # %for.cond.cleanup 128; NO-SINK-NEXT: ret 129; 130; SINK-LABEL: sink_splat_add_scalable: 131; SINK: # %bb.0: # %entry 132; SINK-NEXT: csrr a5, vlenb 133; SINK-NEXT: srli a3, a5, 1 134; SINK-NEXT: li a2, 1024 135; SINK-NEXT: bgeu a2, a3, .LBB1_2 136; SINK-NEXT: # %bb.1: 137; SINK-NEXT: li a2, 0 138; SINK-NEXT: j .LBB1_5 139; SINK-NEXT: .LBB1_2: # %vector.ph 140; SINK-NEXT: addi a2, a3, -1 141; SINK-NEXT: andi a4, a2, 1024 142; SINK-NEXT: xori a2, a4, 1024 143; SINK-NEXT: slli a5, a5, 1 144; SINK-NEXT: mv a6, a0 145; SINK-NEXT: mv a7, a2 146; SINK-NEXT: vsetvli t0, zero, e32, m2, ta, ma 147; SINK-NEXT: .LBB1_3: # %vector.body 148; SINK-NEXT: # =>This Inner Loop Header: Depth=1 149; SINK-NEXT: vl2re32.v v8, (a6) 150; SINK-NEXT: sub a7, a7, a3 151; SINK-NEXT: vadd.vx v8, v8, a1 152; SINK-NEXT: vs2r.v v8, (a6) 153; SINK-NEXT: add a6, a6, a5 154; SINK-NEXT: bnez a7, .LBB1_3 155; SINK-NEXT: # %bb.4: # %middle.block 156; SINK-NEXT: beqz a4, .LBB1_7 157; SINK-NEXT: .LBB1_5: # %for.body.preheader 158; SINK-NEXT: slli a2, a2, 2 159; SINK-NEXT: lui a3, 1 160; SINK-NEXT: add a2, a0, a2 161; SINK-NEXT: add a0, a0, a3 162; SINK-NEXT: .LBB1_6: # %for.body 163; SINK-NEXT: # =>This Inner Loop Header: Depth=1 164; SINK-NEXT: lw a3, 0(a2) 165; SINK-NEXT: add a3, a3, a1 166; SINK-NEXT: sw a3, 0(a2) 167; SINK-NEXT: addi a2, a2, 4 168; SINK-NEXT: bne a2, a0, .LBB1_6 169; SINK-NEXT: .LBB1_7: # %for.cond.cleanup 170; SINK-NEXT: ret 171; 172; DEFAULT-LABEL: sink_splat_add_scalable: 173; DEFAULT: # %bb.0: # %entry 174; DEFAULT-NEXT: csrr a5, vlenb 175; DEFAULT-NEXT: srli a3, a5, 1 176; DEFAULT-NEXT: li a2, 1024 177; DEFAULT-NEXT: bgeu a2, a3, .LBB1_2 178; DEFAULT-NEXT: # %bb.1: 179; DEFAULT-NEXT: li a2, 0 180; DEFAULT-NEXT: j .LBB1_5 181; DEFAULT-NEXT: .LBB1_2: # %vector.ph 182; DEFAULT-NEXT: addi a2, a3, -1 183; DEFAULT-NEXT: andi a4, a2, 1024 184; DEFAULT-NEXT: xori a2, a4, 1024 185; DEFAULT-NEXT: slli a5, a5, 1 186; DEFAULT-NEXT: mv a6, a0 187; DEFAULT-NEXT: mv a7, a2 188; DEFAULT-NEXT: vsetvli t0, zero, e32, m2, ta, ma 189; DEFAULT-NEXT: .LBB1_3: # %vector.body 190; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 191; DEFAULT-NEXT: vl2re32.v v8, (a6) 192; DEFAULT-NEXT: sub a7, a7, a3 193; DEFAULT-NEXT: vadd.vx v8, v8, a1 194; DEFAULT-NEXT: vs2r.v v8, (a6) 195; DEFAULT-NEXT: add a6, a6, a5 196; DEFAULT-NEXT: bnez a7, .LBB1_3 197; DEFAULT-NEXT: # %bb.4: # %middle.block 198; DEFAULT-NEXT: beqz a4, .LBB1_7 199; DEFAULT-NEXT: .LBB1_5: # %for.body.preheader 200; DEFAULT-NEXT: slli a2, a2, 2 201; DEFAULT-NEXT: lui a3, 1 202; DEFAULT-NEXT: add a2, a0, a2 203; DEFAULT-NEXT: add a0, a0, a3 204; DEFAULT-NEXT: .LBB1_6: # %for.body 205; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 206; DEFAULT-NEXT: lw a3, 0(a2) 207; DEFAULT-NEXT: add a3, a3, a1 208; DEFAULT-NEXT: sw a3, 0(a2) 209; DEFAULT-NEXT: addi a2, a2, 4 210; DEFAULT-NEXT: bne a2, a0, .LBB1_6 211; DEFAULT-NEXT: .LBB1_7: # %for.cond.cleanup 212; DEFAULT-NEXT: ret 213entry: 214 %0 = call i64 @llvm.vscale.i64() 215 %1 = shl i64 %0, 2 216 %min.iters.check = icmp ugt i64 %1, 1024 217 br i1 %min.iters.check, label %for.body.preheader, label %vector.ph 218 219vector.ph: ; preds = %entry 220 %2 = call i64 @llvm.vscale.i64() 221 %3 = shl i64 %2, 2 222 %n.mod.vf = urem i64 1024, %3 223 %n.vec = sub nsw i64 1024, %n.mod.vf 224 %broadcast.splatinsert = insertelement <vscale x 4 x i32> poison, i32 %x, i32 0 225 %broadcast.splat = shufflevector <vscale x 4 x i32> %broadcast.splatinsert, <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer 226 %4 = call i64 @llvm.vscale.i64() 227 %5 = shl i64 %4, 2 228 br label %vector.body 229 230vector.body: ; preds = %vector.body, %vector.ph 231 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 232 %6 = getelementptr inbounds i32, ptr %a, i64 %index 233 %7 = bitcast ptr %6 to ptr 234 %wide.load = load <vscale x 4 x i32>, ptr %7, align 4 235 %8 = add <vscale x 4 x i32> %wide.load, %broadcast.splat 236 %9 = bitcast ptr %6 to ptr 237 store <vscale x 4 x i32> %8, ptr %9, align 4 238 %index.next = add nuw i64 %index, %5 239 %10 = icmp eq i64 %index.next, %n.vec 240 br i1 %10, label %middle.block, label %vector.body 241 242middle.block: ; preds = %vector.body 243 %cmp.n = icmp eq i64 %n.mod.vf, 0 244 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader 245 246for.body.preheader: ; preds = %entry, %middle.block 247 %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] 248 br label %for.body 249 250for.cond.cleanup: ; preds = %for.body, %middle.block 251 ret void 252 253for.body: ; preds = %for.body.preheader, %for.body 254 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] 255 %arrayidx = getelementptr inbounds i32, ptr %a, i64 %indvars.iv 256 %11 = load i32, ptr %arrayidx, align 4 257 %add = add i32 %11, %x 258 store i32 %add, ptr %arrayidx, align 4 259 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 260 %cmp.not = icmp eq i64 %indvars.iv.next, 1024 261 br i1 %cmp.not, label %for.cond.cleanup, label %for.body 262} 263 264declare <4 x i32> @llvm.vp.add.v4i32(<4 x i32>, <4 x i32>, <4 x i1>, i32) 265 266define void @sink_splat_vp_add(ptr nocapture %a, i32 signext %x, <4 x i1> %m, i32 zeroext %vl) { 267; NO-SINK-LABEL: sink_splat_vp_add: 268; NO-SINK: # %bb.0: # %entry 269; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 270; NO-SINK-NEXT: vmv.v.x v8, a1 271; NO-SINK-NEXT: lui a1, 1 272; NO-SINK-NEXT: add a1, a0, a1 273; NO-SINK-NEXT: .LBB2_1: # %vector.body 274; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 275; NO-SINK-NEXT: vle32.v v9, (a0) 276; NO-SINK-NEXT: vsetvli zero, a2, e32, m1, ta, ma 277; NO-SINK-NEXT: vadd.vv v9, v9, v8, v0.t 278; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 279; NO-SINK-NEXT: vse32.v v9, (a0) 280; NO-SINK-NEXT: addi a0, a0, 16 281; NO-SINK-NEXT: bne a0, a1, .LBB2_1 282; NO-SINK-NEXT: # %bb.2: # %for.cond.cleanup 283; NO-SINK-NEXT: ret 284; 285; SINK-LABEL: sink_splat_vp_add: 286; SINK: # %bb.0: # %entry 287; SINK-NEXT: lui a3, 1 288; SINK-NEXT: add a3, a0, a3 289; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 290; SINK-NEXT: .LBB2_1: # %vector.body 291; SINK-NEXT: # =>This Inner Loop Header: Depth=1 292; SINK-NEXT: vle32.v v8, (a0) 293; SINK-NEXT: vsetvli zero, a2, e32, m1, ta, ma 294; SINK-NEXT: vadd.vx v8, v8, a1, v0.t 295; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 296; SINK-NEXT: vse32.v v8, (a0) 297; SINK-NEXT: addi a0, a0, 16 298; SINK-NEXT: bne a0, a3, .LBB2_1 299; SINK-NEXT: # %bb.2: # %for.cond.cleanup 300; SINK-NEXT: ret 301; 302; DEFAULT-LABEL: sink_splat_vp_add: 303; DEFAULT: # %bb.0: # %entry 304; DEFAULT-NEXT: lui a3, 1 305; DEFAULT-NEXT: add a3, a0, a3 306; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 307; DEFAULT-NEXT: .LBB2_1: # %vector.body 308; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 309; DEFAULT-NEXT: vle32.v v8, (a0) 310; DEFAULT-NEXT: vsetvli zero, a2, e32, m1, ta, ma 311; DEFAULT-NEXT: vadd.vx v8, v8, a1, v0.t 312; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 313; DEFAULT-NEXT: vse32.v v8, (a0) 314; DEFAULT-NEXT: addi a0, a0, 16 315; DEFAULT-NEXT: bne a0, a3, .LBB2_1 316; DEFAULT-NEXT: # %bb.2: # %for.cond.cleanup 317; DEFAULT-NEXT: ret 318entry: 319 %broadcast.splatinsert = insertelement <4 x i32> poison, i32 %x, i32 0 320 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> poison, <4 x i32> zeroinitializer 321 br label %vector.body 322 323vector.body: ; preds = %vector.body, %entry 324 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 325 %0 = getelementptr inbounds i32, ptr %a, i64 %index 326 %1 = bitcast ptr %0 to ptr 327 %wide.load = load <4 x i32>, ptr %1, align 4 328 %2 = call <4 x i32> @llvm.vp.add.v4i32(<4 x i32> %wide.load, <4 x i32> %broadcast.splat, <4 x i1> %m, i32 %vl) 329 %3 = bitcast ptr %0 to ptr 330 store <4 x i32> %2, ptr %3, align 4 331 %index.next = add nuw i64 %index, 4 332 %4 = icmp eq i64 %index.next, 1024 333 br i1 %4, label %for.cond.cleanup, label %vector.body 334 335for.cond.cleanup: ; preds = %vector.body 336 ret void 337} 338 339define void @sink_splat_fadd(ptr nocapture %a, float %x) { 340; NO-SINK-LABEL: sink_splat_fadd: 341; NO-SINK: # %bb.0: # %entry 342; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 343; NO-SINK-NEXT: vfmv.v.f v8, fa0 344; NO-SINK-NEXT: lui a1, 1 345; NO-SINK-NEXT: add a1, a0, a1 346; NO-SINK-NEXT: .LBB3_1: # %vector.body 347; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 348; NO-SINK-NEXT: vle32.v v9, (a0) 349; NO-SINK-NEXT: vfadd.vv v9, v9, v8 350; NO-SINK-NEXT: vse32.v v9, (a0) 351; NO-SINK-NEXT: addi a0, a0, 16 352; NO-SINK-NEXT: bne a0, a1, .LBB3_1 353; NO-SINK-NEXT: # %bb.2: # %for.cond.cleanup 354; NO-SINK-NEXT: ret 355; 356; SINK-LABEL: sink_splat_fadd: 357; SINK: # %bb.0: # %entry 358; SINK-NEXT: lui a1, 1 359; SINK-NEXT: add a1, a0, a1 360; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 361; SINK-NEXT: .LBB3_1: # %vector.body 362; SINK-NEXT: # =>This Inner Loop Header: Depth=1 363; SINK-NEXT: vle32.v v8, (a0) 364; SINK-NEXT: vfadd.vf v8, v8, fa0 365; SINK-NEXT: vse32.v v8, (a0) 366; SINK-NEXT: addi a0, a0, 16 367; SINK-NEXT: bne a0, a1, .LBB3_1 368; SINK-NEXT: # %bb.2: # %for.cond.cleanup 369; SINK-NEXT: ret 370; 371; DEFAULT-LABEL: sink_splat_fadd: 372; DEFAULT: # %bb.0: # %entry 373; DEFAULT-NEXT: lui a1, 1 374; DEFAULT-NEXT: add a1, a0, a1 375; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 376; DEFAULT-NEXT: .LBB3_1: # %vector.body 377; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 378; DEFAULT-NEXT: vle32.v v8, (a0) 379; DEFAULT-NEXT: vfadd.vf v8, v8, fa0 380; DEFAULT-NEXT: vse32.v v8, (a0) 381; DEFAULT-NEXT: addi a0, a0, 16 382; DEFAULT-NEXT: bne a0, a1, .LBB3_1 383; DEFAULT-NEXT: # %bb.2: # %for.cond.cleanup 384; DEFAULT-NEXT: ret 385entry: 386 %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 387 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer 388 br label %vector.body 389 390vector.body: ; preds = %vector.body, %entry 391 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 392 %0 = getelementptr inbounds float, ptr %a, i64 %index 393 %1 = bitcast ptr %0 to ptr 394 %wide.load = load <4 x float>, ptr %1, align 4 395 %2 = fadd <4 x float> %wide.load, %broadcast.splat 396 %3 = bitcast ptr %0 to ptr 397 store <4 x float> %2, ptr %3, align 4 398 %index.next = add nuw i64 %index, 4 399 %4 = icmp eq i64 %index.next, 1024 400 br i1 %4, label %for.cond.cleanup, label %vector.body 401 402for.cond.cleanup: ; preds = %vector.body 403 ret void 404} 405 406define void @sink_splat_fadd_scalable(ptr nocapture %a, float %x) { 407; NO-SINK-LABEL: sink_splat_fadd_scalable: 408; NO-SINK: # %bb.0: # %entry 409; NO-SINK-NEXT: csrr a1, vlenb 410; NO-SINK-NEXT: srli a3, a1, 2 411; NO-SINK-NEXT: li a2, 1024 412; NO-SINK-NEXT: bgeu a2, a3, .LBB4_2 413; NO-SINK-NEXT: # %bb.1: 414; NO-SINK-NEXT: li a2, 0 415; NO-SINK-NEXT: j .LBB4_5 416; NO-SINK-NEXT: .LBB4_2: # %vector.ph 417; NO-SINK-NEXT: addi a2, a3, -1 418; NO-SINK-NEXT: andi a4, a2, 1024 419; NO-SINK-NEXT: xori a2, a4, 1024 420; NO-SINK-NEXT: vsetvli a5, zero, e32, m1, ta, ma 421; NO-SINK-NEXT: vfmv.v.f v8, fa0 422; NO-SINK-NEXT: mv a5, a0 423; NO-SINK-NEXT: mv a6, a2 424; NO-SINK-NEXT: .LBB4_3: # %vector.body 425; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 426; NO-SINK-NEXT: vl1re32.v v9, (a5) 427; NO-SINK-NEXT: sub a6, a6, a3 428; NO-SINK-NEXT: vfadd.vv v9, v9, v8 429; NO-SINK-NEXT: vs1r.v v9, (a5) 430; NO-SINK-NEXT: add a5, a5, a1 431; NO-SINK-NEXT: bnez a6, .LBB4_3 432; NO-SINK-NEXT: # %bb.4: # %middle.block 433; NO-SINK-NEXT: beqz a4, .LBB4_7 434; NO-SINK-NEXT: .LBB4_5: # %for.body.preheader 435; NO-SINK-NEXT: slli a1, a2, 2 436; NO-SINK-NEXT: lui a2, 1 437; NO-SINK-NEXT: add a1, a0, a1 438; NO-SINK-NEXT: add a0, a0, a2 439; NO-SINK-NEXT: .LBB4_6: # %for.body 440; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 441; NO-SINK-NEXT: flw fa5, 0(a1) 442; NO-SINK-NEXT: fadd.s fa5, fa5, fa0 443; NO-SINK-NEXT: fsw fa5, 0(a1) 444; NO-SINK-NEXT: addi a1, a1, 4 445; NO-SINK-NEXT: bne a1, a0, .LBB4_6 446; NO-SINK-NEXT: .LBB4_7: # %for.cond.cleanup 447; NO-SINK-NEXT: ret 448; 449; SINK-LABEL: sink_splat_fadd_scalable: 450; SINK: # %bb.0: # %entry 451; SINK-NEXT: csrr a1, vlenb 452; SINK-NEXT: srli a3, a1, 2 453; SINK-NEXT: li a2, 1024 454; SINK-NEXT: bgeu a2, a3, .LBB4_2 455; SINK-NEXT: # %bb.1: 456; SINK-NEXT: li a2, 0 457; SINK-NEXT: j .LBB4_5 458; SINK-NEXT: .LBB4_2: # %vector.ph 459; SINK-NEXT: addi a2, a3, -1 460; SINK-NEXT: andi a4, a2, 1024 461; SINK-NEXT: xori a2, a4, 1024 462; SINK-NEXT: mv a5, a0 463; SINK-NEXT: mv a6, a2 464; SINK-NEXT: vsetvli a7, zero, e32, m1, ta, ma 465; SINK-NEXT: .LBB4_3: # %vector.body 466; SINK-NEXT: # =>This Inner Loop Header: Depth=1 467; SINK-NEXT: vl1re32.v v8, (a5) 468; SINK-NEXT: sub a6, a6, a3 469; SINK-NEXT: vfadd.vf v8, v8, fa0 470; SINK-NEXT: vs1r.v v8, (a5) 471; SINK-NEXT: add a5, a5, a1 472; SINK-NEXT: bnez a6, .LBB4_3 473; SINK-NEXT: # %bb.4: # %middle.block 474; SINK-NEXT: beqz a4, .LBB4_7 475; SINK-NEXT: .LBB4_5: # %for.body.preheader 476; SINK-NEXT: slli a1, a2, 2 477; SINK-NEXT: lui a2, 1 478; SINK-NEXT: add a1, a0, a1 479; SINK-NEXT: add a0, a0, a2 480; SINK-NEXT: .LBB4_6: # %for.body 481; SINK-NEXT: # =>This Inner Loop Header: Depth=1 482; SINK-NEXT: flw fa5, 0(a1) 483; SINK-NEXT: fadd.s fa5, fa5, fa0 484; SINK-NEXT: fsw fa5, 0(a1) 485; SINK-NEXT: addi a1, a1, 4 486; SINK-NEXT: bne a1, a0, .LBB4_6 487; SINK-NEXT: .LBB4_7: # %for.cond.cleanup 488; SINK-NEXT: ret 489; 490; DEFAULT-LABEL: sink_splat_fadd_scalable: 491; DEFAULT: # %bb.0: # %entry 492; DEFAULT-NEXT: csrr a1, vlenb 493; DEFAULT-NEXT: srli a3, a1, 2 494; DEFAULT-NEXT: li a2, 1024 495; DEFAULT-NEXT: bgeu a2, a3, .LBB4_2 496; DEFAULT-NEXT: # %bb.1: 497; DEFAULT-NEXT: li a2, 0 498; DEFAULT-NEXT: j .LBB4_5 499; DEFAULT-NEXT: .LBB4_2: # %vector.ph 500; DEFAULT-NEXT: addi a2, a3, -1 501; DEFAULT-NEXT: andi a4, a2, 1024 502; DEFAULT-NEXT: xori a2, a4, 1024 503; DEFAULT-NEXT: mv a5, a0 504; DEFAULT-NEXT: mv a6, a2 505; DEFAULT-NEXT: vsetvli a7, zero, e32, m1, ta, ma 506; DEFAULT-NEXT: .LBB4_3: # %vector.body 507; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 508; DEFAULT-NEXT: vl1re32.v v8, (a5) 509; DEFAULT-NEXT: sub a6, a6, a3 510; DEFAULT-NEXT: vfadd.vf v8, v8, fa0 511; DEFAULT-NEXT: vs1r.v v8, (a5) 512; DEFAULT-NEXT: add a5, a5, a1 513; DEFAULT-NEXT: bnez a6, .LBB4_3 514; DEFAULT-NEXT: # %bb.4: # %middle.block 515; DEFAULT-NEXT: beqz a4, .LBB4_7 516; DEFAULT-NEXT: .LBB4_5: # %for.body.preheader 517; DEFAULT-NEXT: slli a1, a2, 2 518; DEFAULT-NEXT: lui a2, 1 519; DEFAULT-NEXT: add a1, a0, a1 520; DEFAULT-NEXT: add a0, a0, a2 521; DEFAULT-NEXT: .LBB4_6: # %for.body 522; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 523; DEFAULT-NEXT: flw fa5, 0(a1) 524; DEFAULT-NEXT: fadd.s fa5, fa5, fa0 525; DEFAULT-NEXT: fsw fa5, 0(a1) 526; DEFAULT-NEXT: addi a1, a1, 4 527; DEFAULT-NEXT: bne a1, a0, .LBB4_6 528; DEFAULT-NEXT: .LBB4_7: # %for.cond.cleanup 529; DEFAULT-NEXT: ret 530entry: 531 %0 = call i64 @llvm.vscale.i64() 532 %1 = shl i64 %0, 1 533 %min.iters.check = icmp ugt i64 %1, 1024 534 br i1 %min.iters.check, label %for.body.preheader, label %vector.ph 535 536vector.ph: ; preds = %entry 537 %2 = call i64 @llvm.vscale.i64() 538 %3 = shl i64 %2, 1 539 %n.mod.vf = urem i64 1024, %3 540 %n.vec = sub nsw i64 1024, %n.mod.vf 541 %broadcast.splatinsert = insertelement <vscale x 2 x float> poison, float %x, i32 0 542 %broadcast.splat = shufflevector <vscale x 2 x float> %broadcast.splatinsert, <vscale x 2 x float> poison, <vscale x 2 x i32> zeroinitializer 543 %4 = call i64 @llvm.vscale.i64() 544 %5 = shl i64 %4, 1 545 br label %vector.body 546 547vector.body: ; preds = %vector.body, %vector.ph 548 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 549 %6 = getelementptr inbounds float, ptr %a, i64 %index 550 %7 = bitcast ptr %6 to ptr 551 %wide.load = load <vscale x 2 x float>, ptr %7, align 4 552 %8 = fadd <vscale x 2 x float> %wide.load, %broadcast.splat 553 %9 = bitcast ptr %6 to ptr 554 store <vscale x 2 x float> %8, ptr %9, align 4 555 %index.next = add nuw i64 %index, %5 556 %10 = icmp eq i64 %index.next, %n.vec 557 br i1 %10, label %middle.block, label %vector.body 558 559middle.block: ; preds = %vector.body 560 %cmp.n = icmp eq i64 %n.mod.vf, 0 561 br i1 %cmp.n, label %for.cond.cleanup, label %for.body.preheader 562 563for.body.preheader: ; preds = %entry, %middle.block 564 %indvars.iv.ph = phi i64 [ 0, %entry ], [ %n.vec, %middle.block ] 565 br label %for.body 566 567for.cond.cleanup: ; preds = %for.body, %middle.block 568 ret void 569 570for.body: ; preds = %for.body.preheader, %for.body 571 %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ] 572 %arrayidx = getelementptr inbounds float, ptr %a, i64 %indvars.iv 573 %11 = load float, ptr %arrayidx, align 4 574 %mul = fadd float %11, %x 575 store float %mul, ptr %arrayidx, align 4 576 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 577 %cmp.not = icmp eq i64 %indvars.iv.next, 1024 578 br i1 %cmp.not, label %for.cond.cleanup, label %for.body 579} 580 581declare <4 x float> @llvm.vp.fadd.v4i32(<4 x float>, <4 x float>, <4 x i1>, i32) 582 583define void @sink_splat_vp_fadd(ptr nocapture %a, float %x, <4 x i1> %m, i32 zeroext %vl) { 584; NO-SINK-LABEL: sink_splat_vp_fadd: 585; NO-SINK: # %bb.0: # %entry 586; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 587; NO-SINK-NEXT: vfmv.v.f v8, fa0 588; NO-SINK-NEXT: lui a2, 1 589; NO-SINK-NEXT: add a2, a0, a2 590; NO-SINK-NEXT: .LBB5_1: # %vector.body 591; NO-SINK-NEXT: # =>This Inner Loop Header: Depth=1 592; NO-SINK-NEXT: vle32.v v9, (a0) 593; NO-SINK-NEXT: vsetvli zero, a1, e32, m1, ta, ma 594; NO-SINK-NEXT: vfadd.vv v9, v9, v8, v0.t 595; NO-SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 596; NO-SINK-NEXT: vse32.v v9, (a0) 597; NO-SINK-NEXT: addi a0, a0, 16 598; NO-SINK-NEXT: bne a0, a2, .LBB5_1 599; NO-SINK-NEXT: # %bb.2: # %for.cond.cleanup 600; NO-SINK-NEXT: ret 601; 602; SINK-LABEL: sink_splat_vp_fadd: 603; SINK: # %bb.0: # %entry 604; SINK-NEXT: lui a2, 1 605; SINK-NEXT: add a2, a0, a2 606; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 607; SINK-NEXT: .LBB5_1: # %vector.body 608; SINK-NEXT: # =>This Inner Loop Header: Depth=1 609; SINK-NEXT: vle32.v v8, (a0) 610; SINK-NEXT: vsetvli zero, a1, e32, m1, ta, ma 611; SINK-NEXT: vfadd.vf v8, v8, fa0, v0.t 612; SINK-NEXT: vsetivli zero, 4, e32, m1, ta, ma 613; SINK-NEXT: vse32.v v8, (a0) 614; SINK-NEXT: addi a0, a0, 16 615; SINK-NEXT: bne a0, a2, .LBB5_1 616; SINK-NEXT: # %bb.2: # %for.cond.cleanup 617; SINK-NEXT: ret 618; 619; DEFAULT-LABEL: sink_splat_vp_fadd: 620; DEFAULT: # %bb.0: # %entry 621; DEFAULT-NEXT: lui a2, 1 622; DEFAULT-NEXT: add a2, a0, a2 623; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 624; DEFAULT-NEXT: .LBB5_1: # %vector.body 625; DEFAULT-NEXT: # =>This Inner Loop Header: Depth=1 626; DEFAULT-NEXT: vle32.v v8, (a0) 627; DEFAULT-NEXT: vsetvli zero, a1, e32, m1, ta, ma 628; DEFAULT-NEXT: vfadd.vf v8, v8, fa0, v0.t 629; DEFAULT-NEXT: vsetivli zero, 4, e32, m1, ta, ma 630; DEFAULT-NEXT: vse32.v v8, (a0) 631; DEFAULT-NEXT: addi a0, a0, 16 632; DEFAULT-NEXT: bne a0, a2, .LBB5_1 633; DEFAULT-NEXT: # %bb.2: # %for.cond.cleanup 634; DEFAULT-NEXT: ret 635entry: 636 %broadcast.splatinsert = insertelement <4 x float> poison, float %x, i32 0 637 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> poison, <4 x i32> zeroinitializer 638 br label %vector.body 639 640vector.body: ; preds = %vector.body, %entry 641 %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] 642 %0 = getelementptr inbounds float, ptr %a, i64 %index 643 %1 = bitcast ptr %0 to ptr 644 %wide.load = load <4 x float>, ptr %1, align 4 645 %2 = call <4 x float> @llvm.vp.fadd.v4i32(<4 x float> %wide.load, <4 x float> %broadcast.splat, <4 x i1> %m, i32 %vl) 646 %3 = bitcast ptr %0 to ptr 647 store <4 x float> %2, ptr %3, align 4 648 %index.next = add nuw i64 %index, 4 649 %4 = icmp eq i64 %index.next, 1024 650 br i1 %4, label %for.cond.cleanup, label %vector.body 651 652for.cond.cleanup: ; preds = %vector.body 653 ret void 654} 655