1; This is the loop in c++ being vectorize in this file with 2;vector.reverse 3; #pragma clang loop vectorize_width(4, scalable) 4; for (int i = N-1; i >= 0; --i) 5; a[i] = b[i] + 1.0; 6 7; REQUIRES: asserts 8; RUN: opt -passes=loop-vectorize,dce,instcombine -mtriple riscv64-linux-gnu \ 9; RUN: -mattr=+v -debug-only=loop-vectorize -scalable-vectorization=on \ 10; RUN: -riscv-v-vector-bits-min=128 -disable-output < %s 2>&1 | FileCheck %s 11 12define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { 13; CHECK-LABEL: 'vector_reverse_i64' 14; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0 15; CHECK-NEXT: LV: Found a loop: for.body 16; CHECK-NEXT: LV: Found an induction variable. 17; CHECK-NEXT: LV: Found an induction variable. 18; CHECK-NEXT: LV: Did not find one integer induction var. 19; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! 20; CHECK-NEXT: LV: Loop does not require scalar epilogue 21; CHECK-NEXT: LV: Found trip count: 0 22; CHECK-NEXT: LV: Found maximum trip count: 4294967295 23; CHECK-NEXT: LV: Scalable vectorization is available 24; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. 25; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. 26; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 27; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom 28; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom 29; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 30; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 31; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 32; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 33; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 34; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 35; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 36; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 37; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 38; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 39; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom 40; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 41; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 42; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom 43; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 44; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 45; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 46; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 47; CHECK-NEXT: LV: Using user VF vscale x 4. 48; CHECK-NEXT: LV: Loop does not require scalar epilogue 49; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 50; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 51; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom 52; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom 53; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 54; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 55; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { 56; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF 57; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF 58; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count 59; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count 60; CHECK-EMPTY: 61; CHECK-NEXT: ir-bb<for.body.preheader>: 62; CHECK-NEXT: IR %0 = zext i32 %n to i64 63; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) 64; CHECK-NEXT: Successor(s): vector.ph 65; CHECK-EMPTY: 66; CHECK-NEXT: vector.ph: 67; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1> 68; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> 69; CHECK-NEXT: Successor(s): vector loop 70; CHECK-EMPTY: 71; CHECK-NEXT: <x1> vector loop: { 72; CHECK-NEXT: vector.body: 73; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION 74; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> 75; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> 76; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> 77; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> 78; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> 79; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]> 80; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> 81; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> 82; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> 83; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> 84; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> 85; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> 86; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> 87; CHECK-NEXT: No successors 88; CHECK-NEXT: } 89; CHECK-NEXT: Successor(s): middle.block 90; CHECK-EMPTY: 91; CHECK-NEXT: middle.block: 92; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> 93; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> 94; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph 95; CHECK-EMPTY: 96; CHECK-NEXT: scalar.ph: 97; CHECK-NEXT: EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0> 98; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n> 99; CHECK-NEXT: Successor(s): ir-bb<for.body> 100; CHECK-EMPTY: 101; CHECK-NEXT: ir-bb<for.body>: 102; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph) 103; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) 104; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 105; CHECK-NEXT: No successors 106; CHECK-EMPTY: 107; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: 108; CHECK-NEXT: No successors 109; CHECK-NEXT: } 110; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 111; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 112; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 113; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 114; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom 115; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load i32, ptr %arrayidx, align 4 116; CHECK-NEXT: LV: Found an estimated cost of 2 for VF vscale x 4 For instruction: %add9 = add i32 %1, 1 117; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom 118; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store i32 %add9, ptr %arrayidx3, align 4 119; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 120; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 121; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 122; CHECK-NEXT: LV(REG): Calculating max register usage: 123; CHECK-NEXT: LV(REG): At #0 Interval # 0 124; CHECK-NEXT: LV(REG): At #1 Interval # 1 125; CHECK-NEXT: LV(REG): At #2 Interval # 2 126; CHECK-NEXT: LV(REG): At #3 Interval # 2 127; CHECK-NEXT: LV(REG): At #4 Interval # 2 128; CHECK-NEXT: LV(REG): At #5 Interval # 3 129; CHECK-NEXT: LV(REG): At #6 Interval # 3 130; CHECK-NEXT: LV(REG): At #7 Interval # 3 131; CHECK-NEXT: LV(REG): At #9 Interval # 1 132; CHECK-NEXT: LV(REG): At #10 Interval # 2 133; CHECK-NEXT: LV(REG): VF = vscale x 4 134; CHECK-NEXT: LV(REG): Found max usage: 2 item 135; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers 136; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers 137; CHECK-NEXT: LV(REG): Found invariant usage: 1 item 138; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers 139; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class 140; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class 141; CHECK-NEXT: LV: Loop does not require scalar epilogue 142; CHECK-NEXT: LV: Loop cost is 32 143; CHECK-NEXT: LV: IC is 1 144; CHECK-NEXT: LV: VF is vscale x 4 145; CHECK-NEXT: LV: Not Interleaving. 146; CHECK-NEXT: LV: Interleaving is not beneficial. 147; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> 148; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop 149; CHECK: Executing best plan with VF=vscale x 4, UF=1 150; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { 151; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF 152; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF 153; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count 154; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count 155; CHECK-EMPTY: 156; CHECK-NEXT: ir-bb<for.body.preheader>: 157; CHECK-NEXT: IR %0 = zext i32 %n to i64 158; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) 159; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> 160; CHECK-EMPTY: 161; CHECK-NEXT: ir-bb<vector.scevcheck>: 162; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 163; CHECK-NEXT: IR %4 = add i32 %n, -1 164; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 165; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) 166; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 167; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 168; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result 169; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 170; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow 171; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 172; CHECK-NEXT: IR %10 = or i1 %8, %9 173; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> 174; CHECK-EMPTY: 175; CHECK-NEXT: ir-bb<vector.memcheck>: 176; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() 177; CHECK-NEXT: IR %12 = mul i64 %11, 4 178; CHECK-NEXT: IR %13 = mul i64 %12, 4 179; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 180; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 181; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> 182; CHECK-EMPTY: 183; CHECK-NEXT: ir-bb<vector.ph>: 184; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() 185; CHECK-NEXT: IR %16 = mul i64 %15, 4 186; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 187; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf 188; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() 189; CHECK-NEXT: IR %18 = mul i64 %17, 4 190; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> 191; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> 192; CHECK-NEXT: Successor(s): vector loop 193; CHECK-EMPTY: 194; CHECK-NEXT: <x1> vector loop: { 195; CHECK-NEXT: vector.body: 196; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]> 197; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> 198; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> 199; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> 200; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> 201; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> 202; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]> 203; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> 204; CHECK-NEXT: WIDEN ir<%add9> = add ir<[[L]]>, ir<1> 205; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> 206; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> 207; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> 208; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 209; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> 210; CHECK-NEXT: No successors 211; CHECK-NEXT: } 212; CHECK-NEXT: Successor(s): ir-bb<middle.block> 213; CHECK-EMPTY: 214; CHECK-NEXT: ir-bb<middle.block>: 215; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]> 216; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> 217; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> 218; CHECK-EMPTY: 219; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: 220; CHECK-NEXT: No successors 221; CHECK-EMPTY: 222; CHECK-NEXT: ir-bb<scalar.ph>: 223; CHECK-NEXT: EMIT vp<[[RESUME_1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0> 224; CHECK-NEXT: EMIT vp<[[RESUME_2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n> 225; CHECK-NEXT: Successor(s): ir-bb<for.body> 226; CHECK-EMPTY: 227; CHECK-NEXT: ir-bb<for.body>: 228; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME_1]]> from ir-bb<scalar.ph>) 229; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME_2]]>.1 from ir-bb<scalar.ph>) 230; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 231; CHECK-NEXT: No successors 232; CHECK-NEXT: } 233; 234entry: 235 %cmp7 = icmp sgt i32 %n, 0 236 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup 237 238for.body.preheader: ; preds = %entry 239 %0 = zext i32 %n to i64 240 br label %for.body 241 242for.cond.cleanup: ; preds = %for.body, %entry 243 ret void 244 245for.body: ; preds = %for.body.preheader, %for.body 246 %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 247 %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 248 %i.0 = add nsw i32 %i.0.in8, -1 249 %idxprom = zext i32 %i.0 to i64 250 %arrayidx = getelementptr inbounds i32, ptr %B, i64 %idxprom 251 %1 = load i32, ptr %arrayidx, align 4 252 %add9 = add i32 %1, 1 253 %arrayidx3 = getelementptr inbounds i32, ptr %A, i64 %idxprom 254 store i32 %add9, ptr %arrayidx3, align 4 255 %cmp = icmp ugt i64 %indvars.iv, 1 256 %indvars.iv.next = add nsw i64 %indvars.iv, -1 257 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 258} 259 260define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocapture noundef readonly %B, i32 noundef signext %n) { 261; CHECK-LABEL: 'vector_reverse_f32' 262; CHECK-NEXT: LV: Loop hints: force=enabled width=vscale x 4 interleave=0 263; CHECK-NEXT: LV: Found a loop: for.body 264; CHECK-NEXT: LV: Found an induction variable. 265; CHECK-NEXT: LV: Found an induction variable. 266; CHECK-NEXT: LV: Found FP op with unsafe algebra. 267; CHECK-NEXT: LV: Did not find one integer induction var. 268; CHECK-NEXT: LV: We can vectorize this loop (with a runtime bound check)! 269; CHECK-NEXT: LV: Loop does not require scalar epilogue 270; CHECK-NEXT: LV: Found trip count: 0 271; CHECK-NEXT: LV: Found maximum trip count: 4294967295 272; CHECK-NEXT: LV: Scalable vectorization is available 273; CHECK-NEXT: LV: The max safe fixed VF is: 67108864. 274; CHECK-NEXT: LV: The max safe scalable VF is: vscale x 4294967295. 275; CHECK-NEXT: LV: Found uniform instruction: %cmp = icmp ugt i64 %indvars.iv, 1 276; CHECK-NEXT: LV: Found uniform instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom 277; CHECK-NEXT: LV: Found uniform instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom 278; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 279; CHECK-NEXT: LV: Found uniform instruction: %idxprom = zext i32 %i.0 to i64 280; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 281; CHECK-NEXT: LV: Found uniform instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 282; CHECK-NEXT: LV: Found uniform instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 283; CHECK-NEXT: LV: Found uniform instruction: %i.0 = add nsw i32 %i.0.in8, -1 284; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 285; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 286; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 287; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 288; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom 289; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 290; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 291; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom 292; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 293; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 294; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 295; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 296; CHECK-NEXT: LV: Using user VF vscale x 4. 297; CHECK-NEXT: LV: Loop does not require scalar epilogue 298; CHECK-NEXT: LV: Scalarizing: %i.0 = add nsw i32 %i.0.in8, -1 299; CHECK-NEXT: LV: Scalarizing: %idxprom = zext i32 %i.0 to i64 300; CHECK-NEXT: LV: Scalarizing: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom 301; CHECK-NEXT: LV: Scalarizing: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom 302; CHECK-NEXT: LV: Scalarizing: %cmp = icmp ugt i64 %indvars.iv, 1 303; CHECK-NEXT: LV: Scalarizing: %indvars.iv.next = add nsw i64 %indvars.iv, -1 304; CHECK-NEXT: VPlan 'Initial VPlan for VF={vscale x 4},UF>=1' { 305; CHECK-NEXT: Live-in vp<[[VF:%.+]]> = VF 306; CHECK-NEXT: Live-in vp<[[VFxUF:%.+]]> = VF * UF 307; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count 308; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count 309; CHECK-EMPTY: 310; CHECK-NEXT: ir-bb<for.body.preheader>: 311; CHECK-NEXT: IR %0 = zext i32 %n to i64 312; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) 313; CHECK-NEXT: Successor(s): vector.ph 314; CHECK-EMPTY: 315; CHECK-NEXT: vector.ph: 316; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + vp<[[VEC_TC]]> * ir<-1> 317; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + vp<[[VEC_TC]]> * ir<-1> 318; CHECK-NEXT: Successor(s): vector loop 319; CHECK-EMPTY: 320; CHECK-NEXT: <x1> vector loop: { 321; CHECK-NEXT: vector.body: 322; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION 323; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> 324; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> 325; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> 326; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> 327; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> 328; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, vp<[[VF]]> 329; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> 330; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> 331; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> 332; CHECK-NEXT: vp<[[VEC_PTR2:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, vp<[[VF]]> 333; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> 334; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> 335; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> 336; CHECK-NEXT: No successors 337; CHECK-NEXT: } 338; CHECK-NEXT: Successor(s): middle.block 339; CHECK-EMPTY: 340; CHECK-NEXT: middle.block: 341; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, vp<[[VEC_TC]]> 342; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> 343; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, scalar.ph 344; CHECK-EMPTY: 345; CHECK-NEXT: scalar.ph: 346; CHECK-NEXT: EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0> 347; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n> 348; CHECK-NEXT: Successor(s): ir-bb<for.body> 349; CHECK-EMPTY: 350; CHECK-NEXT: ir-bb<for.body>: 351; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from scalar.ph) 352; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from scalar.ph) 353; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 354; CHECK-NEXT: No successors 355; CHECK-EMPTY: 356; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: 357; CHECK-NEXT: No successors 358; CHECK-NEXT: } 359; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 360; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 361; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %i.0 = add nsw i32 %i.0.in8, -1 362; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %idxprom = zext i32 %i.0 to i64 363; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom 364; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: %1 = load float, ptr %arrayidx, align 4 365; CHECK-NEXT: LV: Found an estimated cost of 4 for VF vscale x 4 For instruction: %conv1 = fadd float %1, 1.000000e+00 366; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom 367; CHECK-NEXT: LV: Found an estimated cost of 13 for VF vscale x 4 For instruction: store float %conv1, ptr %arrayidx3, align 4 368; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %cmp = icmp ugt i64 %indvars.iv, 1 369; CHECK-NEXT: LV: Found an estimated cost of 1 for VF vscale x 4 For instruction: %indvars.iv.next = add nsw i64 %indvars.iv, -1 370; CHECK-NEXT: LV: Found an estimated cost of 0 for VF vscale x 4 For instruction: br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 371; CHECK-NEXT: LV(REG): Calculating max register usage: 372; CHECK-NEXT: LV(REG): At #0 Interval # 0 373; CHECK-NEXT: LV(REG): At #1 Interval # 1 374; CHECK-NEXT: LV(REG): At #2 Interval # 2 375; CHECK-NEXT: LV(REG): At #3 Interval # 2 376; CHECK-NEXT: LV(REG): At #4 Interval # 2 377; CHECK-NEXT: LV(REG): At #5 Interval # 3 378; CHECK-NEXT: LV(REG): At #6 Interval # 3 379; CHECK-NEXT: LV(REG): At #7 Interval # 3 380; CHECK-NEXT: LV(REG): At #9 Interval # 1 381; CHECK-NEXT: LV(REG): At #10 Interval # 2 382; CHECK-NEXT: LV(REG): VF = vscale x 4 383; CHECK-NEXT: LV(REG): Found max usage: 2 item 384; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 3 registers 385; CHECK-NEXT: LV(REG): RegisterClass: RISCV::VRRC, 2 registers 386; CHECK-NEXT: LV(REG): Found invariant usage: 1 item 387; CHECK-NEXT: LV(REG): RegisterClass: RISCV::GPRRC, 1 registers 388; CHECK-NEXT: LV: The target has 31 registers of RISCV::GPRRC register class 389; CHECK-NEXT: LV: The target has 32 registers of RISCV::VRRC register class 390; CHECK-NEXT: LV: Loop does not require scalar epilogue 391; CHECK-NEXT: LV: Loop cost is 34 392; CHECK-NEXT: LV: IC is 1 393; CHECK-NEXT: LV: VF is vscale x 4 394; CHECK-NEXT: LV: Not Interleaving. 395; CHECK-NEXT: LV: Interleaving is not beneficial. 396; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in <stdin> 397; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop 398; CHECK: Executing best plan with VF=vscale x 4, UF=1 399; CHECK-NEXT: VPlan 'Final VPlan for VF={vscale x 4},UF={1}' { 400; CHECK-NEXT: Live-in ir<[[VF:%.+]]> = VF 401; CHECK-NEXT: Live-in ir<[[VFxUF:%.+]]>.1 = VF * UF 402; CHECK-NEXT: Live-in ir<[[VEC_TC:%.+]]> = vector-trip-count 403; CHECK-NEXT: vp<[[TC:%.+]]> = original trip-count 404; CHECK-EMPTY: 405; CHECK-NEXT: ir-bb<for.body.preheader>: 406; CHECK-NEXT: IR %0 = zext i32 %n to i64 407; CHECK-NEXT: EMIT vp<[[TC]]> = EXPAND SCEV (zext i32 %n to i64) 408; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.scevcheck> 409; CHECK-EMPTY: 410; CHECK-NEXT: ir-bb<vector.scevcheck>: 411; CHECK-NEXT: IR %3 = add nsw i64 %0, -1 412; CHECK-NEXT: IR %4 = add i32 %n, -1 413; CHECK-NEXT: IR %5 = trunc i64 %3 to i32 414; CHECK-NEXT: IR %mul = call { i32, i1 } @llvm.umul.with.overflow.i32(i32 1, i32 %5) 415; CHECK-NEXT: IR %mul.result = extractvalue { i32, i1 } %mul, 0 416; CHECK-NEXT: IR %mul.overflow = extractvalue { i32, i1 } %mul, 1 417; CHECK-NEXT: IR %6 = sub i32 %4, %mul.result 418; CHECK-NEXT: IR %7 = icmp ugt i32 %6, %4 419; CHECK-NEXT: IR %8 = or i1 %7, %mul.overflow 420; CHECK-NEXT: IR %9 = icmp ugt i64 %3, 4294967295 421; CHECK-NEXT: IR %10 = or i1 %8, %9 422; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.memcheck> 423; CHECK-EMPTY: 424; CHECK-NEXT: ir-bb<vector.memcheck>: 425; CHECK-NEXT: IR %11 = call i64 @llvm.vscale.i64() 426; CHECK-NEXT: IR %12 = mul i64 %11, 4 427; CHECK-NEXT: IR %13 = mul i64 %12, 4 428; CHECK-NEXT: IR %14 = sub i64 %B1, %A2 429; CHECK-NEXT: IR %diff.check = icmp ult i64 %14, %13 430; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, ir-bb<vector.ph> 431; CHECK-EMPTY: 432; CHECK-NEXT: ir-bb<vector.ph>: 433; CHECK-NEXT: IR %15 = call i64 @llvm.vscale.i64() 434; CHECK-NEXT: IR %16 = mul i64 %15, 4 435; CHECK-NEXT: IR %n.mod.vf = urem i64 %0, %16 436; CHECK-NEXT: IR %n.vec = sub i64 %0, %n.mod.vf 437; CHECK-NEXT: IR %17 = call i64 @llvm.vscale.i64() 438; CHECK-NEXT: IR %18 = mul i64 %17, 4 439; CHECK-NEXT: vp<[[END1:%.+]]> = DERIVED-IV ir<%0> + ir<[[VEC_TC]]> * ir<-1> 440; CHECK-NEXT: vp<[[END2:%.+]]> = DERIVED-IV ir<%n> + ir<[[VEC_TC]]> * ir<-1> 441; CHECK-NEXT: Successor(s): vector loop 442; CHECK-EMPTY: 443; CHECK-NEXT: <x1> vector loop: { 444; CHECK-NEXT: vector.body: 445; CHECK-NEXT: SCALAR-PHI vp<[[CAN_IV:%.+]]> = phi ir<0>, vp<[[CAN_IV_NEXT:%.+]]> 446; CHECK-NEXT: vp<[[DEV_IV:%.+]]> = DERIVED-IV ir<%n> + vp<[[CAN_IV]]> * ir<-1> 447; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[DEV_IV]]>, ir<-1> 448; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> 449; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> 450; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> 451; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx>, ir<[[VF]]> 452; CHECK-NEXT: WIDEN ir<[[L:%.+]]> = load vp<[[VEC_PTR]]> 453; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<[[L]]>, ir<1.000000e+00> 454; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> 455; CHECK-NEXT: vp<[[VEC_PTR:%.+]]> = reverse-vector-pointer inbounds ir<%arrayidx3>, ir<[[VF]]> 456; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%conv1> 457; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, ir<[[VFxUF]]>.1 458; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, ir<[[VEC_TC]]> 459; CHECK-NEXT: No successors 460; CHECK-NEXT: } 461; CHECK-NEXT: Successor(s): ir-bb<middle.block> 462; CHECK-EMPTY: 463; CHECK-NEXT: ir-bb<middle.block>: 464; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq vp<[[TC]]>, ir<[[VEC_TC]]> 465; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> 466; CHECK-NEXT: Successor(s): ir-bb<for.cond.cleanup.loopexit>, ir-bb<scalar.ph> 467; CHECK-EMPTY: 468; CHECK-NEXT: ir-bb<for.cond.cleanup.loopexit>: 469; CHECK-NEXT: No successors 470; CHECK-EMPTY: 471; CHECK-NEXT: ir-bb<scalar.ph>: 472; CHECK-NEXT: EMIT vp<[[RESUME1:%.+]]> = resume-phi vp<[[END1]]>, ir<%0> 473; CHECK-NEXT: EMIT vp<[[RESUME2:%.+]]>.1 = resume-phi vp<[[END2]]>, ir<%n> 474; CHECK-NEXT: Successor(s): ir-bb<for.body> 475; CHECK-EMPTY: 476; CHECK-NEXT: ir-bb<for.body>: 477; CHECK-NEXT: IR %indvars.iv = phi i64 [ %0, %scalar.ph ], [ %indvars.iv.next, %for.body ] (extra operand: vp<[[RESUME1]]> from ir-bb<scalar.ph>) 478; CHECK-NEXT: IR %i.0.in8 = phi i32 [ %n, %scalar.ph ], [ %i.0, %for.body ] (extra operand: vp<[[RESUME2]]>.1 from ir-bb<scalar.ph>) 479; CHECK: IR %indvars.iv.next = add nsw i64 %indvars.iv, -1 480; CHECK-NEXT: No successors 481; CHECK-NEXT: } 482; 483entry: 484 %cmp7 = icmp sgt i32 %n, 0 485 br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup 486 487for.body.preheader: ; preds = %entry 488 %0 = zext i32 %n to i64 489 br label %for.body 490 491for.cond.cleanup: ; preds = %for.body, %entry 492 ret void 493 494for.body: ; preds = %for.body.preheader, %for.body 495 %indvars.iv = phi i64 [ %0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] 496 %i.0.in8 = phi i32 [ %n, %for.body.preheader ], [ %i.0, %for.body ] 497 %i.0 = add nsw i32 %i.0.in8, -1 498 %idxprom = zext i32 %i.0 to i64 499 %arrayidx = getelementptr inbounds float, ptr %B, i64 %idxprom 500 %1 = load float, ptr %arrayidx, align 4 501 %conv1 = fadd float %1, 1.000000e+00 502 %arrayidx3 = getelementptr inbounds float, ptr %A, i64 %idxprom 503 store float %conv1, ptr %arrayidx3, align 4 504 %cmp = icmp ugt i64 %indvars.iv, 1 505 %indvars.iv.next = add nsw i64 %indvars.iv, -1 506 br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !0 507} 508 509!0 = distinct !{!0, !1, !2, !3, !4} 510!1 = !{!"llvm.loop.mustprogress"} 511!2 = !{!"llvm.loop.vectorize.width", i32 4} 512!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} 513!4 = !{!"llvm.loop.vectorize.enable", i1 true} 514