1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -passes=loop-vectorize < %s -S -o - | FileCheck %s 3 4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 5target triple = "thumbv8.1m.main-arm-none-eabi" 6 7; This test could produce gather/scatter or predicated scalar load/stores. It 8; should never choose scalar load/store, and the cost of gather/scatter may be 9; high enough to make vectorization unwarranted. 10 11define i32 @nested(ptr nocapture %pG, ptr nocapture readonly %pA, i32 %n, i32 %ii) #0 { 12; CHECK-LABEL: @nested( 13; CHECK-NEXT: entry: 14; CHECK-NEXT: [[CMP66:%.*]] = icmp sgt i32 [[N:%.*]], 0 15; CHECK-NEXT: br i1 [[CMP66]], label [[FOR_BODY4_LR_PH_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] 16; CHECK: for.body4.lr.ph.preheader: 17; CHECK-NEXT: br label [[FOR_BODY4_LR_PH:%.*]] 18; CHECK: for.body4.lr.ph: 19; CHECK-NEXT: [[I_067:%.*]] = phi i32 [ [[INC29:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ] 20; CHECK-NEXT: [[CMP962_NOT:%.*]] = icmp eq i32 [[I_067]], 0 21; CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[I_067]], [[N]] 22; CHECK-NEXT: br i1 [[CMP962_NOT]], label [[FOR_BODY4_PREHEADER:%.*]], label [[FOR_BODY4_US_PREHEADER:%.*]] 23; CHECK: for.body4.us.preheader: 24; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] 25; CHECK: for.body4.preheader: 26; CHECK-NEXT: br label [[FOR_BODY4:%.*]] 27; CHECK: for.body4.us: 28; CHECK-NEXT: [[J_065_US:%.*]] = phi i32 [ [[INC26_US:%.*]], [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US:%.*]] ], [ [[I_067]], [[FOR_BODY4_US_PREHEADER]] ] 29; CHECK-NEXT: [[MUL_US:%.*]] = mul nsw i32 [[J_065_US]], [[N]] 30; CHECK-NEXT: [[ADD_US:%.*]] = add nsw i32 [[MUL_US]], [[I_067]] 31; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds float, ptr [[PA:%.*]], i32 [[ADD_US]] 32; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[ARRAYIDX_US]], align 4 33; CHECK-NEXT: [[ARRAYIDX7_US:%.*]] = getelementptr inbounds float, ptr [[PG:%.*]], i32 [[ADD_US]] 34; CHECK-NEXT: store float [[TMP0]], ptr [[ARRAYIDX7_US]], align 4 35; CHECK-NEXT: br label [[FOR_BODY11_US:%.*]] 36; CHECK: for.body11.us: 37; CHECK-NEXT: [[TMP1:%.*]] = phi float [ [[TMP0]], [[FOR_BODY4_US]] ], [ [[SUB_US:%.*]], [[FOR_BODY11_US]] ] 38; CHECK-NEXT: [[K_063_US:%.*]] = phi i32 [ 0, [[FOR_BODY4_US]] ], [ [[INC_US:%.*]], [[FOR_BODY11_US]] ] 39; CHECK-NEXT: [[ADD16_US:%.*]] = add nsw i32 [[K_063_US]], [[MUL15]] 40; CHECK-NEXT: [[ARRAYIDX17_US:%.*]] = getelementptr inbounds float, ptr [[PG]], i32 [[ADD16_US]] 41; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX17_US]], align 4 42; CHECK-NEXT: [[ADD19_US:%.*]] = add nsw i32 [[K_063_US]], [[MUL_US]] 43; CHECK-NEXT: [[ARRAYIDX20_US:%.*]] = getelementptr inbounds float, ptr [[PG]], i32 [[ADD19_US]] 44; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX20_US]], align 4 45; CHECK-NEXT: [[MUL21_US:%.*]] = fmul fast float [[TMP3]], [[TMP2]] 46; CHECK-NEXT: [[SUB_US]] = fsub fast float [[TMP1]], [[MUL21_US]] 47; CHECK-NEXT: store float [[SUB_US]], ptr [[ARRAYIDX7_US]], align 4 48; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_063_US]], 1 49; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC_US]], [[I_067]] 50; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND8_FOR_COND_CLEANUP10_CRIT_EDGE_US]], label [[FOR_BODY11_US]] 51; CHECK: for.cond8.for.cond.cleanup10_crit_edge.us: 52; CHECK-NEXT: [[INC26_US]] = add nuw nsw i32 [[J_065_US]], 1 53; CHECK-NEXT: [[EXITCOND71_NOT:%.*]] = icmp eq i32 [[INC26_US]], [[N]] 54; CHECK-NEXT: br i1 [[EXITCOND71_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT1:%.*]], label [[FOR_BODY4_US]] 55; CHECK: for.cond.cleanup.loopexit: 56; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] 57; CHECK: for.cond.cleanup: 58; CHECK-NEXT: ret i32 0 59; CHECK: for.cond.cleanup3.loopexit: 60; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] 61; CHECK: for.cond.cleanup3.loopexit1: 62; CHECK-NEXT: br label [[FOR_COND_CLEANUP3]] 63; CHECK: for.cond.cleanup3: 64; CHECK-NEXT: [[INC29]] = add nuw nsw i32 [[I_067]], 1 65; CHECK-NEXT: [[EXITCOND73_NOT:%.*]] = icmp eq i32 [[INC29]], [[N]] 66; CHECK-NEXT: br i1 [[EXITCOND73_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY4_LR_PH]] 67; CHECK: for.body4: 68; CHECK-NEXT: [[J_065:%.*]] = phi i32 [ [[INC26:%.*]], [[FOR_BODY4]] ], [ 0, [[FOR_BODY4_PREHEADER]] ] 69; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[J_065]], [[N]] 70; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[PA]], i32 [[MUL]] 71; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX]], align 4 72; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds float, ptr [[PG]], i32 [[MUL]] 73; CHECK-NEXT: store float [[TMP4]], ptr [[ARRAYIDX7]], align 4 74; CHECK-NEXT: [[INC26]] = add nuw nsw i32 [[J_065]], 1 75; CHECK-NEXT: [[EXITCOND72_NOT:%.*]] = icmp eq i32 [[INC26]], [[N]] 76; CHECK-NEXT: br i1 [[EXITCOND72_NOT]], label [[FOR_COND_CLEANUP3_LOOPEXIT:%.*]], label [[FOR_BODY4]] 77; 78entry: 79 %cmp66 = icmp sgt i32 %n, 0 80 br i1 %cmp66, label %for.body4.lr.ph, label %for.cond.cleanup 81 82for.body4.lr.ph: ; preds = %entry, %for.cond.cleanup3 83 %i.067 = phi i32 [ %inc29, %for.cond.cleanup3 ], [ 0, %entry ] 84 %cmp962.not = icmp eq i32 %i.067, 0 85 %mul15 = mul nsw i32 %i.067, %n 86 br i1 %cmp962.not, label %for.body4, label %for.body4.us 87 88for.body4.us: ; preds = %for.body4.lr.ph, %for.cond8.for.cond.cleanup10_crit_edge.us 89 %j.065.us = phi i32 [ %inc26.us, %for.cond8.for.cond.cleanup10_crit_edge.us ], [ %i.067, %for.body4.lr.ph ] 90 %mul.us = mul nsw i32 %j.065.us, %n 91 %add.us = add nsw i32 %mul.us, %i.067 92 %arrayidx.us = getelementptr inbounds float, ptr %pA, i32 %add.us 93 %0 = load float, ptr %arrayidx.us, align 4 94 %arrayidx7.us = getelementptr inbounds float, ptr %pG, i32 %add.us 95 store float %0, ptr %arrayidx7.us, align 4 96 br label %for.body11.us 97 98for.body11.us: ; preds = %for.body4.us, %for.body11.us 99 %1 = phi float [ %0, %for.body4.us ], [ %sub.us, %for.body11.us ] 100 %k.063.us = phi i32 [ 0, %for.body4.us ], [ %inc.us, %for.body11.us ] 101 %add16.us = add nsw i32 %k.063.us, %mul15 102 %arrayidx17.us = getelementptr inbounds float, ptr %pG, i32 %add16.us 103 %2 = load float, ptr %arrayidx17.us, align 4 104 %add19.us = add nsw i32 %k.063.us, %mul.us 105 %arrayidx20.us = getelementptr inbounds float, ptr %pG, i32 %add19.us 106 %3 = load float, ptr %arrayidx20.us, align 4 107 %mul21.us = fmul fast float %3, %2 108 %sub.us = fsub fast float %1, %mul21.us 109 store float %sub.us, ptr %arrayidx7.us, align 4 110 %inc.us = add nuw nsw i32 %k.063.us, 1 111 %exitcond.not = icmp eq i32 %inc.us, %i.067 112 br i1 %exitcond.not, label %for.cond8.for.cond.cleanup10_crit_edge.us, label %for.body11.us 113 114for.cond8.for.cond.cleanup10_crit_edge.us: ; preds = %for.body11.us 115 %inc26.us = add nuw nsw i32 %j.065.us, 1 116 %exitcond71.not = icmp eq i32 %inc26.us, %n 117 br i1 %exitcond71.not, label %for.cond.cleanup3, label %for.body4.us 118 119for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry 120 ret i32 0 121 122for.cond.cleanup3: ; preds = %for.cond8.for.cond.cleanup10_crit_edge.us, %for.body4 123 %inc29 = add nuw nsw i32 %i.067, 1 124 %exitcond73.not = icmp eq i32 %inc29, %n 125 br i1 %exitcond73.not, label %for.cond.cleanup, label %for.body4.lr.ph 126 127for.body4: ; preds = %for.body4.lr.ph, %for.body4 128 %j.065 = phi i32 [ %inc26, %for.body4 ], [ 0, %for.body4.lr.ph ] 129 %mul = mul nsw i32 %j.065, %n 130 %arrayidx = getelementptr inbounds float, ptr %pA, i32 %mul 131 %4 = load float, ptr %arrayidx, align 4 132 %arrayidx7 = getelementptr inbounds float, ptr %pG, i32 %mul 133 store float %4, ptr %arrayidx7, align 4 134 %inc26 = add nuw nsw i32 %j.065, 1 135 %exitcond72.not = icmp eq i32 %inc26, %n 136 br i1 %exitcond72.not, label %for.cond.cleanup3, label %for.body4 137} 138 139attributes #0 = { "target-features"="+mve" } 140