1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py 2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s 3 4define dso_local void @foo(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 5; CHECK-LABEL: @foo( 6; CHECK-NEXT: entry: 7; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 8; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 9; CHECK: vector.body: 10; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 11; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 12; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 13; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] 14; CHECK-NEXT: [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 15; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]]) 16; CHECK-NEXT: [[TMP3]] = sub i32 [[TMP1]], 4 17; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) 18; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef) 19; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 20; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP2]]) 21; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 22; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 23; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 24; CHECK-NEXT: [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 25; CHECK-NEXT: [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0 26; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 27; CHECK: for.cond.cleanup: 28; CHECK-NEXT: ret void 29; 30entry: 31 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 32 br label %vector.body 33 34vector.body: 35 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 36 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 37 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 38 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 39 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 40 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 41 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 42 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 43 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 44 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 45 %index.next = add i32 %index, 4 46 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 47 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 48 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 49 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 50 %4 = icmp ne i32 %3, 0 51 br i1 %4, label %vector.body, label %for.cond.cleanup 52 53for.cond.cleanup: 54 ret void 55} 56 57; Silly test case: the loop count is constant and a multiple of the vectorisation 58; factor. So, the vectoriser should not produce masked loads/stores and there's 59; nothing to tail-predicate here, just checking. 60define dso_local void @foo2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 61; CHECK-LABEL: @foo2( 62; CHECK-NEXT: entry: 63; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000) 64; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 65; CHECK: vector.body: 66; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 67; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 68; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 69; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 70; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LSR_IV]], align 4 71; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[LSR_IV11]], align 4 72; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]] 73; CHECK-NEXT: store <4 x i32> [[TMP1]], ptr [[LSR_IV14]], align 4 74; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 75; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 76; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 77; CHECK-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 78; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 79; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 80; CHECK: for.cond.cleanup: 81; CHECK-NEXT: ret void 82; 83entry: 84 %start = call i32 @llvm.start.loop.iterations.i32(i32 2000) 85 br label %vector.body 86 87vector.body: 88 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 89 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 90 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 91 %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ] 92 %wide.load = load <4 x i32>, ptr %lsr.iv, align 4 93 %wide.load9 = load <4 x i32>, ptr %lsr.iv11, align 4 94 %1 = add nsw <4 x i32> %wide.load9, %wide.load 95 store <4 x i32> %1, ptr %lsr.iv14, align 4 96 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 97 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 98 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 99 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 100 %3 = icmp ne i32 %2, 0 101 br i1 %3, label %vector.body, label %for.cond.cleanup 102 103for.cond.cleanup: 104 ret void 105} 106 107; Check that the icmp is a ult 108define dso_local void @foo3(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 109; CHECK-LABEL: @foo3( 110; CHECK-NEXT: entry: 111; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 112; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 113; CHECK: vector.body: 114; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 115; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 116; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 117; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 118; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 119; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 120; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer 121; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> 122; CHECK-NEXT: [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], splat (i32 32002) 123; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 124; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 125; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 126; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 127; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 128; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 129; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 130; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 131; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 132; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 133; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 134; CHECK: for.cond.cleanup: 135; CHECK-NEXT: ret void 136; 137entry: 138 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 139 br label %vector.body 140 141vector.body: 142 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 143 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 144 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 145 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 146 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 147 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 148 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 149 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 150 151; UGT here: 152 %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002> 153 154 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 155 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 156 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 157 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 158 %index.next = add i32 %index, 4 159 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 160 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 161 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 162 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 163 %4 = icmp ne i32 %3, 0 164 br i1 %4, label %vector.body, label %for.cond.cleanup 165 166for.cond.cleanup: 167 ret void 168} 169 170define dso_local void @foo5(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 171; CHECK-LABEL: @foo5( 172; CHECK-NEXT: entry: 173; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 174; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 175; CHECK: vector.body: 176; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 177; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 178; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 179; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 180; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 181; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0 182; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer 183; CHECK-NEXT: [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3> 184; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002> 185; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 186; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 187; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 188; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 189; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 190; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 191; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 192; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 193; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 194; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 195; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 196; CHECK: for.cond.cleanup: 197; CHECK-NEXT: ret void 198; 199entry: 200 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 201 br label %vector.body 202 203vector.body: 204 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 205 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 206 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 207 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 208 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 209 %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0 210 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer 211 %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3> 212 %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002> 213 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 214 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 215 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 216 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 217 %index.next = add i32 %index, 4 218 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 219 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 220 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 221 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 222 %4 = icmp ne i32 %3, 0 223 br i1 %4, label %vector.body, label %for.cond.cleanup 224 225for.cond.cleanup: 226 ret void 227} 228 229; 230define dso_local void @inconsistent_tripcounts(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 231; CHECK-LABEL: @inconsistent_tripcounts( 232; CHECK-NEXT: entry: 233; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 234; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 235; CHECK: vector.body: 236; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 237; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 238; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 239; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 240; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 241; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 -1) 242; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 243; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 244; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 245; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 246; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 247; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 248; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 249; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 250; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 251; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 252; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 253; CHECK: for.cond.cleanup: 254; CHECK-NEXT: ret void 255; 256entry: 257 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 258 br label %vector.body 259 260vector.body: 261 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 262 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 263 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 264 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 265 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 266; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow: 267 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295) 268 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 269 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 270 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 271 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 272 %index.next = add i32 %index, 4 273 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 274 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 275 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 276 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 277 %4 = icmp ne i32 %3, 0 278 br i1 %4, label %vector.body, label %for.cond.cleanup 279 280for.cond.cleanup: 281 ret void 282} 283 284; 285define dso_local void @overflow_in_sub(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 286; CHECK-LABEL: @overflow_in_sub( 287; CHECK-NEXT: entry: 288; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 1073741824) 289; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 290; CHECK: vector.body: 291; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 292; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 293; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 294; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 295; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 296; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003) 297; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 298; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 299; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 300; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 301; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 302; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 303; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 304; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 305; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 306; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 307; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 308; CHECK: for.cond.cleanup: 309; CHECK-NEXT: ret void 310; 311entry: 312 %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824) 313 br label %vector.body 314 315vector.body: 316 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 317 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 318 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 319 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 320 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 321 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 322 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 323 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 324 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 325 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 326 %index.next = add i32 %index, 4 327 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 328 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 329 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 330 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 331 %4 = icmp ne i32 %3, 0 332 br i1 %4, label %vector.body, label %for.cond.cleanup 333 334for.cond.cleanup: 335 ret void 336} 337 338 339; 340define dso_local void @IV_not_an_induction(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 341; CHECK-LABEL: @IV_not_an_induction( 342; CHECK-NEXT: entry: 343; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 344; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 345; CHECK: vector.body: 346; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 347; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 348; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 349; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 350; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 351; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[N:%.*]], i32 32003) 352; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 353; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 354; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 355; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 356; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 357; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 358; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 359; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 360; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 361; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 362; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 363; CHECK: for.cond.cleanup: 364; CHECK-NEXT: ret void 365; 366entry: 367 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 368 br label %vector.body 369 370vector.body: 371 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 372 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 373 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 374 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 375 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 376; The induction variable %N is not an IV: 377 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003) 378 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 379 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 380 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 381 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 382 %index.next = add i32 %index, 4 383 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 384 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 385 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 386 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 387 %4 = icmp ne i32 %3, 0 388 br i1 %4, label %vector.body, label %for.cond.cleanup 389 390for.cond.cleanup: 391 ret void 392} 393 394; 395define dso_local void @IV_wrong_step(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 396; CHECK-LABEL: @IV_wrong_step( 397; CHECK-NEXT: entry: 398; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 399; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 400; CHECK: vector.body: 401; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 402; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 403; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 404; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 405; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 406; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003) 407; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 408; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 409; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 410; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 411; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 3 412; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 413; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 414; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 415; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 416; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 417; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 418; CHECK: for.cond.cleanup: 419; CHECK-NEXT: ret void 420; 421entry: 422 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 423 br label %vector.body 424 425vector.body: 426 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 427 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 428 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 429 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 430 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 431 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 432 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 433 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 434 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 435 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 436 437; %index is incremented with 3 and not 4, which is the vectorisation factor 438; that we expect here: 439 %index.next = add i32 %index, 3 440 441 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 442 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 443 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 444 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 445 %4 = icmp ne i32 %3, 0 446 br i1 %4, label %vector.body, label %for.cond.cleanup 447 448for.cond.cleanup: 449 ret void 450} 451 452; 453define dso_local void @IV_step_not_constant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 { 454; CHECK-LABEL: @IV_step_not_constant( 455; CHECK-NEXT: entry: 456; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001) 457; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 458; CHECK: vector.body: 459; CHECK-NEXT: [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ] 460; CHECK-NEXT: [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ] 461; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ] 462; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 463; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] 464; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003) 465; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 466; CHECK-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef) 467; CHECK-NEXT: [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]] 468; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]]) 469; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[N:%.*]] 470; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4 471; CHECK-NEXT: [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4 472; CHECK-NEXT: [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4 473; CHECK-NEXT: [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 474; CHECK-NEXT: [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0 475; CHECK-NEXT: br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]] 476; CHECK: for.cond.cleanup: 477; CHECK-NEXT: ret void 478; 479entry: 480 %start = call i32 @llvm.start.loop.iterations.i32(i32 8001) 481 br label %vector.body 482 483vector.body: 484 %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ] 485 %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ] 486 %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ] 487 %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] 488 %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ] 489 %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003) 490 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef) 491 %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef) 492 %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load 493 call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1) 494 495; %index is incremented with some runtime value, i.e. not a constant: 496 %index.next = add i32 %index, %N 497 498 %scevgep = getelementptr i32, ptr %lsr.iv, i32 4 499 %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4 500 %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4 501 %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 502 %4 = icmp ne i32 %3, 0 503 br i1 %4, label %vector.body, label %for.cond.cleanup 504 505for.cond.cleanup: 506 ret void 507} 508 509; 510define dso_local void @outerloop_phi(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 { 511; CHECK-LABEL: @outerloop_phi( 512; CHECK-NEXT: entry: 513; CHECK-NEXT: [[CMP24:%.*]] = icmp eq i32 [[N:%.*]], 0 514; CHECK-NEXT: br i1 [[CMP24]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH_PREHEADER:%.*]] 515; CHECK: vector.ph.preheader: 516; CHECK-NEXT: br label [[VECTOR_PH:%.*]] 517; CHECK: vector.ph: 518; CHECK-NEXT: [[LSR_IV36:%.*]] = phi ptr [ [[B:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] 519; CHECK-NEXT: [[LSR_IV31:%.*]] = phi ptr [ [[C:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP32:%.*]], [[FOR_COND_CLEANUP3]] ] 520; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[A:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[FOR_COND_CLEANUP3]] ] 521; CHECK-NEXT: [[J_025:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3]] ], [ 0, [[VECTOR_PH_PREHEADER]] ] 522; CHECK-NEXT: [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 1025) 523; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 524; CHECK: vector.body: 525; CHECK-NEXT: [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV36]], [[VECTOR_PH]] ] 526; CHECK-NEXT: [[LSR_IV33:%.*]] = phi ptr [ [[SCEVGEP34:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV31]], [[VECTOR_PH]] ] 527; CHECK-NEXT: [[LSR_IV28:%.*]] = phi ptr [ [[SCEVGEP29:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ] 528; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 529; CHECK-NEXT: [[TMP0:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] 530; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[J_025]], i32 4096) 531; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) 532; CHECK-NEXT: [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV33]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef) 533; CHECK-NEXT: [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_MASKED_LOAD]] 534; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr [[LSR_IV28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]]) 535; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 536; CHECK-NEXT: [[SCEVGEP29]] = getelementptr i32, ptr [[LSR_IV28]], i32 4 537; CHECK-NEXT: [[SCEVGEP34]] = getelementptr i32, ptr [[LSR_IV33]], i32 4 538; CHECK-NEXT: [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 4 539; CHECK-NEXT: [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1) 540; CHECK-NEXT: [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0 541; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP3]] 542; CHECK: for.cond.cleanup: 543; CHECK-NEXT: ret void 544; CHECK: for.cond.cleanup3: 545; CHECK-NEXT: [[INC11]] = add nuw i32 [[J_025]], 1 546; CHECK-NEXT: [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 1 547; CHECK-NEXT: [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 1 548; CHECK-NEXT: [[SCEVGEP37]] = getelementptr i32, ptr [[LSR_IV36]], i32 1 549; CHECK-NEXT: [[EXITCOND26:%.*]] = icmp eq i32 [[INC11]], [[N]] 550; CHECK-NEXT: br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[VECTOR_PH]] 551; 552entry: 553 %cmp24 = icmp eq i32 %N, 0 554 br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader 555 556vector.ph.preheader: ; preds = %entry 557 br label %vector.ph 558 559vector.ph: ; preds = %vector.ph.preheader, %for.cond.cleanup3 560 %lsr.iv36 = phi ptr [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ] 561 %lsr.iv31 = phi ptr [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ] 562 %lsr.iv = phi ptr [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ] 563 %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ] 564 %start = call i32 @llvm.start.loop.iterations.i32(i32 1025) 565 br label %vector.body 566 567vector.body: ; preds = %vector.body, %vector.ph 568 %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ] 569 %lsr.iv33 = phi ptr [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ] 570 %lsr.iv28 = phi ptr [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ] 571 %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] 572 %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ] 573; It's using %j.025, the induction variable from its outer loop: 574 %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096) 575 %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv38, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 576 %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv33, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) 577 %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load 578 call void @llvm.masked.store.v4i32.p0(<4 x i32> %1, ptr %lsr.iv28, i32 4, <4 x i1> %active.lane.mask) 579 %index.next = add i32 %index, 4 580 %scevgep29 = getelementptr i32, ptr %lsr.iv28, i32 4 581 %scevgep34 = getelementptr i32, ptr %lsr.iv33, i32 4 582 %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 4 583 %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1) 584 %3 = icmp ne i32 %2, 0 585 br i1 %3, label %vector.body, label %for.cond.cleanup3 586 587for.cond.cleanup: ; preds = %for.cond.cleanup3, %entry 588 ret void 589 590for.cond.cleanup3: ; preds = %vector.body 591 %inc11 = add nuw i32 %j.025, 1 592 %scevgep = getelementptr i32, ptr %lsr.iv, i32 1 593 %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 1 594 %scevgep37 = getelementptr i32, ptr %lsr.iv36, i32 1 595 %exitcond26 = icmp eq i32 %inc11, %N 596 br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph 597} 598 599 600declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1 601declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2 602declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 ) 603declare i32 @llvm.start.loop.iterations.i32(i32) 604declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) 605