xref: /llvm-project/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll (revision 38fffa630ee80163dc65e759392ad29798905679)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
2; RUN: opt -mtriple=thumbv8.1m.main -mve-tail-predication -tail-predication=enabled -mattr=+mve %s -S -o - | FileCheck %s
3
4define dso_local void @foo(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
5; CHECK-LABEL: @foo(
6; CHECK-NEXT:  entry:
7; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
8; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
9; CHECK:       vector.body:
10; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
11; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
12; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
13; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
14; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 32003, [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
15; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.vctp32(i32 [[TMP1]])
16; CHECK-NEXT:    [[TMP3]] = sub i32 [[TMP1]], 4
17; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
18; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP2]], <4 x i32> undef)
19; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
20; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP2]])
21; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
22; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
23; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
24; CHECK-NEXT:    [[TMP5]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
25; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i32 [[TMP5]], 0
26; CHECK-NEXT:    br i1 [[TMP6]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
27; CHECK:       for.cond.cleanup:
28; CHECK-NEXT:    ret void
29;
30entry:
31  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
32  br label %vector.body
33
34vector.body:
35  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
36  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
37  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
38  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
39  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
40  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
41  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
42  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
43  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
44  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
45  %index.next = add i32 %index, 4
46  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
47  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
48  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
49  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
50  %4 = icmp ne i32 %3, 0
51  br i1 %4, label %vector.body, label %for.cond.cleanup
52
53for.cond.cleanup:
54  ret void
55}
56
57; Silly test case: the loop count is constant and a multiple of the vectorisation
58; factor. So, the vectoriser should not produce masked loads/stores and there's
59; nothing to tail-predicate here, just checking.
60define dso_local void @foo2(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
61; CHECK-LABEL: @foo2(
62; CHECK-NEXT:  entry:
63; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 2000)
64; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
65; CHECK:       vector.body:
66; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
67; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
68; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
69; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
70; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[LSR_IV]], align 4
71; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[LSR_IV11]], align 4
72; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_LOAD9]], [[WIDE_LOAD]]
73; CHECK-NEXT:    store <4 x i32> [[TMP1]], ptr [[LSR_IV14]], align 4
74; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
75; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
76; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
77; CHECK-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
78; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
79; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
80; CHECK:       for.cond.cleanup:
81; CHECK-NEXT:    ret void
82;
83entry:
84  %start = call i32 @llvm.start.loop.iterations.i32(i32 2000)
85  br label %vector.body
86
87vector.body:
88  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
89  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
90  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
91  %0 = phi i32 [ %start, %entry ], [ %2, %vector.body ]
92  %wide.load = load <4 x i32>, ptr %lsr.iv, align 4
93  %wide.load9 = load <4 x i32>, ptr %lsr.iv11, align 4
94  %1 = add nsw <4 x i32> %wide.load9, %wide.load
95  store <4 x i32> %1, ptr %lsr.iv14, align 4
96  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
97  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
98  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
99  %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
100  %3 = icmp ne i32 %2, 0
101  br i1 %3, label %vector.body, label %for.cond.cleanup
102
103for.cond.cleanup:
104  ret void
105}
106
107; Check that the icmp is a ult
108define dso_local void @foo3(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
109; CHECK-LABEL: @foo3(
110; CHECK-NEXT:  entry:
111; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
112; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
113; CHECK:       vector.body:
114; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
115; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
116; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
117; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
118; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
119; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
120; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
121; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
122; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt <4 x i32> [[INDUCTION]], splat (i32 32002)
123; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
124; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
125; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
126; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
127; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
128; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
129; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
130; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
131; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
132; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
133; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
134; CHECK:       for.cond.cleanup:
135; CHECK-NEXT:    ret void
136;
137entry:
138  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
139  br label %vector.body
140
141vector.body:
142  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
143  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
144  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
145  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
146  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
147  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
148  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
149  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
150
151; UGT here:
152  %1 = icmp ugt <4 x i32> %induction, <i32 32002, i32 32002, i32 32002, i32 32002>
153
154  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
155  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
156  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
157  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
158  %index.next = add i32 %index, 4
159  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
160  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
161  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
162  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
163  %4 = icmp ne i32 %3, 0
164  br i1 %4, label %vector.body, label %for.cond.cleanup
165
166for.cond.cleanup:
167  ret void
168}
169
170define dso_local void @foo5(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
171; CHECK-LABEL: @foo5(
172; CHECK-NEXT:  entry:
173; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
174; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
175; CHECK:       vector.body:
176; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
177; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
178; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
179; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
180; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
181; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
182; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
183; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
184; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <4 x i32> [[INDUCTION]], <i32 0, i32 3200, i32 32002, i32 32002>
185; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
186; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
187; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
188; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
189; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
190; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
191; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
192; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
193; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
194; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
195; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
196; CHECK:       for.cond.cleanup:
197; CHECK-NEXT:    ret void
198;
199entry:
200  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
201  br label %vector.body
202
203vector.body:
204  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
205  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
206  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
207  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
208  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
209  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
210  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
211  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
212  %1 = icmp ult <4 x i32> %induction, <i32 0, i32 3200, i32 32002, i32 32002>
213  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
214  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
215  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
216  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
217  %index.next = add i32 %index, 4
218  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
219  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
220  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
221  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
222  %4 = icmp ne i32 %3, 0
223  br i1 %4, label %vector.body, label %for.cond.cleanup
224
225for.cond.cleanup:
226  ret void
227}
228
229;
230define dso_local void @inconsistent_tripcounts(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
231; CHECK-LABEL: @inconsistent_tripcounts(
232; CHECK-NEXT:  entry:
233; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
234; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
235; CHECK:       vector.body:
236; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
237; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
238; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
239; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
240; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
241; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 -1)
242; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
243; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
244; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
245; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
246; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
247; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
248; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
249; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
250; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
251; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
252; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
253; CHECK:       for.cond.cleanup:
254; CHECK-NEXT:    ret void
255;
256entry:
257  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
258  br label %vector.body
259
260vector.body:
261  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
262  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
263  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
264  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
265  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
266; BTC = UINT_MAX, and scalar trip count BTC + 1 would overflow:
267  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967295)
268  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
269  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
270  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
271  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
272  %index.next = add i32 %index, 4
273  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
274  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
275  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
276  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
277  %4 = icmp ne i32 %3, 0
278  br i1 %4, label %vector.body, label %for.cond.cleanup
279
280for.cond.cleanup:
281  ret void
282}
283
284;
285define dso_local void @overflow_in_sub(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
286; CHECK-LABEL: @overflow_in_sub(
287; CHECK-NEXT:  entry:
288; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
289; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
290; CHECK:       vector.body:
291; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
292; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
293; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
294; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
295; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
296; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
297; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
298; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
299; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
300; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
301; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
302; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
303; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
304; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
305; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
306; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
307; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
308; CHECK:       for.cond.cleanup:
309; CHECK-NEXT:    ret void
310;
311entry:
312  %start = call i32 @llvm.start.loop.iterations.i32(i32 1073741824)
313  br label %vector.body
314
315vector.body:
316  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
317  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
318  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
319  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
320  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
321  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
322  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
323  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
324  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
325  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
326  %index.next = add i32 %index, 4
327  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
328  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
329  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
330  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
331  %4 = icmp ne i32 %3, 0
332  br i1 %4, label %vector.body, label %for.cond.cleanup
333
334for.cond.cleanup:
335  ret void
336}
337
338
339;
340define dso_local void @IV_not_an_induction(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
341; CHECK-LABEL: @IV_not_an_induction(
342; CHECK-NEXT:  entry:
343; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
344; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
345; CHECK:       vector.body:
346; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
347; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
348; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
349; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
350; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
351; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[N:%.*]], i32 32003)
352; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
353; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
354; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
355; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
356; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
357; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
358; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
359; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
360; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
361; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
362; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
363; CHECK:       for.cond.cleanup:
364; CHECK-NEXT:    ret void
365;
366entry:
367  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
368  br label %vector.body
369
370vector.body:
371  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
372  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
373  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
374  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
375  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
376; The induction variable %N is not an IV:
377  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %N, i32 32003)
378  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
379  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
380  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
381  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
382  %index.next = add i32 %index, 4
383  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
384  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
385  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
386  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
387  %4 = icmp ne i32 %3, 0
388  br i1 %4, label %vector.body, label %for.cond.cleanup
389
390for.cond.cleanup:
391  ret void
392}
393
394;
395define dso_local void @IV_wrong_step(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
396; CHECK-LABEL: @IV_wrong_step(
397; CHECK-NEXT:  entry:
398; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
399; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
400; CHECK:       vector.body:
401; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
402; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
403; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
404; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
405; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
406; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
407; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
408; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
409; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
410; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
411; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 3
412; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
413; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
414; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
415; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
416; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
417; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
418; CHECK:       for.cond.cleanup:
419; CHECK-NEXT:    ret void
420;
421entry:
422  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
423  br label %vector.body
424
425vector.body:
426  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
427  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
428  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
429  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
430  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
431  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
432  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
433  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
434  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
435  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
436
437; %index is incremented with 3 and not 4, which is the vectorisation factor
438; that we expect here:
439  %index.next = add i32 %index, 3
440
441  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
442  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
443  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
444  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
445  %4 = icmp ne i32 %3, 0
446  br i1 %4, label %vector.body, label %for.cond.cleanup
447
448for.cond.cleanup:
449  ret void
450}
451
452;
453define dso_local void @IV_step_not_constant(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, ptr noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
454; CHECK-LABEL: @IV_step_not_constant(
455; CHECK-NEXT:  entry:
456; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 8001)
457; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
458; CHECK:       vector.body:
459; CHECK-NEXT:    [[LSR_IV14:%.*]] = phi ptr [ [[SCEVGEP15:%.*]], [[VECTOR_BODY]] ], [ [[A:%.*]], [[ENTRY:%.*]] ]
460; CHECK-NEXT:    [[LSR_IV11:%.*]] = phi ptr [ [[SCEVGEP12:%.*]], [[VECTOR_BODY]] ], [ [[C:%.*]], [[ENTRY]] ]
461; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], [[VECTOR_BODY]] ], [ [[B:%.*]], [[ENTRY]] ]
462; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
463; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[ENTRY]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
464; CHECK-NEXT:    [[TMP1:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[INDEX]], i32 32003)
465; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
466; CHECK-NEXT:    [[WIDE_MASKED_LOAD9:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV11]], i32 4, <4 x i1> [[TMP1]], <4 x i32> undef)
467; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD]]
468; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP2]], ptr [[LSR_IV14]], i32 4, <4 x i1> [[TMP1]])
469; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[N:%.*]]
470; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 4
471; CHECK-NEXT:    [[SCEVGEP12]] = getelementptr i32, ptr [[LSR_IV11]], i32 4
472; CHECK-NEXT:    [[SCEVGEP15]] = getelementptr i32, ptr [[LSR_IV14]], i32 4
473; CHECK-NEXT:    [[TMP3]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
474; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i32 [[TMP3]], 0
475; CHECK-NEXT:    br i1 [[TMP4]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
476; CHECK:       for.cond.cleanup:
477; CHECK-NEXT:    ret void
478;
479entry:
480  %start = call i32 @llvm.start.loop.iterations.i32(i32 8001)
481  br label %vector.body
482
483vector.body:
484  %lsr.iv14 = phi ptr [ %scevgep15, %vector.body ], [ %A, %entry ]
485  %lsr.iv11 = phi ptr [ %scevgep12, %vector.body ], [ %C, %entry ]
486  %lsr.iv = phi ptr [ %scevgep, %vector.body ], [ %B, %entry ]
487  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
488  %0 = phi i32 [ %start, %entry ], [ %3, %vector.body ]
489  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32003)
490  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv, i32 4, <4 x i1> %1, <4 x i32> undef)
491  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv11, i32 4, <4 x i1> %1, <4 x i32> undef)
492  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
493  call void @llvm.masked.store.v4i32.p0(<4 x i32> %2, ptr %lsr.iv14, i32 4, <4 x i1> %1)
494
495; %index is incremented with some runtime value, i.e. not a constant:
496  %index.next = add i32 %index, %N
497
498  %scevgep = getelementptr i32, ptr %lsr.iv, i32 4
499  %scevgep12 = getelementptr i32, ptr %lsr.iv11, i32 4
500  %scevgep15 = getelementptr i32, ptr %lsr.iv14, i32 4
501  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
502  %4 = icmp ne i32 %3, 0
503  br i1 %4, label %vector.body, label %for.cond.cleanup
504
505for.cond.cleanup:
506  ret void
507}
508
509;
510define dso_local void @outerloop_phi(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
511; CHECK-LABEL: @outerloop_phi(
512; CHECK-NEXT:  entry:
513; CHECK-NEXT:    [[CMP24:%.*]] = icmp eq i32 [[N:%.*]], 0
514; CHECK-NEXT:    br i1 [[CMP24]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_PH_PREHEADER:%.*]]
515; CHECK:       vector.ph.preheader:
516; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
517; CHECK:       vector.ph:
518; CHECK-NEXT:    [[LSR_IV36:%.*]] = phi ptr [ [[B:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP37:%.*]], [[FOR_COND_CLEANUP3:%.*]] ]
519; CHECK-NEXT:    [[LSR_IV31:%.*]] = phi ptr [ [[C:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP32:%.*]], [[FOR_COND_CLEANUP3]] ]
520; CHECK-NEXT:    [[LSR_IV:%.*]] = phi ptr [ [[A:%.*]], [[VECTOR_PH_PREHEADER]] ], [ [[SCEVGEP:%.*]], [[FOR_COND_CLEANUP3]] ]
521; CHECK-NEXT:    [[J_025:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3]] ], [ 0, [[VECTOR_PH_PREHEADER]] ]
522; CHECK-NEXT:    [[START:%.*]] = call i32 @llvm.start.loop.iterations.i32(i32 1025)
523; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
524; CHECK:       vector.body:
525; CHECK-NEXT:    [[LSR_IV38:%.*]] = phi ptr [ [[SCEVGEP39:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV36]], [[VECTOR_PH]] ]
526; CHECK-NEXT:    [[LSR_IV33:%.*]] = phi ptr [ [[SCEVGEP34:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV31]], [[VECTOR_PH]] ]
527; CHECK-NEXT:    [[LSR_IV28:%.*]] = phi ptr [ [[SCEVGEP29:%.*]], [[VECTOR_BODY]] ], [ [[LSR_IV]], [[VECTOR_PH]] ]
528; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
529; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[START]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
530; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[J_025]], i32 4096)
531; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV38]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
532; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[LSR_IV33]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> undef)
533; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_LOAD27]], [[WIDE_MASKED_LOAD]]
534; CHECK-NEXT:    call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP1]], ptr [[LSR_IV28]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
535; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
536; CHECK-NEXT:    [[SCEVGEP29]] = getelementptr i32, ptr [[LSR_IV28]], i32 4
537; CHECK-NEXT:    [[SCEVGEP34]] = getelementptr i32, ptr [[LSR_IV33]], i32 4
538; CHECK-NEXT:    [[SCEVGEP39]] = getelementptr i32, ptr [[LSR_IV38]], i32 4
539; CHECK-NEXT:    [[TMP2]] = call i32 @llvm.loop.decrement.reg.i32(i32 [[TMP0]], i32 1)
540; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP2]], 0
541; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP3]]
542; CHECK:       for.cond.cleanup:
543; CHECK-NEXT:    ret void
544; CHECK:       for.cond.cleanup3:
545; CHECK-NEXT:    [[INC11]] = add nuw i32 [[J_025]], 1
546; CHECK-NEXT:    [[SCEVGEP]] = getelementptr i32, ptr [[LSR_IV]], i32 1
547; CHECK-NEXT:    [[SCEVGEP32]] = getelementptr i32, ptr [[LSR_IV31]], i32 1
548; CHECK-NEXT:    [[SCEVGEP37]] = getelementptr i32, ptr [[LSR_IV36]], i32 1
549; CHECK-NEXT:    [[EXITCOND26:%.*]] = icmp eq i32 [[INC11]], [[N]]
550; CHECK-NEXT:    br i1 [[EXITCOND26]], label [[FOR_COND_CLEANUP]], label [[VECTOR_PH]]
551;
552entry:
553  %cmp24 = icmp eq i32 %N, 0
554  br i1 %cmp24, label %for.cond.cleanup, label %vector.ph.preheader
555
556vector.ph.preheader:                              ; preds = %entry
557  br label %vector.ph
558
559vector.ph:                                        ; preds = %vector.ph.preheader, %for.cond.cleanup3
560  %lsr.iv36 = phi ptr [ %B, %vector.ph.preheader ], [ %scevgep37, %for.cond.cleanup3 ]
561  %lsr.iv31 = phi ptr [ %C, %vector.ph.preheader ], [ %scevgep32, %for.cond.cleanup3 ]
562  %lsr.iv = phi ptr [ %A, %vector.ph.preheader ], [ %scevgep, %for.cond.cleanup3 ]
563  %j.025 = phi i32 [ %inc11, %for.cond.cleanup3 ], [ 0, %vector.ph.preheader ]
564  %start = call i32 @llvm.start.loop.iterations.i32(i32 1025)
565  br label %vector.body
566
567vector.body:                                      ; preds = %vector.body, %vector.ph
568  %lsr.iv38 = phi ptr [ %scevgep39, %vector.body ], [ %lsr.iv36, %vector.ph ]
569  %lsr.iv33 = phi ptr [ %scevgep34, %vector.body ], [ %lsr.iv31, %vector.ph ]
570  %lsr.iv28 = phi ptr [ %scevgep29, %vector.body ], [ %lsr.iv, %vector.ph ]
571  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
572  %0 = phi i32 [ %start, %vector.ph ], [ %2, %vector.body ]
573; It's using %j.025, the induction variable from its outer loop:
574  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %j.025, i32 4096)
575  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv38, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
576  %wide.masked.load27 = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr %lsr.iv33, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
577  %1 = add nsw <4 x i32> %wide.masked.load27, %wide.masked.load
578  call void @llvm.masked.store.v4i32.p0(<4 x i32> %1, ptr %lsr.iv28, i32 4, <4 x i1> %active.lane.mask)
579  %index.next = add i32 %index, 4
580  %scevgep29 = getelementptr i32, ptr %lsr.iv28, i32 4
581  %scevgep34 = getelementptr i32, ptr %lsr.iv33, i32 4
582  %scevgep39 = getelementptr i32, ptr %lsr.iv38, i32 4
583  %2 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
584  %3 = icmp ne i32 %2, 0
585  br i1 %3, label %vector.body, label %for.cond.cleanup3
586
587for.cond.cleanup:                                 ; preds = %for.cond.cleanup3, %entry
588  ret void
589
590for.cond.cleanup3:                                ; preds = %vector.body
591  %inc11 = add nuw i32 %j.025, 1
592  %scevgep = getelementptr i32, ptr %lsr.iv, i32 1
593  %scevgep32 = getelementptr i32, ptr %lsr.iv31, i32 1
594  %scevgep37 = getelementptr i32, ptr %lsr.iv36, i32 1
595  %exitcond26 = icmp eq i32 %inc11, %N
596  br i1 %exitcond26, label %for.cond.cleanup, label %vector.ph
597}
598
599
600declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #1
601declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #2
602declare i32 @llvm.loop.decrement.reg.i32(i32 , i32 )
603declare i32 @llvm.start.loop.iterations.i32(i32)
604declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
605