xref: /llvm-project/llvm/test/Transforms/LoopVectorize/runtime-checks-hoist.ll (revision 29441e4f5fa5f5c7709f7cf180815ba97f611297)
1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
2; REQUIRES: asserts
3; RUN: opt < %s -p 'loop-vectorize' -force-vector-interleave=1 -S \
4; RUN:   -force-vector-width=4 -debug-only=loop-accesses,loop-vectorize,loop-utils 2> %t | FileCheck %s
5; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
6
7target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
8
9; Equivalent example in C:
10; void diff_checks(int32_t *dst, int32_t *src, int m, int n) {
11;   for (int i = 0; i < m; i++) {
12;     for (int j = 0; j < n; j++) {
13;       dst[(i * (n + 1)) + j] = src[(i * n) + j];
14;     }
15;   }
16; }
17; NOTE: The strides of the starting address values in the inner loop differ, i.e.
18; '(i * (n + 1))' vs '(i * n)'.
19
20; DEBUG-LABEL: 'diff_checks'
21; DEBUG:      LAA: Found an analyzable loop: inner.loop
22; DEBUG:      LAA: Not creating diff runtime check, since these  cannot be hoisted out of the outer loop
23; DEBUG:      LAA: Adding RT check for range:
24; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
25; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 (1 + %n)<nuw> to i64))<nsw>
26; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 (1 + %n)<nuw> to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst)
27; DEBUG-NEXT: LAA: Adding RT check for range:
28; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
29; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src)
30
31define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
32; CHECK-LABEL: define void @diff_checks
33; CHECK-SAME: (ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
34; CHECK-NEXT:  entry:
35; CHECK-NEXT:    [[ADD5:%.*]] = add nuw i32 [[N]], 1
36; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
37; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[ADD5]] to i64
38; CHECK-NEXT:    [[WIDE_M:%.*]] = zext i32 [[M]] to i64
39; CHECK-NEXT:    [[WIDE_N:%.*]] = zext i32 [[N]] to i64
40; CHECK-NEXT:    [[TMP2:%.*]] = add nsw i64 [[WIDE_M]], -1
41; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP1]]
42; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 2
43; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2
44; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[TMP4]], [[TMP5]]
45; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
46; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP1]], 2
47; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]]
48; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 2
49; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP9]]
50; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
51; CHECK:       outer.loop:
52; CHECK-NEXT:    [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
53; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]]
54; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP1]]
55; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4
56; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
57; CHECK:       vector.memcheck:
58; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
59; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
60; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
61; CHECK-NEXT:    [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP7]], 0
62; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]]
63; CHECK-NEXT:    br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
64; CHECK:       vector.ph:
65; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4
66; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]]
67; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
68; CHECK:       vector.body:
69; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
70; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[INDEX]], 0
71; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]]
72; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]]
73; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0
74; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META0:![0-9]+]]
75; CHECK-NEXT:    [[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]]
76; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]]
77; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0
78; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
79; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
80; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
81; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
82; CHECK:       middle.block:
83; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
84; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
85; CHECK:       scalar.ph:
86; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
87; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
88; CHECK:       inner.loop:
89; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
90; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP10]]
91; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP21]]
92; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
93; CHECK-NEXT:    [[TMP23:%.*]] = add nsw i64 [[IV_INNER]], [[TMP11]]
94; CHECK-NEXT:    [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]]
95; CHECK-NEXT:    store i32 [[TMP22]], ptr [[ARRAYIDX9_US]], align 4
96; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
97; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
98; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
99; CHECK:       inner.exit:
100; CHECK-NEXT:    [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1
101; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]]
102; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]]
103; CHECK:       outer.exit:
104; CHECK-NEXT:    ret void
105;
106entry:
107  %add5 = add nuw i32 %n, 1
108  %0 = zext i32 %n to i64
109  %1 = sext i32 %add5 to i64
110  %wide.m = zext i32 %m to i64
111  %wide.n = zext i32 %n to i64
112  br label %outer.loop
113
114outer.loop:
115  %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.exit ]
116  %2 = mul nsw i64 %iv.outer, %0
117  %3 = mul nsw i64 %iv.outer, %1
118  br label %inner.loop
119
120inner.loop:
121  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
122  %4 = add nuw nsw i64 %iv.inner, %2
123  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %4
124  %5 = load i32, ptr %arrayidx.us, align 4
125  %6 = add nsw i64 %iv.inner, %3
126  %arrayidx9.us = getelementptr inbounds i32, ptr %dst, i64 %6
127  store i32 %5, ptr %arrayidx9.us, align 4
128  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
129  %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n
130  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
131
132inner.exit:
133  %iv.outer.next = add nuw nsw i64 %iv.outer, 1
134  %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m
135  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
136
137outer.exit:
138  ret void
139}
140
141
142; Equivalent example in C:
143; void full_checks(int32_t *dst, int32_t *src, int m, int n) {
144;   for (int i = 0; i < m; i++) {
145;     for (int j = 0; j < n; j++) {
146;       dst[(i * n) + j] += src[(i * n) + j];
147;     }
148;   }
149; }
150; We decide to do full runtime checks here (as opposed to diff checks) due to
151; the additional load of 'dst[(i * n) + j]' in the loop.
152
153; DEBUG-LABEL: 'full_checks'
154; DEBUG: LAA: Found an analyzable loop: inner.loop
155; DEBUG-NOT: LAA: Creating diff runtime check for:
156; DEBUG: LAA: Adding RT check for range:
157; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
158; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %dst)
159; DEBUG-NEXT: LAA: Adding RT check for range:
160; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
161; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src)
162
163define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
164; CHECK-LABEL: define void @full_checks
165; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
166; CHECK-NEXT:  entry:
167; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
168; CHECK-NEXT:    [[WIDE_M:%.*]] = zext i32 [[M]] to i64
169; CHECK-NEXT:    [[WIDE_N:%.*]] = zext i32 [[N]] to i64
170; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]]
171; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
172; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
173; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]]
174; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
175; CHECK:       outer.loop:
176; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
177; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP0]]
178; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4
179; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
180; CHECK:       vector.memcheck:
181; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
182; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
183; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
184; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
185; CHECK:       vector.ph:
186; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4
187; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]]
188; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
189; CHECK:       vector.body:
190; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
191; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
192; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]]
193; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]]
194; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
195; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META9:![0-9]+]]
196; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]]
197; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
198; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]]
199; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
200; CHECK-NEXT:    store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META12]], !noalias [[META9]]
201; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
202; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
203; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
204; CHECK:       middle.block:
205; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
206; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
207; CHECK:       scalar.ph:
208; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
209; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
210; CHECK:       inner.loop:
211; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
212; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP3]]
213; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP12]]
214; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
215; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP12]]
216; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
217; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP14]], [[TMP13]]
218; CHECK-NEXT:    store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4
219; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
220; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
221; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
222; CHECK:       inner.exit:
223; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
224; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_M]]
225; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]]
226; CHECK:       outer.exit:
227; CHECK-NEXT:    ret void
228;
229entry:
230  %0 = zext i32 %n to i64
231  %wide.m = zext i32 %m to i64
232  %wide.n = zext i32 %n to i64
233  br label %outer.loop
234
235outer.loop:
236  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
237  %1 = mul nsw i64 %outer.iv, %0
238  br label %inner.loop
239
240inner.loop:
241  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
242  %2 = add nuw nsw i64 %iv.inner, %1
243  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %2
244  %3 = load i32, ptr %arrayidx.us, align 4
245  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %2
246  %4 = load i32, ptr %arrayidx8.us, align 4
247  %add9.us = add nsw i32 %4, %3
248  store i32 %add9.us, ptr %arrayidx8.us, align 4
249  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
250  %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n
251  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
252
253inner.exit:
254  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
255  %outer.exit.cond = icmp eq i64 %outer.iv.next, %wide.m
256  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
257
258outer.exit:
259  ret void
260}
261
262
263; Equivalent example in C:
264; void full_checks_diff_strides(int32_t *dst, int32_t *src, int m, int n) {
265;   for (int i = 0; i < m; i++) {
266;     for (int j = 0; j < n; j++) {
267;       dst[(i * (n + 1)) + j] += src[(i * n) + j];
268;     }
269;   }
270; }
271; We decide to do full runtime checks here (as opposed to diff checks) due to
272; the additional load of 'dst[(i * n) + j]' in the loop.
273; NOTE: This is different to the test above (@full_checks) because the dst array
274; is accessed with a higher stride compared src, and therefore the inner loop
275; runtime checks will vary for each outer loop iteration.
276
277; DEBUG-LABEL: 'full_checks_diff_strides'
278; DEBUG: LAA: Found an analyzable loop: inner.loop
279; DEBUG-NOT: LAA: Creating diff runtime check for:
280; DEBUG: LAA: Adding RT check for range:
281; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
282; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + ((4 + (4 * (zext i32 %n to i64))<nuw><nsw>)<nuw><nsw> * (-1 + (zext i32 %m to i64))<nsw>) + %dst)
283; DEBUG-NEXT: LAA: Adding RT check for range:
284; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
285; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src)
286
287define void @full_checks_diff_strides(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
288; CHECK-LABEL: define void @full_checks_diff_strides
289; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
290; CHECK-NEXT:  entry:
291; CHECK-NEXT:    [[WIDE_M:%.*]] = zext i32 [[M]] to i64
292; CHECK-NEXT:    [[WIDE_N:%.*]] = zext i32 [[N]] to i64
293; CHECK-NEXT:    [[TMP0:%.*]] = add nsw i64 [[WIDE_M]], -1
294; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2
295; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4
296; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP0]], [[TMP2]]
297; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP1]]
298; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]]
299; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]]
300; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
301; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
302; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
303; CHECK:       outer.loop:
304; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
305; CHECK-NEXT:    [[NPLUS1:%.*]] = add nuw nsw i32 [[N]], 1
306; CHECK-NEXT:    [[WIDE_NPLUS1:%.*]] = zext i32 [[NPLUS1]] to i64
307; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw i64 [[OUTER_IV]], [[WIDE_N]]
308; CHECK-NEXT:    [[TMP8:%.*]] = mul nsw i64 [[OUTER_IV]], [[WIDE_NPLUS1]]
309; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4
310; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
311; CHECK:       vector.memcheck:
312; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
313; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
314; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
315; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
316; CHECK:       vector.ph:
317; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4
318; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]]
319; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
320; CHECK:       vector.body:
321; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
322; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX]], 0
323; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP7]]
324; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP10]]
325; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0
326; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope [[META16:![0-9]+]]
327; CHECK-NEXT:    [[TMP13:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP8]]
328; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP13]]
329; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
330; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]]
331; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
332; CHECK-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP15]], align 4, !alias.scope [[META19]], !noalias [[META16]]
333; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
334; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
335; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
336; CHECK:       middle.block:
337; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
338; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
339; CHECK:       scalar.ph:
340; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
341; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
342; CHECK:       inner.loop:
343; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
344; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP7]]
345; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]]
346; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
347; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP8]]
348; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]]
349; CHECK-NEXT:    [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
350; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP21]], [[TMP19]]
351; CHECK-NEXT:    store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4
352; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
353; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
354; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP22:![0-9]+]]
355; CHECK:       inner.exit:
356; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
357; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_M]]
358; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]]
359; CHECK:       outer.exit:
360; CHECK-NEXT:    ret void
361;
362entry:
363  %wide.m = zext i32 %m to i64
364  %wide.n = zext i32 %n to i64
365  br label %outer.loop
366
367outer.loop:
368  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
369  %nplus1 = add nuw nsw i32 %n, 1
370  %wide.nplus1 = zext i32 %nplus1 to i64
371  %0 = mul nsw i64 %outer.iv, %wide.n
372  %1 = mul nsw i64 %outer.iv, %wide.nplus1
373  br label %inner.loop
374
375inner.loop:
376  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
377  %2 = add nuw nsw i64 %iv.inner, %0
378  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %2
379  %3 = load i32, ptr %arrayidx.us, align 4
380  %4 = add nuw nsw i64 %iv.inner, %1
381  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %4
382  %5 = load i32, ptr %arrayidx8.us, align 4
383  %add9.us = add nsw i32 %5, %3
384  store i32 %add9.us, ptr %arrayidx8.us, align 4
385  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
386  %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n
387  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
388
389inner.exit:
390  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
391  %outer.exit.cond = icmp eq i64 %outer.iv.next, %wide.m
392  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
393
394outer.exit:
395  ret void
396}
397
398
399; Equivalent example in C:
400; void diff_checks_src_start_invariant(int32_t *dst, int32_t *src, int m, int n) {
401;   for (int i = 0; i < m; i++) {
402;     for (int j = 0; j < n; j++) {
403;       dst[(i * n) + j] = src[j];
404;     }
405;   }
406; }
407
408; DEBUG-LABEL: 'diff_checks_src_start_invariant'
409; DEBUG: LAA: Found an analyzable loop: inner.loop
410; DEBUG-NOT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
411
412define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
413; CHECK-LABEL: define void @diff_checks_src_start_invariant
414; CHECK-SAME: (ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
415; CHECK-NEXT:  entry:
416; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
417; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
418; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
419; CHECK-NEXT:    [[WIDE_M:%.*]] = zext i32 [[M]] to i64
420; CHECK-NEXT:    [[WIDE_N:%.*]] = zext i32 [[N]] to i64
421; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC2]]
422; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[WIDE_N]], 2
423; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
424; CHECK:       outer.loop:
425; CHECK-NEXT:    [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
426; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], [[IV_OUTER]]
427; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]]
428; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]]
429; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4
430; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
431; CHECK:       vector.memcheck:
432; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16
433; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
434; CHECK:       vector.ph:
435; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4
436; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]]
437; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
438; CHECK:       vector.body:
439; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
440; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 0
441; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP6]]
442; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
443; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4
444; CHECK-NEXT:    [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], [[TMP5]]
445; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP9]]
446; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0
447; CHECK-NEXT:    store <4 x i32> [[WIDE_LOAD]], ptr [[TMP11]], align 4
448; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
449; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
450; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
451; CHECK:       middle.block:
452; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
453; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
454; CHECK:       scalar.ph:
455; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
456; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
457; CHECK:       inner.loop:
458; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
459; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]]
460; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
461; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP5]]
462; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP14]]
463; CHECK-NEXT:    store i32 [[TMP13]], ptr [[ARRAYIDX6_US]], align 4
464; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
465; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
466; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP24:![0-9]+]]
467; CHECK:       inner.loop.exit:
468; CHECK-NEXT:    [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1
469; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]]
470; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]]
471; CHECK:       outer.loop.exit:
472; CHECK-NEXT:    ret void
473;
474entry:
475  %0 = zext i32 %n to i64
476  %wide.m = zext i32 %m to i64
477  %wide.n = zext i32 %n to i64
478  br label %outer.loop
479
480outer.loop:
481  %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.loop.exit ]
482  %1 = mul nsw i64 %iv.outer, %0
483  br label %inner.loop
484
485inner.loop:
486  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
487  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %iv.inner
488  %2 = load i32, ptr %arrayidx.us, align 4
489  %3 = add nuw nsw i64 %iv.inner, %1
490  %arrayidx6.us = getelementptr inbounds i32, ptr %dst, i64 %3
491  store i32 %2, ptr %arrayidx6.us, align 4
492  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
493  %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n
494  br i1 %inner.exit.cond, label %inner.loop.exit, label %inner.loop
495
496inner.loop.exit:
497  %iv.outer.next = add nuw nsw i64 %iv.outer, 1
498  %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m
499  br i1 %outer.exit.cond, label %outer.loop.exit, label %outer.loop
500
501outer.loop.exit:
502  ret void
503}
504
505
506; Equivalent example in C:
507; void full_checks_src_start_invariant(int32_t *dst, int32_t *src, int m, int n) {
508;   for (int i = 0; i < m; i++) {
509;     for (int j = 0; j < n; j++) {
510;       dst[(i * n) + j] += src[j];
511;     }
512;   }
513; }
514
515; DEBUG-LABEL: 'full_checks_src_start_invariant'
516; DEBUG:      LAA: Found an analyzable loop: inner.loop
517; DEBUG:      LAA: Adding RT check for range:
518; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
519; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %dst)
520; DEBUG-NEXT: LAA: Adding RT check for range:
521; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %n to i64))<nuw><nsw> + %src)
522
523define void @full_checks_src_start_invariant(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) {
524; CHECK-LABEL: define void @full_checks_src_start_invariant
525; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
526; CHECK-NEXT:  entry:
527; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
528; CHECK-NEXT:    [[WIDE_M:%.*]] = zext i32 [[M]] to i64
529; CHECK-NEXT:    [[WIDE_N:%.*]] = zext i32 [[N]] to i64
530; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]]
531; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
532; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
533; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2
534; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]]
535; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
536; CHECK:       outer.loop:
537; CHECK-NEXT:    [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
538; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]]
539; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4
540; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
541; CHECK:       vector.memcheck:
542; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
543; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
544; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
545; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
546; CHECK:       vector.ph:
547; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4
548; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]]
549; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
550; CHECK:       vector.body:
551; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
552; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
553; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]]
554; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
555; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META25:![0-9]+]]
556; CHECK-NEXT:    [[TMP8:%.*]] = add nuw nsw i64 [[TMP5]], [[TMP4]]
557; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]]
558; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0
559; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META28:![0-9]+]], !noalias [[META25]]
560; CHECK-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]]
561; CHECK-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP10]], align 4, !alias.scope [[META28]], !noalias [[META25]]
562; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
563; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
564; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
565; CHECK:       middle.block:
566; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]]
567; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
568; CHECK:       scalar.ph:
569; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
570; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
571; CHECK:       inner.loop:
572; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
573; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]]
574; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
575; CHECK-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP4]]
576; CHECK-NEXT:    [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP14]]
577; CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4
578; CHECK-NEXT:    [[ADD7_US:%.*]] = add nsw i32 [[TMP15]], [[TMP13]]
579; CHECK-NEXT:    store i32 [[ADD7_US]], ptr [[ARRAYIDX6_US]], align 4
580; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
581; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]]
582; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP31:![0-9]+]]
583; CHECK:       inner.loop.exit:
584; CHECK-NEXT:    [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1
585; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]]
586; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]]
587; CHECK:       outer.loop.exit:
588; CHECK-NEXT:    ret void
589;
590entry:
591  %0 = zext i32 %n to i64
592  %wide.m = zext i32 %m to i64
593  %wide.n = zext i32 %n to i64
594  br label %outer.loop
595
596outer.loop:
597  %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.loop.exit ]
598  %1 = mul nsw i64 %iv.outer, %0
599  br label %inner.loop
600
601inner.loop:
602  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
603  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %iv.inner
604  %2 = load i32, ptr %arrayidx.us, align 4
605  %3 = add nuw nsw i64 %iv.inner, %1
606  %arrayidx6.us = getelementptr inbounds i32, ptr %dst, i64 %3
607  %4 = load i32, ptr %arrayidx6.us, align 4
608  %add7.us = add nsw i32 %4, %2
609  store i32 %add7.us, ptr %arrayidx6.us, align 4
610  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
611  %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n
612  br i1 %inner.exit.cond, label %inner.loop.exit, label %inner.loop
613
614inner.loop.exit:
615  %iv.outer.next = add nuw nsw i64 %iv.outer, 1
616  %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m
617  br i1 %outer.exit.cond, label %outer.loop.exit, label %outer.loop
618
619outer.loop.exit:
620  ret void
621}
622
623
624; Equivalent example in C:
625; void triple_nested_loop_mixed_access(int *dst, int *src, int m, int n, int o) {
626;   for (int i = 0; i < m; i++) {
627;     for (int j = 0; j < n; j++) {
628;       for (int l = 0; l < o; l++) {
629;         dst[(i * n * (o + 1)) + (j * o) + l] += src[(i * n * o) + l];
630;       }
631;     }
632;   }
633; }
634; The 'src' access varies with the outermost loop, rather than the parent of the
635; innermost loop. Hence we don't expand `src`, although in theory we could do.
636
637; DEBUG-LABEL: 'triple_nested_loop_mixed_access'
638; DEBUG:      LAA: Found an analyzable loop: inner.loop
639; DEBUG-NOT:  LAA: Creating diff runtime check for:
640; DEBUG:      LAA: Adding RT check for range:
641; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
642; DEBUG-NEXT: Start: {%dst,+,(4 * (zext i32 (1 + %o)<nsw> to i64) * (zext i32 %n to i64))}<%outer.outer.loop> End: {((4 * (zext i32 %n to i64) * (zext i32 %o to i64)) + %dst),+,(4 * (zext i32 (1 + %o)<nsw> to i64) * (zext i32 %n to i64))}<%outer.outer.loop>
643; DEBUG-NEXT: LAA: Adding RT check for range:
644; DEBUG-NEXT: Start: {%src,+,(4 * (zext i32 %n to i64) * (zext i32 %o to i64))}<%outer.outer.loop> End: {((4 * (zext i32 %o to i64))<nuw><nsw> + %src),+,(4 * (zext i32 %n to i64) * (zext i32 %o to i64))}<%outer.outer.loop>
645
646define void @triple_nested_loop_mixed_access(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n, i32 noundef %o) {
647; CHECK-LABEL: define void @triple_nested_loop_mixed_access
648; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]], i32 noundef [[O:%.*]]) {
649; CHECK-NEXT:  entry:
650; CHECK-NEXT:    [[ADD11:%.*]] = add nsw i32 [[O]], 1
651; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[O]] to i64
652; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
653; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[ADD11]] to i64
654; CHECK-NEXT:    [[WIDE_TRIP_COUNT68:%.*]] = zext i32 [[M]] to i64
655; CHECK-NEXT:    [[WIDE_TRIP_COUNT60:%.*]] = zext i32 [[N]] to i64
656; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[O]] to i64
657; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP1]], [[TMP2]]
658; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[TMP3]], 2
659; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
660; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
661; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
662; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 2
663; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
664; CHECK-NEXT:    br label [[OUTER_OUTER_LOOP:%.*]]
665; CHECK:       outer.outer.loop:
666; CHECK-NEXT:    [[OUTER_OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_OUTER_IV_NEXT:%.*]], [[OUTER_LOOP_END:%.*]] ]
667; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP4]], [[OUTER_OUTER_IV]]
668; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]]
669; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[TMP6]], [[TMP10]]
670; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
671; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP8]], [[OUTER_OUTER_IV]]
672; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]]
673; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP9]], [[TMP12]]
674; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
675; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw i64 [[OUTER_OUTER_IV]], [[TMP1]]
676; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP0]]
677; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 [[TMP14]], [[TMP2]]
678; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
679; CHECK:       outer.loop:
680; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_END:%.*]] ], [ 0, [[OUTER_OUTER_LOOP]] ]
681; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP0]]
682; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw i64 [[TMP17]], [[TMP16]]
683; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
684; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
685; CHECK:       vector.memcheck:
686; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
687; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
688; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
689; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
690; CHECK:       vector.ph:
691; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
692; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
693; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
694; CHECK:       vector.body:
695; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
696; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 0
697; CHECK-NEXT:    [[TMP20:%.*]] = add nuw nsw i64 [[TMP19]], [[TMP15]]
698; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
699; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
700; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META32:![0-9]+]]
701; CHECK-NEXT:    [[TMP23:%.*]] = add nuw nsw i64 [[TMP18]], [[TMP19]]
702; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]]
703; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
704; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META35:![0-9]+]], !noalias [[META32]]
705; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
706; CHECK-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP25]], align 4, !alias.scope [[META35]], !noalias [[META32]]
707; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
708; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
709; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]]
710; CHECK:       middle.block:
711; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
712; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_END]], label [[SCALAR_PH]]
713; CHECK:       scalar.ph:
714; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
715; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
716; CHECK:       inner.loop:
717; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
718; CHECK-NEXT:    [[TMP28:%.*]] = add nuw nsw i64 [[INNER_IV]], [[TMP15]]
719; CHECK-NEXT:    [[ARRAYIDX_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP28]]
720; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_US_US_US]], align 4
721; CHECK-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[TMP18]], [[INNER_IV]]
722; CHECK-NEXT:    [[ARRAYIDX17_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP30]]
723; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX17_US_US_US]], align 4
724; CHECK-NEXT:    [[ADD18_US_US_US:%.*]] = add nsw i32 [[TMP31]], [[TMP29]]
725; CHECK-NEXT:    store i32 [[ADD18_US_US_US]], ptr [[ARRAYIDX17_US_US_US]], align 4
726; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
727; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
728; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_END]], label [[INNER_LOOP]], !llvm.loop [[LOOP38:![0-9]+]]
729; CHECK:       inner.loop.end:
730; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
731; CHECK-NEXT:    [[EXIT_OUTER:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT60]]
732; CHECK-NEXT:    br i1 [[EXIT_OUTER]], label [[OUTER_LOOP_END]], label [[OUTER_LOOP]]
733; CHECK:       outer.loop.end:
734; CHECK-NEXT:    [[OUTER_OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_OUTER_IV]], 1
735; CHECK-NEXT:    [[EXIT_OUTER_OUTER:%.*]] = icmp eq i64 [[OUTER_OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT68]]
736; CHECK-NEXT:    br i1 [[EXIT_OUTER_OUTER]], label [[EXIT:%.*]], label [[OUTER_OUTER_LOOP]]
737; CHECK:       exit:
738; CHECK-NEXT:    ret void
739;
740entry:
741  %add11 = add nsw i32 %o, 1
742  %0 = zext i32 %o to i64
743  %1 = zext i32 %n to i64
744  %2 = zext i32 %add11 to i64
745  %wide.trip.count68 = zext i32 %m to i64
746  %wide.trip.count60 = zext i32 %n to i64
747  %wide.trip.count = zext i32 %o to i64
748  br label %outer.outer.loop
749
750outer.outer.loop:
751  %outer.outer.iv = phi i64 [ 0, %entry ], [ %outer.outer.iv.next, %outer.loop.end ]
752  %3 = mul nsw i64 %outer.outer.iv, %1
753  %4 = mul nsw i64 %3, %0
754  %5 = mul nsw i64 %3, %2
755  br label %outer.loop
756
757outer.loop:
758  %outer.iv = phi i64 [ %outer.iv.next, %inner.loop.end ], [ 0, %outer.outer.loop ]
759  %6 = mul nsw i64 %outer.iv, %0
760  %7 = add nuw nsw i64 %6, %5
761  br label %inner.loop
762
763inner.loop:
764  %inner.iv = phi i64 [ %inner.iv.next, %inner.loop ], [ 0, %outer.loop ]
765  %8 = add nuw nsw i64 %inner.iv, %4
766  %arrayidx.us.us.us = getelementptr inbounds i32, ptr %src, i64 %8
767  %9 = load i32, ptr %arrayidx.us.us.us, align 4
768  %10 = add nuw nsw i64 %7, %inner.iv
769  %arrayidx17.us.us.us = getelementptr inbounds i32, ptr %dst, i64 %10
770  %11 = load i32, ptr %arrayidx17.us.us.us, align 4
771  %add18.us.us.us = add nsw i32 %11, %9
772  store i32 %add18.us.us.us, ptr %arrayidx17.us.us.us, align 4
773  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
774  %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count
775  br i1 %exitcond.not, label %inner.loop.end, label %inner.loop
776
777inner.loop.end:
778  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
779  %exit.outer = icmp eq i64 %outer.iv.next, %wide.trip.count60
780  br i1 %exit.outer, label %outer.loop.end, label %outer.loop
781
782outer.loop.end:
783  %outer.outer.iv.next = add nuw nsw i64 %outer.outer.iv, 1
784  %exit.outer.outer = icmp eq i64 %outer.outer.iv.next, %wide.trip.count68
785  br i1 %exit.outer.outer, label %exit, label %outer.outer.loop
786
787exit:
788  ret void
789}
790
791
792; Equivalent example in C:
793; void uncomputable_outer_tc(int32_t *dst, int32_t *src, char *str, int n) {
794;   int i;
795;   while (str[i] != '\0') {
796;     for (int j = 0; j < n; j++) {
797;       dst[(i * (n + 1)) + j] += src[(i * n) + j];
798;     }
799;     i++;
800;   }
801; }
802; Outer loop trip count is uncomputable so we shouldn't expand the ranges.
803
804; DEBUG-LABEL: 'uncomputable_outer_tc'
805; DEBUG:      LAA: Found an analyzable loop: inner.loop
806; DEBUG:      LAA: Adding RT check for range:
807; DEBUG-NEXT: Start: {%dst,+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop> End: {((4 * (zext i32 %n to i64))<nuw><nsw> + %dst),+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop>
808; DEBUG-NEXT: LAA: Adding RT check for range:
809; DEBUG-NEXT: Start: {%src,+,(4 * (zext i32 %n to i64))<nuw><nsw>}<%outer.loop> End: {((4 * (zext i32 %n to i64))<nuw><nsw> + %src),+,(4 * (zext i32 %n to i64))<nuw><nsw>}<%outer.loop>
810
811define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, ptr nocapture noundef readonly %str, i32 noundef %n) {
812; CHECK-LABEL: define void @uncomputable_outer_tc
813; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef readonly captures(none) [[STR:%.*]], i32 noundef [[N:%.*]]) {
814; CHECK-NEXT:  entry:
815; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[STR]], align 1
816; CHECK-NEXT:    [[CMP_NOT23:%.*]] = icmp ne i8 [[TMP0]], 0
817; CHECK-NEXT:    [[CMP221:%.*]] = icmp sgt i32 [[N]], 0
818; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP_NOT23]], [[CMP221]]
819; CHECK-NEXT:    br i1 [[OR_COND]], label [[OUTER_LOOP_PREHEADER:%.*]], label [[WHILE_END:%.*]]
820; CHECK:       outer.loop.preheader:
821; CHECK-NEXT:    [[ADD6:%.*]] = add nuw nsw i32 [[N]], 1
822; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[N]] to i64
823; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[ADD6]] to i64
824; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
825; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
826; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
827; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[WIDE_TRIP_COUNT]], 2
828; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
829; CHECK:       outer.loop:
830; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PREHEADER]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
831; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP3]], [[OUTER_IV]]
832; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
833; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP4]], [[TMP6]]
834; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]]
835; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP5]], [[OUTER_IV]]
836; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP8]]
837; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP4]], [[TMP8]]
838; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP9]]
839; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP1]]
840; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]]
841; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
842; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
843; CHECK:       vector.memcheck:
844; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
845; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
846; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
847; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
848; CHECK:       vector.ph:
849; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
850; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
851; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
852; CHECK:       vector.body:
853; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
854; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
855; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP12]], [[TMP10]]
856; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP13]]
857; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0
858; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META39:![0-9]+]]
859; CHECK-NEXT:    [[TMP16:%.*]] = add nsw i64 [[TMP12]], [[TMP11]]
860; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP16]]
861; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
862; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META39]]
863; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]]
864; CHECK-NEXT:    store <4 x i32> [[TMP19]], ptr [[TMP18]], align 4, !alias.scope [[META42]], !noalias [[META39]]
865; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
866; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
867; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]]
868; CHECK:       middle.block:
869; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
870; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
871; CHECK:       scalar.ph:
872; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
873; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
874; CHECK:       inner.loop:
875; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
876; CHECK-NEXT:    [[TMP21:%.*]] = add nsw i64 [[INNER_IV]], [[TMP10]]
877; CHECK-NEXT:    [[ARRAYIDX5_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP21]]
878; CHECK-NEXT:    [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5_US]], align 4
879; CHECK-NEXT:    [[TMP23:%.*]] = add nsw i64 [[INNER_IV]], [[TMP11]]
880; CHECK-NEXT:    [[ARRAYIDX10_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]]
881; CHECK-NEXT:    [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX10_US]], align 4
882; CHECK-NEXT:    [[ADD11_US:%.*]] = add nsw i32 [[TMP24]], [[TMP22]]
883; CHECK-NEXT:    store i32 [[ADD11_US]], ptr [[ARRAYIDX10_US]], align 4
884; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
885; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
886; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP45:![0-9]+]]
887; CHECK:       inner.loop.exit:
888; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1
889; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i8, ptr [[STR]], i64 [[OUTER_IV_NEXT]]
890; CHECK-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1
891; CHECK-NEXT:    [[CMP_NOT_US:%.*]] = icmp eq i8 [[TMP25]], 0
892; CHECK-NEXT:    br i1 [[CMP_NOT_US]], label [[WHILE_END_LOOPEXIT:%.*]], label [[OUTER_LOOP]]
893; CHECK:       while.end.loopexit:
894; CHECK-NEXT:    br label [[WHILE_END]]
895; CHECK:       while.end:
896; CHECK-NEXT:    ret void
897;
898entry:
899  %0 = load i8, ptr %str, align 1
900  %cmp.not23 = icmp ne i8 %0, 0
901  %cmp221 = icmp sgt i32 %n, 0
902  %or.cond = and i1 %cmp.not23, %cmp221
903  br i1 %or.cond, label %outer.loop.preheader, label %while.end
904
905outer.loop.preheader:
906  %add6 = add nuw nsw i32 %n, 1
907  %1 = zext i32 %n to i64
908  %2 = zext i32 %add6 to i64
909  %wide.trip.count = zext i32 %n to i64
910  br label %outer.loop
911
912outer.loop:
913  %outer.iv = phi i64 [ 0, %outer.loop.preheader ], [ %outer.iv.next, %inner.loop.exit ]
914  %3 = mul nsw i64 %outer.iv, %1
915  %4 = mul nsw i64 %outer.iv, %2
916  br label %inner.loop
917
918inner.loop:
919  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
920  %5 = add nsw i64 %inner.iv, %3
921  %arrayidx5.us = getelementptr inbounds i32, ptr %src, i64 %5
922  %6 = load i32, ptr %arrayidx5.us, align 4
923  %7 = add nsw i64 %inner.iv, %4
924  %arrayidx10.us = getelementptr inbounds i32, ptr %dst, i64 %7
925  %8 = load i32, ptr %arrayidx10.us, align 4
926  %add11.us = add nsw i32 %8, %6
927  store i32 %add11.us, ptr %arrayidx10.us, align 4
928  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
929  %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count
930  br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop
931
932inner.loop.exit:
933  %outer.iv.next = add i64 %outer.iv, 1
934  %arrayidx.us = getelementptr inbounds i8, ptr %str, i64 %outer.iv.next
935  %9 = load i8, ptr %arrayidx.us, align 1
936  %cmp.not.us = icmp eq i8 %9, 0
937  br i1 %cmp.not.us, label %while.end, label %outer.loop
938
939while.end:
940  ret void
941}
942
943
944; Equivalent example in C:
945; void decreasing_inner_iv(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) {
946;   for (int i = 0; i < m; i++) {
947;     for (int j = n; j >= 0; j--) {
948;       dst[(i * stride1) + j] += src[(i * stride2) + j];
949;     }
950;   }
951; }
952; Inner IV is decreasing, but this isn't a problem and we can still expand the
953; runtime checks correctly to cover the whole loop.
954
955; DEBUG-LABEL: 'decreasing_inner_iv'
956; DEBUG:      LAA: Found an analyzable loop: inner.loop
957; DEBUG:      LAA: Adding RT check for range:
958; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
959; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 %stride1 to i64))<nsw>
960; DEBUG-NEXT: Start: %dst End: (4 + (4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 %stride1 to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst)
961; DEBUG-NEXT: LAA: Adding RT check for range:
962; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
963; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 %stride2 to i64))<nsw>
964; DEBUG-NEXT: Start: %src End: (4 + (4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 %stride2 to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %src)
965
966define void @decreasing_inner_iv(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) {
967; CHECK-LABEL: define void @decreasing_inner_iv
968; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
969; CHECK-NEXT:  entry:
970; CHECK-NEXT:    [[CMP20:%.*]] = icmp sgt i32 [[M]], 0
971; CHECK-NEXT:    [[CMP218:%.*]] = icmp sgt i32 [[N]], -1
972; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP20]], [[CMP218]]
973; CHECK-NEXT:    br i1 [[OR_COND]], label [[OUTER_LOOP_PRE:%.*]], label [[EXIT:%.*]]
974; CHECK:       outer.loop.pre:
975; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
976; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[STRIDE2]] to i64
977; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[STRIDE1]] to i64
978; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M]] to i64
979; CHECK-NEXT:    [[TMP3:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
980; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP2]]
981; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
982; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 2
983; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP5]], [[TMP6]]
984; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP7]], 4
985; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
986; CHECK-NEXT:    [[TMP9:%.*]] = shl nsw i64 [[TMP2]], 2
987; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP3]], [[TMP1]]
988; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 2
989; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[TMP6]]
990; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 4
991; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
992; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw i64 [[TMP1]], 2
993; CHECK-NEXT:    [[TMP15:%.*]] = add nuw nsw i64 [[TMP0]], 1
994; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
995; CHECK:       outer.loop:
996; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PRE]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
997; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP1]]
998; CHECK-NEXT:    [[TMP17:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]]
999; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP15]], 4
1000; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1001; CHECK:       vector.memcheck:
1002; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
1003; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
1004; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1005; CHECK-NEXT:    [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP9]], 0
1006; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]]
1007; CHECK-NEXT:    [[STRIDE_CHECK2:%.*]] = icmp slt i64 [[TMP14]], 0
1008; CHECK-NEXT:    [[TMP19:%.*]] = or i1 [[TMP18]], [[STRIDE_CHECK2]]
1009; CHECK-NEXT:    br i1 [[TMP19]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1010; CHECK:       vector.ph:
1011; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP15]], 4
1012; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP15]], [[N_MOD_VF]]
1013; CHECK-NEXT:    [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]]
1014; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1015; CHECK:       vector.body:
1016; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1017; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]]
1018; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0
1019; CHECK-NEXT:    [[TMP22:%.*]] = add nsw i64 [[TMP21]], [[TMP16]]
1020; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]]
1021; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0
1022; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 -3
1023; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META46:![0-9]+]]
1024; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1025; CHECK-NEXT:    [[TMP26:%.*]] = add nsw i64 [[TMP21]], [[TMP17]]
1026; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP26]]
1027; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0
1028; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 -3
1029; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP29]], align 4, !alias.scope [[META49:![0-9]+]], !noalias [[META46]]
1030; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1031; CHECK-NEXT:    [[TMP30:%.*]] = add nsw <4 x i32> [[REVERSE4]], [[REVERSE]]
1032; CHECK-NEXT:    [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP30]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
1033; CHECK-NEXT:    store <4 x i32> [[REVERSE5]], ptr [[TMP29]], align 4, !alias.scope [[META49]], !noalias [[META46]]
1034; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1035; CHECK-NEXT:    [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1036; CHECK-NEXT:    br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]]
1037; CHECK:       middle.block:
1038; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP15]], [[N_VEC]]
1039; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
1040; CHECK:       scalar.ph:
1041; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[OUTER_LOOP]] ]
1042; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
1043; CHECK:       inner.loop:
1044; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
1045; CHECK-NEXT:    [[TMP32:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]]
1046; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP32]]
1047; CHECK-NEXT:    [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
1048; CHECK-NEXT:    [[TMP34:%.*]] = add nsw i64 [[INNER_IV]], [[TMP17]]
1049; CHECK-NEXT:    [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP34]]
1050; CHECK-NEXT:    [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4
1051; CHECK-NEXT:    [[ADD9_US:%.*]] = add nsw i32 [[TMP35]], [[TMP33]]
1052; CHECK-NEXT:    store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4
1053; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], -1
1054; CHECK-NEXT:    [[CMP2_US:%.*]] = icmp sgt i64 [[INNER_IV]], 0
1055; CHECK-NEXT:    br i1 [[CMP2_US]], label [[INNER_LOOP]], label [[INNER_LOOP_EXIT]], !llvm.loop [[LOOP52:![0-9]+]]
1056; CHECK:       inner.loop.exit:
1057; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
1058; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1059; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]]
1060; CHECK:       outer.loop.exit:
1061; CHECK-NEXT:    br label [[EXIT]]
1062; CHECK:       exit:
1063; CHECK-NEXT:    ret void
1064;
1065entry:
1066  %cmp20 = icmp sgt i32 %m, 0
1067  %cmp218 = icmp sgt i32 %n, -1
1068  %or.cond = and i1 %cmp20, %cmp218
1069  br i1 %or.cond, label %outer.loop.pre, label %exit
1070
1071outer.loop.pre:
1072  %0 = zext i32 %n to i64
1073  %1 = sext i32 %stride2 to i64
1074  %2 = sext i32 %stride1 to i64
1075  %wide.trip.count = zext i32 %m to i64
1076  br label %outer.loop
1077
1078outer.loop:
1079  %outer.iv = phi i64 [ 0, %outer.loop.pre ], [ %outer.iv.next, %inner.loop.exit ]
1080  %3 = mul nsw i64 %outer.iv, %1
1081  %4 = mul nsw i64 %outer.iv, %2
1082  br label %inner.loop
1083
1084inner.loop:
1085  %inner.iv = phi i64 [ %0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
1086  %5 = add nsw i64 %inner.iv, %3
1087  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %5
1088  %6 = load i32, ptr %arrayidx.us, align 4
1089  %7 = add nsw i64 %inner.iv, %4
1090  %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %7
1091  %8 = load i32, ptr %arrayidx8.us, align 4
1092  %add9.us = add nsw i32 %8, %6
1093  store i32 %add9.us, ptr %arrayidx8.us, align 4
1094  %inner.iv.next = add nsw i64 %inner.iv, -1
1095  %cmp2.us = icmp sgt i64 %inner.iv, 0
1096  br i1 %cmp2.us, label %inner.loop, label %inner.loop.exit
1097
1098inner.loop.exit:
1099  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
1100  %exitcond.not = icmp eq i64 %outer.iv.next, %wide.trip.count
1101  br i1 %exitcond.not, label %outer.loop.exit, label %outer.loop
1102
1103outer.loop.exit:
1104  br label %exit
1105
1106exit:
1107  ret void
1108}
1109
1110
1111; Equivalent example in C:
1112; void decreasing_outer_iv(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) {
1113;   for (int i = m - 1; i >= 0; i--) {
1114;     for (int j = 0; j <= n; j++) {
1115;       dst[(i * stride1) + j] += src[(i * stride2) + j];
1116;     }
1117;   }
1118; }
1119; Outer IV is decreasing, but the direction of memory accesses also depends
1120; upon the signedness of stride1.
1121
1122; DEBUG-LABEL: 'decreasing_outer_iv'
1123; DEBUG:      LAA: Found an analyzable loop: inner.loop
1124; DEBUG:      LAA: Adding RT check for range:
1125; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
1126; DEBUG-NEXT: LAA: ... but need to check stride is positive: (-4 * (sext i32 %stride1 to i64))<nsw>
1127; DEBUG-NEXT: Start: ((4 * (zext i32 %m to i64) * (sext i32 %stride1 to i64)) + %dst) End: ((4 * (zext i32 (1 + %n) to i64))<nuw><nsw> + (4 * (sext i32 %stride1 to i64))<nsw> + %dst)
1128; DEBUG-NEXT: LAA: Adding RT check for range:
1129; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
1130; DEBUG-NEXT: LAA: ... but need to check stride is positive: (-4 * (sext i32 %stride2 to i64))<nsw>
1131; DEBUG-NEXT: Start: ((4 * (zext i32 %m to i64) * (sext i32 %stride2 to i64)) + %src) End: ((4 * (zext i32 (1 + %n) to i64))<nuw><nsw> + (4 * (sext i32 %stride2 to i64))<nsw> + %src)
1132
1133define void @decreasing_outer_iv(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) {
1134; CHECK-LABEL: define void @decreasing_outer_iv
1135; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
1136; CHECK-NEXT:  entry:
1137; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 [[M]], 1
1138; CHECK-NEXT:    [[CMP2_NOT18:%.*]] = icmp slt i32 [[N]], 0
1139; CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[CMP21]], [[CMP2_NOT18]]
1140; CHECK-NEXT:    br i1 [[OR_COND]], label [[EXIT:%.*]], label [[OUTER_LOOP_PRE:%.*]]
1141; CHECK:       outer.loop.pre:
1142; CHECK-NEXT:    [[TMP0:%.*]] = add nuw i32 [[N]], 1
1143; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[M]] to i64
1144; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[STRIDE1]] to i64
1145; CHECK-NEXT:    [[TMP3:%.*]] = sext i32 [[STRIDE2]] to i64
1146; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64
1147; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]]
1148; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
1149; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]]
1150; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[TMP2]], 2
1151; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
1152; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], [[TMP7]]
1153; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
1154; CHECK-NEXT:    [[TMP9:%.*]] = mul nsw i64 [[TMP2]], -4
1155; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP3]], [[TMP1]]
1156; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[TMP10]], 2
1157; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]]
1158; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw i64 [[TMP3]], 2
1159; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], [[TMP7]]
1160; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
1161; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw i64 [[TMP3]], -4
1162; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
1163; CHECK:       outer.loop:
1164; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ [[TMP1]], [[OUTER_LOOP_PRE]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
1165; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], -1
1166; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP3]]
1167; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]]
1168; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
1169; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1170; CHECK:       vector.memcheck:
1171; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]]
1172; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]]
1173; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1174; CHECK-NEXT:    [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP9]], 0
1175; CHECK-NEXT:    [[TMP17:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]]
1176; CHECK-NEXT:    [[STRIDE_CHECK4:%.*]] = icmp slt i64 [[TMP14]], 0
1177; CHECK-NEXT:    [[TMP18:%.*]] = or i1 [[TMP17]], [[STRIDE_CHECK4]]
1178; CHECK-NEXT:    br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1179; CHECK:       vector.ph:
1180; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
1181; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1182; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1183; CHECK:       vector.body:
1184; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1185; CHECK-NEXT:    [[TMP19:%.*]] = add i64 [[INDEX]], 0
1186; CHECK-NEXT:    [[TMP20:%.*]] = add nsw i64 [[TMP19]], [[TMP15]]
1187; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]]
1188; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0
1189; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META53:![0-9]+]]
1190; CHECK-NEXT:    [[TMP23:%.*]] = add nsw i64 [[TMP19]], [[TMP16]]
1191; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]]
1192; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0
1193; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META56:![0-9]+]], !noalias [[META53]]
1194; CHECK-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]]
1195; CHECK-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP25]], align 4, !alias.scope [[META56]], !noalias [[META53]]
1196; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1197; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1198; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]]
1199; CHECK:       middle.block:
1200; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1201; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
1202; CHECK:       scalar.ph:
1203; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
1204; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
1205; CHECK:       inner.loop:
1206; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
1207; CHECK-NEXT:    [[TMP28:%.*]] = add nsw i64 [[INNER_IV]], [[TMP15]]
1208; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP28]]
1209; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
1210; CHECK-NEXT:    [[TMP30:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]]
1211; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP30]]
1212; CHECK-NEXT:    [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4
1213; CHECK-NEXT:    [[ADD9:%.*]] = add nsw i32 [[TMP31]], [[TMP29]]
1214; CHECK-NEXT:    store i32 [[ADD9]], ptr [[ARRAYIDX8]], align 4
1215; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
1216; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1217; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP59:![0-9]+]]
1218; CHECK:       inner.loop.exit:
1219; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i64 [[OUTER_IV]], 1
1220; CHECK-NEXT:    br i1 [[CMP]], label [[OUTER_LOOP]], label [[OUTER_LOOP_EXIT:%.*]]
1221; CHECK:       outer.loop.exit:
1222; CHECK-NEXT:    br label [[EXIT]]
1223; CHECK:       exit:
1224; CHECK-NEXT:    ret void
1225;
1226entry:
1227  %cmp21 = icmp slt i32 %m, 1
1228  %cmp2.not18 = icmp slt i32 %n, 0
1229  %or.cond = or i1 %cmp21, %cmp2.not18
1230  br i1 %or.cond, label %exit, label %outer.loop.pre
1231
1232outer.loop.pre:
1233  %0 = add nuw i32 %n, 1
1234  %1 = zext i32 %m to i64
1235  %2 = sext i32 %stride1 to i64
1236  %3 = sext i32 %stride2 to i64
1237  %wide.trip.count = zext i32 %0 to i64
1238  br label %outer.loop
1239
1240outer.loop:
1241  %outer.iv = phi i64 [ %1, %outer.loop.pre ], [ %outer.iv.next, %inner.loop.exit ]
1242  %outer.iv.next = add nsw i64 %outer.iv, -1
1243  %4 = mul nsw i64 %outer.iv, %3
1244  %5 = mul nsw i64 %outer.iv, %2
1245  br label %inner.loop
1246
1247inner.loop:
1248  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
1249  %6 = add nsw i64 %inner.iv, %4
1250  %arrayidx = getelementptr inbounds i32, ptr %src, i64 %6
1251  %7 = load i32, ptr %arrayidx, align 4
1252  %8 = add nsw i64 %inner.iv, %5
1253  %arrayidx8 = getelementptr inbounds i32, ptr %dst, i64 %8
1254  %9 = load i32, ptr %arrayidx8, align 4
1255  %add9 = add nsw i32 %9, %7
1256  store i32 %add9, ptr %arrayidx8, align 4
1257  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
1258  %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count
1259  br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop
1260
1261inner.loop.exit:
1262  %cmp = icmp sgt i64 %outer.iv, 1
1263  br i1 %cmp, label %outer.loop, label %outer.loop.exit
1264
1265outer.loop.exit:
1266  br label %exit
1267
1268exit:
1269  ret void
1270}
1271
1272
1273; Equivalent example in C:
1274; void foo(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) {
1275;   for (int i = 0; i < m; i++) {
1276;     for (int j = 0; j < n; j++) {
1277;       dst[(i * (n + 1)) + (j * stride1)] += src[(i * n) + (j * stride2)];
1278;     }
1279;   }
1280; }
1281
1282
1283; DEBUG-LABEL: 'unknown_inner_stride'
1284; DEBUG:      LAA: Found an analyzable loop: inner.loop
1285; DEBUG:      LAA: Adding RT check for range:
1286; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
1287; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (zext i32 (1 + %n) to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst)
1288; DEBUG-NEXT: LAA: Adding RT check for range:
1289; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting
1290; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src)
1291
1292define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) {
1293; CHECK-LABEL: define void @unknown_inner_stride
1294; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) {
1295; CHECK-NEXT:  entry:
1296; CHECK-NEXT:    [[CMP26:%.*]] = icmp sgt i32 [[M]], 0
1297; CHECK-NEXT:    [[CMP224:%.*]] = icmp sgt i32 [[N]], 0
1298; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP26]], [[CMP224]]
1299; CHECK-NEXT:    br i1 [[OR_COND]], label [[OUTER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]]
1300; CHECK:       outer.loop.preheader:
1301; CHECK-NEXT:    [[ADD6:%.*]] = add nuw nsw i32 [[N]], 1
1302; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[STRIDE2]] to i64
1303; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[STRIDE1]] to i64
1304; CHECK-NEXT:    [[TMP2:%.*]] = zext i32 [[N]] to i64
1305; CHECK-NEXT:    [[TMP3:%.*]] = zext i32 [[ADD6]] to i64
1306; CHECK-NEXT:    [[WIDE_TRIP_COUNT39:%.*]] = zext i32 [[M]] to i64
1307; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
1308; CHECK-NEXT:    [[TMP4:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT39]], -1
1309; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP3]]
1310; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
1311; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2
1312; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[TMP6]], [[TMP7]]
1313; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
1314; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[WIDE_TRIP_COUNT39]]
1315; CHECK-NEXT:    [[TMP10:%.*]] = shl i64 [[TMP9]], 2
1316; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP10]]
1317; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
1318; CHECK:       outer.loop:
1319; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PREHEADER]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ]
1320; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]]
1321; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP3]]
1322; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4
1323; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]]
1324; CHECK:       vector.scevcheck:
1325; CHECK-NEXT:    [[IDENT_CHECK:%.*]] = icmp ne i32 [[STRIDE1]], 1
1326; CHECK-NEXT:    [[IDENT_CHECK1:%.*]] = icmp ne i32 [[STRIDE2]], 1
1327; CHECK-NEXT:    [[TMP13:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK1]]
1328; CHECK-NEXT:    br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]]
1329; CHECK:       vector.memcheck:
1330; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
1331; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
1332; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1333; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1334; CHECK:       vector.ph:
1335; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4
1336; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
1337; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1338; CHECK:       vector.body:
1339; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1340; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[INDEX]], 0
1341; CHECK-NEXT:    [[TMP15:%.*]] = add nsw i64 [[TMP14]], [[TMP11]]
1342; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP15]]
1343; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0
1344; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !alias.scope [[META60:![0-9]+]]
1345; CHECK-NEXT:    [[TMP18:%.*]] = add nsw i64 [[TMP14]], [[TMP12]]
1346; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP18]]
1347; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0
1348; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]]
1349; CHECK-NEXT:    [[TMP21:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]]
1350; CHECK-NEXT:    store <4 x i32> [[TMP21]], ptr [[TMP20]], align 4, !alias.scope [[META63]], !noalias [[META60]]
1351; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1352; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1353; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]]
1354; CHECK:       middle.block:
1355; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
1356; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]]
1357; CHECK:       scalar.ph:
1358; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[OUTER_LOOP]] ]
1359; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
1360; CHECK:       inner.loop:
1361; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ]
1362; CHECK-NEXT:    [[TMP23:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP0]]
1363; CHECK-NEXT:    [[TMP24:%.*]] = add nsw i64 [[TMP23]], [[TMP11]]
1364; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]]
1365; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4
1366; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP1]]
1367; CHECK-NEXT:    [[TMP27:%.*]] = add nsw i64 [[TMP26]], [[TMP12]]
1368; CHECK-NEXT:    [[ARRAYIDX11_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP27]]
1369; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX11_US]], align 4
1370; CHECK-NEXT:    [[ADD12_US:%.*]] = add nsw i32 [[TMP28]], [[TMP25]]
1371; CHECK-NEXT:    store i32 [[ADD12_US]], ptr [[ARRAYIDX11_US]], align 4
1372; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1
1373; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]]
1374; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP66:![0-9]+]]
1375; CHECK:       inner.loop.exit:
1376; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
1377; CHECK-NEXT:    [[EXITCOND40_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT39]]
1378; CHECK-NEXT:    br i1 [[EXITCOND40_NOT]], label [[EXIT_LOOPEXIT:%.*]], label [[OUTER_LOOP]]
1379; CHECK:       exit.loopexit:
1380; CHECK-NEXT:    br label [[EXIT]]
1381; CHECK:       exit:
1382; CHECK-NEXT:    ret void
1383;
1384entry:
1385  %cmp26 = icmp sgt i32 %m, 0
1386  %cmp224 = icmp sgt i32 %n, 0
1387  %or.cond = and i1 %cmp26, %cmp224
1388  br i1 %or.cond, label %outer.loop.preheader, label %exit
1389
1390outer.loop.preheader:
1391  %add6 = add nuw nsw i32 %n, 1
1392  %0 = sext i32 %stride2 to i64
1393  %1 = sext i32 %stride1 to i64
1394  %2 = zext i32 %n to i64
1395  %3 = zext i32 %add6 to i64
1396  %wide.trip.count39 = zext i32 %m to i64
1397  %wide.trip.count = zext i32 %n to i64
1398  br label %outer.loop
1399
1400outer.loop:
1401  %outer.iv = phi i64 [ 0, %outer.loop.preheader ], [ %outer.iv.next, %inner.loop.exit ]
1402  %4 = mul nsw i64 %outer.iv, %2
1403  %5 = mul nsw i64 %outer.iv, %3
1404  br label %inner.loop
1405
1406inner.loop:
1407  %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ]
1408  %6 = mul nsw i64 %inner.iv, %0
1409  %7 = add nsw i64 %6, %4
1410  %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %7
1411  %8 = load i32, ptr %arrayidx.us, align 4
1412  %9 = mul nsw i64 %inner.iv, %1
1413  %10 = add nsw i64 %9, %5
1414  %arrayidx11.us = getelementptr inbounds i32, ptr %dst, i64 %10
1415  %11 = load i32, ptr %arrayidx11.us, align 4
1416  %add12.us = add nsw i32 %11, %8
1417  store i32 %add12.us, ptr %arrayidx11.us, align 4
1418  %inner.iv.next = add nuw nsw i64 %inner.iv, 1
1419  %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count
1420  br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop
1421
1422inner.loop.exit:
1423  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
1424  %exitcond40.not = icmp eq i64 %outer.iv.next, %wide.trip.count39
1425  br i1 %exitcond40.not, label %exit, label %outer.loop
1426
1427exit:
1428  ret void
1429}
1430
1431
1432; Test case where the AddRec for the pointers in the inner loop have the AddRec
1433; of the outer loop as start value. It is sufficient to subtract the start
1434; values (%dst, %src) of the outer AddRecs.
1435define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) {
1436; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec
1437; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) {
1438; CHECK-NEXT:  entry:
1439; CHECK-NEXT:    [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64
1440; CHECK-NEXT:    [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64
1441; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]]
1442; CHECK-NEXT:    br label [[OUTER_LOOP:%.*]]
1443; CHECK:       outer.loop:
1444; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ]
1445; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]]
1446; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
1447; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1448; CHECK:       vector.memcheck:
1449; CHECK-NEXT:    [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
1450; CHECK-NEXT:    br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1451; CHECK:       vector.ph:
1452; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
1453; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
1454; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1455; CHECK:       vector.body:
1456; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1457; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 0
1458; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], [[MUL]]
1459; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]]
1460; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
1461; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
1462; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]]
1463; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 10)
1464; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
1465; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4
1466; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
1467; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1468; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]]
1469; CHECK:       middle.block:
1470; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
1471; CHECK-NEXT:    br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]]
1472; CHECK:       scalar.ph:
1473; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ]
1474; CHECK-NEXT:    br label [[INNER_LOOP:%.*]]
1475; CHECK:       inner.loop:
1476; CHECK-NEXT:    [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ]
1477; CHECK-NEXT:    [[IDX:%.*]] = add nuw nsw i64 [[IV_INNER]], [[MUL]]
1478; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IDX]]
1479; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
1480; CHECK-NEXT:    [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]]
1481; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L]], 10
1482; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_DST]], align 4
1483; CHECK-NEXT:    [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1
1484; CHECK-NEXT:    [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[N]]
1485; CHECK-NEXT:    br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP68:![0-9]+]]
1486; CHECK:       inner.exit:
1487; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1
1488; CHECK-NEXT:    [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[M]]
1489; CHECK-NEXT:    br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]]
1490; CHECK:       outer.exit:
1491; CHECK-NEXT:    ret void
1492;
1493entry:
1494  br label %outer.loop
1495
1496outer.loop:
1497  %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ]
1498  %mul = mul nsw i64 %outer.iv, %n
1499  br label %inner.loop
1500
1501inner.loop:
1502  %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ]
1503  %idx = add nuw nsw i64 %iv.inner, %mul
1504  %gep.src = getelementptr inbounds i32, ptr %src, i64 %idx
1505  %l = load i32, ptr %gep.src, align 4
1506  %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %idx
1507  %add = add nsw i32 %l, 10
1508  store i32 %add, ptr %gep.dst, align 4
1509  %iv.inner.next = add nuw nsw i64 %iv.inner, 1
1510  %inner.exit.cond = icmp eq i64 %iv.inner.next, %n
1511  br i1 %inner.exit.cond, label %inner.exit, label %inner.loop
1512
1513inner.exit:
1514  %outer.iv.next = add nuw nsw i64 %outer.iv, 1
1515  %outer.exit.cond = icmp eq i64 %outer.iv.next, %m
1516  br i1 %outer.exit.cond, label %outer.exit, label %outer.loop
1517
1518outer.exit:
1519  ret void
1520}
1521
1522; The stride for the access in the inner loop is known to be non-negative via
1523; loop guards.
1524define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) {
1525; CHECK-LABEL: define void @stride_check_known_via_loop_guard
1526; CHECK-SAME: (ptr [[C:%.*]], ptr [[A:%.*]], i32 [[ACOLS:%.*]]) {
1527; CHECK-NEXT:  entry:
1528; CHECK-NEXT:    [[PRE_C:%.*]] = icmp ugt i32 [[ACOLS]], 0
1529; CHECK-NEXT:    br i1 [[PRE_C]], label [[EXIT:%.*]], label [[OUTER_HEADER_PREHEADER:%.*]]
1530; CHECK:       outer.header.preheader:
1531; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 8000
1532; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 8
1533; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
1534; CHECK:       outer.header:
1535; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER]] ]
1536; CHECK-NEXT:    [[MUL_US:%.*]] = mul i32 [[OUTER_IV]], [[ACOLS]]
1537; CHECK-NEXT:    [[ARRAYIDX_US:%.*]] = getelementptr double, ptr [[A]], i32 [[MUL_US]]
1538; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
1539; CHECK:       vector.memcheck:
1540; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]]
1541; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]]
1542; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
1543; CHECK-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
1544; CHECK:       vector.ph:
1545; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
1546; CHECK:       vector.body:
1547; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1548; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[INDEX]], 0
1549; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[TMP0]]
1550; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]]
1551; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i64 0
1552; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer
1553; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0
1554; CHECK-NEXT:    store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]]
1555; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
1556; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
1557; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]]
1558; CHECK:       middle.block:
1559; CHECK-NEXT:    br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]]
1560; CHECK:       scalar.ph:
1561; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_HEADER]] ]
1562; CHECK-NEXT:    br label [[INNER:%.*]]
1563; CHECK:       inner:
1564; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ]
1565; CHECK-NEXT:    [[GEP_C:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INNER_IV]]
1566; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8
1567; CHECK-NEXT:    store double [[L]], ptr [[GEP_C]], align 8
1568; CHECK-NEXT:    [[INNER_IV_NEXT]] = add i32 [[INNER_IV]], 1
1569; CHECK-NEXT:    [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 1000
1570; CHECK-NEXT:    br i1 [[INNER_C]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP75:![0-9]+]]
1571; CHECK:       outer.latch:
1572; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add i32 [[OUTER_IV]], 1
1573; CHECK-NEXT:    [[OUTER_C:%.*]] = icmp ult i32 [[OUTER_IV]], 128
1574; CHECK-NEXT:    br i1 [[OUTER_C]], label [[EXIT_LOOPEXIT:%.*]], label [[OUTER_HEADER]]
1575; CHECK:       exit.loopexit:
1576; CHECK-NEXT:    br label [[EXIT]]
1577; CHECK:       exit:
1578; CHECK-NEXT:    ret void
1579;
1580entry:
1581  %pre.c = icmp ugt i32 %Acols, 0
1582  br i1 %pre.c, label %exit, label %outer.header
1583
1584outer.header:
1585  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ]
1586  %mul.us = mul i32 %outer.iv, %Acols
1587  %arrayidx.us = getelementptr double, ptr %A, i32 %mul.us
1588  br label %inner
1589
1590inner:
1591  %inner.iv = phi i32 [ 0, %outer.header ], [ %inner.iv.next, %inner ]
1592  %gep.C = getelementptr inbounds double, ptr %C, i32 %inner.iv
1593  %l = load double, ptr %arrayidx.us, align 8
1594  store double %l, ptr %gep.C, align 8
1595  %inner.iv.next = add i32 %inner.iv, 1
1596  %inner.c = icmp eq i32 %inner.iv.next, 1000
1597  br i1 %inner.c, label %outer.latch, label %inner
1598
1599outer.latch:
1600  %outer.iv.next = add i32 %outer.iv, 1
1601  %outer.c = icmp ult i32 %outer.iv, 128
1602  br i1 %outer.c, label %exit, label %outer.header
1603
1604exit:
1605  ret void
1606}
1607