1; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 2; REQUIRES: asserts 3; RUN: opt < %s -p 'loop-vectorize' -force-vector-interleave=1 -S \ 4; RUN: -force-vector-width=4 -debug-only=loop-accesses,loop-vectorize,loop-utils 2> %t | FileCheck %s 5; RUN: cat %t | FileCheck %s --check-prefix=DEBUG 6 7target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" 8 9; Equivalent example in C: 10; void diff_checks(int32_t *dst, int32_t *src, int m, int n) { 11; for (int i = 0; i < m; i++) { 12; for (int j = 0; j < n; j++) { 13; dst[(i * (n + 1)) + j] = src[(i * n) + j]; 14; } 15; } 16; } 17; NOTE: The strides of the starting address values in the inner loop differ, i.e. 18; '(i * (n + 1))' vs '(i * n)'. 19 20; DEBUG-LABEL: 'diff_checks' 21; DEBUG: LAA: Found an analyzable loop: inner.loop 22; DEBUG: LAA: Not creating diff runtime check, since these cannot be hoisted out of the outer loop 23; DEBUG: LAA: Adding RT check for range: 24; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 25; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 (1 + %n)<nuw> to i64))<nsw> 26; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 (1 + %n)<nuw> to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst) 27; DEBUG-NEXT: LAA: Adding RT check for range: 28; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 29; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src) 30 31define void @diff_checks(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) { 32; CHECK-LABEL: define void @diff_checks 33; CHECK-SAME: (ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 34; CHECK-NEXT: entry: 35; CHECK-NEXT: [[ADD5:%.*]] = add nuw i32 [[N]], 1 36; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 37; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[ADD5]] to i64 38; CHECK-NEXT: [[WIDE_M:%.*]] = zext i32 [[M]] to i64 39; CHECK-NEXT: [[WIDE_N:%.*]] = zext i32 [[N]] to i64 40; CHECK-NEXT: [[TMP2:%.*]] = add nsw i64 [[WIDE_M]], -1 41; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[TMP1]] 42; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 2 43; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2 44; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP4]], [[TMP5]] 45; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] 46; CHECK-NEXT: [[TMP7:%.*]] = shl nsw i64 [[TMP1]], 2 47; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]] 48; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 2 49; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP9]] 50; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 51; CHECK: outer.loop: 52; CHECK-NEXT: [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_EXIT:%.*]] ] 53; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]] 54; CHECK-NEXT: [[TMP11:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP1]] 55; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4 56; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 57; CHECK: vector.memcheck: 58; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] 59; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 60; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 61; CHECK-NEXT: [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP7]], 0 62; CHECK-NEXT: [[TMP12:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]] 63; CHECK-NEXT: br i1 [[TMP12]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 64; CHECK: vector.ph: 65; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4 66; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]] 67; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 68; CHECK: vector.body: 69; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 70; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX]], 0 71; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP13]], [[TMP10]] 72; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP14]] 73; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 74; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4, !alias.scope [[META0:![0-9]+]] 75; CHECK-NEXT: [[TMP17:%.*]] = add nsw i64 [[TMP13]], [[TMP11]] 76; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP17]] 77; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 78; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD]], ptr [[TMP19]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] 79; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 80; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 81; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] 82; CHECK: middle.block: 83; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] 84; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] 85; CHECK: scalar.ph: 86; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 87; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 88; CHECK: inner.loop: 89; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 90; CHECK-NEXT: [[TMP21:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP10]] 91; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP21]] 92; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 93; CHECK-NEXT: [[TMP23:%.*]] = add nsw i64 [[IV_INNER]], [[TMP11]] 94; CHECK-NEXT: [[ARRAYIDX9_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]] 95; CHECK-NEXT: store i32 [[TMP22]], ptr [[ARRAYIDX9_US]], align 4 96; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 97; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]] 98; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP8:![0-9]+]] 99; CHECK: inner.exit: 100; CHECK-NEXT: [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1 101; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]] 102; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]] 103; CHECK: outer.exit: 104; CHECK-NEXT: ret void 105; 106entry: 107 %add5 = add nuw i32 %n, 1 108 %0 = zext i32 %n to i64 109 %1 = sext i32 %add5 to i64 110 %wide.m = zext i32 %m to i64 111 %wide.n = zext i32 %n to i64 112 br label %outer.loop 113 114outer.loop: 115 %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.exit ] 116 %2 = mul nsw i64 %iv.outer, %0 117 %3 = mul nsw i64 %iv.outer, %1 118 br label %inner.loop 119 120inner.loop: 121 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 122 %4 = add nuw nsw i64 %iv.inner, %2 123 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %4 124 %5 = load i32, ptr %arrayidx.us, align 4 125 %6 = add nsw i64 %iv.inner, %3 126 %arrayidx9.us = getelementptr inbounds i32, ptr %dst, i64 %6 127 store i32 %5, ptr %arrayidx9.us, align 4 128 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 129 %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n 130 br i1 %inner.exit.cond, label %inner.exit, label %inner.loop 131 132inner.exit: 133 %iv.outer.next = add nuw nsw i64 %iv.outer, 1 134 %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m 135 br i1 %outer.exit.cond, label %outer.exit, label %outer.loop 136 137outer.exit: 138 ret void 139} 140 141 142; Equivalent example in C: 143; void full_checks(int32_t *dst, int32_t *src, int m, int n) { 144; for (int i = 0; i < m; i++) { 145; for (int j = 0; j < n; j++) { 146; dst[(i * n) + j] += src[(i * n) + j]; 147; } 148; } 149; } 150; We decide to do full runtime checks here (as opposed to diff checks) due to 151; the additional load of 'dst[(i * n) + j]' in the loop. 152 153; DEBUG-LABEL: 'full_checks' 154; DEBUG: LAA: Found an analyzable loop: inner.loop 155; DEBUG-NOT: LAA: Creating diff runtime check for: 156; DEBUG: LAA: Adding RT check for range: 157; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 158; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %dst) 159; DEBUG-NEXT: LAA: Adding RT check for range: 160; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 161; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src) 162 163define void @full_checks(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) { 164; CHECK-LABEL: define void @full_checks 165; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 166; CHECK-NEXT: entry: 167; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 168; CHECK-NEXT: [[WIDE_M:%.*]] = zext i32 [[M]] to i64 169; CHECK-NEXT: [[WIDE_N:%.*]] = zext i32 [[N]] to i64 170; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]] 171; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 172; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] 173; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP2]] 174; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 175; CHECK: outer.loop: 176; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ] 177; CHECK-NEXT: [[TMP3:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP0]] 178; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4 179; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 180; CHECK: vector.memcheck: 181; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] 182; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 183; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 184; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 185; CHECK: vector.ph: 186; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4 187; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]] 188; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 189; CHECK: vector.body: 190; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 191; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 192; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i64 [[TMP4]], [[TMP3]] 193; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]] 194; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 195; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META9:![0-9]+]] 196; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]] 197; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 198; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4, !alias.scope [[META12:![0-9]+]], !noalias [[META9]] 199; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] 200; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[TMP9]], align 4, !alias.scope [[META12]], !noalias [[META9]] 201; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 202; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 203; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] 204; CHECK: middle.block: 205; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] 206; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] 207; CHECK: scalar.ph: 208; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 209; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 210; CHECK: inner.loop: 211; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 212; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP3]] 213; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP12]] 214; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 215; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP12]] 216; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4 217; CHECK-NEXT: [[ADD9_US:%.*]] = add nsw i32 [[TMP14]], [[TMP13]] 218; CHECK-NEXT: store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4 219; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 220; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]] 221; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP15:![0-9]+]] 222; CHECK: inner.exit: 223; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 224; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_M]] 225; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]] 226; CHECK: outer.exit: 227; CHECK-NEXT: ret void 228; 229entry: 230 %0 = zext i32 %n to i64 231 %wide.m = zext i32 %m to i64 232 %wide.n = zext i32 %n to i64 233 br label %outer.loop 234 235outer.loop: 236 %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] 237 %1 = mul nsw i64 %outer.iv, %0 238 br label %inner.loop 239 240inner.loop: 241 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 242 %2 = add nuw nsw i64 %iv.inner, %1 243 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %2 244 %3 = load i32, ptr %arrayidx.us, align 4 245 %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %2 246 %4 = load i32, ptr %arrayidx8.us, align 4 247 %add9.us = add nsw i32 %4, %3 248 store i32 %add9.us, ptr %arrayidx8.us, align 4 249 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 250 %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n 251 br i1 %inner.exit.cond, label %inner.exit, label %inner.loop 252 253inner.exit: 254 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 255 %outer.exit.cond = icmp eq i64 %outer.iv.next, %wide.m 256 br i1 %outer.exit.cond, label %outer.exit, label %outer.loop 257 258outer.exit: 259 ret void 260} 261 262 263; Equivalent example in C: 264; void full_checks_diff_strides(int32_t *dst, int32_t *src, int m, int n) { 265; for (int i = 0; i < m; i++) { 266; for (int j = 0; j < n; j++) { 267; dst[(i * (n + 1)) + j] += src[(i * n) + j]; 268; } 269; } 270; } 271; We decide to do full runtime checks here (as opposed to diff checks) due to 272; the additional load of 'dst[(i * n) + j]' in the loop. 273; NOTE: This is different to the test above (@full_checks) because the dst array 274; is accessed with a higher stride compared src, and therefore the inner loop 275; runtime checks will vary for each outer loop iteration. 276 277; DEBUG-LABEL: 'full_checks_diff_strides' 278; DEBUG: LAA: Found an analyzable loop: inner.loop 279; DEBUG-NOT: LAA: Creating diff runtime check for: 280; DEBUG: LAA: Adding RT check for range: 281; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 282; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + ((4 + (4 * (zext i32 %n to i64))<nuw><nsw>)<nuw><nsw> * (-1 + (zext i32 %m to i64))<nsw>) + %dst) 283; DEBUG-NEXT: LAA: Adding RT check for range: 284; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 285; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src) 286 287define void @full_checks_diff_strides(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) { 288; CHECK-LABEL: define void @full_checks_diff_strides 289; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 290; CHECK-NEXT: entry: 291; CHECK-NEXT: [[WIDE_M:%.*]] = zext i32 [[M]] to i64 292; CHECK-NEXT: [[WIDE_N:%.*]] = zext i32 [[N]] to i64 293; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[WIDE_M]], -1 294; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2 295; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 4 296; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP0]], [[TMP2]] 297; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP3]], [[TMP1]] 298; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP4]] 299; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]] 300; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 301; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]] 302; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 303; CHECK: outer.loop: 304; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ] 305; CHECK-NEXT: [[NPLUS1:%.*]] = add nuw nsw i32 [[N]], 1 306; CHECK-NEXT: [[WIDE_NPLUS1:%.*]] = zext i32 [[NPLUS1]] to i64 307; CHECK-NEXT: [[TMP7:%.*]] = mul nsw i64 [[OUTER_IV]], [[WIDE_N]] 308; CHECK-NEXT: [[TMP8:%.*]] = mul nsw i64 [[OUTER_IV]], [[WIDE_NPLUS1]] 309; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4 310; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 311; CHECK: vector.memcheck: 312; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] 313; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 314; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 315; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 316; CHECK: vector.ph: 317; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4 318; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]] 319; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 320; CHECK: vector.body: 321; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 322; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0 323; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP7]] 324; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP10]] 325; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i32 0 326; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4, !alias.scope [[META16:![0-9]+]] 327; CHECK-NEXT: [[TMP13:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP8]] 328; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP13]] 329; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 330; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META19:![0-9]+]], !noalias [[META16]] 331; CHECK-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] 332; CHECK-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP15]], align 4, !alias.scope [[META19]], !noalias [[META16]] 333; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 334; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 335; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] 336; CHECK: middle.block: 337; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] 338; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] 339; CHECK: scalar.ph: 340; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 341; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 342; CHECK: inner.loop: 343; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 344; CHECK-NEXT: [[TMP18:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP7]] 345; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP18]] 346; CHECK-NEXT: [[TMP19:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 347; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP8]] 348; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP20]] 349; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4 350; CHECK-NEXT: [[ADD9_US:%.*]] = add nsw i32 [[TMP21]], [[TMP19]] 351; CHECK-NEXT: store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4 352; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 353; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]] 354; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP22:![0-9]+]] 355; CHECK: inner.exit: 356; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 357; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_M]] 358; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]] 359; CHECK: outer.exit: 360; CHECK-NEXT: ret void 361; 362entry: 363 %wide.m = zext i32 %m to i64 364 %wide.n = zext i32 %n to i64 365 br label %outer.loop 366 367outer.loop: 368 %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] 369 %nplus1 = add nuw nsw i32 %n, 1 370 %wide.nplus1 = zext i32 %nplus1 to i64 371 %0 = mul nsw i64 %outer.iv, %wide.n 372 %1 = mul nsw i64 %outer.iv, %wide.nplus1 373 br label %inner.loop 374 375inner.loop: 376 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 377 %2 = add nuw nsw i64 %iv.inner, %0 378 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %2 379 %3 = load i32, ptr %arrayidx.us, align 4 380 %4 = add nuw nsw i64 %iv.inner, %1 381 %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %4 382 %5 = load i32, ptr %arrayidx8.us, align 4 383 %add9.us = add nsw i32 %5, %3 384 store i32 %add9.us, ptr %arrayidx8.us, align 4 385 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 386 %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n 387 br i1 %inner.exit.cond, label %inner.exit, label %inner.loop 388 389inner.exit: 390 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 391 %outer.exit.cond = icmp eq i64 %outer.iv.next, %wide.m 392 br i1 %outer.exit.cond, label %outer.exit, label %outer.loop 393 394outer.exit: 395 ret void 396} 397 398 399; Equivalent example in C: 400; void diff_checks_src_start_invariant(int32_t *dst, int32_t *src, int m, int n) { 401; for (int i = 0; i < m; i++) { 402; for (int j = 0; j < n; j++) { 403; dst[(i * n) + j] = src[j]; 404; } 405; } 406; } 407 408; DEBUG-LABEL: 'diff_checks_src_start_invariant' 409; DEBUG: LAA: Found an analyzable loop: inner.loop 410; DEBUG-NOT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 411 412define void @diff_checks_src_start_invariant(ptr nocapture noundef writeonly %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) { 413; CHECK-LABEL: define void @diff_checks_src_start_invariant 414; CHECK-SAME: (ptr noundef writeonly captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 415; CHECK-NEXT: entry: 416; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 417; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 418; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 419; CHECK-NEXT: [[WIDE_M:%.*]] = zext i32 [[M]] to i64 420; CHECK-NEXT: [[WIDE_N:%.*]] = zext i32 [[N]] to i64 421; CHECK-NEXT: [[TMP1:%.*]] = sub i64 [[DST1]], [[SRC2]] 422; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[WIDE_N]], 2 423; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 424; CHECK: outer.loop: 425; CHECK-NEXT: [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 426; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], [[IV_OUTER]] 427; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[TMP1]], [[TMP3]] 428; CHECK-NEXT: [[TMP5:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]] 429; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4 430; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 431; CHECK: vector.memcheck: 432; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP4]], 16 433; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 434; CHECK: vector.ph: 435; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4 436; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]] 437; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 438; CHECK: vector.body: 439; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 440; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 441; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP6]] 442; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 443; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 4 444; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP6]], [[TMP5]] 445; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP9]] 446; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP10]], i32 0 447; CHECK-NEXT: store <4 x i32> [[WIDE_LOAD]], ptr [[TMP11]], align 4 448; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 449; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 450; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] 451; CHECK: middle.block: 452; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] 453; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 454; CHECK: scalar.ph: 455; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 456; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 457; CHECK: inner.loop: 458; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 459; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]] 460; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 461; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP5]] 462; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP14]] 463; CHECK-NEXT: store i32 [[TMP13]], ptr [[ARRAYIDX6_US]], align 4 464; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 465; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]] 466; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP24:![0-9]+]] 467; CHECK: inner.loop.exit: 468; CHECK-NEXT: [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1 469; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]] 470; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]] 471; CHECK: outer.loop.exit: 472; CHECK-NEXT: ret void 473; 474entry: 475 %0 = zext i32 %n to i64 476 %wide.m = zext i32 %m to i64 477 %wide.n = zext i32 %n to i64 478 br label %outer.loop 479 480outer.loop: 481 %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.loop.exit ] 482 %1 = mul nsw i64 %iv.outer, %0 483 br label %inner.loop 484 485inner.loop: 486 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 487 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %iv.inner 488 %2 = load i32, ptr %arrayidx.us, align 4 489 %3 = add nuw nsw i64 %iv.inner, %1 490 %arrayidx6.us = getelementptr inbounds i32, ptr %dst, i64 %3 491 store i32 %2, ptr %arrayidx6.us, align 4 492 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 493 %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n 494 br i1 %inner.exit.cond, label %inner.loop.exit, label %inner.loop 495 496inner.loop.exit: 497 %iv.outer.next = add nuw nsw i64 %iv.outer, 1 498 %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m 499 br i1 %outer.exit.cond, label %outer.loop.exit, label %outer.loop 500 501outer.loop.exit: 502 ret void 503} 504 505 506; Equivalent example in C: 507; void full_checks_src_start_invariant(int32_t *dst, int32_t *src, int m, int n) { 508; for (int i = 0; i < m; i++) { 509; for (int j = 0; j < n; j++) { 510; dst[(i * n) + j] += src[j]; 511; } 512; } 513; } 514 515; DEBUG-LABEL: 'full_checks_src_start_invariant' 516; DEBUG: LAA: Found an analyzable loop: inner.loop 517; DEBUG: LAA: Adding RT check for range: 518; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 519; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %dst) 520; DEBUG-NEXT: LAA: Adding RT check for range: 521; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %n to i64))<nuw><nsw> + %src) 522 523define void @full_checks_src_start_invariant(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n) { 524; CHECK-LABEL: define void @full_checks_src_start_invariant 525; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 526; CHECK-NEXT: entry: 527; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 528; CHECK-NEXT: [[WIDE_M:%.*]] = zext i32 [[M]] to i64 529; CHECK-NEXT: [[WIDE_N:%.*]] = zext i32 [[N]] to i64 530; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[WIDE_N]], [[WIDE_M]] 531; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 532; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]] 533; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[WIDE_N]], 2 534; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP3]] 535; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 536; CHECK: outer.loop: 537; CHECK-NEXT: [[IV_OUTER:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_OUTER_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 538; CHECK-NEXT: [[TMP4:%.*]] = mul nsw i64 [[IV_OUTER]], [[TMP0]] 539; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_N]], 4 540; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 541; CHECK: vector.memcheck: 542; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] 543; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 544; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 545; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 546; CHECK: vector.ph: 547; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_N]], 4 548; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_N]], [[N_MOD_VF]] 549; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 550; CHECK: vector.body: 551; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 552; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0 553; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP5]] 554; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0 555; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4, !alias.scope [[META25:![0-9]+]] 556; CHECK-NEXT: [[TMP8:%.*]] = add nuw nsw i64 [[TMP5]], [[TMP4]] 557; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]] 558; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i32 0 559; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4, !alias.scope [[META28:![0-9]+]], !noalias [[META25]] 560; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD2]], [[WIDE_LOAD]] 561; CHECK-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP10]], align 4, !alias.scope [[META28]], !noalias [[META25]] 562; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 563; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 564; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]] 565; CHECK: middle.block: 566; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_N]], [[N_VEC]] 567; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 568; CHECK: scalar.ph: 569; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 570; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 571; CHECK: inner.loop: 572; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 573; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IV_INNER]] 574; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 575; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[IV_INNER]], [[TMP4]] 576; CHECK-NEXT: [[ARRAYIDX6_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP14]] 577; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX6_US]], align 4 578; CHECK-NEXT: [[ADD7_US:%.*]] = add nsw i32 [[TMP15]], [[TMP13]] 579; CHECK-NEXT: store i32 [[ADD7_US]], ptr [[ARRAYIDX6_US]], align 4 580; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 581; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[WIDE_N]] 582; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP31:![0-9]+]] 583; CHECK: inner.loop.exit: 584; CHECK-NEXT: [[IV_OUTER_NEXT]] = add nuw nsw i64 [[IV_OUTER]], 1 585; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[IV_OUTER_NEXT]], [[WIDE_M]] 586; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]] 587; CHECK: outer.loop.exit: 588; CHECK-NEXT: ret void 589; 590entry: 591 %0 = zext i32 %n to i64 592 %wide.m = zext i32 %m to i64 593 %wide.n = zext i32 %n to i64 594 br label %outer.loop 595 596outer.loop: 597 %iv.outer = phi i64 [ 0, %entry ], [ %iv.outer.next, %inner.loop.exit ] 598 %1 = mul nsw i64 %iv.outer, %0 599 br label %inner.loop 600 601inner.loop: 602 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 603 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %iv.inner 604 %2 = load i32, ptr %arrayidx.us, align 4 605 %3 = add nuw nsw i64 %iv.inner, %1 606 %arrayidx6.us = getelementptr inbounds i32, ptr %dst, i64 %3 607 %4 = load i32, ptr %arrayidx6.us, align 4 608 %add7.us = add nsw i32 %4, %2 609 store i32 %add7.us, ptr %arrayidx6.us, align 4 610 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 611 %inner.exit.cond = icmp eq i64 %iv.inner.next, %wide.n 612 br i1 %inner.exit.cond, label %inner.loop.exit, label %inner.loop 613 614inner.loop.exit: 615 %iv.outer.next = add nuw nsw i64 %iv.outer, 1 616 %outer.exit.cond = icmp eq i64 %iv.outer.next, %wide.m 617 br i1 %outer.exit.cond, label %outer.loop.exit, label %outer.loop 618 619outer.loop.exit: 620 ret void 621} 622 623 624; Equivalent example in C: 625; void triple_nested_loop_mixed_access(int *dst, int *src, int m, int n, int o) { 626; for (int i = 0; i < m; i++) { 627; for (int j = 0; j < n; j++) { 628; for (int l = 0; l < o; l++) { 629; dst[(i * n * (o + 1)) + (j * o) + l] += src[(i * n * o) + l]; 630; } 631; } 632; } 633; } 634; The 'src' access varies with the outermost loop, rather than the parent of the 635; innermost loop. Hence we don't expand `src`, although in theory we could do. 636 637; DEBUG-LABEL: 'triple_nested_loop_mixed_access' 638; DEBUG: LAA: Found an analyzable loop: inner.loop 639; DEBUG-NOT: LAA: Creating diff runtime check for: 640; DEBUG: LAA: Adding RT check for range: 641; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 642; DEBUG-NEXT: Start: {%dst,+,(4 * (zext i32 (1 + %o)<nsw> to i64) * (zext i32 %n to i64))}<%outer.outer.loop> End: {((4 * (zext i32 %n to i64) * (zext i32 %o to i64)) + %dst),+,(4 * (zext i32 (1 + %o)<nsw> to i64) * (zext i32 %n to i64))}<%outer.outer.loop> 643; DEBUG-NEXT: LAA: Adding RT check for range: 644; DEBUG-NEXT: Start: {%src,+,(4 * (zext i32 %n to i64) * (zext i32 %o to i64))}<%outer.outer.loop> End: {((4 * (zext i32 %o to i64))<nuw><nsw> + %src),+,(4 * (zext i32 %n to i64) * (zext i32 %o to i64))}<%outer.outer.loop> 645 646define void @triple_nested_loop_mixed_access(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %m, i32 noundef %n, i32 noundef %o) { 647; CHECK-LABEL: define void @triple_nested_loop_mixed_access 648; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]], i32 noundef [[O:%.*]]) { 649; CHECK-NEXT: entry: 650; CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[O]], 1 651; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[O]] to i64 652; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 653; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[ADD11]] to i64 654; CHECK-NEXT: [[WIDE_TRIP_COUNT68:%.*]] = zext i32 [[M]] to i64 655; CHECK-NEXT: [[WIDE_TRIP_COUNT60:%.*]] = zext i32 [[N]] to i64 656; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[O]] to i64 657; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP1]], [[TMP2]] 658; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[TMP3]], 2 659; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[TMP1]] 660; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 661; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[TMP1]] 662; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 2 663; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 664; CHECK-NEXT: br label [[OUTER_OUTER_LOOP:%.*]] 665; CHECK: outer.outer.loop: 666; CHECK-NEXT: [[OUTER_OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_OUTER_IV_NEXT:%.*]], [[OUTER_LOOP_END:%.*]] ] 667; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP4]], [[OUTER_OUTER_IV]] 668; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP10]] 669; CHECK-NEXT: [[TMP11:%.*]] = add i64 [[TMP6]], [[TMP10]] 670; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] 671; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP8]], [[OUTER_OUTER_IV]] 672; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP12]] 673; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP9]], [[TMP12]] 674; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] 675; CHECK-NEXT: [[TMP14:%.*]] = mul nsw i64 [[OUTER_OUTER_IV]], [[TMP1]] 676; CHECK-NEXT: [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP0]] 677; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 [[TMP14]], [[TMP2]] 678; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 679; CHECK: outer.loop: 680; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_END:%.*]] ], [ 0, [[OUTER_OUTER_LOOP]] ] 681; CHECK-NEXT: [[TMP17:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP0]] 682; CHECK-NEXT: [[TMP18:%.*]] = add nuw nsw i64 [[TMP17]], [[TMP16]] 683; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 684; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 685; CHECK: vector.memcheck: 686; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]] 687; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]] 688; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 689; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 690; CHECK: vector.ph: 691; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 692; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] 693; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 694; CHECK: vector.body: 695; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 696; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 0 697; CHECK-NEXT: [[TMP20:%.*]] = add nuw nsw i64 [[TMP19]], [[TMP15]] 698; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] 699; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 700; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META32:![0-9]+]] 701; CHECK-NEXT: [[TMP23:%.*]] = add nuw nsw i64 [[TMP18]], [[TMP19]] 702; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]] 703; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0 704; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META35:![0-9]+]], !noalias [[META32]] 705; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]] 706; CHECK-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP25]], align 4, !alias.scope [[META35]], !noalias [[META32]] 707; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 708; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 709; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP37:![0-9]+]] 710; CHECK: middle.block: 711; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] 712; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_END]], label [[SCALAR_PH]] 713; CHECK: scalar.ph: 714; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 715; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 716; CHECK: inner.loop: 717; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] 718; CHECK-NEXT: [[TMP28:%.*]] = add nuw nsw i64 [[INNER_IV]], [[TMP15]] 719; CHECK-NEXT: [[ARRAYIDX_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP28]] 720; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX_US_US_US]], align 4 721; CHECK-NEXT: [[TMP30:%.*]] = add nuw nsw i64 [[TMP18]], [[INNER_IV]] 722; CHECK-NEXT: [[ARRAYIDX17_US_US_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP30]] 723; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX17_US_US_US]], align 4 724; CHECK-NEXT: [[ADD18_US_US_US:%.*]] = add nsw i32 [[TMP31]], [[TMP29]] 725; CHECK-NEXT: store i32 [[ADD18_US_US_US]], ptr [[ARRAYIDX17_US_US_US]], align 4 726; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 727; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]] 728; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_END]], label [[INNER_LOOP]], !llvm.loop [[LOOP38:![0-9]+]] 729; CHECK: inner.loop.end: 730; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 731; CHECK-NEXT: [[EXIT_OUTER:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT60]] 732; CHECK-NEXT: br i1 [[EXIT_OUTER]], label [[OUTER_LOOP_END]], label [[OUTER_LOOP]] 733; CHECK: outer.loop.end: 734; CHECK-NEXT: [[OUTER_OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_OUTER_IV]], 1 735; CHECK-NEXT: [[EXIT_OUTER_OUTER:%.*]] = icmp eq i64 [[OUTER_OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT68]] 736; CHECK-NEXT: br i1 [[EXIT_OUTER_OUTER]], label [[EXIT:%.*]], label [[OUTER_OUTER_LOOP]] 737; CHECK: exit: 738; CHECK-NEXT: ret void 739; 740entry: 741 %add11 = add nsw i32 %o, 1 742 %0 = zext i32 %o to i64 743 %1 = zext i32 %n to i64 744 %2 = zext i32 %add11 to i64 745 %wide.trip.count68 = zext i32 %m to i64 746 %wide.trip.count60 = zext i32 %n to i64 747 %wide.trip.count = zext i32 %o to i64 748 br label %outer.outer.loop 749 750outer.outer.loop: 751 %outer.outer.iv = phi i64 [ 0, %entry ], [ %outer.outer.iv.next, %outer.loop.end ] 752 %3 = mul nsw i64 %outer.outer.iv, %1 753 %4 = mul nsw i64 %3, %0 754 %5 = mul nsw i64 %3, %2 755 br label %outer.loop 756 757outer.loop: 758 %outer.iv = phi i64 [ %outer.iv.next, %inner.loop.end ], [ 0, %outer.outer.loop ] 759 %6 = mul nsw i64 %outer.iv, %0 760 %7 = add nuw nsw i64 %6, %5 761 br label %inner.loop 762 763inner.loop: 764 %inner.iv = phi i64 [ %inner.iv.next, %inner.loop ], [ 0, %outer.loop ] 765 %8 = add nuw nsw i64 %inner.iv, %4 766 %arrayidx.us.us.us = getelementptr inbounds i32, ptr %src, i64 %8 767 %9 = load i32, ptr %arrayidx.us.us.us, align 4 768 %10 = add nuw nsw i64 %7, %inner.iv 769 %arrayidx17.us.us.us = getelementptr inbounds i32, ptr %dst, i64 %10 770 %11 = load i32, ptr %arrayidx17.us.us.us, align 4 771 %add18.us.us.us = add nsw i32 %11, %9 772 store i32 %add18.us.us.us, ptr %arrayidx17.us.us.us, align 4 773 %inner.iv.next = add nuw nsw i64 %inner.iv, 1 774 %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count 775 br i1 %exitcond.not, label %inner.loop.end, label %inner.loop 776 777inner.loop.end: 778 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 779 %exit.outer = icmp eq i64 %outer.iv.next, %wide.trip.count60 780 br i1 %exit.outer, label %outer.loop.end, label %outer.loop 781 782outer.loop.end: 783 %outer.outer.iv.next = add nuw nsw i64 %outer.outer.iv, 1 784 %exit.outer.outer = icmp eq i64 %outer.outer.iv.next, %wide.trip.count68 785 br i1 %exit.outer.outer, label %exit, label %outer.outer.loop 786 787exit: 788 ret void 789} 790 791 792; Equivalent example in C: 793; void uncomputable_outer_tc(int32_t *dst, int32_t *src, char *str, int n) { 794; int i; 795; while (str[i] != '\0') { 796; for (int j = 0; j < n; j++) { 797; dst[(i * (n + 1)) + j] += src[(i * n) + j]; 798; } 799; i++; 800; } 801; } 802; Outer loop trip count is uncomputable so we shouldn't expand the ranges. 803 804; DEBUG-LABEL: 'uncomputable_outer_tc' 805; DEBUG: LAA: Found an analyzable loop: inner.loop 806; DEBUG: LAA: Adding RT check for range: 807; DEBUG-NEXT: Start: {%dst,+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop> End: {((4 * (zext i32 %n to i64))<nuw><nsw> + %dst),+,(4 * (zext i32 (1 + %n) to i64))<nuw><nsw>}<%outer.loop> 808; DEBUG-NEXT: LAA: Adding RT check for range: 809; DEBUG-NEXT: Start: {%src,+,(4 * (zext i32 %n to i64))<nuw><nsw>}<%outer.loop> End: {((4 * (zext i32 %n to i64))<nuw><nsw> + %src),+,(4 * (zext i32 %n to i64))<nuw><nsw>}<%outer.loop> 810 811define void @uncomputable_outer_tc(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, ptr nocapture noundef readonly %str, i32 noundef %n) { 812; CHECK-LABEL: define void @uncomputable_outer_tc 813; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], ptr noundef readonly captures(none) [[STR:%.*]], i32 noundef [[N:%.*]]) { 814; CHECK-NEXT: entry: 815; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[STR]], align 1 816; CHECK-NEXT: [[CMP_NOT23:%.*]] = icmp ne i8 [[TMP0]], 0 817; CHECK-NEXT: [[CMP221:%.*]] = icmp sgt i32 [[N]], 0 818; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP_NOT23]], [[CMP221]] 819; CHECK-NEXT: br i1 [[OR_COND]], label [[OUTER_LOOP_PREHEADER:%.*]], label [[WHILE_END:%.*]] 820; CHECK: outer.loop.preheader: 821; CHECK-NEXT: [[ADD6:%.*]] = add nuw nsw i32 [[N]], 1 822; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[N]] to i64 823; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[ADD6]] to i64 824; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 825; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 826; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 827; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[WIDE_TRIP_COUNT]], 2 828; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 829; CHECK: outer.loop: 830; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PREHEADER]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 831; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP3]], [[OUTER_IV]] 832; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] 833; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP4]], [[TMP6]] 834; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP7]] 835; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP5]], [[OUTER_IV]] 836; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP8]] 837; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[TMP4]], [[TMP8]] 838; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP9]] 839; CHECK-NEXT: [[TMP10:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP1]] 840; CHECK-NEXT: [[TMP11:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]] 841; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 842; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 843; CHECK: vector.memcheck: 844; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]] 845; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]] 846; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 847; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 848; CHECK: vector.ph: 849; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 850; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] 851; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 852; CHECK: vector.body: 853; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 854; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[INDEX]], 0 855; CHECK-NEXT: [[TMP13:%.*]] = add nsw i64 [[TMP12]], [[TMP10]] 856; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP13]] 857; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 858; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4, !alias.scope [[META39:![0-9]+]] 859; CHECK-NEXT: [[TMP16:%.*]] = add nsw i64 [[TMP12]], [[TMP11]] 860; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP16]] 861; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 862; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP18]], align 4, !alias.scope [[META42:![0-9]+]], !noalias [[META39]] 863; CHECK-NEXT: [[TMP19:%.*]] = add nsw <4 x i32> [[WIDE_LOAD4]], [[WIDE_LOAD]] 864; CHECK-NEXT: store <4 x i32> [[TMP19]], ptr [[TMP18]], align 4, !alias.scope [[META42]], !noalias [[META39]] 865; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 866; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 867; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP44:![0-9]+]] 868; CHECK: middle.block: 869; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] 870; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 871; CHECK: scalar.ph: 872; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 873; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 874; CHECK: inner.loop: 875; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] 876; CHECK-NEXT: [[TMP21:%.*]] = add nsw i64 [[INNER_IV]], [[TMP10]] 877; CHECK-NEXT: [[ARRAYIDX5_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP21]] 878; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[ARRAYIDX5_US]], align 4 879; CHECK-NEXT: [[TMP23:%.*]] = add nsw i64 [[INNER_IV]], [[TMP11]] 880; CHECK-NEXT: [[ARRAYIDX10_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]] 881; CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr [[ARRAYIDX10_US]], align 4 882; CHECK-NEXT: [[ADD11_US:%.*]] = add nsw i32 [[TMP24]], [[TMP22]] 883; CHECK-NEXT: store i32 [[ADD11_US]], ptr [[ARRAYIDX10_US]], align 4 884; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 885; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]] 886; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP45:![0-9]+]] 887; CHECK: inner.loop.exit: 888; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i64 [[OUTER_IV]], 1 889; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i8, ptr [[STR]], i64 [[OUTER_IV_NEXT]] 890; CHECK-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX_US]], align 1 891; CHECK-NEXT: [[CMP_NOT_US:%.*]] = icmp eq i8 [[TMP25]], 0 892; CHECK-NEXT: br i1 [[CMP_NOT_US]], label [[WHILE_END_LOOPEXIT:%.*]], label [[OUTER_LOOP]] 893; CHECK: while.end.loopexit: 894; CHECK-NEXT: br label [[WHILE_END]] 895; CHECK: while.end: 896; CHECK-NEXT: ret void 897; 898entry: 899 %0 = load i8, ptr %str, align 1 900 %cmp.not23 = icmp ne i8 %0, 0 901 %cmp221 = icmp sgt i32 %n, 0 902 %or.cond = and i1 %cmp.not23, %cmp221 903 br i1 %or.cond, label %outer.loop.preheader, label %while.end 904 905outer.loop.preheader: 906 %add6 = add nuw nsw i32 %n, 1 907 %1 = zext i32 %n to i64 908 %2 = zext i32 %add6 to i64 909 %wide.trip.count = zext i32 %n to i64 910 br label %outer.loop 911 912outer.loop: 913 %outer.iv = phi i64 [ 0, %outer.loop.preheader ], [ %outer.iv.next, %inner.loop.exit ] 914 %3 = mul nsw i64 %outer.iv, %1 915 %4 = mul nsw i64 %outer.iv, %2 916 br label %inner.loop 917 918inner.loop: 919 %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] 920 %5 = add nsw i64 %inner.iv, %3 921 %arrayidx5.us = getelementptr inbounds i32, ptr %src, i64 %5 922 %6 = load i32, ptr %arrayidx5.us, align 4 923 %7 = add nsw i64 %inner.iv, %4 924 %arrayidx10.us = getelementptr inbounds i32, ptr %dst, i64 %7 925 %8 = load i32, ptr %arrayidx10.us, align 4 926 %add11.us = add nsw i32 %8, %6 927 store i32 %add11.us, ptr %arrayidx10.us, align 4 928 %inner.iv.next = add nuw nsw i64 %inner.iv, 1 929 %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count 930 br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop 931 932inner.loop.exit: 933 %outer.iv.next = add i64 %outer.iv, 1 934 %arrayidx.us = getelementptr inbounds i8, ptr %str, i64 %outer.iv.next 935 %9 = load i8, ptr %arrayidx.us, align 1 936 %cmp.not.us = icmp eq i8 %9, 0 937 br i1 %cmp.not.us, label %while.end, label %outer.loop 938 939while.end: 940 ret void 941} 942 943 944; Equivalent example in C: 945; void decreasing_inner_iv(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) { 946; for (int i = 0; i < m; i++) { 947; for (int j = n; j >= 0; j--) { 948; dst[(i * stride1) + j] += src[(i * stride2) + j]; 949; } 950; } 951; } 952; Inner IV is decreasing, but this isn't a problem and we can still expand the 953; runtime checks correctly to cover the whole loop. 954 955; DEBUG-LABEL: 'decreasing_inner_iv' 956; DEBUG: LAA: Found an analyzable loop: inner.loop 957; DEBUG: LAA: Adding RT check for range: 958; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 959; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 %stride1 to i64))<nsw> 960; DEBUG-NEXT: Start: %dst End: (4 + (4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 %stride1 to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst) 961; DEBUG-NEXT: LAA: Adding RT check for range: 962; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 963; DEBUG-NEXT: LAA: ... but need to check stride is positive: (4 * (sext i32 %stride2 to i64))<nsw> 964; DEBUG-NEXT: Start: %src End: (4 + (4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (sext i32 %stride2 to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %src) 965 966define void @decreasing_inner_iv(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) { 967; CHECK-LABEL: define void @decreasing_inner_iv 968; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 969; CHECK-NEXT: entry: 970; CHECK-NEXT: [[CMP20:%.*]] = icmp sgt i32 [[M]], 0 971; CHECK-NEXT: [[CMP218:%.*]] = icmp sgt i32 [[N]], -1 972; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP20]], [[CMP218]] 973; CHECK-NEXT: br i1 [[OR_COND]], label [[OUTER_LOOP_PRE:%.*]], label [[EXIT:%.*]] 974; CHECK: outer.loop.pre: 975; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 976; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[STRIDE2]] to i64 977; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[STRIDE1]] to i64 978; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[M]] to i64 979; CHECK-NEXT: [[TMP3:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1 980; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], [[TMP2]] 981; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 982; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP0]], 2 983; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP5]], [[TMP6]] 984; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 4 985; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] 986; CHECK-NEXT: [[TMP9:%.*]] = shl nsw i64 [[TMP2]], 2 987; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP3]], [[TMP1]] 988; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2 989; CHECK-NEXT: [[TMP12:%.*]] = add i64 [[TMP11]], [[TMP6]] 990; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], 4 991; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] 992; CHECK-NEXT: [[TMP14:%.*]] = shl nsw i64 [[TMP1]], 2 993; CHECK-NEXT: [[TMP15:%.*]] = add nuw nsw i64 [[TMP0]], 1 994; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 995; CHECK: outer.loop: 996; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PRE]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 997; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP1]] 998; CHECK-NEXT: [[TMP17:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]] 999; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP15]], 4 1000; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1001; CHECK: vector.memcheck: 1002; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]] 1003; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 1004; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1005; CHECK-NEXT: [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP9]], 0 1006; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]] 1007; CHECK-NEXT: [[STRIDE_CHECK2:%.*]] = icmp slt i64 [[TMP14]], 0 1008; CHECK-NEXT: [[TMP19:%.*]] = or i1 [[TMP18]], [[STRIDE_CHECK2]] 1009; CHECK-NEXT: br i1 [[TMP19]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1010; CHECK: vector.ph: 1011; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP15]], 4 1012; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP15]], [[N_MOD_VF]] 1013; CHECK-NEXT: [[TMP20:%.*]] = sub i64 [[TMP0]], [[N_VEC]] 1014; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1015; CHECK: vector.body: 1016; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1017; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 [[TMP0]], [[INDEX]] 1018; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[OFFSET_IDX]], 0 1019; CHECK-NEXT: [[TMP22:%.*]] = add nsw i64 [[TMP21]], [[TMP16]] 1020; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP22]] 1021; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 1022; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 -3 1023; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META46:![0-9]+]] 1024; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1025; CHECK-NEXT: [[TMP26:%.*]] = add nsw i64 [[TMP21]], [[TMP17]] 1026; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP26]] 1027; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP27]], i32 0 1028; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP28]], i32 -3 1029; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP29]], align 4, !alias.scope [[META49:![0-9]+]], !noalias [[META46]] 1030; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1031; CHECK-NEXT: [[TMP30:%.*]] = add nsw <4 x i32> [[REVERSE4]], [[REVERSE]] 1032; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i32> [[TMP30]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 1033; CHECK-NEXT: store <4 x i32> [[REVERSE5]], ptr [[TMP29]], align 4, !alias.scope [[META49]], !noalias [[META46]] 1034; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1035; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1036; CHECK-NEXT: br i1 [[TMP31]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP51:![0-9]+]] 1037; CHECK: middle.block: 1038; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP15]], [[N_VEC]] 1039; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 1040; CHECK: scalar.ph: 1041; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[TMP20]], [[MIDDLE_BLOCK]] ], [ [[TMP0]], [[VECTOR_MEMCHECK]] ], [ [[TMP0]], [[OUTER_LOOP]] ] 1042; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 1043; CHECK: inner.loop: 1044; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] 1045; CHECK-NEXT: [[TMP32:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]] 1046; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP32]] 1047; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 1048; CHECK-NEXT: [[TMP34:%.*]] = add nsw i64 [[INNER_IV]], [[TMP17]] 1049; CHECK-NEXT: [[ARRAYIDX8_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP34]] 1050; CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[ARRAYIDX8_US]], align 4 1051; CHECK-NEXT: [[ADD9_US:%.*]] = add nsw i32 [[TMP35]], [[TMP33]] 1052; CHECK-NEXT: store i32 [[ADD9_US]], ptr [[ARRAYIDX8_US]], align 4 1053; CHECK-NEXT: [[INNER_IV_NEXT]] = add nsw i64 [[INNER_IV]], -1 1054; CHECK-NEXT: [[CMP2_US:%.*]] = icmp sgt i64 [[INNER_IV]], 0 1055; CHECK-NEXT: br i1 [[CMP2_US]], label [[INNER_LOOP]], label [[INNER_LOOP_EXIT]], !llvm.loop [[LOOP52:![0-9]+]] 1056; CHECK: inner.loop.exit: 1057; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 1058; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT]] 1059; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[OUTER_LOOP_EXIT:%.*]], label [[OUTER_LOOP]] 1060; CHECK: outer.loop.exit: 1061; CHECK-NEXT: br label [[EXIT]] 1062; CHECK: exit: 1063; CHECK-NEXT: ret void 1064; 1065entry: 1066 %cmp20 = icmp sgt i32 %m, 0 1067 %cmp218 = icmp sgt i32 %n, -1 1068 %or.cond = and i1 %cmp20, %cmp218 1069 br i1 %or.cond, label %outer.loop.pre, label %exit 1070 1071outer.loop.pre: 1072 %0 = zext i32 %n to i64 1073 %1 = sext i32 %stride2 to i64 1074 %2 = sext i32 %stride1 to i64 1075 %wide.trip.count = zext i32 %m to i64 1076 br label %outer.loop 1077 1078outer.loop: 1079 %outer.iv = phi i64 [ 0, %outer.loop.pre ], [ %outer.iv.next, %inner.loop.exit ] 1080 %3 = mul nsw i64 %outer.iv, %1 1081 %4 = mul nsw i64 %outer.iv, %2 1082 br label %inner.loop 1083 1084inner.loop: 1085 %inner.iv = phi i64 [ %0, %outer.loop ], [ %inner.iv.next, %inner.loop ] 1086 %5 = add nsw i64 %inner.iv, %3 1087 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %5 1088 %6 = load i32, ptr %arrayidx.us, align 4 1089 %7 = add nsw i64 %inner.iv, %4 1090 %arrayidx8.us = getelementptr inbounds i32, ptr %dst, i64 %7 1091 %8 = load i32, ptr %arrayidx8.us, align 4 1092 %add9.us = add nsw i32 %8, %6 1093 store i32 %add9.us, ptr %arrayidx8.us, align 4 1094 %inner.iv.next = add nsw i64 %inner.iv, -1 1095 %cmp2.us = icmp sgt i64 %inner.iv, 0 1096 br i1 %cmp2.us, label %inner.loop, label %inner.loop.exit 1097 1098inner.loop.exit: 1099 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 1100 %exitcond.not = icmp eq i64 %outer.iv.next, %wide.trip.count 1101 br i1 %exitcond.not, label %outer.loop.exit, label %outer.loop 1102 1103outer.loop.exit: 1104 br label %exit 1105 1106exit: 1107 ret void 1108} 1109 1110 1111; Equivalent example in C: 1112; void decreasing_outer_iv(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) { 1113; for (int i = m - 1; i >= 0; i--) { 1114; for (int j = 0; j <= n; j++) { 1115; dst[(i * stride1) + j] += src[(i * stride2) + j]; 1116; } 1117; } 1118; } 1119; Outer IV is decreasing, but the direction of memory accesses also depends 1120; upon the signedness of stride1. 1121 1122; DEBUG-LABEL: 'decreasing_outer_iv' 1123; DEBUG: LAA: Found an analyzable loop: inner.loop 1124; DEBUG: LAA: Adding RT check for range: 1125; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 1126; DEBUG-NEXT: LAA: ... but need to check stride is positive: (-4 * (sext i32 %stride1 to i64))<nsw> 1127; DEBUG-NEXT: Start: ((4 * (zext i32 %m to i64) * (sext i32 %stride1 to i64)) + %dst) End: ((4 * (zext i32 (1 + %n) to i64))<nuw><nsw> + (4 * (sext i32 %stride1 to i64))<nsw> + %dst) 1128; DEBUG-NEXT: LAA: Adding RT check for range: 1129; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 1130; DEBUG-NEXT: LAA: ... but need to check stride is positive: (-4 * (sext i32 %stride2 to i64))<nsw> 1131; DEBUG-NEXT: Start: ((4 * (zext i32 %m to i64) * (sext i32 %stride2 to i64)) + %src) End: ((4 * (zext i32 (1 + %n) to i64))<nuw><nsw> + (4 * (sext i32 %stride2 to i64))<nsw> + %src) 1132 1133define void @decreasing_outer_iv(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) { 1134; CHECK-LABEL: define void @decreasing_outer_iv 1135; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 1136; CHECK-NEXT: entry: 1137; CHECK-NEXT: [[CMP21:%.*]] = icmp slt i32 [[M]], 1 1138; CHECK-NEXT: [[CMP2_NOT18:%.*]] = icmp slt i32 [[N]], 0 1139; CHECK-NEXT: [[OR_COND:%.*]] = or i1 [[CMP21]], [[CMP2_NOT18]] 1140; CHECK-NEXT: br i1 [[OR_COND]], label [[EXIT:%.*]], label [[OUTER_LOOP_PRE:%.*]] 1141; CHECK: outer.loop.pre: 1142; CHECK-NEXT: [[TMP0:%.*]] = add nuw i32 [[N]], 1 1143; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[M]] to i64 1144; CHECK-NEXT: [[TMP2:%.*]] = sext i32 [[STRIDE1]] to i64 1145; CHECK-NEXT: [[TMP3:%.*]] = sext i32 [[STRIDE2]] to i64 1146; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[TMP0]] to i64 1147; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP2]], [[TMP1]] 1148; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 1149; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP5]] 1150; CHECK-NEXT: [[TMP6:%.*]] = shl nsw i64 [[TMP2]], 2 1151; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 1152; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], [[TMP7]] 1153; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] 1154; CHECK-NEXT: [[TMP9:%.*]] = mul nsw i64 [[TMP2]], -4 1155; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP3]], [[TMP1]] 1156; CHECK-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 2 1157; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP11]] 1158; CHECK-NEXT: [[TMP12:%.*]] = shl nsw i64 [[TMP3]], 2 1159; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[TMP12]], [[TMP7]] 1160; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] 1161; CHECK-NEXT: [[TMP14:%.*]] = mul nsw i64 [[TMP3]], -4 1162; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 1163; CHECK: outer.loop: 1164; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ [[TMP1]], [[OUTER_LOOP_PRE]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 1165; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nsw i64 [[OUTER_IV]], -1 1166; CHECK-NEXT: [[TMP15:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP3]] 1167; CHECK-NEXT: [[TMP16:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]] 1168; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 1169; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1170; CHECK: vector.memcheck: 1171; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[SCEVGEP]], [[SCEVGEP3]] 1172; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SCEVGEP2]], [[SCEVGEP1]] 1173; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1174; CHECK-NEXT: [[STRIDE_CHECK:%.*]] = icmp slt i64 [[TMP9]], 0 1175; CHECK-NEXT: [[TMP17:%.*]] = or i1 [[FOUND_CONFLICT]], [[STRIDE_CHECK]] 1176; CHECK-NEXT: [[STRIDE_CHECK4:%.*]] = icmp slt i64 [[TMP14]], 0 1177; CHECK-NEXT: [[TMP18:%.*]] = or i1 [[TMP17]], [[STRIDE_CHECK4]] 1178; CHECK-NEXT: br i1 [[TMP18]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1179; CHECK: vector.ph: 1180; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 1181; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] 1182; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1183; CHECK: vector.body: 1184; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1185; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX]], 0 1186; CHECK-NEXT: [[TMP20:%.*]] = add nsw i64 [[TMP19]], [[TMP15]] 1187; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP20]] 1188; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i32 0 1189; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4, !alias.scope [[META53:![0-9]+]] 1190; CHECK-NEXT: [[TMP23:%.*]] = add nsw i64 [[TMP19]], [[TMP16]] 1191; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP23]] 1192; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i32 0 1193; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4, !alias.scope [[META56:![0-9]+]], !noalias [[META53]] 1194; CHECK-NEXT: [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD5]], [[WIDE_LOAD]] 1195; CHECK-NEXT: store <4 x i32> [[TMP26]], ptr [[TMP25]], align 4, !alias.scope [[META56]], !noalias [[META53]] 1196; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1197; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1198; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP58:![0-9]+]] 1199; CHECK: middle.block: 1200; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] 1201; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 1202; CHECK: scalar.ph: 1203; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 1204; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 1205; CHECK: inner.loop: 1206; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] 1207; CHECK-NEXT: [[TMP28:%.*]] = add nsw i64 [[INNER_IV]], [[TMP15]] 1208; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP28]] 1209; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 1210; CHECK-NEXT: [[TMP30:%.*]] = add nsw i64 [[INNER_IV]], [[TMP16]] 1211; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP30]] 1212; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 1213; CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP31]], [[TMP29]] 1214; CHECK-NEXT: store i32 [[ADD9]], ptr [[ARRAYIDX8]], align 4 1215; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 1216; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]] 1217; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP59:![0-9]+]] 1218; CHECK: inner.loop.exit: 1219; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i64 [[OUTER_IV]], 1 1220; CHECK-NEXT: br i1 [[CMP]], label [[OUTER_LOOP]], label [[OUTER_LOOP_EXIT:%.*]] 1221; CHECK: outer.loop.exit: 1222; CHECK-NEXT: br label [[EXIT]] 1223; CHECK: exit: 1224; CHECK-NEXT: ret void 1225; 1226entry: 1227 %cmp21 = icmp slt i32 %m, 1 1228 %cmp2.not18 = icmp slt i32 %n, 0 1229 %or.cond = or i1 %cmp21, %cmp2.not18 1230 br i1 %or.cond, label %exit, label %outer.loop.pre 1231 1232outer.loop.pre: 1233 %0 = add nuw i32 %n, 1 1234 %1 = zext i32 %m to i64 1235 %2 = sext i32 %stride1 to i64 1236 %3 = sext i32 %stride2 to i64 1237 %wide.trip.count = zext i32 %0 to i64 1238 br label %outer.loop 1239 1240outer.loop: 1241 %outer.iv = phi i64 [ %1, %outer.loop.pre ], [ %outer.iv.next, %inner.loop.exit ] 1242 %outer.iv.next = add nsw i64 %outer.iv, -1 1243 %4 = mul nsw i64 %outer.iv, %3 1244 %5 = mul nsw i64 %outer.iv, %2 1245 br label %inner.loop 1246 1247inner.loop: 1248 %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] 1249 %6 = add nsw i64 %inner.iv, %4 1250 %arrayidx = getelementptr inbounds i32, ptr %src, i64 %6 1251 %7 = load i32, ptr %arrayidx, align 4 1252 %8 = add nsw i64 %inner.iv, %5 1253 %arrayidx8 = getelementptr inbounds i32, ptr %dst, i64 %8 1254 %9 = load i32, ptr %arrayidx8, align 4 1255 %add9 = add nsw i32 %9, %7 1256 store i32 %add9, ptr %arrayidx8, align 4 1257 %inner.iv.next = add nuw nsw i64 %inner.iv, 1 1258 %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count 1259 br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop 1260 1261inner.loop.exit: 1262 %cmp = icmp sgt i64 %outer.iv, 1 1263 br i1 %cmp, label %outer.loop, label %outer.loop.exit 1264 1265outer.loop.exit: 1266 br label %exit 1267 1268exit: 1269 ret void 1270} 1271 1272 1273; Equivalent example in C: 1274; void foo(int32_t *dst, int32_t *src, int stride1, int stride2, int m, int n) { 1275; for (int i = 0; i < m; i++) { 1276; for (int j = 0; j < n; j++) { 1277; dst[(i * (n + 1)) + (j * stride1)] += src[(i * n) + (j * stride2)]; 1278; } 1279; } 1280; } 1281 1282 1283; DEBUG-LABEL: 'unknown_inner_stride' 1284; DEBUG: LAA: Found an analyzable loop: inner.loop 1285; DEBUG: LAA: Adding RT check for range: 1286; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 1287; DEBUG-NEXT: Start: %dst End: ((4 * (zext i32 %n to i64))<nuw><nsw> + (4 * (zext i32 (1 + %n) to i64) * (-1 + (zext i32 %m to i64))<nsw>) + %dst) 1288; DEBUG-NEXT: LAA: Adding RT check for range: 1289; DEBUG-NEXT: LAA: Expanded RT check for range to include outer loop in order to permit hoisting 1290; DEBUG-NEXT: Start: %src End: ((4 * (zext i32 %m to i64) * (zext i32 %n to i64)) + %src) 1291 1292define void @unknown_inner_stride(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i32 noundef %stride1, i32 noundef %stride2, i32 noundef %m, i32 noundef %n) { 1293; CHECK-LABEL: define void @unknown_inner_stride 1294; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i32 noundef [[STRIDE1:%.*]], i32 noundef [[STRIDE2:%.*]], i32 noundef [[M:%.*]], i32 noundef [[N:%.*]]) { 1295; CHECK-NEXT: entry: 1296; CHECK-NEXT: [[CMP26:%.*]] = icmp sgt i32 [[M]], 0 1297; CHECK-NEXT: [[CMP224:%.*]] = icmp sgt i32 [[N]], 0 1298; CHECK-NEXT: [[OR_COND:%.*]] = and i1 [[CMP26]], [[CMP224]] 1299; CHECK-NEXT: br i1 [[OR_COND]], label [[OUTER_LOOP_PREHEADER:%.*]], label [[EXIT:%.*]] 1300; CHECK: outer.loop.preheader: 1301; CHECK-NEXT: [[ADD6:%.*]] = add nuw nsw i32 [[N]], 1 1302; CHECK-NEXT: [[TMP0:%.*]] = sext i32 [[STRIDE2]] to i64 1303; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[STRIDE1]] to i64 1304; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[N]] to i64 1305; CHECK-NEXT: [[TMP3:%.*]] = zext i32 [[ADD6]] to i64 1306; CHECK-NEXT: [[WIDE_TRIP_COUNT39:%.*]] = zext i32 [[M]] to i64 1307; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64 1308; CHECK-NEXT: [[TMP4:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT39]], -1 1309; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], [[TMP3]] 1310; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2 1311; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[WIDE_TRIP_COUNT]], 2 1312; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP6]], [[TMP7]] 1313; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]] 1314; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[WIDE_TRIP_COUNT]], [[WIDE_TRIP_COUNT39]] 1315; CHECK-NEXT: [[TMP10:%.*]] = shl i64 [[TMP9]], 2 1316; CHECK-NEXT: [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP10]] 1317; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 1318; CHECK: outer.loop: 1319; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[OUTER_LOOP_PREHEADER]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_LOOP_EXIT:%.*]] ] 1320; CHECK-NEXT: [[TMP11:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP2]] 1321; CHECK-NEXT: [[TMP12:%.*]] = mul nsw i64 [[OUTER_IV]], [[TMP3]] 1322; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 4 1323; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_SCEVCHECK:%.*]] 1324; CHECK: vector.scevcheck: 1325; CHECK-NEXT: [[IDENT_CHECK:%.*]] = icmp ne i32 [[STRIDE1]], 1 1326; CHECK-NEXT: [[IDENT_CHECK1:%.*]] = icmp ne i32 [[STRIDE2]], 1 1327; CHECK-NEXT: [[TMP13:%.*]] = or i1 [[IDENT_CHECK]], [[IDENT_CHECK1]] 1328; CHECK-NEXT: br i1 [[TMP13]], label [[SCALAR_PH]], label [[VECTOR_MEMCHECK:%.*]] 1329; CHECK: vector.memcheck: 1330; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]] 1331; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] 1332; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1333; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1334; CHECK: vector.ph: 1335; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 4 1336; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] 1337; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1338; CHECK: vector.body: 1339; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1340; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[INDEX]], 0 1341; CHECK-NEXT: [[TMP15:%.*]] = add nsw i64 [[TMP14]], [[TMP11]] 1342; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP15]] 1343; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 1344; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP17]], align 4, !alias.scope [[META60:![0-9]+]] 1345; CHECK-NEXT: [[TMP18:%.*]] = add nsw i64 [[TMP14]], [[TMP12]] 1346; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP18]] 1347; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 1348; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP20]], align 4, !alias.scope [[META63:![0-9]+]], !noalias [[META60]] 1349; CHECK-NEXT: [[TMP21:%.*]] = add nsw <4 x i32> [[WIDE_LOAD3]], [[WIDE_LOAD]] 1350; CHECK-NEXT: store <4 x i32> [[TMP21]], ptr [[TMP20]], align 4, !alias.scope [[META63]], !noalias [[META60]] 1351; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1352; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1353; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP65:![0-9]+]] 1354; CHECK: middle.block: 1355; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] 1356; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_LOOP_EXIT]], label [[SCALAR_PH]] 1357; CHECK: scalar.ph: 1358; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[VECTOR_SCEVCHECK]] ], [ 0, [[OUTER_LOOP]] ] 1359; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 1360; CHECK: inner.loop: 1361; CHECK-NEXT: [[INNER_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_LOOP]] ] 1362; CHECK-NEXT: [[TMP23:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP0]] 1363; CHECK-NEXT: [[TMP24:%.*]] = add nsw i64 [[TMP23]], [[TMP11]] 1364; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP24]] 1365; CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[ARRAYIDX_US]], align 4 1366; CHECK-NEXT: [[TMP26:%.*]] = mul nsw i64 [[INNER_IV]], [[TMP1]] 1367; CHECK-NEXT: [[TMP27:%.*]] = add nsw i64 [[TMP26]], [[TMP12]] 1368; CHECK-NEXT: [[ARRAYIDX11_US:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP27]] 1369; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[ARRAYIDX11_US]], align 4 1370; CHECK-NEXT: [[ADD12_US:%.*]] = add nsw i32 [[TMP28]], [[TMP25]] 1371; CHECK-NEXT: store i32 [[ADD12_US]], ptr [[ARRAYIDX11_US]], align 4 1372; CHECK-NEXT: [[INNER_IV_NEXT]] = add nuw nsw i64 [[INNER_IV]], 1 1373; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INNER_IV_NEXT]], [[WIDE_TRIP_COUNT]] 1374; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[INNER_LOOP_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP66:![0-9]+]] 1375; CHECK: inner.loop.exit: 1376; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 1377; CHECK-NEXT: [[EXITCOND40_NOT:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[WIDE_TRIP_COUNT39]] 1378; CHECK-NEXT: br i1 [[EXITCOND40_NOT]], label [[EXIT_LOOPEXIT:%.*]], label [[OUTER_LOOP]] 1379; CHECK: exit.loopexit: 1380; CHECK-NEXT: br label [[EXIT]] 1381; CHECK: exit: 1382; CHECK-NEXT: ret void 1383; 1384entry: 1385 %cmp26 = icmp sgt i32 %m, 0 1386 %cmp224 = icmp sgt i32 %n, 0 1387 %or.cond = and i1 %cmp26, %cmp224 1388 br i1 %or.cond, label %outer.loop.preheader, label %exit 1389 1390outer.loop.preheader: 1391 %add6 = add nuw nsw i32 %n, 1 1392 %0 = sext i32 %stride2 to i64 1393 %1 = sext i32 %stride1 to i64 1394 %2 = zext i32 %n to i64 1395 %3 = zext i32 %add6 to i64 1396 %wide.trip.count39 = zext i32 %m to i64 1397 %wide.trip.count = zext i32 %n to i64 1398 br label %outer.loop 1399 1400outer.loop: 1401 %outer.iv = phi i64 [ 0, %outer.loop.preheader ], [ %outer.iv.next, %inner.loop.exit ] 1402 %4 = mul nsw i64 %outer.iv, %2 1403 %5 = mul nsw i64 %outer.iv, %3 1404 br label %inner.loop 1405 1406inner.loop: 1407 %inner.iv = phi i64 [ 0, %outer.loop ], [ %inner.iv.next, %inner.loop ] 1408 %6 = mul nsw i64 %inner.iv, %0 1409 %7 = add nsw i64 %6, %4 1410 %arrayidx.us = getelementptr inbounds i32, ptr %src, i64 %7 1411 %8 = load i32, ptr %arrayidx.us, align 4 1412 %9 = mul nsw i64 %inner.iv, %1 1413 %10 = add nsw i64 %9, %5 1414 %arrayidx11.us = getelementptr inbounds i32, ptr %dst, i64 %10 1415 %11 = load i32, ptr %arrayidx11.us, align 4 1416 %add12.us = add nsw i32 %11, %8 1417 store i32 %add12.us, ptr %arrayidx11.us, align 4 1418 %inner.iv.next = add nuw nsw i64 %inner.iv, 1 1419 %exitcond.not = icmp eq i64 %inner.iv.next, %wide.trip.count 1420 br i1 %exitcond.not, label %inner.loop.exit, label %inner.loop 1421 1422inner.loop.exit: 1423 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 1424 %exitcond40.not = icmp eq i64 %outer.iv.next, %wide.trip.count39 1425 br i1 %exitcond40.not, label %exit, label %outer.loop 1426 1427exit: 1428 ret void 1429} 1430 1431 1432; Test case where the AddRec for the pointers in the inner loop have the AddRec 1433; of the outer loop as start value. It is sufficient to subtract the start 1434; values (%dst, %src) of the outer AddRecs. 1435define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec(ptr nocapture noundef %dst, ptr nocapture noundef readonly %src, i64 noundef %m, i64 noundef %n) { 1436; CHECK-LABEL: define void @nested_loop_start_of_inner_ptr_addrec_is_same_outer_addrec 1437; CHECK-SAME: (ptr noundef captures(none) [[DST:%.*]], ptr noundef readonly captures(none) [[SRC:%.*]], i64 noundef [[M:%.*]], i64 noundef [[N:%.*]]) { 1438; CHECK-NEXT: entry: 1439; CHECK-NEXT: [[SRC2:%.*]] = ptrtoint ptr [[SRC]] to i64 1440; CHECK-NEXT: [[DST1:%.*]] = ptrtoint ptr [[DST]] to i64 1441; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[DST1]], [[SRC2]] 1442; CHECK-NEXT: br label [[OUTER_LOOP:%.*]] 1443; CHECK: outer.loop: 1444; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[INNER_EXIT:%.*]] ] 1445; CHECK-NEXT: [[MUL:%.*]] = mul nsw i64 [[OUTER_IV]], [[N]] 1446; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4 1447; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1448; CHECK: vector.memcheck: 1449; CHECK-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 1450; CHECK-NEXT: br i1 [[DIFF_CHECK]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1451; CHECK: vector.ph: 1452; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], 4 1453; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]] 1454; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1455; CHECK: vector.body: 1456; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1457; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 0 1458; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], [[MUL]] 1459; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]] 1460; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 1461; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 1462; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]] 1463; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], splat (i32 10) 1464; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 1465; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP7]], align 4 1466; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 1467; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] 1468; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP67:![0-9]+]] 1469; CHECK: middle.block: 1470; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] 1471; CHECK-NEXT: br i1 [[CMP_N]], label [[INNER_EXIT]], label [[SCALAR_PH]] 1472; CHECK: scalar.ph: 1473; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_LOOP]] ] 1474; CHECK-NEXT: br label [[INNER_LOOP:%.*]] 1475; CHECK: inner.loop: 1476; CHECK-NEXT: [[IV_INNER:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_INNER_NEXT:%.*]], [[INNER_LOOP]] ] 1477; CHECK-NEXT: [[IDX:%.*]] = add nuw nsw i64 [[IV_INNER]], [[MUL]] 1478; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[IDX]] 1479; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4 1480; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IDX]] 1481; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[L]], 10 1482; CHECK-NEXT: store i32 [[ADD]], ptr [[GEP_DST]], align 4 1483; CHECK-NEXT: [[IV_INNER_NEXT]] = add nuw nsw i64 [[IV_INNER]], 1 1484; CHECK-NEXT: [[INNER_EXIT_COND:%.*]] = icmp eq i64 [[IV_INNER_NEXT]], [[N]] 1485; CHECK-NEXT: br i1 [[INNER_EXIT_COND]], label [[INNER_EXIT]], label [[INNER_LOOP]], !llvm.loop [[LOOP68:![0-9]+]] 1486; CHECK: inner.exit: 1487; CHECK-NEXT: [[OUTER_IV_NEXT]] = add nuw nsw i64 [[OUTER_IV]], 1 1488; CHECK-NEXT: [[OUTER_EXIT_COND:%.*]] = icmp eq i64 [[OUTER_IV_NEXT]], [[M]] 1489; CHECK-NEXT: br i1 [[OUTER_EXIT_COND]], label [[OUTER_EXIT:%.*]], label [[OUTER_LOOP]] 1490; CHECK: outer.exit: 1491; CHECK-NEXT: ret void 1492; 1493entry: 1494 br label %outer.loop 1495 1496outer.loop: 1497 %outer.iv = phi i64 [ 0, %entry ], [ %outer.iv.next, %inner.exit ] 1498 %mul = mul nsw i64 %outer.iv, %n 1499 br label %inner.loop 1500 1501inner.loop: 1502 %iv.inner = phi i64 [ 0, %outer.loop ], [ %iv.inner.next, %inner.loop ] 1503 %idx = add nuw nsw i64 %iv.inner, %mul 1504 %gep.src = getelementptr inbounds i32, ptr %src, i64 %idx 1505 %l = load i32, ptr %gep.src, align 4 1506 %gep.dst = getelementptr inbounds i32, ptr %dst, i64 %idx 1507 %add = add nsw i32 %l, 10 1508 store i32 %add, ptr %gep.dst, align 4 1509 %iv.inner.next = add nuw nsw i64 %iv.inner, 1 1510 %inner.exit.cond = icmp eq i64 %iv.inner.next, %n 1511 br i1 %inner.exit.cond, label %inner.exit, label %inner.loop 1512 1513inner.exit: 1514 %outer.iv.next = add nuw nsw i64 %outer.iv, 1 1515 %outer.exit.cond = icmp eq i64 %outer.iv.next, %m 1516 br i1 %outer.exit.cond, label %outer.exit, label %outer.loop 1517 1518outer.exit: 1519 ret void 1520} 1521 1522; The stride for the access in the inner loop is known to be non-negative via 1523; loop guards. 1524define void @stride_check_known_via_loop_guard(ptr %C, ptr %A, i32 %Acols) { 1525; CHECK-LABEL: define void @stride_check_known_via_loop_guard 1526; CHECK-SAME: (ptr [[C:%.*]], ptr [[A:%.*]], i32 [[ACOLS:%.*]]) { 1527; CHECK-NEXT: entry: 1528; CHECK-NEXT: [[PRE_C:%.*]] = icmp ugt i32 [[ACOLS]], 0 1529; CHECK-NEXT: br i1 [[PRE_C]], label [[EXIT:%.*]], label [[OUTER_HEADER_PREHEADER:%.*]] 1530; CHECK: outer.header.preheader: 1531; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[C]], i64 8000 1532; CHECK-NEXT: [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[A]], i64 8 1533; CHECK-NEXT: br label [[OUTER_HEADER:%.*]] 1534; CHECK: outer.header: 1535; CHECK-NEXT: [[OUTER_IV:%.*]] = phi i32 [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH:%.*]] ], [ 0, [[OUTER_HEADER_PREHEADER]] ] 1536; CHECK-NEXT: [[MUL_US:%.*]] = mul i32 [[OUTER_IV]], [[ACOLS]] 1537; CHECK-NEXT: [[ARRAYIDX_US:%.*]] = getelementptr double, ptr [[A]], i32 [[MUL_US]] 1538; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]] 1539; CHECK: vector.memcheck: 1540; CHECK-NEXT: [[BOUND0:%.*]] = icmp ult ptr [[C]], [[SCEVGEP1]] 1541; CHECK-NEXT: [[BOUND1:%.*]] = icmp ult ptr [[A]], [[SCEVGEP]] 1542; CHECK-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] 1543; CHECK-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]] 1544; CHECK: vector.ph: 1545; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] 1546; CHECK: vector.body: 1547; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] 1548; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 1549; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[TMP0]] 1550; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8, !alias.scope [[META69:![0-9]+]] 1551; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x double> poison, double [[TMP2]], i64 0 1552; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x double> [[BROADCAST_SPLATINSERT]], <4 x double> poison, <4 x i32> zeroinitializer 1553; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i32 0 1554; CHECK-NEXT: store <4 x double> [[BROADCAST_SPLAT]], ptr [[TMP3]], align 8, !alias.scope [[META72:![0-9]+]], !noalias [[META69]] 1555; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 1556; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000 1557; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP74:![0-9]+]] 1558; CHECK: middle.block: 1559; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[SCALAR_PH]] 1560; CHECK: scalar.ph: 1561; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 1000, [[MIDDLE_BLOCK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[OUTER_HEADER]] ] 1562; CHECK-NEXT: br label [[INNER:%.*]] 1563; CHECK: inner: 1564; CHECK-NEXT: [[INNER_IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER]] ] 1565; CHECK-NEXT: [[GEP_C:%.*]] = getelementptr inbounds double, ptr [[C]], i32 [[INNER_IV]] 1566; CHECK-NEXT: [[L:%.*]] = load double, ptr [[ARRAYIDX_US]], align 8 1567; CHECK-NEXT: store double [[L]], ptr [[GEP_C]], align 8 1568; CHECK-NEXT: [[INNER_IV_NEXT]] = add i32 [[INNER_IV]], 1 1569; CHECK-NEXT: [[INNER_C:%.*]] = icmp eq i32 [[INNER_IV_NEXT]], 1000 1570; CHECK-NEXT: br i1 [[INNER_C]], label [[OUTER_LATCH]], label [[INNER]], !llvm.loop [[LOOP75:![0-9]+]] 1571; CHECK: outer.latch: 1572; CHECK-NEXT: [[OUTER_IV_NEXT]] = add i32 [[OUTER_IV]], 1 1573; CHECK-NEXT: [[OUTER_C:%.*]] = icmp ult i32 [[OUTER_IV]], 128 1574; CHECK-NEXT: br i1 [[OUTER_C]], label [[EXIT_LOOPEXIT:%.*]], label [[OUTER_HEADER]] 1575; CHECK: exit.loopexit: 1576; CHECK-NEXT: br label [[EXIT]] 1577; CHECK: exit: 1578; CHECK-NEXT: ret void 1579; 1580entry: 1581 %pre.c = icmp ugt i32 %Acols, 0 1582 br i1 %pre.c, label %exit, label %outer.header 1583 1584outer.header: 1585 %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv.next, %outer.latch ] 1586 %mul.us = mul i32 %outer.iv, %Acols 1587 %arrayidx.us = getelementptr double, ptr %A, i32 %mul.us 1588 br label %inner 1589 1590inner: 1591 %inner.iv = phi i32 [ 0, %outer.header ], [ %inner.iv.next, %inner ] 1592 %gep.C = getelementptr inbounds double, ptr %C, i32 %inner.iv 1593 %l = load double, ptr %arrayidx.us, align 8 1594 store double %l, ptr %gep.C, align 8 1595 %inner.iv.next = add i32 %inner.iv, 1 1596 %inner.c = icmp eq i32 %inner.iv.next, 1000 1597 br i1 %inner.c, label %outer.latch, label %inner 1598 1599outer.latch: 1600 %outer.iv.next = add i32 %outer.iv, 1 1601 %outer.c = icmp ult i32 %outer.iv, 128 1602 br i1 %outer.c, label %exit, label %outer.header 1603 1604exit: 1605 ret void 1606} 1607