1; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -S \ 2; RUN: | FileCheck %s --check-prefix=CHECK-VECTOR 3; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -S \ 4; RUN: | FileCheck %s --check-prefix=CHECK-SCALAR 5 6target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 7target triple = "x86_64-apple-macosx10.8.0" 8 9; We don't unroll this loop because it has a small constant trip count 10; that is not profitable for generating a scalar epilogue 11; 12; CHECK-VECTOR-LABEL: @foo_trip_count_8( 13; CHECK-VECTOR: load <4 x i32> 14; CHECK-VECTOR-NOT: load <4 x i32> 15; CHECK-VECTOR: store <4 x i32> 16; CHECK-VECTOR-NOT: store <4 x i32> 17; CHECK-VECTOR: ret 18; 19; CHECK-SCALAR-LABEL: @foo_trip_count_8( 20; CHECK-SCALAR: load i32, ptr 21; CHECK-SCALAR-NOT: load i32, ptr 22; CHECK-SCALAR: store i32 23; CHECK-SCALAR-NOT: store i32 24; CHECK-SCALAR: ret 25define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp { 26entry: 27 br label %for.body 28 29for.body: ; preds = %for.body, %entry 30 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 31 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 32 %1 = load i32, ptr %0, align 4 33 %2 = add nsw i32 %1, 6 34 store i32 %2, ptr %0, align 4 35 %indvars.iv.next = add i64 %indvars.iv, 1 36 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 37 %exitcond = icmp eq i32 %lftr.wideiv, 8 38 br i1 %exitcond, label %for.end, label %for.body 39 40for.end: ; preds = %for.body 41 ret void 42} 43 44; We should unroll this loop 4 times since TC being a multiple of VF means 45; that the epilogue loop may not need to run, making it profitable for 46; the vector loop to run even once 47; 48; CHECK-VECTOR-LABEL: @foo_trip_count_16( 49; CHECK-VECTOR: load <4 x i32> 50; CHECK-VECTOR: load <4 x i32> 51; CHECK-VECTOR: load <4 x i32> 52; CHECK-VECTOR: load <4 x i32> 53; CHECK-VECTOR-NOT: load <4 x i32> 54; CHECK-VECTOR: store <4 x i32> 55; CHECK-VECTOR: store <4 x i32> 56; CHECK-VECTOR: store <4 x i32> 57; CHECK-VECTOR: store <4 x i32> 58; CHECK-VECTOR-NOT: store <4 x i32> 59; CHECK-VECTOR: ret 60; 61; CHECK-SCALAR-LABEL: @foo_trip_count_16( 62; CHECK-SCALAR: load i32, ptr 63; CHECK-SCALAR-NOT: load i32, ptr 64; CHECK-SCALAR: store i32 65; CHECK-SCALAR-NOT: store i32 66; CHECK-SCALAR: ret 67define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp { 68entry: 69 br label %for.body 70 71for.body: ; preds = %for.body, %entry 72 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 73 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 74 %1 = load i32, ptr %0, align 4 75 %2 = add nsw i32 %1, 6 76 store i32 %2, ptr %0, align 4 77 %indvars.iv.next = add i64 %indvars.iv, 1 78 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 79 %exitcond = icmp eq i32 %lftr.wideiv, 16 80 br i1 %exitcond, label %for.end, label %for.body 81 82for.end: ; preds = %for.body 83 ret void 84} 85 86; We should unroll this loop four times since unrolling it twice 87; will produce the same epilogue TC of 1, making larger unroll count 88; more profitable 89; 90; CHECK-VECTOR-LABEL: @foo_trip_count_17( 91; CHECK-VECTOR: load <4 x i32> 92; CHECK-VECTOR: load <4 x i32> 93; CHECK-VECTOR: load <4 x i32> 94; CHECK-VECTOR: load <4 x i32> 95; CHECK-VECTOR-NOT: load <4 x i32> 96; CHECK-VECTOR: store <4 x i32> 97; CHECK-VECTOR: store <4 x i32> 98; CHECK-VECTOR: store <4 x i32> 99; CHECK-VECTOR: store <4 x i32> 100; CHECK-VECTOR-NOT: store <4 x i32> 101; CHECK-VECTOR: ret 102; 103; CHECK-SCALAR-LABEL: @foo_trip_count_17( 104; CHECK-SCALAR: load i32, ptr 105; CHECK-SCALAR-NOT: load i32, ptr 106; CHECK-SCALAR: store i32 107; CHECK-SCALAR-NOT: store i32 108; CHECK-SCALAR: ret 109define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp { 110entry: 111 br label %for.body 112 113for.body: ; preds = %for.body, %entry 114 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 115 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 116 %1 = load i32, ptr %0, align 4 117 %2 = add nsw i32 %1, 6 118 store i32 %2, ptr %0, align 4 119 %indvars.iv.next = add i64 %indvars.iv, 1 120 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 121 %exitcond = icmp eq i32 %lftr.wideiv, 17 122 br i1 %exitcond, label %for.end, label %for.body 123 124for.end: ; preds = %for.body 125 ret void 126} 127 128; We should unroll this loop twice since unrolling four times will 129; create an epilogue loop of TC 8, while unrolling it twice will 130; eliminate the epologue loop altogether 131; 132; CHECK-VECTOR-LABEL: @foo_trip_count_24( 133; CHECK-VECTOR: load <4 x i32> 134; CHECK-VECTOR: load <4 x i32> 135; CHECK-VECTOR-NOT: load <4 x i32> 136; CHECK-VECTOR: store <4 x i32> 137; CHECK-VECTOR: store <4 x i32> 138; CHECK-VECTOR-NOT: store <4 x i32> 139; CHECK-VECTOR: ret 140; 141; CHECK-SCALAR-LABEL: @foo_trip_count_24( 142; CHECK-SCALAR: load i32, ptr 143; CHECK-SCALAR-NOT: load i32, ptr 144; CHECK-SCALAR: store i32 145; CHECK-SCALAR-NOT: store i32 146; CHECK-SCALAR: ret 147define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp { 148entry: 149 br label %for.body 150 151for.body: ; preds = %for.body, %entry 152 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 153 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 154 %1 = load i32, ptr %0, align 4 155 %2 = add nsw i32 %1, 6 156 store i32 %2, ptr %0, align 4 157 %indvars.iv.next = add i64 %indvars.iv, 1 158 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 159 %exitcond = icmp eq i32 %lftr.wideiv, 24 160 br i1 %exitcond, label %for.end, label %for.body 161 162for.end: ; preds = %for.body 163 ret void 164} 165 166; We should unroll this loop twice since TC not being a multiple of VF may require 167; the epilogue loop to run, making it profitable when the vector loop runs 168; at least twice. 169; 170; CHECK-VECTOR-LABEL: @foo_trip_count_25( 171; CHECK-VECTOR: load <4 x i32> 172; CHECK-VECTOR: load <4 x i32> 173; CHECK-VECTOR-NOT: load <4 x i32> 174; CHECK-VECTOR: store <4 x i32> 175; CHECK-VECTOR: store <4 x i32> 176; CHECK-VECTOR-NOT: store <4 x i32> 177; CHECK-VECTOR: ret 178; 179; CHECK-SCALAR-LABEL: @foo_trip_count_25( 180; CHECK-SCALAR: load i32, ptr 181; CHECK-SCALAR-NOT: load i32, ptr 182; CHECK-SCALAR: store i32 183; CHECK-SCALAR-NOT: store i32 184; CHECK-SCALAR: ret 185define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp { 186entry: 187 br label %for.body 188 189for.body: ; preds = %for.body, %entry 190 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 191 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 192 %1 = load i32, ptr %0, align 4 193 %2 = add nsw i32 %1, 6 194 store i32 %2, ptr %0, align 4 195 %indvars.iv.next = add i64 %indvars.iv, 1 196 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 197 %exitcond = icmp eq i32 %lftr.wideiv, 25 198 br i1 %exitcond, label %for.end, label %for.body 199 200for.end: ; preds = %for.body 201 ret void 202} 203 204; We should unroll this loop 4 times since TC not being a multiple of VF may require 205; the epilogue loop to run, making it profitable when the vector loop runs 206; at least twice. 207; 208; CHECK-VECTOR-LABEL: @foo_trip_count_33( 209; CHECK-VECTOR: load <4 x i32> 210; CHECK-VECTOR: load <4 x i32> 211; CHECK-VECTOR: load <4 x i32> 212; CHECK-VECTOR: load <4 x i32> 213; CHECK-VECTOR-NOT: load <4 x i32> 214; CHECK-VECTOR: store <4 x i32> 215; CHECK-VECTOR: store <4 x i32> 216; CHECK-VECTOR: store <4 x i32> 217; CHECK-VECTOR: store <4 x i32> 218; CHECK-VECTOR-NOT: store <4 x i32> 219; CHECK-VECTOR: ret 220; 221; CHECK-SCALAR-LABEL: @foo_trip_count_33( 222; CHECK-SCALAR: load i32, ptr 223; CHECK-SCALAR-NOT: load i32, ptr 224; CHECK-SCALAR: store i32 225; CHECK-SCALAR-NOT: store i32 226; CHECK-SCALAR: ret 227define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp { 228entry: 229 br label %for.body 230 231for.body: ; preds = %for.body, %entry 232 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 233 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 234 %1 = load i32, ptr %0, align 4 235 %2 = add nsw i32 %1, 6 236 store i32 %2, ptr %0, align 4 237 %indvars.iv.next = add i64 %indvars.iv, 1 238 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 239 %exitcond = icmp eq i32 %lftr.wideiv, 33 240 br i1 %exitcond, label %for.end, label %for.body 241 242for.end: ; preds = %for.body 243 ret void 244} 245 246; We should unroll this loop 4 times since TC not being a multiple of VF may require 247; the epilogue loop to run, making it profitable when the vector loop runs 248; at least twice. The IC is restricted to 4 since that is the maximum supported 249; for the target. 250; 251; CHECK-VECTOR-LABEL: @foo_trip_count_101( 252; CHECK-VECTOR: load <4 x i32> 253; CHECK-VECTOR: load <4 x i32> 254; CHECK-VECTOR: load <4 x i32> 255; CHECK-VECTOR: load <4 x i32> 256; CHECK-VECTOR-NOT: load <4 x i32> 257; CHECK-VECTOR: store <4 x i32> 258; CHECK-VECTOR: store <4 x i32> 259; CHECK-VECTOR: store <4 x i32> 260; CHECK-VECTOR: store <4 x i32> 261; CHECK-VECTOR-NOT: store <4 x i32> 262; CHECK-VECTOR: ret 263; 264; CHECK-SCALAR-LABEL: @foo_trip_count_101( 265; CHECK-SCALAR: load i32, ptr 266; CHECK-SCALAR-NOT: load i32, ptr 267; CHECK-SCALAR: store i32 268; CHECK-SCALAR-NOT: store i32 269; CHECK-SCALAR: ret 270define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp { 271entry: 272 br label %for.body 273 274for.body: ; preds = %for.body, %entry 275 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 276 %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 277 %1 = load i32, ptr %0, align 4 278 %2 = add nsw i32 %1, 6 279 store i32 %2, ptr %0, align 4 280 %indvars.iv.next = add i64 %indvars.iv, 1 281 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 282 %exitcond = icmp eq i32 %lftr.wideiv, 101 283 br i1 %exitcond, label %for.end, label %for.body 284 285for.end: ; preds = %for.body 286 ret void 287} 288 289; But this is a good small loop to unroll as we don't know of a bound on its 290; trip count. 291; 292; CHECK-VECTOR-LABEL: @bar( 293; CHECK-VECTOR: store <4 x i32> 294; CHECK-VECTOR: store <4 x i32> 295; CHECK-VECTOR: ret 296; 297; For x86, loop unroll in loop vectorizer is disabled when VF==1. 298; 299; CHECK-SCALAR-LABEL: @bar( 300; CHECK-SCALAR: store i32 301; CHECK-SCALAR-NOT: store i32 302; CHECK-SCALAR: ret 303define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp { 304 %1 = icmp sgt i32 %n, 0 305 br i1 %1, label %.lr.ph, label %._crit_edge 306 307.lr.ph: ; preds = %0, %.lr.ph 308 %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] 309 %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv 310 %3 = load i32, ptr %2, align 4 311 %4 = add nsw i32 %3, 6 312 store i32 %4, ptr %2, align 4 313 %indvars.iv.next = add i64 %indvars.iv, 1 314 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 315 %exitcond = icmp eq i32 %lftr.wideiv, %n 316 br i1 %exitcond, label %._crit_edge, label %.lr.ph 317 318._crit_edge: ; preds = %.lr.ph, %0 319 ret void 320} 321 322; Also unroll if we need a runtime check but it was going to be added for 323; vectorization anyways. 324; CHECK-VECTOR-LABEL: @runtime_chk( 325; CHECK-VECTOR: store <4 x float> 326; CHECK-VECTOR: store <4 x float> 327; 328; But not if the unrolling would introduce the runtime check. 329; CHECK-SCALAR-LABEL: @runtime_chk( 330; CHECK-SCALAR: store float 331; CHECK-SCALAR-NOT: store float 332define void @runtime_chk(ptr %A, ptr %B, float %N) { 333entry: 334 br label %for.body 335 336for.body: 337 %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] 338 %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv 339 %0 = load float, ptr %arrayidx, align 4 340 %mul = fmul float %0, %N 341 %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv 342 store float %mul, ptr %arrayidx2, align 4 343 %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 344 %exitcond = icmp eq i64 %indvars.iv.next, 256 345 br i1 %exitcond, label %for.end, label %for.body 346 347for.end: 348 ret void 349} 350