xref: /llvm-project/llvm/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll (revision c1c5b854adc9414ee3d8c55ddd07bdb4cc5b7171)
1; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-interleave=0 -S \
2; RUN:   | FileCheck %s --check-prefix=CHECK-VECTOR
3; RUN: opt < %s -passes=loop-vectorize,dce -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=1 -force-vector-interleave=0 -S \
4; RUN:   | FileCheck %s --check-prefix=CHECK-SCALAR
5
6target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
7target triple = "x86_64-apple-macosx10.8.0"
8
9; We don't unroll this loop because it has a small constant trip count
10; that is not profitable for generating a scalar epilogue
11;
12; CHECK-VECTOR-LABEL: @foo_trip_count_8(
13; CHECK-VECTOR: load <4 x i32>
14; CHECK-VECTOR-NOT: load <4 x i32>
15; CHECK-VECTOR: store <4 x i32>
16; CHECK-VECTOR-NOT: store <4 x i32>
17; CHECK-VECTOR: ret
18;
19; CHECK-SCALAR-LABEL: @foo_trip_count_8(
20; CHECK-SCALAR: load i32, ptr
21; CHECK-SCALAR-NOT: load i32, ptr
22; CHECK-SCALAR: store i32
23; CHECK-SCALAR-NOT: store i32
24; CHECK-SCALAR: ret
25define void @foo_trip_count_8(ptr nocapture %A) nounwind uwtable ssp {
26entry:
27  br label %for.body
28
29for.body:                                       ; preds = %for.body, %entry
30  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
31  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
32  %1 = load i32, ptr %0, align 4
33  %2 = add nsw i32 %1, 6
34  store i32 %2, ptr %0, align 4
35  %indvars.iv.next = add i64 %indvars.iv, 1
36  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
37  %exitcond = icmp eq i32 %lftr.wideiv, 8
38  br i1 %exitcond, label %for.end, label %for.body
39
40for.end:                                       ; preds = %for.body
41  ret void
42}
43
44; We should unroll this loop 4 times since TC being a multiple of VF means
45; that the epilogue loop may not need to run, making it profitable for
46; the vector loop to run even once
47;
48; CHECK-VECTOR-LABEL: @foo_trip_count_16(
49; CHECK-VECTOR: load <4 x i32>
50; CHECK-VECTOR: load <4 x i32>
51; CHECK-VECTOR: load <4 x i32>
52; CHECK-VECTOR: load <4 x i32>
53; CHECK-VECTOR-NOT: load <4 x i32>
54; CHECK-VECTOR: store <4 x i32>
55; CHECK-VECTOR: store <4 x i32>
56; CHECK-VECTOR: store <4 x i32>
57; CHECK-VECTOR: store <4 x i32>
58; CHECK-VECTOR-NOT: store <4 x i32>
59; CHECK-VECTOR: ret
60;
61; CHECK-SCALAR-LABEL: @foo_trip_count_16(
62; CHECK-SCALAR: load i32, ptr
63; CHECK-SCALAR-NOT: load i32, ptr
64; CHECK-SCALAR: store i32
65; CHECK-SCALAR-NOT: store i32
66; CHECK-SCALAR: ret
67define void @foo_trip_count_16(ptr nocapture %A) nounwind uwtable ssp {
68entry:
69  br label %for.body
70
71for.body:                                       ; preds = %for.body, %entry
72  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
73  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
74  %1 = load i32, ptr %0, align 4
75  %2 = add nsw i32 %1, 6
76  store i32 %2, ptr %0, align 4
77  %indvars.iv.next = add i64 %indvars.iv, 1
78  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
79  %exitcond = icmp eq i32 %lftr.wideiv, 16
80  br i1 %exitcond, label %for.end, label %for.body
81
82for.end:                                       ; preds = %for.body
83  ret void
84}
85
86; We should unroll this loop four times since unrolling it twice
87; will produce the same epilogue TC of 1, making larger unroll count
88; more profitable
89;
90; CHECK-VECTOR-LABEL: @foo_trip_count_17(
91; CHECK-VECTOR: load <4 x i32>
92; CHECK-VECTOR: load <4 x i32>
93; CHECK-VECTOR: load <4 x i32>
94; CHECK-VECTOR: load <4 x i32>
95; CHECK-VECTOR-NOT: load <4 x i32>
96; CHECK-VECTOR: store <4 x i32>
97; CHECK-VECTOR: store <4 x i32>
98; CHECK-VECTOR: store <4 x i32>
99; CHECK-VECTOR: store <4 x i32>
100; CHECK-VECTOR-NOT: store <4 x i32>
101; CHECK-VECTOR: ret
102;
103; CHECK-SCALAR-LABEL: @foo_trip_count_17(
104; CHECK-SCALAR: load i32, ptr
105; CHECK-SCALAR-NOT: load i32, ptr
106; CHECK-SCALAR: store i32
107; CHECK-SCALAR-NOT: store i32
108; CHECK-SCALAR: ret
109define void @foo_trip_count_17(ptr nocapture %A) nounwind uwtable ssp {
110entry:
111  br label %for.body
112
113for.body:                                       ; preds = %for.body, %entry
114  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
115  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
116  %1 = load i32, ptr %0, align 4
117  %2 = add nsw i32 %1, 6
118  store i32 %2, ptr %0, align 4
119  %indvars.iv.next = add i64 %indvars.iv, 1
120  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
121  %exitcond = icmp eq i32 %lftr.wideiv, 17
122  br i1 %exitcond, label %for.end, label %for.body
123
124for.end:                                       ; preds = %for.body
125  ret void
126}
127
128; We should unroll this loop twice since unrolling four times will
129; create an epilogue loop of TC 8, while unrolling it twice will
130; eliminate the epologue loop altogether
131;
132; CHECK-VECTOR-LABEL: @foo_trip_count_24(
133; CHECK-VECTOR: load <4 x i32>
134; CHECK-VECTOR: load <4 x i32>
135; CHECK-VECTOR-NOT: load <4 x i32>
136; CHECK-VECTOR: store <4 x i32>
137; CHECK-VECTOR: store <4 x i32>
138; CHECK-VECTOR-NOT: store <4 x i32>
139; CHECK-VECTOR: ret
140;
141; CHECK-SCALAR-LABEL: @foo_trip_count_24(
142; CHECK-SCALAR: load i32, ptr
143; CHECK-SCALAR-NOT: load i32, ptr
144; CHECK-SCALAR: store i32
145; CHECK-SCALAR-NOT: store i32
146; CHECK-SCALAR: ret
147define void @foo_trip_count_24(ptr nocapture %A) nounwind uwtable ssp {
148entry:
149  br label %for.body
150
151for.body:                                       ; preds = %for.body, %entry
152  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
153  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
154  %1 = load i32, ptr %0, align 4
155  %2 = add nsw i32 %1, 6
156  store i32 %2, ptr %0, align 4
157  %indvars.iv.next = add i64 %indvars.iv, 1
158  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
159  %exitcond = icmp eq i32 %lftr.wideiv, 24
160  br i1 %exitcond, label %for.end, label %for.body
161
162for.end:                                       ; preds = %for.body
163  ret void
164}
165
166; We should unroll this loop twice since TC not being a multiple of VF may require
167; the epilogue loop to run, making it profitable when the vector loop runs
168; at least twice.
169;
170; CHECK-VECTOR-LABEL: @foo_trip_count_25(
171; CHECK-VECTOR: load <4 x i32>
172; CHECK-VECTOR: load <4 x i32>
173; CHECK-VECTOR-NOT: load <4 x i32>
174; CHECK-VECTOR: store <4 x i32>
175; CHECK-VECTOR: store <4 x i32>
176; CHECK-VECTOR-NOT: store <4 x i32>
177; CHECK-VECTOR: ret
178;
179; CHECK-SCALAR-LABEL: @foo_trip_count_25(
180; CHECK-SCALAR: load i32, ptr
181; CHECK-SCALAR-NOT: load i32, ptr
182; CHECK-SCALAR: store i32
183; CHECK-SCALAR-NOT: store i32
184; CHECK-SCALAR: ret
185define void @foo_trip_count_25(ptr nocapture %A) nounwind uwtable ssp {
186entry:
187  br label %for.body
188
189for.body:                                       ; preds = %for.body, %entry
190  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
191  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
192  %1 = load i32, ptr %0, align 4
193  %2 = add nsw i32 %1, 6
194  store i32 %2, ptr %0, align 4
195  %indvars.iv.next = add i64 %indvars.iv, 1
196  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
197  %exitcond = icmp eq i32 %lftr.wideiv, 25
198  br i1 %exitcond, label %for.end, label %for.body
199
200for.end:                                       ; preds = %for.body
201  ret void
202}
203
204; We should unroll this loop 4 times since TC not being a multiple of VF may require
205; the epilogue loop to run, making it profitable when the vector loop runs
206; at least twice.
207;
208; CHECK-VECTOR-LABEL: @foo_trip_count_33(
209; CHECK-VECTOR: load <4 x i32>
210; CHECK-VECTOR: load <4 x i32>
211; CHECK-VECTOR: load <4 x i32>
212; CHECK-VECTOR: load <4 x i32>
213; CHECK-VECTOR-NOT: load <4 x i32>
214; CHECK-VECTOR: store <4 x i32>
215; CHECK-VECTOR: store <4 x i32>
216; CHECK-VECTOR: store <4 x i32>
217; CHECK-VECTOR: store <4 x i32>
218; CHECK-VECTOR-NOT: store <4 x i32>
219; CHECK-VECTOR: ret
220;
221; CHECK-SCALAR-LABEL: @foo_trip_count_33(
222; CHECK-SCALAR: load i32, ptr
223; CHECK-SCALAR-NOT: load i32, ptr
224; CHECK-SCALAR: store i32
225; CHECK-SCALAR-NOT: store i32
226; CHECK-SCALAR: ret
227define void @foo_trip_count_33(ptr nocapture %A) nounwind uwtable ssp {
228entry:
229  br label %for.body
230
231for.body:                                       ; preds = %for.body, %entry
232  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
233  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
234  %1 = load i32, ptr %0, align 4
235  %2 = add nsw i32 %1, 6
236  store i32 %2, ptr %0, align 4
237  %indvars.iv.next = add i64 %indvars.iv, 1
238  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
239  %exitcond = icmp eq i32 %lftr.wideiv, 33
240  br i1 %exitcond, label %for.end, label %for.body
241
242for.end:                                       ; preds = %for.body
243  ret void
244}
245
246; We should unroll this loop 4 times since TC not being a multiple of VF may require
247; the epilogue loop to run, making it profitable when the vector loop runs
248; at least twice. The IC is restricted to 4 since that is the maximum supported
249; for the target.
250;
251; CHECK-VECTOR-LABEL: @foo_trip_count_101(
252; CHECK-VECTOR: load <4 x i32>
253; CHECK-VECTOR: load <4 x i32>
254; CHECK-VECTOR: load <4 x i32>
255; CHECK-VECTOR: load <4 x i32>
256; CHECK-VECTOR-NOT: load <4 x i32>
257; CHECK-VECTOR: store <4 x i32>
258; CHECK-VECTOR: store <4 x i32>
259; CHECK-VECTOR: store <4 x i32>
260; CHECK-VECTOR: store <4 x i32>
261; CHECK-VECTOR-NOT: store <4 x i32>
262; CHECK-VECTOR: ret
263;
264; CHECK-SCALAR-LABEL: @foo_trip_count_101(
265; CHECK-SCALAR: load i32, ptr
266; CHECK-SCALAR-NOT: load i32, ptr
267; CHECK-SCALAR: store i32
268; CHECK-SCALAR-NOT: store i32
269; CHECK-SCALAR: ret
270define void @foo_trip_count_101(ptr nocapture %A) nounwind uwtable ssp {
271entry:
272  br label %for.body
273
274for.body:                                       ; preds = %for.body, %entry
275  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
276  %0 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
277  %1 = load i32, ptr %0, align 4
278  %2 = add nsw i32 %1, 6
279  store i32 %2, ptr %0, align 4
280  %indvars.iv.next = add i64 %indvars.iv, 1
281  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
282  %exitcond = icmp eq i32 %lftr.wideiv, 101
283  br i1 %exitcond, label %for.end, label %for.body
284
285for.end:                                       ; preds = %for.body
286  ret void
287}
288
289; But this is a good small loop to unroll as we don't know of a bound on its
290; trip count.
291;
292; CHECK-VECTOR-LABEL: @bar(
293; CHECK-VECTOR: store <4 x i32>
294; CHECK-VECTOR: store <4 x i32>
295; CHECK-VECTOR: ret
296;
297; For x86, loop unroll in loop vectorizer is disabled when VF==1.
298;
299; CHECK-SCALAR-LABEL: @bar(
300; CHECK-SCALAR: store i32
301; CHECK-SCALAR-NOT: store i32
302; CHECK-SCALAR: ret
303define void @bar(ptr nocapture %A, i32 %n) nounwind uwtable ssp {
304  %1 = icmp sgt i32 %n, 0
305  br i1 %1, label %.lr.ph, label %._crit_edge
306
307.lr.ph:                                           ; preds = %0, %.lr.ph
308  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
309  %2 = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
310  %3 = load i32, ptr %2, align 4
311  %4 = add nsw i32 %3, 6
312  store i32 %4, ptr %2, align 4
313  %indvars.iv.next = add i64 %indvars.iv, 1
314  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
315  %exitcond = icmp eq i32 %lftr.wideiv, %n
316  br i1 %exitcond, label %._crit_edge, label %.lr.ph
317
318._crit_edge:                                      ; preds = %.lr.ph, %0
319  ret void
320}
321
322; Also unroll if we need a runtime check but it was going to be added for
323; vectorization anyways.
324; CHECK-VECTOR-LABEL: @runtime_chk(
325; CHECK-VECTOR: store <4 x float>
326; CHECK-VECTOR: store <4 x float>
327;
328; But not if the unrolling would introduce the runtime check.
329; CHECK-SCALAR-LABEL: @runtime_chk(
330; CHECK-SCALAR: store float
331; CHECK-SCALAR-NOT: store float
332define void @runtime_chk(ptr %A, ptr %B, float %N) {
333entry:
334  br label %for.body
335
336for.body:
337  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
338  %arrayidx = getelementptr inbounds float, ptr %B, i64 %indvars.iv
339  %0 = load float, ptr %arrayidx, align 4
340  %mul = fmul float %0, %N
341  %arrayidx2 = getelementptr inbounds float, ptr %A, i64 %indvars.iv
342  store float %mul, ptr %arrayidx2, align 4
343  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
344  %exitcond = icmp eq i64 %indvars.iv.next, 256
345  br i1 %exitcond, label %for.end, label %for.body
346
347for.end:
348  ret void
349}
350