xref: /llvm-project/llvm/test/Transforms/LoopVectorize/pr38697.ll (revision 46f42d4db91ea89046981010aebbe501eae5db74)
1; RUN: opt -passes=loop-vectorize -force-vector-width=2 -S < %s 2>&1 | FileCheck %s
2; RUN: opt -passes=indvars -S < %s 2>&1 | FileCheck %s -check-prefix=INDVARCHECK
3
4target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
5target triple = "x86_64-unknown-linux-gnu"
6
7; Produced from test-case:
8;
9; void testCountIncrLoop(unsigned char *ptr, int lim, int count, int val)
10; {
11;   int inx = 0;
12;   for (int outer_i = 0; outer_i < lim; ++outer_i) {
13;     if (count > 0) { // At runtime, 'count' is 0, so the following code is dead.
14;       int result = val;
15;       int tmp = count;
16;
17;       while (tmp < 8) {
18;         result += val >> tmp;
19;         tmp += count;
20;       }
21;
22;       ptr[inx++] = (unsigned char) result;
23;     }
24;   }
25; }
26;
27; No explicit division appears in the input, but a division is generated during
28; vectorization, and that division is a division-by-0 when the input 'count'
29; is 0, so it cannot be hoisted above the guard of 'count > 0'.
30
31; Verify that a 'udiv' does not appear in the 'loop1.preheader' block, and that
32; a 'udiv' has been inserted at the top of the 'while.body.preheader' block.
33define void @testCountIncrLoop(ptr %ptr, i32 %lim, i32 %count, i32 %val) mustprogress {
34; CHECK-LABEL: @testCountIncrLoop(
35; CHECK-NEXT:  entry:
36; CHECK:       loop1.preheader:
37; CHECK-NOT:     udiv
38; CHECK:       loop1.body:
39; CHECK:       while.cond.preheader:
40; CHECK:       while.body.preheader:
41; CHECK-NEXT:    [[TMP1:%.*]] = udiv i32 [[TMP0:%.*]], [[COUNT:%.*]]
42; CHECK:       vector.ph:
43; CHECK:       exit:
44; CHECK:         ret void
45;
46entry:
47  %cmp1 = icmp sgt i32 %lim, 0
48  br i1 %cmp1, label %loop1.preheader, label %exit
49
50loop1.preheader:                                  ; preds = %entry
51  %cmp2 = icmp sgt i32 %count, 0
52  %cmp4 = icmp slt i32 %count, 8
53  br label %loop1.body
54
55loop1.body:                                       ; preds = %loop1.inc, %loop1.preheader
56  %outer_i = phi i32 [ 0, %loop1.preheader ], [ %outer_i.1, %loop1.inc ]
57  %inx.1 = phi i32 [ 0, %loop1.preheader ], [ %inx.2, %loop1.inc ]
58  br i1 %cmp2, label %while.cond.preheader, label %loop1.inc
59
60while.cond.preheader:                             ; preds = %loop1.body
61  br i1 %cmp4, label %while.body, label %while.end
62
63while.body:                                       ; preds = %while.body, %while.cond.preheader
64  %tmp = phi i32 [ %add3, %while.body ], [ %count, %while.cond.preheader ]
65  %result.1 = phi i32 [ %add, %while.body ], [ %val, %while.cond.preheader ]
66  %shr = ashr i32 %val, %tmp
67  %add = add nsw i32 %shr, %result.1
68  %add3 = add nsw i32 %tmp, %count
69  %cmp3 = icmp slt i32 %add3, 8
70  br i1 %cmp3, label %while.body, label %while.end
71
72while.end:                                        ; preds = %while.body, %while.cond.preheader
73  %result.0.lcssa = phi i32 [ %val, %while.cond.preheader ], [ %add, %while.body ]
74  %conv = trunc i32 %result.0.lcssa to i8
75  %inc = add nsw i32 %inx.1, 1
76  %idxprom = sext i32 %inx.1 to i64
77  %arrayidx = getelementptr inbounds i8, ptr %ptr, i64 %idxprom
78  store i8 %conv, ptr %arrayidx, align 1
79  br label %loop1.inc
80
81loop1.inc:                                        ; preds = %while.end, %loop1.body
82  %inx.2 = phi i32 [ %inc, %while.end ], [ %inx.1, %loop1.body ]
83  %outer_i.1 = add nuw nsw i32 %outer_i, 1
84  %exitcond = icmp eq i32 %outer_i.1, %lim
85  br i1 %exitcond, label %exit, label %loop1.body
86
87exit:                                             ; preds = %loop1.inc, %entry
88  ret void
89}
90
91; These next tests are all based on the following source code, with slight
92; variations on the calculation of 'incr' (all of which are loop-invariant
93; divisions, but only some of which can be safely hoisted):
94;
95; uint32_t foo(uint32_t *ptr, uint32_t start1, uint32_t start2) {
96;   uint32_t counter1, counter2;
97;   uint32_t val = start1;
98;   for (counter1 = 1; counter1 < 100; ++counter1) {
99;     uint32_t index = 0;
100;     val += ptr[index];
101;     for (counter2 = start2; counter2 < 10; ++counter2) {
102;       // Division is loop invariant, and denominator is guaranteed non-zero:
103;       // Safe to hoist it out of the inner loop.
104;       uint32_t incr = 16 / counter1;
105;       index += incr;
106;       val += ptr[index];
107;     }
108;   }
109;   return val;
110; }
111
112; This version is as written above, where 'incr' is '16/counter1', and it is
113; guaranted that 'counter1' is always non-zero.  So it is safe to hoist the
114; division from the inner loop to the preheader.
115;
116; Verify that the 'udiv' is hoisted to the preheader, and is not in the loop body.
117define i32 @NonZeroDivHoist(ptr nocapture readonly %ptr, i32 %start1, i32 %start2) {
118; INDVARCHECK-LABEL: @NonZeroDivHoist(
119; INDVARCHECK-NEXT:  entry:
120; INDVARCHECK:       for.body3.lr.ph:
121; INDVARCHECK-NEXT:    [[TMP0:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
122; INDVARCHECK-NEXT:    br label [[FOR_BODY3:%.*]]
123; INDVARCHECK:       for.body3:
124; INDVARCHECK-NOT:     udiv
125; INDVARCHECK:       for.end10:
126;
127entry:
128  br label %for.cond
129
130for.cond:                                         ; preds = %for.end, %entry
131  %val.0 = phi i32 [ %start1, %entry ], [ %val.1.lcssa, %for.end ]
132  %counter1.0 = phi i32 [ 1, %entry ], [ %inc9, %for.end ]
133  %cmp = icmp ult i32 %counter1.0, 100
134  br i1 %cmp, label %for.body, label %for.end10
135
136for.body:                                         ; preds = %for.cond
137  %tmp = load i32, ptr %ptr, align 4
138  %add = add i32 %tmp, %val.0
139  %cmp224 = icmp ult i32 %start2, 10
140  br i1 %cmp224, label %for.body3.lr.ph, label %for.end
141
142for.body3.lr.ph:                                  ; preds = %for.body
143  br label %for.body3
144
145for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
146  %index.027 = phi i32 [ 0, %for.body3.lr.ph ], [ %add4, %for.body3 ]
147  %val.126 = phi i32 [ %add, %for.body3.lr.ph ], [ %add7, %for.body3 ]
148  %counter2.025 = phi i32 [ %start2, %for.body3.lr.ph ], [ %inc, %for.body3 ]
149  %div = udiv i32 16, %counter1.0
150  %add4 = add i32 %div, %index.027
151  %idxprom5 = zext i32 %add4 to i64
152  %arrayidx6 = getelementptr inbounds i32, ptr %ptr, i64 %idxprom5
153  %tmp1 = load i32, ptr %arrayidx6, align 4
154  %add7 = add i32 %tmp1, %val.126
155  %inc = add i32 %counter2.025, 1
156  %cmp2 = icmp ult i32 %inc, 10
157  br i1 %cmp2, label %for.body3, label %for.cond1.for.end_crit_edge
158
159for.cond1.for.end_crit_edge:                      ; preds = %for.body3
160  %split = phi i32 [ %add7, %for.body3 ]
161  br label %for.end
162
163for.end:                                          ; preds = %for.cond1.for.end_crit_edge, %for.body
164  %val.1.lcssa = phi i32 [ %split, %for.cond1.for.end_crit_edge ], [ %add, %for.body ]
165  %inc9 = add i32 %counter1.0, 1
166  br label %for.cond
167
168for.end10:                                        ; preds = %for.cond
169  %val.0.lcssa = phi i32 [ %val.0, %for.cond ]
170  ret i32 %val.0.lcssa
171}
172
173; This version is identical to the above 'NonZeroDivHoist' case, except the
174; outer ('counter1') loop starts at the unknown value of 'start1' rather than 1,
175; and so it is illegal to hoist the division because if 'start1' is 0, hoisting
176; it would incorrectly cause a divide-by-zero trap.
177;
178; Verify that the 'udiv' is not hoisted to the preheader, and it remains in the
179; loop body.
180define i32 @ZeroDivNoHoist(ptr nocapture readonly %ptr, i32 %start1, i32 %start2) {
181; INDVARCHECK-LABEL: @ZeroDivNoHoist(
182; INDVARCHECK-NEXT:  entry:
183; INDVARCHECK-NOT:     udiv
184; INDVARCHECK:       for.body3:
185; INDVARCHECK:         [[TMP1:%.*]] = udiv i64 16, [[INDVARS_IV:%.*]]
186; INDVARCHECK:       for.cond1.for.end_crit_edge:
187;
188entry:
189  br label %for.cond
190
191for.cond:                                         ; preds = %for.end, %entry
192  %val.0 = phi i32 [ %start1, %entry ], [ %val.1.lcssa, %for.end ]
193  %counter1.0 = phi i32 [ %start1, %entry ], [ %inc9, %for.end ]
194  %cmp = icmp ult i32 %counter1.0, 100
195  br i1 %cmp, label %for.body, label %for.end10
196
197for.body:                                         ; preds = %for.cond
198  %tmp = load i32, ptr %ptr, align 4
199  %add = add i32 %tmp, %val.0
200  %cmp224 = icmp ult i32 %start2, 10
201  br i1 %cmp224, label %for.body3.lr.ph, label %for.end
202
203for.body3.lr.ph:                                  ; preds = %for.body
204  br label %for.body3
205
206for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
207  %index.027 = phi i32 [ 0, %for.body3.lr.ph ], [ %add4, %for.body3 ]
208  %val.126 = phi i32 [ %add, %for.body3.lr.ph ], [ %add7, %for.body3 ]
209  %counter2.025 = phi i32 [ %start2, %for.body3.lr.ph ], [ %inc, %for.body3 ]
210  %div = udiv i32 16, %counter1.0
211  %add4 = add i32 %div, %index.027
212  %idxprom5 = zext i32 %add4 to i64
213  %arrayidx6 = getelementptr inbounds i32, ptr %ptr, i64 %idxprom5
214  %tmp1 = load i32, ptr %arrayidx6, align 4
215  %add7 = add i32 %tmp1, %val.126
216  %inc = add i32 %counter2.025, 1
217  %cmp2 = icmp ult i32 %inc, 10
218  br i1 %cmp2, label %for.body3, label %for.cond1.for.end_crit_edge
219
220for.cond1.for.end_crit_edge:                      ; preds = %for.body3
221  %split = phi i32 [ %add7, %for.body3 ]
222  br label %for.end
223
224for.end:                                          ; preds = %for.cond1.for.end_crit_edge, %for.body
225  %val.1.lcssa = phi i32 [ %split, %for.cond1.for.end_crit_edge ], [ %add, %for.body ]
226  %inc9 = add i32 %counter1.0, 1
227  br label %for.cond
228
229for.end10:                                        ; preds = %for.cond
230  %val.0.lcssa = phi i32 [ %val.0, %for.cond ]
231  ret i32 %val.0.lcssa
232}
233
234; This version is has a clearly safe division by a non-zero constant (16).  The
235; division is transformed to a logical-shift-right of 4, and it is safely
236; hoisted.
237;
238; Verify that the division-operation is hoisted, and that it appears as a
239; right-shift ('lshr') rather than an explicit division.
240define i32 @DivBy16Hoist(ptr nocapture readonly %ptr, i32 %start1, i32 %start2) {
241; INDVARCHECK-LABEL: @DivBy16Hoist(
242; INDVARCHECK-NEXT:  entry:
243; INDVARCHECK:       for.cond:
244; INDVARCHECK:         [[TMP1:%.*]] = lshr i64 [[INDVARS_IV:%.*]], 4
245; INDVARCHECK:       for.body:
246; INDVARCHECK-NOT:     lshr
247; INDVARCHECK-NOT:     udiv
248; INDVARCHECK:       for.end10:
249;
250entry:
251  br label %for.cond
252
253for.cond:                                         ; preds = %for.end, %entry
254  %val.0 = phi i32 [ %start1, %entry ], [ %val.1.lcssa, %for.end ]
255  %counter1.0 = phi i32 [ %start1, %entry ], [ %inc9, %for.end ]
256  %cmp = icmp ult i32 %counter1.0, 100
257  br i1 %cmp, label %for.body, label %for.end10
258
259for.body:                                         ; preds = %for.cond
260  %tmp = load i32, ptr %ptr, align 4
261  %add = add i32 %tmp, %val.0
262  %cmp224 = icmp ult i32 %start2, 10
263  br i1 %cmp224, label %for.body3.lr.ph, label %for.end
264
265for.body3.lr.ph:                                  ; preds = %for.body
266  br label %for.body3
267
268for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
269  %index.027 = phi i32 [ 0, %for.body3.lr.ph ], [ %add4, %for.body3 ]
270  %val.126 = phi i32 [ %add, %for.body3.lr.ph ], [ %add7, %for.body3 ]
271  %counter2.025 = phi i32 [ %start2, %for.body3.lr.ph ], [ %inc, %for.body3 ]
272  %div = udiv i32 %counter1.0, 16
273  %add4 = add i32 %div, %index.027
274  %idxprom5 = zext i32 %add4 to i64
275  %arrayidx6 = getelementptr inbounds i32, ptr %ptr, i64 %idxprom5
276  %tmp1 = load i32, ptr %arrayidx6, align 4
277  %add7 = add i32 %tmp1, %val.126
278  %inc = add i32 %counter2.025, 1
279  %cmp2 = icmp ult i32 %inc, 10
280  br i1 %cmp2, label %for.body3, label %for.cond1.for.end_crit_edge
281
282for.cond1.for.end_crit_edge:                      ; preds = %for.body3
283  %split = phi i32 [ %add7, %for.body3 ]
284  br label %for.end
285
286for.end:                                          ; preds = %for.cond1.for.end_crit_edge, %for.body
287  %val.1.lcssa = phi i32 [ %split, %for.cond1.for.end_crit_edge ], [ %add, %for.body ]
288  %inc9 = add i32 %counter1.0, 1
289  br label %for.cond
290
291for.end10:                                        ; preds = %for.cond
292  %val.0.lcssa = phi i32 [ %val.0, %for.cond ]
293  ret i32 %val.0.lcssa
294}
295
296; This version is has a clearly safe division by a non-zero constant (17).  The
297; division is safely hoisted, as it was in the 'DivBy16Hoist' verison, but here
298; it remains a division, rather than being transformed to a right-shift.
299;
300; Verify that the division-operation is hoisted.
301define i32 @DivBy17Hoist(ptr nocapture readonly %ptr, i32 %start1, i32 %start2) {
302; INDVARCHECK-LABEL: @DivBy17Hoist(
303; INDVARCHECK-NEXT:  entry:
304; INDVARCHECK:       for.cond:
305; INDVARCHECK:         [[TMP1:%.*]] = udiv i64 [[INDVARS_IV:%.*]], 17
306; INDVARCHECK:       for.body:
307; INDVARCHECK-NOT:     udiv
308; INDVARCHECK:       for.end10:
309;
310entry:
311  br label %for.cond
312
313for.cond:                                         ; preds = %for.end, %entry
314  %val.0 = phi i32 [ %start1, %entry ], [ %val.1.lcssa, %for.end ]
315  %counter1.0 = phi i32 [ %start1, %entry ], [ %inc9, %for.end ]
316  %cmp = icmp ult i32 %counter1.0, 100
317  br i1 %cmp, label %for.body, label %for.end10
318
319for.body:                                         ; preds = %for.cond
320  %tmp = load i32, ptr %ptr, align 4
321  %add = add i32 %tmp, %val.0
322  %cmp224 = icmp ult i32 %start2, 10
323  br i1 %cmp224, label %for.body3.lr.ph, label %for.end
324
325for.body3.lr.ph:                                  ; preds = %for.body
326  br label %for.body3
327
328for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
329  %index.027 = phi i32 [ 0, %for.body3.lr.ph ], [ %add4, %for.body3 ]
330  %val.126 = phi i32 [ %add, %for.body3.lr.ph ], [ %add7, %for.body3 ]
331  %counter2.025 = phi i32 [ %start2, %for.body3.lr.ph ], [ %inc, %for.body3 ]
332  %div = udiv i32 %counter1.0, 17
333  %add4 = add i32 %div, %index.027
334  %idxprom5 = zext i32 %add4 to i64
335  %arrayidx6 = getelementptr inbounds i32, ptr %ptr, i64 %idxprom5
336  %tmp1 = load i32, ptr %arrayidx6, align 4
337  %add7 = add i32 %tmp1, %val.126
338  %inc = add i32 %counter2.025, 1
339  %cmp2 = icmp ult i32 %inc, 10
340  br i1 %cmp2, label %for.body3, label %for.cond1.for.end_crit_edge
341
342for.cond1.for.end_crit_edge:                      ; preds = %for.body3
343  %split = phi i32 [ %add7, %for.body3 ]
344  br label %for.end
345
346for.end:                                          ; preds = %for.cond1.for.end_crit_edge, %for.body
347  %val.1.lcssa = phi i32 [ %split, %for.cond1.for.end_crit_edge ], [ %add, %for.body ]
348  %inc9 = add i32 %counter1.0, 1
349  br label %for.cond
350
351for.end10:                                        ; preds = %for.cond
352  %val.0.lcssa = phi i32 [ %val.0, %for.cond ]
353  ret i32 %val.0.lcssa
354}
355