xref: /llvm-project/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-allowed.ll (revision 2fab927546b34f5af7770541a9bbb974d9818c5c)
1; RUN: opt -mtriple=thumbv8.1m.main-arm-eabihf -mattr=+mve.fp -passes=loop-vectorize -tail-predication=enabled -S < %s | \
2; RUN:  FileCheck %s
3
4target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
5
6; Test that ARMTTIImpl::preferPredicateOverEpilogue triggers tail-folding.
7
8define dso_local void @f1(ptr noalias nocapture %A, ptr noalias nocapture readonly %B, ptr noalias nocapture readonly %C, i32 %N) {
9; CHECK-LABEL: f1(
10; CHECK:       entry:
11; CHECK:       @llvm.get.active.lane.mask
12; CHECK:       }
13entry:
14  %cmp8 = icmp sgt i32 %N, 0
15  br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup
16
17for.body.preheader:                               ; preds = %entry
18  br label %for.body
19
20for.cond.cleanup.loopexit:                        ; preds = %for.body
21  br label %for.cond.cleanup
22
23for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
24  ret void
25
26for.body:                                         ; preds = %for.body.preheader, %for.body
27  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
28  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.09
29  %0 = load i32, ptr %arrayidx, align 4
30  %arrayidx1 = getelementptr inbounds i32, ptr %C, i32 %i.09
31  %1 = load i32, ptr %arrayidx1, align 4
32  %add = add nsw i32 %1, %0
33  %arrayidx2 = getelementptr inbounds i32, ptr %A, i32 %i.09
34  store i32 %add, ptr %arrayidx2, align 4
35  %inc = add nuw nsw i32 %i.09, 1
36  %exitcond.not = icmp eq i32 %inc, %N
37  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
38}
39
40define dso_local void @f32_reduction(ptr nocapture readonly %Input, i32 %N, ptr nocapture %Output) {
41; CHECK-LABEL: f32_reduction(
42; CHECK:       vector.body:
43; CHECK:       @llvm.masked.load
44; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
45entry:
46  %cmp6 = icmp eq i32 %N, 0
47  br i1 %cmp6, label %while.end, label %while.body.preheader
48
49while.body.preheader:                             ; preds = %entry
50  br label %while.body
51
52while.body:                                       ; preds = %while.body.preheader, %while.body
53  %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
54  %sum.08 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
55  %Input.addr.07 = phi ptr [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
56  %incdec.ptr = getelementptr inbounds float, ptr %Input.addr.07, i32 1
57  %0 = load float, ptr %Input.addr.07, align 4
58  %add = fadd fast float %0, %sum.08
59  %dec = add i32 %blkCnt.09, -1
60  %cmp = icmp eq i32 %dec, 0
61  br i1 %cmp, label %while.end.loopexit, label %while.body
62
63while.end.loopexit:                               ; preds = %while.body
64  %add.lcssa = phi float [ %add, %while.body ]
65  br label %while.end
66
67while.end:                                        ; preds = %while.end.loopexit, %entry
68  %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
69  %conv = uitofp i32 %N to float
70  %div = fdiv fast float %sum.0.lcssa, %conv
71  store float %div, ptr %Output, align 4
72  ret void
73}
74
75define dso_local void @f16_reduction(ptr nocapture readonly %Input, i32 %N, ptr nocapture %Output) {
76; CHECK-LABEL: f16_reduction(
77; CHECK:       vector.body:
78; CHECK:       @llvm.masked.load
79; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
80entry:
81  %cmp6 = icmp eq i32 %N, 0
82  br i1 %cmp6, label %while.end, label %while.body.preheader
83
84while.body.preheader:                             ; preds = %entry
85  br label %while.body
86
87while.body:                                       ; preds = %while.body.preheader, %while.body
88  %blkCnt.09 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
89  %sum.08 = phi half [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
90  %Input.addr.07 = phi ptr [ %incdec.ptr, %while.body ], [ %Input, %while.body.preheader ]
91  %incdec.ptr = getelementptr inbounds half, ptr %Input.addr.07, i32 1
92  %0 = load half, ptr %Input.addr.07, align 2
93  %add = fadd fast half %0, %sum.08
94  %dec = add i32 %blkCnt.09, -1
95  %cmp = icmp eq i32 %dec, 0
96  br i1 %cmp, label %while.end.loopexit, label %while.body
97
98while.end.loopexit:                               ; preds = %while.body
99  %add.lcssa = phi half [ %add, %while.body ]
100  br label %while.end
101
102while.end:                                        ; preds = %while.end.loopexit, %entry
103  %sum.0.lcssa = phi half [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
104  %conv = uitofp i32 %N to half
105  %div = fdiv fast half %sum.0.lcssa, %conv
106  store half %div, ptr %Output, align 2
107  ret void
108}
109
110define dso_local void @mixed_f32_i32_reduction(ptr nocapture readonly %fInput, ptr nocapture readonly %iInput, i32 %N, ptr nocapture %fOutput, ptr nocapture %iOutput) {
111; CHECK-LABEL: mixed_f32_i32_reduction(
112; CHECK:       vector.body:
113; CHECK:       @llvm.masked.load
114; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
115entry:
116  %cmp15 = icmp eq i32 %N, 0
117  br i1 %cmp15, label %while.end, label %while.body.preheader
118
119while.body.preheader:
120  br label %while.body
121
122while.body:
123  %blkCnt.020 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
124  %isum.019 = phi i32 [ %add2, %while.body ], [ 0, %while.body.preheader ]
125  %fsum.018 = phi float [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
126  %fInput.addr.017 = phi ptr [ %incdec.ptr, %while.body ], [ %fInput, %while.body.preheader ]
127  %iInput.addr.016 = phi ptr [ %incdec.ptr1, %while.body ], [ %iInput, %while.body.preheader ]
128  %incdec.ptr = getelementptr inbounds float, ptr %fInput.addr.017, i32 1
129  %incdec.ptr1 = getelementptr inbounds i32, ptr %iInput.addr.016, i32 1
130  %0 = load i32, ptr %iInput.addr.016, align 4
131  %add2 = add nsw i32 %0, %isum.019
132  %1 = load float, ptr %fInput.addr.017, align 4
133  %add = fadd fast float %1, %fsum.018
134  %dec = add i32 %blkCnt.020, -1
135  %cmp = icmp eq i32 %dec, 0
136  br i1 %cmp, label %while.end.loopexit, label %while.body
137
138while.end.loopexit:
139  %add.lcssa = phi float [ %add, %while.body ]
140  %add2.lcssa = phi i32 [ %add2, %while.body ]
141  %phitmp = sitofp i32 %add2.lcssa to float
142  br label %while.end
143
144while.end:
145  %fsum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
146  %isum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp, %while.end.loopexit ]
147  %conv = uitofp i32 %N to float
148  %div = fdiv fast float %fsum.0.lcssa, %conv
149  store float %div, ptr %fOutput, align 4
150  %div5 = fdiv fast float %isum.0.lcssa, %conv
151  %conv6 = fptosi float %div5 to i32
152  store i32 %conv6, ptr %iOutput, align 4
153  ret void
154}
155
156define dso_local i32 @i32_mul_reduction(ptr noalias nocapture readonly %B, i32 %N) {
157; CHECK-LABEL: i32_mul_reduction(
158; CHECK:       vector.body:
159; CHECK:       @llvm.masked.load
160; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
161entry:
162  %cmp6 = icmp sgt i32 %N, 0
163  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
164
165for.body.preheader:
166  br label %for.body
167
168for.cond.cleanup.loopexit:
169  %mul.lcssa = phi i32 [ %mul, %for.body ]
170  br label %for.cond.cleanup
171
172for.cond.cleanup:
173  %S.0.lcssa = phi i32 [ 1, %entry ], [ %mul.lcssa, %for.cond.cleanup.loopexit ]
174  ret i32 %S.0.lcssa
175
176for.body:
177  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
178  %S.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
179  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.08
180  %0 = load i32, ptr %arrayidx, align 4
181  %mul = mul nsw i32 %0, %S.07
182  %inc = add nuw nsw i32 %i.08, 1
183  %exitcond = icmp eq i32 %inc, %N
184  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
185}
186
187define dso_local i32 @i32_or_reduction(ptr noalias nocapture readonly %B, i32 %N) {
188; CHECK-LABEL: i32_or_reduction(
189; CHECK:       vector.body:
190; CHECK:       @llvm.masked.load
191; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
192entry:
193  %cmp6 = icmp sgt i32 %N, 0
194  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
195
196for.body.preheader:                               ; preds = %entry
197  br label %for.body
198
199for.cond.cleanup.loopexit:                        ; preds = %for.body
200  %or.lcssa = phi i32 [ %or, %for.body ]
201  br label %for.cond.cleanup
202
203for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
204  %S.0.lcssa = phi i32 [ 1, %entry ], [ %or.lcssa, %for.cond.cleanup.loopexit ]
205  ret i32 %S.0.lcssa
206
207for.body:                                         ; preds = %for.body.preheader, %for.body
208  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
209  %S.07 = phi i32 [ %or, %for.body ], [ 1, %for.body.preheader ]
210  %arrayidx = getelementptr inbounds i32, ptr %B, i32 %i.08
211  %0 = load i32, ptr %arrayidx, align 4
212  %or = or i32 %0, %S.07
213  %inc = add nuw nsw i32 %i.08, 1
214  %exitcond = icmp eq i32 %inc, %N
215  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
216}
217
218define dso_local i32 @i32_and_reduction(ptr noalias nocapture readonly %A, i32 %N, i32 %S) {
219; CHECK-LABEL: i32_and_reduction(
220; CHECK:       vector.body:
221; CHECK:       @llvm.masked.load
222; CHECK:       br i1 %{{.*}}, label {{.*}}, label %vector.body
223entry:
224  %cmp5 = icmp sgt i32 %N, 0
225  br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup
226
227for.body.preheader:                               ; preds = %entry
228  br label %for.body
229
230for.cond.cleanup.loopexit:                        ; preds = %for.body
231  %and.lcssa = phi i32 [ %and, %for.body ]
232  br label %for.cond.cleanup
233
234for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
235  %S.addr.0.lcssa = phi i32 [ %S, %entry ], [ %and.lcssa, %for.cond.cleanup.loopexit ]
236  ret i32 %S.addr.0.lcssa
237
238for.body:                                         ; preds = %for.body.preheader, %for.body
239  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
240  %S.addr.06 = phi i32 [ %and, %for.body ], [ %S, %for.body.preheader ]
241  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.07
242  %0 = load i32, ptr %arrayidx, align 4
243  %and = and i32 %0, %S.addr.06
244  %inc = add nuw nsw i32 %i.07, 1
245  %exitcond = icmp eq i32 %inc, %N
246  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
247}
248