xref: /llvm-project/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll (revision c836b8956d393f98e0d4e136799a33f1bd06e5f5)
1; RUN: opt -mtriple armv7-linux-gnueabihf -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
2; RUN: opt -mtriple armv8-linux-gnu -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX
3; RUN: opt -mtriple armv8.1.m-none-eabi -mattr=+mve.fp -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=MVE
4; RUN: opt -mtriple armv7-unknwon-darwin -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN
5; REQUIRES: asserts
6
7target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
8
9; Testing the ability of the loop vectorizer to tell when SIMD is safe or not
10; regarding IEEE 754 standard.
11; On Linux, we only want the vectorizer to work when -ffast-math flag is set,
12; because NEON is not IEEE compliant.
13; Darwin, on the other hand, doesn't support subnormals, and all optimizations
14; are allowed, even without -ffast-math.
15
16; Integer loops are always vectorizeable
17; CHECK: Checking a loop in 'sumi'
18; CHECK: We can vectorize this loop!
19define void @sumi(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
20entry:
21  %cmp5 = icmp eq i32 %N, 0
22  br i1 %cmp5, label %for.end, label %for.body.preheader
23
24for.body.preheader:                               ; preds = %entry
25  br label %for.body
26
27for.body:                                         ; preds = %for.body.preheader, %for.body
28  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
29  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.06
30  %0 = load i32, ptr %arrayidx, align 4
31  %arrayidx1 = getelementptr inbounds i32, ptr %B, i32 %i.06
32  %1 = load i32, ptr %arrayidx1, align 4
33  %mul = mul nsw i32 %1, %0
34  %arrayidx2 = getelementptr inbounds i32, ptr %C, i32 %i.06
35  store i32 %mul, ptr %arrayidx2, align 4
36  %inc = add nuw nsw i32 %i.06, 1
37  %exitcond = icmp eq i32 %inc, %N
38  br i1 %exitcond, label %for.end.loopexit, label %for.body
39
40for.end.loopexit:                                 ; preds = %for.body
41  br label %for.end
42
43for.end:                                          ; preds = %for.end.loopexit, %entry
44  ret void
45}
46
47; Floating-point loops need fast-math to be vectorizeable
48; LINUX: Checking a loop in 'sumf'
49; LINUX: Potentially unsafe FP op prevents vectorization
50; MVE: Checking a loop in 'sumf'
51; MVE: We can vectorize this loop!
52; DARWIN: Checking a loop in 'sumf'
53; DARWIN: We can vectorize this loop!
54define void @sumf(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
55entry:
56  %cmp5 = icmp eq i32 %N, 0
57  br i1 %cmp5, label %for.end, label %for.body.preheader
58
59for.body.preheader:                               ; preds = %entry
60  br label %for.body
61
62for.body:                                         ; preds = %for.body.preheader, %for.body
63  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
64  %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.06
65  %0 = load float, ptr %arrayidx, align 4
66  %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.06
67  %1 = load float, ptr %arrayidx1, align 4
68  %mul = fmul float %0, %1
69  %arrayidx2 = getelementptr inbounds float, ptr %C, i32 %i.06
70  store float %mul, ptr %arrayidx2, align 4
71  %inc = add nuw nsw i32 %i.06, 1
72  %exitcond = icmp eq i32 %inc, %N
73  br i1 %exitcond, label %for.end.loopexit, label %for.body
74
75for.end.loopexit:                                 ; preds = %for.body
76  br label %for.end
77
78for.end:                                          ; preds = %for.end.loopexit, %entry
79  ret void
80}
81
82; Integer loops are always vectorizeable
83; CHECK: Checking a loop in 'redi'
84; CHECK: We can vectorize this loop!
85define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
86entry:
87  %cmp5 = icmp eq i32 %N, 0
88  br i1 %cmp5, label %for.end, label %for.body.preheader
89
90for.body.preheader:                               ; preds = %entry
91  br label %for.body
92
93for.body:                                         ; preds = %for.body.preheader, %for.body
94  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
95  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
96  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
97  %0 = load i32, ptr %arrayidx, align 4
98  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
99  %1 = load i32, ptr %arrayidx1, align 4
100  %mul = mul nsw i32 %1, %0
101  %add = add nsw i32 %mul, %Red.06
102  %inc = add nuw nsw i32 %i.07, 1
103  %exitcond = icmp eq i32 %inc, %N
104  br i1 %exitcond, label %for.end.loopexit, label %for.body
105
106for.end.loopexit:                                 ; preds = %for.body
107  %add.lcssa = phi i32 [ %add, %for.body ]
108  br label %for.end
109
110for.end:                                          ; preds = %for.end.loopexit, %entry
111  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
112  ret i32 %Red.0.lcssa
113}
114
115; Floating-point loops need fast-math to be vectorizeable
116; LINUX: Checking a loop in 'redf'
117; LINUX: Potentially unsafe FP op prevents vectorization
118; MVE: Checking a loop in 'redf'
119; MVE: We can vectorize this loop!
120; DARWIN: Checking a loop in 'redf'
121; DARWIN: We can vectorize this loop!
122define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
123entry:
124  %cmp5 = icmp eq i32 %N, 0
125  br i1 %cmp5, label %for.end, label %for.body.preheader
126
127for.body.preheader:                               ; preds = %entry
128  br label %for.body
129
130for.body:                                         ; preds = %for.body.preheader, %for.body
131  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
132  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
133  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
134  %0 = load float, ptr %arrayidx, align 4
135  %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
136  %1 = load float, ptr %arrayidx1, align 4
137  %mul = fmul float %0, %1
138  %add = fadd float %Red.06, %mul
139  %inc = add nuw nsw i32 %i.07, 1
140  %exitcond = icmp eq i32 %inc, %N
141  br i1 %exitcond, label %for.end.loopexit, label %for.body
142
143for.end.loopexit:                                 ; preds = %for.body
144  %add.lcssa = phi float [ %add, %for.body ]
145  br label %for.end
146
147for.end:                                          ; preds = %for.end.loopexit, %entry
148  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
149  ret float %Red.0.lcssa
150}
151
152; Make sure calls that turn into builtins are also covered
153; LINUX: Checking a loop in 'fabs'
154; LINUX: Potentially unsafe FP op prevents vectorization
155; DARWIN: Checking a loop in 'fabs'
156; DARWIN: We can vectorize this loop!
157define void @fabs(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
158entry:
159  %cmp10 = icmp eq i32 %N, 0
160  br i1 %cmp10, label %for.end, label %for.body
161
162for.body:                                         ; preds = %entry, %for.body
163  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
164  %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.011
165  %0 = load float, ptr %arrayidx, align 4
166  %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.011
167  %1 = load float, ptr %arrayidx1, align 4
168  %fabsf = tail call float @fabsf(float %1) #1
169  %conv3 = fmul float %0, %fabsf
170  %arrayidx4 = getelementptr inbounds float, ptr %C, i32 %i.011
171  store float %conv3, ptr %arrayidx4, align 4
172  %inc = add nuw nsw i32 %i.011, 1
173  %exitcond = icmp eq i32 %inc, %N
174  br i1 %exitcond, label %for.end, label %for.body
175
176for.end:                                          ; preds = %for.body, %entry
177  ret void
178}
179
180; Integer loops are always vectorizeable
181; CHECK: Checking a loop in 'sumi_fast'
182; CHECK: We can vectorize this loop!
183define void @sumi_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
184entry:
185  %cmp5 = icmp eq i32 %N, 0
186  br i1 %cmp5, label %for.end, label %for.body.preheader
187
188for.body.preheader:                               ; preds = %entry
189  br label %for.body
190
191for.body:                                         ; preds = %for.body.preheader, %for.body
192  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
193  %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.06
194  %0 = load i32, ptr %arrayidx, align 4
195  %arrayidx1 = getelementptr inbounds i32, ptr %B, i32 %i.06
196  %1 = load i32, ptr %arrayidx1, align 4
197  %mul = mul nsw i32 %1, %0
198  %arrayidx2 = getelementptr inbounds i32, ptr %C, i32 %i.06
199  store i32 %mul, ptr %arrayidx2, align 4
200  %inc = add nuw nsw i32 %i.06, 1
201  %exitcond = icmp eq i32 %inc, %N
202  br i1 %exitcond, label %for.end.loopexit, label %for.body
203
204for.end.loopexit:                                 ; preds = %for.body
205  br label %for.end
206
207for.end:                                          ; preds = %for.end.loopexit, %entry
208  ret void
209}
210
211; Floating-point loops can be vectorizeable with fast-math
212; CHECK: Checking a loop in 'sumf_fast'
213; CHECK: We can vectorize this loop!
214define void @sumf_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
215entry:
216  %cmp5 = icmp eq i32 %N, 0
217  br i1 %cmp5, label %for.end, label %for.body.preheader
218
219for.body.preheader:                               ; preds = %entry
220  br label %for.body
221
222for.body:                                         ; preds = %for.body.preheader, %for.body
223  %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
224  %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.06
225  %0 = load float, ptr %arrayidx, align 4
226  %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.06
227  %1 = load float, ptr %arrayidx1, align 4
228  %mul = fmul fast float %1, %0
229  %arrayidx2 = getelementptr inbounds float, ptr %C, i32 %i.06
230  store float %mul, ptr %arrayidx2, align 4
231  %inc = add nuw nsw i32 %i.06, 1
232  %exitcond = icmp eq i32 %inc, %N
233  br i1 %exitcond, label %for.end.loopexit, label %for.body
234
235for.end.loopexit:                                 ; preds = %for.body
236  br label %for.end
237
238for.end:                                          ; preds = %for.end.loopexit, %entry
239  ret void
240}
241
242; Integer loops are always vectorizeable
243; CHECK: Checking a loop in 'redi_fast'
244; CHECK: We can vectorize this loop!
245define i32 @redi_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
246entry:
247  %cmp5 = icmp eq i32 %N, 0
248  br i1 %cmp5, label %for.end, label %for.body.preheader
249
250for.body.preheader:                               ; preds = %entry
251  br label %for.body
252
253for.body:                                         ; preds = %for.body.preheader, %for.body
254  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
255  %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ]
256  %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07
257  %0 = load i32, ptr %arrayidx, align 4
258  %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07
259  %1 = load i32, ptr %arrayidx1, align 4
260  %mul = mul nsw i32 %1, %0
261  %add = add nsw i32 %mul, %Red.06
262  %inc = add nuw nsw i32 %i.07, 1
263  %exitcond = icmp eq i32 %inc, %N
264  br i1 %exitcond, label %for.end.loopexit, label %for.body
265
266for.end.loopexit:                                 ; preds = %for.body
267  %add.lcssa = phi i32 [ %add, %for.body ]
268  br label %for.end
269
270for.end:                                          ; preds = %for.end.loopexit, %entry
271  %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
272  ret i32 %Red.0.lcssa
273}
274
275; Floating-point loops can be vectorizeable with fast-math
276; CHECK: Checking a loop in 'redf_fast'
277; CHECK: We can vectorize this loop!
278define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) {
279entry:
280  %cmp5 = icmp eq i32 %N, 0
281  br i1 %cmp5, label %for.end, label %for.body.preheader
282
283for.body.preheader:                               ; preds = %entry
284  br label %for.body
285
286for.body:                                         ; preds = %for.body.preheader, %for.body
287  %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
288  %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ]
289  %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07
290  %0 = load float, ptr %arrayidx, align 4
291  %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07
292  %1 = load float, ptr %arrayidx1, align 4
293  %mul = fmul fast float %1, %0
294  %add = fadd fast float %mul, %Red.06
295  %inc = add nuw nsw i32 %i.07, 1
296  %exitcond = icmp eq i32 %inc, %N
297  br i1 %exitcond, label %for.end.loopexit, label %for.body
298
299for.end.loopexit:                                 ; preds = %for.body
300  %add.lcssa = phi float [ %add, %for.body ]
301  br label %for.end
302
303for.end:                                          ; preds = %for.end.loopexit, %entry
304  %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ]
305  ret float %Red.0.lcssa
306}
307
308; Make sure calls that turn into builtins are also covered
309; CHECK: Checking a loop in 'fabs_fast'
310; CHECK: We can vectorize this loop!
311define void @fabs_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) {
312entry:
313  %cmp10 = icmp eq i32 %N, 0
314  br i1 %cmp10, label %for.end, label %for.body
315
316for.body:                                         ; preds = %entry, %for.body
317  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
318  %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.011
319  %0 = load float, ptr %arrayidx, align 4
320  %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.011
321  %1 = load float, ptr %arrayidx1, align 4
322  %fabsf = tail call fast float @fabsf(float %1) #2
323  %conv3 = fmul fast float %fabsf, %0
324  %arrayidx4 = getelementptr inbounds float, ptr %C, i32 %i.011
325  store float %conv3, ptr %arrayidx4, align 4
326  %inc = add nuw nsw i32 %i.011, 1
327  %exitcond = icmp eq i32 %inc, %N
328  br i1 %exitcond, label %for.end, label %for.body
329
330for.end:                                          ; preds = %for.body, %entry
331  ret void
332}
333
334declare float @fabsf(float)
335
336attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
337attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
338