1; RUN: opt -mtriple armv7-linux-gnueabihf -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 2; RUN: opt -mtriple armv8-linux-gnu -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=LINUX 3; RUN: opt -mtriple armv8.1.m-none-eabi -mattr=+mve.fp -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=MVE 4; RUN: opt -mtriple armv7-unknwon-darwin -passes=loop-vectorize -S %s -debug-only=loop-vectorize --disable-output -o /dev/null 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=DARWIN 5; REQUIRES: asserts 6 7target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" 8 9; Testing the ability of the loop vectorizer to tell when SIMD is safe or not 10; regarding IEEE 754 standard. 11; On Linux, we only want the vectorizer to work when -ffast-math flag is set, 12; because NEON is not IEEE compliant. 13; Darwin, on the other hand, doesn't support subnormals, and all optimizations 14; are allowed, even without -ffast-math. 15 16; Integer loops are always vectorizeable 17; CHECK: Checking a loop in 'sumi' 18; CHECK: We can vectorize this loop! 19define void @sumi(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 20entry: 21 %cmp5 = icmp eq i32 %N, 0 22 br i1 %cmp5, label %for.end, label %for.body.preheader 23 24for.body.preheader: ; preds = %entry 25 br label %for.body 26 27for.body: ; preds = %for.body.preheader, %for.body 28 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 29 %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.06 30 %0 = load i32, ptr %arrayidx, align 4 31 %arrayidx1 = getelementptr inbounds i32, ptr %B, i32 %i.06 32 %1 = load i32, ptr %arrayidx1, align 4 33 %mul = mul nsw i32 %1, %0 34 %arrayidx2 = getelementptr inbounds i32, ptr %C, i32 %i.06 35 store i32 %mul, ptr %arrayidx2, align 4 36 %inc = add nuw nsw i32 %i.06, 1 37 %exitcond = icmp eq i32 %inc, %N 38 br i1 %exitcond, label %for.end.loopexit, label %for.body 39 40for.end.loopexit: ; preds = %for.body 41 br label %for.end 42 43for.end: ; preds = %for.end.loopexit, %entry 44 ret void 45} 46 47; Floating-point loops need fast-math to be vectorizeable 48; LINUX: Checking a loop in 'sumf' 49; LINUX: Potentially unsafe FP op prevents vectorization 50; MVE: Checking a loop in 'sumf' 51; MVE: We can vectorize this loop! 52; DARWIN: Checking a loop in 'sumf' 53; DARWIN: We can vectorize this loop! 54define void @sumf(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 55entry: 56 %cmp5 = icmp eq i32 %N, 0 57 br i1 %cmp5, label %for.end, label %for.body.preheader 58 59for.body.preheader: ; preds = %entry 60 br label %for.body 61 62for.body: ; preds = %for.body.preheader, %for.body 63 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 64 %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.06 65 %0 = load float, ptr %arrayidx, align 4 66 %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.06 67 %1 = load float, ptr %arrayidx1, align 4 68 %mul = fmul float %0, %1 69 %arrayidx2 = getelementptr inbounds float, ptr %C, i32 %i.06 70 store float %mul, ptr %arrayidx2, align 4 71 %inc = add nuw nsw i32 %i.06, 1 72 %exitcond = icmp eq i32 %inc, %N 73 br i1 %exitcond, label %for.end.loopexit, label %for.body 74 75for.end.loopexit: ; preds = %for.body 76 br label %for.end 77 78for.end: ; preds = %for.end.loopexit, %entry 79 ret void 80} 81 82; Integer loops are always vectorizeable 83; CHECK: Checking a loop in 'redi' 84; CHECK: We can vectorize this loop! 85define i32 @redi(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { 86entry: 87 %cmp5 = icmp eq i32 %N, 0 88 br i1 %cmp5, label %for.end, label %for.body.preheader 89 90for.body.preheader: ; preds = %entry 91 br label %for.body 92 93for.body: ; preds = %for.body.preheader, %for.body 94 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 95 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 96 %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07 97 %0 = load i32, ptr %arrayidx, align 4 98 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07 99 %1 = load i32, ptr %arrayidx1, align 4 100 %mul = mul nsw i32 %1, %0 101 %add = add nsw i32 %mul, %Red.06 102 %inc = add nuw nsw i32 %i.07, 1 103 %exitcond = icmp eq i32 %inc, %N 104 br i1 %exitcond, label %for.end.loopexit, label %for.body 105 106for.end.loopexit: ; preds = %for.body 107 %add.lcssa = phi i32 [ %add, %for.body ] 108 br label %for.end 109 110for.end: ; preds = %for.end.loopexit, %entry 111 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 112 ret i32 %Red.0.lcssa 113} 114 115; Floating-point loops need fast-math to be vectorizeable 116; LINUX: Checking a loop in 'redf' 117; LINUX: Potentially unsafe FP op prevents vectorization 118; MVE: Checking a loop in 'redf' 119; MVE: We can vectorize this loop! 120; DARWIN: Checking a loop in 'redf' 121; DARWIN: We can vectorize this loop! 122define float @redf(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { 123entry: 124 %cmp5 = icmp eq i32 %N, 0 125 br i1 %cmp5, label %for.end, label %for.body.preheader 126 127for.body.preheader: ; preds = %entry 128 br label %for.body 129 130for.body: ; preds = %for.body.preheader, %for.body 131 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 132 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 133 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07 134 %0 = load float, ptr %arrayidx, align 4 135 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07 136 %1 = load float, ptr %arrayidx1, align 4 137 %mul = fmul float %0, %1 138 %add = fadd float %Red.06, %mul 139 %inc = add nuw nsw i32 %i.07, 1 140 %exitcond = icmp eq i32 %inc, %N 141 br i1 %exitcond, label %for.end.loopexit, label %for.body 142 143for.end.loopexit: ; preds = %for.body 144 %add.lcssa = phi float [ %add, %for.body ] 145 br label %for.end 146 147for.end: ; preds = %for.end.loopexit, %entry 148 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 149 ret float %Red.0.lcssa 150} 151 152; Make sure calls that turn into builtins are also covered 153; LINUX: Checking a loop in 'fabs' 154; LINUX: Potentially unsafe FP op prevents vectorization 155; DARWIN: Checking a loop in 'fabs' 156; DARWIN: We can vectorize this loop! 157define void @fabs(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 158entry: 159 %cmp10 = icmp eq i32 %N, 0 160 br i1 %cmp10, label %for.end, label %for.body 161 162for.body: ; preds = %entry, %for.body 163 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 164 %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.011 165 %0 = load float, ptr %arrayidx, align 4 166 %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.011 167 %1 = load float, ptr %arrayidx1, align 4 168 %fabsf = tail call float @fabsf(float %1) #1 169 %conv3 = fmul float %0, %fabsf 170 %arrayidx4 = getelementptr inbounds float, ptr %C, i32 %i.011 171 store float %conv3, ptr %arrayidx4, align 4 172 %inc = add nuw nsw i32 %i.011, 1 173 %exitcond = icmp eq i32 %inc, %N 174 br i1 %exitcond, label %for.end, label %for.body 175 176for.end: ; preds = %for.body, %entry 177 ret void 178} 179 180; Integer loops are always vectorizeable 181; CHECK: Checking a loop in 'sumi_fast' 182; CHECK: We can vectorize this loop! 183define void @sumi_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 184entry: 185 %cmp5 = icmp eq i32 %N, 0 186 br i1 %cmp5, label %for.end, label %for.body.preheader 187 188for.body.preheader: ; preds = %entry 189 br label %for.body 190 191for.body: ; preds = %for.body.preheader, %for.body 192 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 193 %arrayidx = getelementptr inbounds i32, ptr %A, i32 %i.06 194 %0 = load i32, ptr %arrayidx, align 4 195 %arrayidx1 = getelementptr inbounds i32, ptr %B, i32 %i.06 196 %1 = load i32, ptr %arrayidx1, align 4 197 %mul = mul nsw i32 %1, %0 198 %arrayidx2 = getelementptr inbounds i32, ptr %C, i32 %i.06 199 store i32 %mul, ptr %arrayidx2, align 4 200 %inc = add nuw nsw i32 %i.06, 1 201 %exitcond = icmp eq i32 %inc, %N 202 br i1 %exitcond, label %for.end.loopexit, label %for.body 203 204for.end.loopexit: ; preds = %for.body 205 br label %for.end 206 207for.end: ; preds = %for.end.loopexit, %entry 208 ret void 209} 210 211; Floating-point loops can be vectorizeable with fast-math 212; CHECK: Checking a loop in 'sumf_fast' 213; CHECK: We can vectorize this loop! 214define void @sumf_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 215entry: 216 %cmp5 = icmp eq i32 %N, 0 217 br i1 %cmp5, label %for.end, label %for.body.preheader 218 219for.body.preheader: ; preds = %entry 220 br label %for.body 221 222for.body: ; preds = %for.body.preheader, %for.body 223 %i.06 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 224 %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.06 225 %0 = load float, ptr %arrayidx, align 4 226 %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.06 227 %1 = load float, ptr %arrayidx1, align 4 228 %mul = fmul fast float %1, %0 229 %arrayidx2 = getelementptr inbounds float, ptr %C, i32 %i.06 230 store float %mul, ptr %arrayidx2, align 4 231 %inc = add nuw nsw i32 %i.06, 1 232 %exitcond = icmp eq i32 %inc, %N 233 br i1 %exitcond, label %for.end.loopexit, label %for.body 234 235for.end.loopexit: ; preds = %for.body 236 br label %for.end 237 238for.end: ; preds = %for.end.loopexit, %entry 239 ret void 240} 241 242; Integer loops are always vectorizeable 243; CHECK: Checking a loop in 'redi_fast' 244; CHECK: We can vectorize this loop! 245define i32 @redi_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { 246entry: 247 %cmp5 = icmp eq i32 %N, 0 248 br i1 %cmp5, label %for.end, label %for.body.preheader 249 250for.body.preheader: ; preds = %entry 251 br label %for.body 252 253for.body: ; preds = %for.body.preheader, %for.body 254 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 255 %Red.06 = phi i32 [ %add, %for.body ], [ undef, %for.body.preheader ] 256 %arrayidx = getelementptr inbounds i32, ptr %a, i32 %i.07 257 %0 = load i32, ptr %arrayidx, align 4 258 %arrayidx1 = getelementptr inbounds i32, ptr %b, i32 %i.07 259 %1 = load i32, ptr %arrayidx1, align 4 260 %mul = mul nsw i32 %1, %0 261 %add = add nsw i32 %mul, %Red.06 262 %inc = add nuw nsw i32 %i.07, 1 263 %exitcond = icmp eq i32 %inc, %N 264 br i1 %exitcond, label %for.end.loopexit, label %for.body 265 266for.end.loopexit: ; preds = %for.body 267 %add.lcssa = phi i32 [ %add, %for.body ] 268 br label %for.end 269 270for.end: ; preds = %for.end.loopexit, %entry 271 %Red.0.lcssa = phi i32 [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 272 ret i32 %Red.0.lcssa 273} 274 275; Floating-point loops can be vectorizeable with fast-math 276; CHECK: Checking a loop in 'redf_fast' 277; CHECK: We can vectorize this loop! 278define float @redf_fast(ptr noalias nocapture readonly %a, ptr noalias nocapture readonly %b, i32 %N) { 279entry: 280 %cmp5 = icmp eq i32 %N, 0 281 br i1 %cmp5, label %for.end, label %for.body.preheader 282 283for.body.preheader: ; preds = %entry 284 br label %for.body 285 286for.body: ; preds = %for.body.preheader, %for.body 287 %i.07 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] 288 %Red.06 = phi float [ %add, %for.body ], [ undef, %for.body.preheader ] 289 %arrayidx = getelementptr inbounds float, ptr %a, i32 %i.07 290 %0 = load float, ptr %arrayidx, align 4 291 %arrayidx1 = getelementptr inbounds float, ptr %b, i32 %i.07 292 %1 = load float, ptr %arrayidx1, align 4 293 %mul = fmul fast float %1, %0 294 %add = fadd fast float %mul, %Red.06 295 %inc = add nuw nsw i32 %i.07, 1 296 %exitcond = icmp eq i32 %inc, %N 297 br i1 %exitcond, label %for.end.loopexit, label %for.body 298 299for.end.loopexit: ; preds = %for.body 300 %add.lcssa = phi float [ %add, %for.body ] 301 br label %for.end 302 303for.end: ; preds = %for.end.loopexit, %entry 304 %Red.0.lcssa = phi float [ undef, %entry ], [ %add.lcssa, %for.end.loopexit ] 305 ret float %Red.0.lcssa 306} 307 308; Make sure calls that turn into builtins are also covered 309; CHECK: Checking a loop in 'fabs_fast' 310; CHECK: We can vectorize this loop! 311define void @fabs_fast(ptr noalias nocapture readonly %A, ptr noalias nocapture readonly %B, ptr noalias nocapture %C, i32 %N) { 312entry: 313 %cmp10 = icmp eq i32 %N, 0 314 br i1 %cmp10, label %for.end, label %for.body 315 316for.body: ; preds = %entry, %for.body 317 %i.011 = phi i32 [ %inc, %for.body ], [ 0, %entry ] 318 %arrayidx = getelementptr inbounds float, ptr %A, i32 %i.011 319 %0 = load float, ptr %arrayidx, align 4 320 %arrayidx1 = getelementptr inbounds float, ptr %B, i32 %i.011 321 %1 = load float, ptr %arrayidx1, align 4 322 %fabsf = tail call fast float @fabsf(float %1) #2 323 %conv3 = fmul fast float %fabsf, %0 324 %arrayidx4 = getelementptr inbounds float, ptr %C, i32 %i.011 325 store float %conv3, ptr %arrayidx4, align 4 326 %inc = add nuw nsw i32 %i.011, 1 327 %exitcond = icmp eq i32 %inc, %N 328 br i1 %exitcond, label %for.end, label %for.body 329 330for.end: ; preds = %for.body, %entry 331 ret void 332} 333 334declare float @fabsf(float) 335 336attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" } 337attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" } 338