1; RUN: opt -passes="loop-vectorize" -mtriple=x86_64-unknown-linux -S -debug %s 2>&1 | FileCheck %s 2; REQUIRES: asserts 3 4target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" 5 6target triple = "x86_64-unknown-linux" 7 8declare double @llvm.pow.f64(double, double) 9 10; Test case where the memory runtime checks and vector body is more expensive 11; than running the scalar loop. 12define void @test(ptr nocapture %A, ptr nocapture %B, ptr nocapture %C, ptr nocapture %D, ptr nocapture %E) { 13 14; CHECK: Calculating cost of runtime checks: 15; CHECK-NEXT: 0 for {{.+}} = getelementptr i8, ptr %A, i64 128 16; CHECK-NEXT: 0 for {{.+}} = getelementptr i8, ptr %B, i64 128 17; CHECK-NEXT: 0 for {{.+}} = getelementptr i8, ptr %E, i64 128 18; CHECK-NEXT: 0 for {{.+}} = getelementptr i8, ptr %C, i64 128 19; CHECK-NEXT: 0 for {{.+}} = getelementptr i8, ptr %D, i64 128 20; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 21; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 22; CHECK-NEXT: 1 for {{.+}} = and i1 23; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 24; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 25; CHECK-NEXT: 1 for {{.+}} = and i1 26; CHECK-NEXT: 1 for {{.+}} = or i1 27; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 28; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 29; CHECK-NEXT: 1 for {{.+}} = and i1 30; CHECK-NEXT: 1 for {{.+}} = or i1 31; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 32; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 33; CHECK-NEXT: 1 for {{.+}} = and i1 34; CHECK-NEXT: 1 for {{.+}} = or i1 35; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 36; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 37; CHECK-NEXT: 1 for {{.+}} = and i1 38; CHECK-NEXT: 1 for {{.+}} = or i1 39; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 40; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 41; CHECK-NEXT: 1 for {{.+}} = and i1 42; CHECK-NEXT: 1 for {{.+}} = or i1 43; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 44; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 45; CHECK-NEXT: 1 for {{.+}} = and i1 46; CHECK-NEXT: 1 for {{.+}} = or i1 47; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 48; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 49; CHECK-NEXT: 1 for {{.+}} = and i1 50; CHECK-NEXT: 1 for {{.+}} = or i1 51; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 52; CHECK-NEXT: 1 for {{.+}} = icmp ult ptr 53; CHECK-NEXT: 1 for {{.+}} = and i1 54; CHECK-NEXT: 1 for {{.+}} = or i1 55; CHECK-NEXT: Total cost of runtime checks: 35 56 57; CHECK: LV: Vectorization is not beneficial: expected trip count < minimum profitable VF (16 < 24) 58; 59; CHECK-LABEL: @test( 60; CHECK-NEXT: entry: 61; CHECK-NEXT: br label %for.body 62; CHECK-NOT: vector.memcheck 63; CHECK-NOT: vector.body 64; 65entry: 66 br label %for.body 67 68for.body: 69 %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] 70 %gep.A = getelementptr inbounds double, ptr %A, i64 %iv 71 %l.A = load double, ptr %gep.A, align 4 72 store double 0.0, ptr %gep.A, align 4 73 %p.1 = call double @llvm.pow.f64(double %l.A, double 2.0) 74 75 %gep.B = getelementptr inbounds double, ptr %B, i64 %iv 76 %l.B = load double, ptr %gep.B, align 4 77 %p.2 = call double @llvm.pow.f64(double %l.B, double %p.1) 78 store double 0.0, ptr %gep.B, align 4 79 80 %gep.C = getelementptr inbounds double, ptr %C, i64 %iv 81 %l.C = load double, ptr %gep.C, align 4 82 %p.3 = call double @llvm.pow.f64(double %p.1, double %l.C) 83 84 %gep.D = getelementptr inbounds double, ptr %D, i64 %iv 85 %l.D = load double, ptr %gep.D 86 %p.4 = call double @llvm.pow.f64(double %p.2, double %l.D) 87 %p.5 = call double @llvm.pow.f64(double %p.4, double %p.3) 88 %mul = fmul double 2.0, %p.5 89 %mul.2 = fmul double %mul, 2.0 90 %mul.3 = fmul double %mul, %mul.2 91 %gep.E = getelementptr inbounds double, ptr %E, i64 %iv 92 store double %mul.3, ptr %gep.E, align 4 93 %iv.next = add i64 %iv, 1 94 %exitcond = icmp eq i64 %iv.next, 16 95 br i1 %exitcond, label %for.end, label %for.body 96 97for.end: 98 ret void 99} 100