1; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s -check-prefix=CORE2 2; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s -check-prefix=COREI7 3 4; If SSE4.1 roundps instruction is available it is cheap to lower, otherwise 5; it'll be scalarized into calls which are expensive. 6define void @test1(ptr nocapture %f) nounwind { 7vector.ph: 8 br label %vector.body 9 10vector.body: ; preds = %vector.body, %vector.ph 11 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 12 %0 = getelementptr inbounds float, ptr %f, i64 %index 13 %wide.load = load <4 x float>, ptr %0, align 4 14 %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) 15 store <4 x float> %1, ptr %0, align 4 16 %index.next = add i64 %index, 4 17 %2 = icmp eq i64 %index.next, 1024 18 br i1 %2, label %for.end, label %vector.body 19 20for.end: ; preds = %vector.body 21 ret void 22 23; CORE2: function 'test1' 24; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) 25 26; COREI7: function 'test1' 27; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load) 28 29} 30 31declare <4 x float> @llvm.ceil.v4f32(<4 x float>) nounwind readnone 32 33define void @test2(ptr nocapture %f) nounwind { 34vector.ph: 35 br label %vector.body 36 37vector.body: ; preds = %vector.body, %vector.ph 38 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 39 %0 = getelementptr inbounds float, ptr %f, i64 %index 40 %wide.load = load <4 x float>, ptr %0, align 4 41 %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) 42 store <4 x float> %1, ptr %0, align 4 43 %index.next = add i64 %index, 4 44 %2 = icmp eq i64 %index.next, 1024 45 br i1 %2, label %for.end, label %vector.body 46 47for.end: ; preds = %vector.body 48 ret void 49 50; CORE2: function 'test2' 51; CORE2: Cost Model: Found an estimated cost of 46 for instruction: %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) 52 53; COREI7: function 'test2' 54; COREI7: Cost Model: Found an estimated cost of 1 for instruction: %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load) 55 56} 57 58declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) nounwind readnone 59 60define void @test3(ptr nocapture %f, <4 x float> %b, <4 x float> %c) nounwind { 61vector.ph: 62 br label %vector.body 63 64vector.body: ; preds = %vector.body, %vector.ph 65 %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] 66 %0 = getelementptr inbounds float, ptr %f, i64 %index 67 %wide.load = load <4 x float>, ptr %0, align 4 68 %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) 69 store <4 x float> %1, ptr %0, align 4 70 %index.next = add i64 %index, 4 71 %2 = icmp eq i64 %index.next, 1024 72 br i1 %2, label %for.end, label %vector.body 73 74for.end: ; preds = %vector.body 75 ret void 76 77; CORE2: function 'test3' 78; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) 79 80; COREI7: function 'test3' 81; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c) 82 83} 84 85declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone 86