xref: /llvm-project/llvm/test/Analysis/CostModel/X86/intrinsic-cost.ll (revision 68c50b111d74afb9489cf97770fa917d0a1c7f77)
1; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=core2 -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s -check-prefix=CORE2
2; RUN: opt -S -mtriple=x86_64-apple-darwin -mcpu=corei7 -passes="print<cost-model>" 2>&1 -disable-output < %s | FileCheck %s -check-prefix=COREI7
3
4; If SSE4.1 roundps instruction is available it is cheap to lower, otherwise
5; it'll be scalarized into calls which are expensive.
6define void @test1(ptr nocapture %f) nounwind {
7vector.ph:
8  br label %vector.body
9
10vector.body:                                      ; preds = %vector.body, %vector.ph
11  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
12  %0 = getelementptr inbounds float, ptr %f, i64 %index
13  %wide.load = load <4 x float>, ptr %0, align 4
14  %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
15  store <4 x float> %1, ptr %0, align 4
16  %index.next = add i64 %index, 4
17  %2 = icmp eq i64 %index.next, 1024
18  br i1 %2, label %for.end, label %vector.body
19
20for.end:                                          ; preds = %vector.body
21  ret void
22
23; CORE2: function 'test1'
24; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
25
26; COREI7: function 'test1'
27; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %1 = call <4 x float> @llvm.ceil.v4f32(<4 x float> %wide.load)
28
29}
30
31declare <4 x float> @llvm.ceil.v4f32(<4 x float>)  nounwind readnone
32
33define void @test2(ptr nocapture %f) nounwind {
34vector.ph:
35  br label %vector.body
36
37vector.body:                                      ; preds = %vector.body, %vector.ph
38  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
39  %0 = getelementptr inbounds float, ptr %f, i64 %index
40  %wide.load = load <4 x float>, ptr %0, align 4
41  %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
42  store <4 x float> %1, ptr %0, align 4
43  %index.next = add i64 %index, 4
44  %2 = icmp eq i64 %index.next, 1024
45  br i1 %2, label %for.end, label %vector.body
46
47for.end:                                          ; preds = %vector.body
48  ret void
49
50; CORE2: function 'test2'
51; CORE2: Cost Model: Found an estimated cost of 46 for instruction:   %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
52
53; COREI7: function 'test2'
54; COREI7: Cost Model: Found an estimated cost of 1 for instruction:   %1 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %wide.load)
55
56}
57
58declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)  nounwind readnone
59
60define void @test3(ptr nocapture %f, <4 x float> %b, <4 x float> %c) nounwind {
61vector.ph:
62  br label %vector.body
63
64vector.body:                                      ; preds = %vector.body, %vector.ph
65  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
66  %0 = getelementptr inbounds float, ptr %f, i64 %index
67  %wide.load = load <4 x float>, ptr %0, align 4
68  %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
69  store <4 x float> %1, ptr %0, align 4
70  %index.next = add i64 %index, 4
71  %2 = icmp eq i64 %index.next, 1024
72  br i1 %2, label %for.end, label %vector.body
73
74for.end:                                          ; preds = %vector.body
75  ret void
76
77; CORE2: function 'test3'
78; CORE2: Cost Model: Found an estimated cost of 4 for instruction: %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
79
80; COREI7: function 'test3'
81; COREI7: Cost Model: Found an estimated cost of 2 for instruction: %1 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %wide.load, <4 x float> %b, <4 x float> %c)
82
83}
84
85declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
86