xref: /llvm-project/llvm/test/CodeGen/ARM/cortex-a57-misched-vfma.ll (revision fb8c9a339a9d0b78370fbd814d62dd5779f1e196)
1; REQUIRES: asserts
2; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-DEFAULT
3; RUN: llc < %s -mtriple=armv8r-eabi -mcpu=cortex-a57 -enable-misched -verify-misched -debug-only=machine-scheduler -o - 2>&1 > /dev/null -fp-contract=fast | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FAST
4; Check latencies of vmul/vfma accumulate chains.
5
6define arm_aapcs_vfpcc float @Test1(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
7; CHECK:       ********** MI Scheduling **********
8; CHECK:       Test1:%bb.0
9
10; CHECK:       VMULS
11; > VMULS common latency = 5
12; CHECK:       Latency            : 5
13; CHECK:       Successors:
14; CHECK:       Data
15; > VMULS read-advanced latency to VMLAS = 0
16; CHECK-SAME:  Latency=0
17
18; CHECK-DEFAULT: VMLAS
19; CHECK-FAST:    VFMAS
20; > VMLAS common latency = 9
21; CHECK:       Latency            : 9
22; CHECK:       Successors:
23; CHECK:       Data
24; > VMLAS read-advanced latency to the next VMLAS = 4
25; CHECK-SAME:  Latency=4
26
27; CHECK-DEFAULT: VMLAS
28; CHECK-FAST:    VFMAS
29; CHECK:       Latency            : 9
30; CHECK:       Successors:
31; CHECK:       Data
32; > VMLAS not-optimized latency to VMOVRS = 9
33; CHECK-SAME:  Latency=9
34
35; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLAS, VMLAS
36  %mul1 = fmul float %f1, %f2
37  %mul2 = fmul float %f3, %f4
38  %mul3 = fmul float %f5, %f6
39  %add1 = fadd float %mul1, %mul2
40  %add2 = fadd float %add1, %mul3
41  ret float %add2
42}
43
44; ASIMD form
45define arm_aapcs_vfpcc <2 x float> @Test2(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
46; CHECK:       ********** MI Scheduling **********
47; CHECK:       Test2:%bb.0
48
49; CHECK:       VMULfd
50; > VMULfd common latency = 5
51; CHECK:       Latency            : 5
52; CHECK:       Successors:
53; CHECK:       Data
54; VMULfd read-advanced latency to VMLAfd = 0
55; CHECK-SAME:  Latency=0
56
57; CHECK-DEFAULT: VMLAfd
58; CHECK-FAST:    VFMAfd
59; > VMLAfd common latency = 9
60; CHECK:       Latency            : 9
61; CHECK:       Successors:
62; CHECK:       Data
63; > VMLAfd read-advanced latency to the next VMLAfd = 4
64; CHECK-SAME:  Latency=4
65
66; CHECK-DEFAULT: VMLAfd
67; CHECK-FAST:    VFMAfd
68; CHECK:       Latency            : 9
69; CHECK:       Successors:
70; CHECK:       Data
71; > VMLAfd not-optimized latency to VMOVRRD = 9
72; CHECK-SAME:  Latency=9
73
74; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLAS, VMLAS
75  %mul1 = fmul <2 x float> %f1, %f2
76  %mul2 = fmul <2 x float> %f3, %f4
77  %mul3 = fmul <2 x float> %f5, %f6
78  %add1 = fadd <2 x float> %mul1, %mul2
79  %add2 = fadd <2 x float> %add1, %mul3
80  ret <2 x float> %add2
81}
82
83define arm_aapcs_vfpcc float @Test3(float %f1, float %f2, float %f3, float %f4, float %f5, float %f6) {
84; CHECK:       ********** MI Scheduling **********
85; CHECK:       Test3:%bb.0
86
87; CHECK:       VMULS
88; > VMULS common latency = 5
89; CHECK:       Latency            : 5
90; CHECK:       Successors:
91; CHECK:       Data
92; > VMULS read-advanced latency to VMLSS = 0
93; CHECK-SAME:  Latency=0
94
95; CHECK-DEFAULT: VMLSS
96; CHECK-FAST:    VFNMSS
97; > VFNMSS common latency = 9
98; CHECK:       Latency            : 9
99; CHECK:       Successors:
100; CHECK:       Data
101; > VFNMSS read-advanced latency to the next VMLSS = 4
102; CHECK-SAME:  Latency=4
103
104; CHECK-DEFAULT: VMLSS
105; CHECK-FAST:    VFMSS
106; CHECK:       Latency            : 9
107; CHECK:       Successors:
108; CHECK:       Data
109; > VMLSS not-optimized latency to VMOVRS = 9
110; CHECK-SAME:  Latency=9
111
112; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLSS, VMLSS
113  %mul1 = fmul float %f1, %f2
114  %mul2 = fmul float %f3, %f4
115  %mul3 = fmul float %f5, %f6
116  %sub1 = fsub float %mul1, %mul2
117  %sub2 = fsub float %sub1, %mul3
118  ret float %sub2
119}
120
121; ASIMD form
122define arm_aapcs_vfpcc <2 x float> @Test4(<2 x float> %f1, <2 x float> %f2, <2 x float> %f3, <2 x float> %f4, <2 x float> %f5, <2 x float> %f6) {
123; CHECK:       ********** MI Scheduling **********
124; CHECK:       Test4:%bb.0
125
126; CHECK:       VMULfd
127; > VMULfd common latency = 5
128; CHECK:       Latency            : 5
129; CHECK:       Successors:
130; CHECK:       Data
131; VMULfd read-advanced latency to VMLSfd = 0
132; CHECK-SAME:  Latency=0
133
134; CHECK-DEFAULT: VMLSfd
135; CHECK-FAST:    VFMSfd
136; > VMLSfd common latency = 9
137; CHECK:       Latency            : 9
138; CHECK:       Successors:
139; CHECK:       Data
140; > VMLSfd read-advanced latency to the next VMLSfd = 4
141; CHECK-SAME:  Latency=4
142
143; CHECK-DEFAULT: VMLSfd
144; CHECK-FAST:    VFMSfd
145; CHECK:       Latency            : 9
146; CHECK:       Successors:
147; CHECK:       Data
148; > VMLSfd not-optimized latency to VMOVRRD = 9
149; CHECK-SAME:  Latency=9
150
151; f1 * f2 + f3 * f4 + f5 * f6  ==>  VMULS, VMLSS, VMLSS
152  %mul1 = fmul <2 x float> %f1, %f2
153  %mul2 = fmul <2 x float> %f3, %f4
154  %mul3 = fmul <2 x float> %f5, %f6
155  %sub1 = fsub <2 x float> %mul1, %mul2
156  %sub2 = fsub <2 x float> %sub1, %mul3
157  ret <2 x float> %sub2
158}
159
160define arm_aapcs_vfpcc float @Test5(float %f1, float %f2, float %f3) {
161; CHECK:       ********** MI Scheduling **********
162; CHECK:       Test5:%bb.0
163
164; CHECK-DEFAULT: VNMLS
165; CHECK-FAST:    VFNMS
166; CHECK:       Latency            : 9
167; CHECK:       Successors:
168; CHECK:       Data
169; > VMLAS not-optimized latency to VMOVRS = 9
170; CHECK-SAME:  Latency=9
171
172; f1 * f2 - f3  ==>  VNMLS/VFNMS
173  %mul = fmul float %f1, %f2
174  %sub = fsub float %mul, %f3
175  ret float %sub
176}
177
178
179define arm_aapcs_vfpcc float @Test6(float %f1, float %f2, float %f3) {
180; CHECK:       ********** MI Scheduling **********
181; CHECK:       Test6:%bb.0
182
183; CHECK-DEFAULT: VNMLA
184; CHECK-FAST:    VFNMA
185; CHECK:       Latency            : 9
186; CHECK:       Successors:
187; CHECK:       Data
188; > VMLAS not-optimized latency to VMOVRS = 9
189; CHECK-SAME:  Latency=9
190
191; f1 * f2 - f3  ==>  VNMLA/VFNMA
192  %mul = fmul float %f1, %f2
193  %sub1 = fsub float -0.0, %mul
194  %sub2 = fsub float %sub1, %f2
195  ret float %sub2
196}
197