xref: /llvm-project/llvm/test/CodeGen/X86/recip-fastmath2.ll (revision a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2      | FileCheck %s --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx       | FileCheck %s --check-prefixes=AVX,AVX-RECIP
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma  | FileCheck %s --check-prefixes=AVX,FMA-RECIP
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2      | FileCheck %s --check-prefixes=AVX,BDVER2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2      | FileCheck %s --check-prefixes=AVX,BTVER2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge | FileCheck %s --check-prefixes=AVX,SANDY
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell     | FileCheck %s --check-prefixes=AVX,HASWELL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefixes=AVX,HASWELL-NO-FMA
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl         | FileCheck %s --check-prefixes=AVX,AVX512,KNL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx         | FileCheck %s --check-prefixes=AVX,AVX512,SKX
12
13; It's the extra tests coverage for recip as discussed on D26855.
14
15define float @f32_no_step_2(float %x) #3 {
16; SSE-LABEL: f32_no_step_2:
17; SSE:       # %bb.0:
18; SSE-NEXT:    rcpss %xmm0, %xmm0
19; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: f32_no_step_2:
23; AVX:       # %bb.0:
24; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
25; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
26; AVX-NEXT:    retq
27  %div = fdiv fast float 1234.0, %x
28  ret float %div
29}
30
31define float @f32_one_step_2(float %x) #1 {
32; SSE-LABEL: f32_one_step_2:
33; SSE:       # %bb.0:
34; SSE-NEXT:    rcpss %xmm0, %xmm2
35; SSE-NEXT:    movss {{.*#+}} xmm1 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
36; SSE-NEXT:    movaps %xmm2, %xmm3
37; SSE-NEXT:    mulss %xmm1, %xmm3
38; SSE-NEXT:    mulss %xmm3, %xmm0
39; SSE-NEXT:    subss %xmm0, %xmm1
40; SSE-NEXT:    mulss %xmm2, %xmm1
41; SSE-NEXT:    addss %xmm3, %xmm1
42; SSE-NEXT:    movaps %xmm1, %xmm0
43; SSE-NEXT:    retq
44;
45; AVX-RECIP-LABEL: f32_one_step_2:
46; AVX-RECIP:       # %bb.0:
47; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
48; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
49; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
50; AVX-RECIP-NEXT:    vmulss %xmm3, %xmm0, %xmm0
51; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
52; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
53; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm3, %xmm0
54; AVX-RECIP-NEXT:    retq
55;
56; FMA-RECIP-LABEL: f32_one_step_2:
57; FMA-RECIP:       # %bb.0:
58; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
59; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
60; FMA-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
61; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
62; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
63; FMA-RECIP-NEXT:    retq
64;
65; BDVER2-LABEL: f32_one_step_2:
66; BDVER2:       # %bb.0:
67; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
68; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
69; BDVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm3
70; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
71; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
72; BDVER2-NEXT:    retq
73;
74; BTVER2-LABEL: f32_one_step_2:
75; BTVER2:       # %bb.0:
76; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
77; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
78; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm3
79; BTVER2-NEXT:    vmulss %xmm3, %xmm0, %xmm0
80; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
81; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
82; BTVER2-NEXT:    vaddss %xmm0, %xmm3, %xmm0
83; BTVER2-NEXT:    retq
84;
85; SANDY-LABEL: f32_one_step_2:
86; SANDY:       # %bb.0:
87; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
88; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
89; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm3
90; SANDY-NEXT:    vmulss %xmm3, %xmm0, %xmm0
91; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
92; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
93; SANDY-NEXT:    vaddss %xmm0, %xmm3, %xmm0
94; SANDY-NEXT:    retq
95;
96; HASWELL-LABEL: f32_one_step_2:
97; HASWELL:       # %bb.0:
98; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
99; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
100; HASWELL-NEXT:    vmulss %xmm2, %xmm1, %xmm3
101; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
102; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
103; HASWELL-NEXT:    retq
104;
105; HASWELL-NO-FMA-LABEL: f32_one_step_2:
106; HASWELL-NO-FMA:       # %bb.0:
107; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
108; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
109; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm3
110; HASWELL-NO-FMA-NEXT:    vmulss %xmm3, %xmm0, %xmm0
111; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
112; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
113; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm3, %xmm0
114; HASWELL-NO-FMA-NEXT:    retq
115;
116; AVX512-LABEL: f32_one_step_2:
117; AVX512:       # %bb.0:
118; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
119; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
120; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm3
121; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
122; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
123; AVX512-NEXT:    retq
124  %div = fdiv fast float 3456.0, %x
125  ret float %div
126}
127
128define float @f32_one_step_2_divs(float %x) #1 {
129; SSE-LABEL: f32_one_step_2_divs:
130; SSE:       # %bb.0:
131; SSE-NEXT:    rcpss %xmm0, %xmm1
132; SSE-NEXT:    mulss %xmm1, %xmm0
133; SSE-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
134; SSE-NEXT:    subss %xmm0, %xmm2
135; SSE-NEXT:    mulss %xmm1, %xmm2
136; SSE-NEXT:    addss %xmm1, %xmm2
137; SSE-NEXT:    movss {{.*#+}} xmm0 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
138; SSE-NEXT:    mulss %xmm2, %xmm0
139; SSE-NEXT:    mulss %xmm2, %xmm0
140; SSE-NEXT:    retq
141;
142; AVX-RECIP-LABEL: f32_one_step_2_divs:
143; AVX-RECIP:       # %bb.0:
144; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
145; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
146; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
147; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
148; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
149; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
150; AVX-RECIP-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
151; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
152; AVX-RECIP-NEXT:    retq
153;
154; FMA-RECIP-LABEL: f32_one_step_2_divs:
155; FMA-RECIP:       # %bb.0:
156; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
157; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
158; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
159; FMA-RECIP-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
160; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
161; FMA-RECIP-NEXT:    retq
162;
163; BDVER2-LABEL: f32_one_step_2_divs:
164; BDVER2:       # %bb.0:
165; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
166; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
167; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
168; BDVER2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
169; BDVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
170; BDVER2-NEXT:    retq
171;
172; BTVER2-LABEL: f32_one_step_2_divs:
173; BTVER2:       # %bb.0:
174; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
175; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
176; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
177; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
178; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
179; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
180; BTVER2-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
181; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
182; BTVER2-NEXT:    retq
183;
184; SANDY-LABEL: f32_one_step_2_divs:
185; SANDY:       # %bb.0:
186; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
187; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
188; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
189; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
190; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
191; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
192; SANDY-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
193; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
194; SANDY-NEXT:    retq
195;
196; HASWELL-LABEL: f32_one_step_2_divs:
197; HASWELL:       # %bb.0:
198; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
199; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
200; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
201; HASWELL-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
202; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0
203; HASWELL-NEXT:    retq
204;
205; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
206; HASWELL-NO-FMA:       # %bb.0:
207; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
208; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
209; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
210; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
211; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
212; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
213; HASWELL-NO-FMA-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
214; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
215; HASWELL-NO-FMA-NEXT:    retq
216;
217; AVX512-LABEL: f32_one_step_2_divs:
218; AVX512:       # %bb.0:
219; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
220; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
221; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
222; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
223; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
224; AVX512-NEXT:    retq
225  %div = fdiv fast float 3456.0, %x
226  %div2 = fdiv fast float %div, %x
227  ret float %div2
228}
229
230define float @f32_two_step_2(float %x) #2 {
231; SSE-LABEL: f32_two_step_2:
232; SSE:       # %bb.0:
233; SSE-NEXT:    rcpss %xmm0, %xmm1
234; SSE-NEXT:    movaps %xmm0, %xmm2
235; SSE-NEXT:    mulss %xmm1, %xmm2
236; SSE-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
237; SSE-NEXT:    subss %xmm2, %xmm3
238; SSE-NEXT:    mulss %xmm1, %xmm3
239; SSE-NEXT:    addss %xmm1, %xmm3
240; SSE-NEXT:    movss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
241; SSE-NEXT:    movaps %xmm3, %xmm2
242; SSE-NEXT:    mulss %xmm1, %xmm2
243; SSE-NEXT:    mulss %xmm2, %xmm0
244; SSE-NEXT:    subss %xmm0, %xmm1
245; SSE-NEXT:    mulss %xmm3, %xmm1
246; SSE-NEXT:    addss %xmm2, %xmm1
247; SSE-NEXT:    movaps %xmm1, %xmm0
248; SSE-NEXT:    retq
249;
250; AVX-RECIP-LABEL: f32_two_step_2:
251; AVX-RECIP:       # %bb.0:
252; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
253; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
254; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
255; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
256; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
257; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
258; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
259; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
260; AVX-RECIP-NEXT:    vmulss %xmm3, %xmm0, %xmm0
261; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
262; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
263; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm3, %xmm0
264; AVX-RECIP-NEXT:    retq
265;
266; FMA-RECIP-LABEL: f32_two_step_2:
267; FMA-RECIP:       # %bb.0:
268; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
269; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
270; FMA-RECIP-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
271; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
272; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
273; FMA-RECIP-NEXT:    vmulss %xmm1, %xmm2, %xmm3
274; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
275; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
276; FMA-RECIP-NEXT:    retq
277;
278; BDVER2-LABEL: f32_two_step_2:
279; BDVER2:       # %bb.0:
280; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
281; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
282; BDVER2-NEXT:    vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
283; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
284; BDVER2-NEXT:    vmulss %xmm4, %xmm1, %xmm3
285; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
286; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
287; BDVER2-NEXT:    retq
288;
289; BTVER2-LABEL: f32_two_step_2:
290; BTVER2:       # %bb.0:
291; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
292; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
293; BTVER2-NEXT:    vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
294; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
295; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
296; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
297; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
298; BTVER2-NEXT:    vmulss %xmm4, %xmm1, %xmm3
299; BTVER2-NEXT:    vmulss %xmm3, %xmm0, %xmm0
300; BTVER2-NEXT:    vsubss %xmm0, %xmm4, %xmm0
301; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
302; BTVER2-NEXT:    vaddss %xmm0, %xmm3, %xmm0
303; BTVER2-NEXT:    retq
304;
305; SANDY-LABEL: f32_two_step_2:
306; SANDY:       # %bb.0:
307; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
308; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
309; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
310; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
311; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
312; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
313; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
314; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm3
315; SANDY-NEXT:    vmulss %xmm3, %xmm0, %xmm0
316; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
317; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
318; SANDY-NEXT:    vaddss %xmm0, %xmm3, %xmm0
319; SANDY-NEXT:    retq
320;
321; HASWELL-LABEL: f32_two_step_2:
322; HASWELL:       # %bb.0:
323; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
324; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
325; HASWELL-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
326; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
327; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
328; HASWELL-NEXT:    vmulss %xmm1, %xmm2, %xmm3
329; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
330; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
331; HASWELL-NEXT:    retq
332;
333; HASWELL-NO-FMA-LABEL: f32_two_step_2:
334; HASWELL-NO-FMA:       # %bb.0:
335; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
336; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
337; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
338; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
339; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
340; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
341; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
342; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm3
343; HASWELL-NO-FMA-NEXT:    vmulss %xmm3, %xmm0, %xmm0
344; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
345; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
346; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm3, %xmm0
347; HASWELL-NO-FMA-NEXT:    retq
348;
349; AVX512-LABEL: f32_two_step_2:
350; AVX512:       # %bb.0:
351; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
352; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
353; AVX512-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
354; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
355; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
356; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm3
357; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
358; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
359; AVX512-NEXT:    retq
360  %div = fdiv fast float 6789.0, %x
361  ret float %div
362}
363
364define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
365; SSE-LABEL: v4f32_one_step2:
366; SSE:       # %bb.0:
367; SSE-NEXT:    rcpps %xmm0, %xmm2
368; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
369; SSE-NEXT:    movaps %xmm2, %xmm3
370; SSE-NEXT:    mulps %xmm1, %xmm3
371; SSE-NEXT:    mulps %xmm3, %xmm0
372; SSE-NEXT:    subps %xmm0, %xmm1
373; SSE-NEXT:    mulps %xmm2, %xmm1
374; SSE-NEXT:    addps %xmm3, %xmm1
375; SSE-NEXT:    movaps %xmm1, %xmm0
376; SSE-NEXT:    retq
377;
378; AVX-RECIP-LABEL: v4f32_one_step2:
379; AVX-RECIP:       # %bb.0:
380; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
381; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
382; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm3
383; AVX-RECIP-NEXT:    vmulps %xmm3, %xmm0, %xmm0
384; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
385; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
386; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm3, %xmm0
387; AVX-RECIP-NEXT:    retq
388;
389; FMA-RECIP-LABEL: v4f32_one_step2:
390; FMA-RECIP:       # %bb.0:
391; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
392; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
393; FMA-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm3
394; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
395; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
396; FMA-RECIP-NEXT:    retq
397;
398; BDVER2-LABEL: v4f32_one_step2:
399; BDVER2:       # %bb.0:
400; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
401; BDVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
402; BDVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm3
403; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
404; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
405; BDVER2-NEXT:    retq
406;
407; BTVER2-LABEL: v4f32_one_step2:
408; BTVER2:       # %bb.0:
409; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
410; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
411; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm3
412; BTVER2-NEXT:    vmulps %xmm3, %xmm0, %xmm0
413; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
414; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
415; BTVER2-NEXT:    vaddps %xmm0, %xmm3, %xmm0
416; BTVER2-NEXT:    retq
417;
418; SANDY-LABEL: v4f32_one_step2:
419; SANDY:       # %bb.0:
420; SANDY-NEXT:    vrcpps %xmm0, %xmm1
421; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
422; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm3
423; SANDY-NEXT:    vmulps %xmm3, %xmm0, %xmm0
424; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
425; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
426; SANDY-NEXT:    vaddps %xmm0, %xmm3, %xmm0
427; SANDY-NEXT:    retq
428;
429; HASWELL-LABEL: v4f32_one_step2:
430; HASWELL:       # %bb.0:
431; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
432; HASWELL-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
433; HASWELL-NEXT:    vmulps %xmm2, %xmm1, %xmm3
434; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
435; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
436; HASWELL-NEXT:    retq
437;
438; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
439; HASWELL-NO-FMA:       # %bb.0:
440; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
441; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
442; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm3
443; HASWELL-NO-FMA-NEXT:    vmulps %xmm3, %xmm0, %xmm0
444; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
445; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
446; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm3, %xmm0
447; HASWELL-NO-FMA-NEXT:    retq
448;
449; AVX512-LABEL: v4f32_one_step2:
450; AVX512:       # %bb.0:
451; AVX512-NEXT:    vrcpps %xmm0, %xmm1
452; AVX512-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
453; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm3
454; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
455; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
456; AVX512-NEXT:    retq
457  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
458  ret <4 x float> %div
459}
460
461define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
462; SSE-LABEL: v4f32_one_step_2_divs:
463; SSE:       # %bb.0:
464; SSE-NEXT:    rcpps %xmm0, %xmm1
465; SSE-NEXT:    mulps %xmm1, %xmm0
466; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
467; SSE-NEXT:    subps %xmm0, %xmm2
468; SSE-NEXT:    mulps %xmm1, %xmm2
469; SSE-NEXT:    addps %xmm1, %xmm2
470; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
471; SSE-NEXT:    mulps %xmm2, %xmm0
472; SSE-NEXT:    mulps %xmm2, %xmm0
473; SSE-NEXT:    retq
474;
475; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
476; AVX-RECIP:       # %bb.0:
477; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
478; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
479; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
480; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
481; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
482; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
483; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
484; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
485; AVX-RECIP-NEXT:    retq
486;
487; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
488; FMA-RECIP:       # %bb.0:
489; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
490; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
491; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
492; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
493; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
494; FMA-RECIP-NEXT:    retq
495;
496; BDVER2-LABEL: v4f32_one_step_2_divs:
497; BDVER2:       # %bb.0:
498; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
499; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - mem
500; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
501; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
502; BDVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
503; BDVER2-NEXT:    retq
504;
505; BTVER2-LABEL: v4f32_one_step_2_divs:
506; BTVER2:       # %bb.0:
507; BTVER2-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
508; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
509; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
510; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
511; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
512; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
513; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
514; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
515; BTVER2-NEXT:    retq
516;
517; SANDY-LABEL: v4f32_one_step_2_divs:
518; SANDY:       # %bb.0:
519; SANDY-NEXT:    vrcpps %xmm0, %xmm1
520; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
521; SANDY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
522; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
523; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
524; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
525; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
526; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
527; SANDY-NEXT:    retq
528;
529; HASWELL-LABEL: v4f32_one_step_2_divs:
530; HASWELL:       # %bb.0:
531; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
532; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
533; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
534; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
535; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
536; HASWELL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
537; HASWELL-NEXT:    retq
538;
539; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
540; HASWELL-NO-FMA:       # %bb.0:
541; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
542; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
543; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
544; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
545; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
546; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
547; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
548; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
549; HASWELL-NO-FMA-NEXT:    retq
550;
551; KNL-LABEL: v4f32_one_step_2_divs:
552; KNL:       # %bb.0:
553; KNL-NEXT:    vrcpps %xmm0, %xmm1
554; KNL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
555; KNL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm1 * xmm0) - xmm2
556; KNL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
557; KNL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
558; KNL-NEXT:    vmulps %xmm2, %xmm0, %xmm0
559; KNL-NEXT:    retq
560;
561; SKX-LABEL: v4f32_one_step_2_divs:
562; SKX:       # %bb.0:
563; SKX-NEXT:    vrcpps %xmm0, %xmm1
564; SKX-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - mem
565; SKX-NEXT:    vfnmadd132ps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
566; SKX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
567; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
568; SKX-NEXT:    retq
569  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
570  %div2 = fdiv fast <4 x float> %div, %x
571  ret <4 x float> %div2
572}
573
574define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
575; SSE-LABEL: v4f32_two_step2:
576; SSE:       # %bb.0:
577; SSE-NEXT:    rcpps %xmm0, %xmm1
578; SSE-NEXT:    movaps %xmm0, %xmm2
579; SSE-NEXT:    mulps %xmm1, %xmm2
580; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
581; SSE-NEXT:    subps %xmm2, %xmm3
582; SSE-NEXT:    mulps %xmm1, %xmm3
583; SSE-NEXT:    addps %xmm1, %xmm3
584; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
585; SSE-NEXT:    movaps %xmm3, %xmm2
586; SSE-NEXT:    mulps %xmm1, %xmm2
587; SSE-NEXT:    mulps %xmm2, %xmm0
588; SSE-NEXT:    subps %xmm0, %xmm1
589; SSE-NEXT:    mulps %xmm3, %xmm1
590; SSE-NEXT:    addps %xmm2, %xmm1
591; SSE-NEXT:    movaps %xmm1, %xmm0
592; SSE-NEXT:    retq
593;
594; AVX-RECIP-LABEL: v4f32_two_step2:
595; AVX-RECIP:       # %bb.0:
596; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
597; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
598; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
599; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
600; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
601; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
602; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
603; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm3
604; AVX-RECIP-NEXT:    vmulps %xmm3, %xmm0, %xmm0
605; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
606; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
607; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm3, %xmm0
608; AVX-RECIP-NEXT:    retq
609;
610; FMA-RECIP-LABEL: v4f32_two_step2:
611; FMA-RECIP:       # %bb.0:
612; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
613; FMA-RECIP-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
614; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
615; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
616; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
617; FMA-RECIP-NEXT:    vmulps %xmm1, %xmm2, %xmm3
618; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
619; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
620; FMA-RECIP-NEXT:    retq
621;
622; BDVER2-LABEL: v4f32_two_step2:
623; BDVER2:       # %bb.0:
624; BDVER2-NEXT:    vrcpps %xmm0, %xmm1
625; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
626; BDVER2-NEXT:    vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
627; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
628; BDVER2-NEXT:    vmulps %xmm4, %xmm1, %xmm3
629; BDVER2-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
630; BDVER2-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
631; BDVER2-NEXT:    retq
632;
633; BTVER2-LABEL: v4f32_two_step2:
634; BTVER2:       # %bb.0:
635; BTVER2-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
636; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
637; BTVER2-NEXT:    vmovaps {{.*#+}} xmm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
638; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
639; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
640; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
641; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
642; BTVER2-NEXT:    vmulps %xmm4, %xmm1, %xmm3
643; BTVER2-NEXT:    vmulps %xmm3, %xmm0, %xmm0
644; BTVER2-NEXT:    vsubps %xmm0, %xmm4, %xmm0
645; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
646; BTVER2-NEXT:    vaddps %xmm0, %xmm3, %xmm0
647; BTVER2-NEXT:    retq
648;
649; SANDY-LABEL: v4f32_two_step2:
650; SANDY:       # %bb.0:
651; SANDY-NEXT:    vrcpps %xmm0, %xmm1
652; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
653; SANDY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
654; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
655; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
656; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
657; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
658; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm3
659; SANDY-NEXT:    vmulps %xmm3, %xmm0, %xmm0
660; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
661; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
662; SANDY-NEXT:    vaddps %xmm0, %xmm3, %xmm0
663; SANDY-NEXT:    retq
664;
665; HASWELL-LABEL: v4f32_two_step2:
666; HASWELL:       # %bb.0:
667; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
668; HASWELL-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
669; HASWELL-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
670; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
671; HASWELL-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
672; HASWELL-NEXT:    vmulps %xmm1, %xmm2, %xmm3
673; HASWELL-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
674; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
675; HASWELL-NEXT:    retq
676;
677; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
678; HASWELL-NO-FMA:       # %bb.0:
679; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
680; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
681; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
682; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
683; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
684; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
685; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
686; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm3
687; HASWELL-NO-FMA-NEXT:    vmulps %xmm3, %xmm0, %xmm0
688; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
689; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
690; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm3, %xmm0
691; HASWELL-NO-FMA-NEXT:    retq
692;
693; AVX512-LABEL: v4f32_two_step2:
694; AVX512:       # %bb.0:
695; AVX512-NEXT:    vrcpps %xmm0, %xmm1
696; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
697; AVX512-NEXT:    vfmsub231ps {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
698; AVX512-NEXT:    vfnmadd132ps {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
699; AVX512-NEXT:    vmovaps {{.*#+}} xmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
700; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm3
701; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
702; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
703; AVX512-NEXT:    retq
704  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
705  ret <4 x float> %div
706}
707
708define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
709; SSE-LABEL: v8f32_one_step2:
710; SSE:       # %bb.0:
711; SSE-NEXT:    rcpps %xmm0, %xmm3
712; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
713; SSE-NEXT:    movaps %xmm3, %xmm4
714; SSE-NEXT:    mulps %xmm2, %xmm4
715; SSE-NEXT:    mulps %xmm4, %xmm0
716; SSE-NEXT:    subps %xmm0, %xmm2
717; SSE-NEXT:    mulps %xmm3, %xmm2
718; SSE-NEXT:    addps %xmm4, %xmm2
719; SSE-NEXT:    rcpps %xmm1, %xmm0
720; SSE-NEXT:    movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
721; SSE-NEXT:    movaps %xmm0, %xmm4
722; SSE-NEXT:    mulps %xmm3, %xmm4
723; SSE-NEXT:    mulps %xmm4, %xmm1
724; SSE-NEXT:    subps %xmm1, %xmm3
725; SSE-NEXT:    mulps %xmm0, %xmm3
726; SSE-NEXT:    addps %xmm4, %xmm3
727; SSE-NEXT:    movaps %xmm2, %xmm0
728; SSE-NEXT:    movaps %xmm3, %xmm1
729; SSE-NEXT:    retq
730;
731; AVX-RECIP-LABEL: v8f32_one_step2:
732; AVX-RECIP:       # %bb.0:
733; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
734; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
735; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
736; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm0, %ymm0
737; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
738; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
739; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm3, %ymm0
740; AVX-RECIP-NEXT:    retq
741;
742; FMA-RECIP-LABEL: v8f32_one_step2:
743; FMA-RECIP:       # %bb.0:
744; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
745; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
746; FMA-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
747; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
748; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
749; FMA-RECIP-NEXT:    retq
750;
751; BDVER2-LABEL: v8f32_one_step2:
752; BDVER2:       # %bb.0:
753; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
754; BDVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
755; BDVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
756; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm2
757; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
758; BDVER2-NEXT:    retq
759;
760; BTVER2-LABEL: v8f32_one_step2:
761; BTVER2:       # %bb.0:
762; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
763; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
764; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
765; BTVER2-NEXT:    vmulps %ymm3, %ymm0, %ymm0
766; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
767; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
768; BTVER2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
769; BTVER2-NEXT:    retq
770;
771; SANDY-LABEL: v8f32_one_step2:
772; SANDY:       # %bb.0:
773; SANDY-NEXT:    vrcpps %ymm0, %ymm1
774; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
775; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
776; SANDY-NEXT:    vmulps %ymm3, %ymm0, %ymm0
777; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
778; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
779; SANDY-NEXT:    vaddps %ymm0, %ymm3, %ymm0
780; SANDY-NEXT:    retq
781;
782; HASWELL-LABEL: v8f32_one_step2:
783; HASWELL:       # %bb.0:
784; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
785; HASWELL-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
786; HASWELL-NEXT:    vmulps %ymm2, %ymm1, %ymm3
787; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
788; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
789; HASWELL-NEXT:    retq
790;
791; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
792; HASWELL-NO-FMA:       # %bb.0:
793; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
794; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
795; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
796; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm0, %ymm0
797; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
798; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
799; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm3, %ymm0
800; HASWELL-NO-FMA-NEXT:    retq
801;
802; AVX512-LABEL: v8f32_one_step2:
803; AVX512:       # %bb.0:
804; AVX512-NEXT:    vrcpps %ymm0, %ymm1
805; AVX512-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
806; AVX512-NEXT:    vmulps %ymm2, %ymm1, %ymm3
807; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm2
808; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
809; AVX512-NEXT:    retq
810  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
811  ret <8 x float> %div
812}
813
814define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
815; SSE-LABEL: v8f32_one_step_2_divs:
816; SSE:       # %bb.0:
817; SSE-NEXT:    rcpps %xmm0, %xmm2
818; SSE-NEXT:    mulps %xmm2, %xmm0
819; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
820; SSE-NEXT:    movaps %xmm3, %xmm4
821; SSE-NEXT:    subps %xmm0, %xmm4
822; SSE-NEXT:    mulps %xmm2, %xmm4
823; SSE-NEXT:    addps %xmm2, %xmm4
824; SSE-NEXT:    rcpps %xmm1, %xmm0
825; SSE-NEXT:    mulps %xmm0, %xmm1
826; SSE-NEXT:    subps %xmm1, %xmm3
827; SSE-NEXT:    mulps %xmm0, %xmm3
828; SSE-NEXT:    addps %xmm0, %xmm3
829; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
830; SSE-NEXT:    mulps %xmm3, %xmm1
831; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
832; SSE-NEXT:    mulps %xmm4, %xmm0
833; SSE-NEXT:    mulps %xmm4, %xmm0
834; SSE-NEXT:    mulps %xmm3, %xmm1
835; SSE-NEXT:    retq
836;
837; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
838; AVX-RECIP:       # %bb.0:
839; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
840; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
841; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
842; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
843; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
844; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
845; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
846; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
847; AVX-RECIP-NEXT:    retq
848;
849; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
850; FMA-RECIP:       # %bb.0:
851; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
852; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
853; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
854; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
855; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
856; FMA-RECIP-NEXT:    retq
857;
858; BDVER2-LABEL: v8f32_one_step_2_divs:
859; BDVER2:       # %bb.0:
860; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
861; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - mem
862; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm1
863; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
864; BDVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
865; BDVER2-NEXT:    retq
866;
867; BTVER2-LABEL: v8f32_one_step_2_divs:
868; BTVER2:       # %bb.0:
869; BTVER2-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
870; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
871; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
872; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
873; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
874; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
875; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
876; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
877; BTVER2-NEXT:    retq
878;
879; SANDY-LABEL: v8f32_one_step_2_divs:
880; SANDY:       # %bb.0:
881; SANDY-NEXT:    vrcpps %ymm0, %ymm1
882; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
883; SANDY-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
884; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
885; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
886; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
887; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
888; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
889; SANDY-NEXT:    retq
890;
891; HASWELL-LABEL: v8f32_one_step_2_divs:
892; HASWELL:       # %bb.0:
893; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
894; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
895; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
896; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
897; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
898; HASWELL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
899; HASWELL-NEXT:    retq
900;
901; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
902; HASWELL-NO-FMA:       # %bb.0:
903; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
904; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
905; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
906; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
907; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
908; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
909; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
910; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
911; HASWELL-NO-FMA-NEXT:    retq
912;
913; KNL-LABEL: v8f32_one_step_2_divs:
914; KNL:       # %bb.0:
915; KNL-NEXT:    vrcpps %ymm0, %ymm1
916; KNL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
917; KNL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm1 * ymm0) - ymm2
918; KNL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
919; KNL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
920; KNL-NEXT:    vmulps %ymm2, %ymm0, %ymm0
921; KNL-NEXT:    retq
922;
923; SKX-LABEL: v8f32_one_step_2_divs:
924; SKX:       # %bb.0:
925; SKX-NEXT:    vrcpps %ymm0, %ymm1
926; SKX-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - mem
927; SKX-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm1
928; SKX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
929; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
930; SKX-NEXT:    retq
931  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
932  %div2 = fdiv fast <8 x float> %div, %x
933  ret <8 x float> %div2
934}
935
936define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
937; SSE-LABEL: v8f32_two_step2:
938; SSE:       # %bb.0:
939; SSE-NEXT:    rcpps %xmm0, %xmm2
940; SSE-NEXT:    movaps %xmm0, %xmm3
941; SSE-NEXT:    mulps %xmm2, %xmm3
942; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
943; SSE-NEXT:    movaps %xmm4, %xmm5
944; SSE-NEXT:    subps %xmm3, %xmm5
945; SSE-NEXT:    mulps %xmm2, %xmm5
946; SSE-NEXT:    addps %xmm2, %xmm5
947; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
948; SSE-NEXT:    movaps %xmm5, %xmm3
949; SSE-NEXT:    mulps %xmm2, %xmm3
950; SSE-NEXT:    mulps %xmm3, %xmm0
951; SSE-NEXT:    subps %xmm0, %xmm2
952; SSE-NEXT:    mulps %xmm5, %xmm2
953; SSE-NEXT:    addps %xmm3, %xmm2
954; SSE-NEXT:    rcpps %xmm1, %xmm0
955; SSE-NEXT:    movaps %xmm1, %xmm3
956; SSE-NEXT:    mulps %xmm0, %xmm3
957; SSE-NEXT:    subps %xmm3, %xmm4
958; SSE-NEXT:    mulps %xmm0, %xmm4
959; SSE-NEXT:    addps %xmm0, %xmm4
960; SSE-NEXT:    movaps {{.*#+}} xmm3 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
961; SSE-NEXT:    movaps %xmm4, %xmm0
962; SSE-NEXT:    mulps %xmm3, %xmm0
963; SSE-NEXT:    mulps %xmm0, %xmm1
964; SSE-NEXT:    subps %xmm1, %xmm3
965; SSE-NEXT:    mulps %xmm4, %xmm3
966; SSE-NEXT:    addps %xmm0, %xmm3
967; SSE-NEXT:    movaps %xmm2, %xmm0
968; SSE-NEXT:    movaps %xmm3, %xmm1
969; SSE-NEXT:    retq
970;
971; AVX-RECIP-LABEL: v8f32_two_step2:
972; AVX-RECIP:       # %bb.0:
973; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
974; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
975; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
976; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
977; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
978; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
979; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
980; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
981; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm0, %ymm0
982; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
983; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
984; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm3, %ymm0
985; AVX-RECIP-NEXT:    retq
986;
987; FMA-RECIP-LABEL: v8f32_two_step2:
988; FMA-RECIP:       # %bb.0:
989; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
990; FMA-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
991; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
992; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
993; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
994; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm3
995; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
996; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
997; FMA-RECIP-NEXT:    retq
998;
999; BDVER2-LABEL: v8f32_two_step2:
1000; BDVER2:       # %bb.0:
1001; BDVER2-NEXT:    vrcpps %ymm0, %ymm1
1002; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm2 = (ymm0 * ymm1) - mem
1003; BDVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1004; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm1
1005; BDVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm3
1006; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm3) - ymm4
1007; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm3
1008; BDVER2-NEXT:    retq
1009;
1010; BTVER2-LABEL: v8f32_two_step2:
1011; BTVER2:       # %bb.0:
1012; BTVER2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1013; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
1014; BTVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1015; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
1016; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
1017; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
1018; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
1019; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm3
1020; BTVER2-NEXT:    vmulps %ymm3, %ymm0, %ymm0
1021; BTVER2-NEXT:    vsubps %ymm0, %ymm4, %ymm0
1022; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
1023; BTVER2-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1024; BTVER2-NEXT:    retq
1025;
1026; SANDY-LABEL: v8f32_two_step2:
1027; SANDY:       # %bb.0:
1028; SANDY-NEXT:    vrcpps %ymm0, %ymm1
1029; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
1030; SANDY-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1031; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
1032; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
1033; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
1034; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1035; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1036; SANDY-NEXT:    vmulps %ymm3, %ymm0, %ymm0
1037; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
1038; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
1039; SANDY-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1040; SANDY-NEXT:    retq
1041;
1042; HASWELL-LABEL: v8f32_two_step2:
1043; HASWELL:       # %bb.0:
1044; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
1045; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1046; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
1047; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
1048; HASWELL-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1049; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm3
1050; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
1051; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1052; HASWELL-NEXT:    retq
1053;
1054; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
1055; HASWELL-NO-FMA:       # %bb.0:
1056; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
1057; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
1058; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1059; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
1060; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
1061; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
1062; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1063; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1064; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm0, %ymm0
1065; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
1066; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
1067; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm3, %ymm0
1068; HASWELL-NO-FMA-NEXT:    retq
1069;
1070; AVX512-LABEL: v8f32_two_step2:
1071; AVX512:       # %bb.0:
1072; AVX512-NEXT:    vrcpps %ymm0, %ymm1
1073; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1074; AVX512-NEXT:    vfmsub231ps {{.*#+}} ymm2 = (ymm0 * ymm1) - ymm2
1075; AVX512-NEXT:    vfnmadd132ps {{.*#+}} ymm2 = -(ymm2 * ymm1) + ymm1
1076; AVX512-NEXT:    vmovaps {{.*#+}} ymm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1077; AVX512-NEXT:    vmulps %ymm1, %ymm2, %ymm3
1078; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm3 * ymm0) - ymm1
1079; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm3
1080; AVX512-NEXT:    retq
1081  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1082  ret <8 x float> %div
1083}
1084
1085define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
1086; SSE-LABEL: v8f32_no_step:
1087; SSE:       # %bb.0:
1088; SSE-NEXT:    rcpps %xmm0, %xmm0
1089; SSE-NEXT:    rcpps %xmm1, %xmm1
1090; SSE-NEXT:    retq
1091;
1092; AVX-LABEL: v8f32_no_step:
1093; AVX:       # %bb.0:
1094; AVX-NEXT:    vrcpps %ymm0, %ymm0
1095; AVX-NEXT:    retq
1096  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1097  ret <8 x float> %div
1098}
1099
1100define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
1101; SSE-LABEL: v8f32_no_step2:
1102; SSE:       # %bb.0:
1103; SSE-NEXT:    rcpps %xmm0, %xmm0
1104; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1105; SSE-NEXT:    rcpps %xmm1, %xmm1
1106; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1107; SSE-NEXT:    retq
1108;
1109; AVX-LABEL: v8f32_no_step2:
1110; AVX:       # %bb.0:
1111; AVX-NEXT:    vrcpps %ymm0, %ymm0
1112; AVX-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1113; AVX-NEXT:    retq
1114  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
1115  ret <8 x float> %div
1116}
1117
1118define <16 x float> @v16f32_one_step2(<16 x float> %x) #1 {
1119; SSE-LABEL: v16f32_one_step2:
1120; SSE:       # %bb.0:
1121; SSE-NEXT:    movaps %xmm1, %xmm4
1122; SSE-NEXT:    movaps %xmm0, %xmm1
1123; SSE-NEXT:    rcpps %xmm0, %xmm5
1124; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1125; SSE-NEXT:    movaps %xmm5, %xmm6
1126; SSE-NEXT:    mulps %xmm0, %xmm6
1127; SSE-NEXT:    mulps %xmm6, %xmm1
1128; SSE-NEXT:    subps %xmm1, %xmm0
1129; SSE-NEXT:    mulps %xmm5, %xmm0
1130; SSE-NEXT:    addps %xmm6, %xmm0
1131; SSE-NEXT:    rcpps %xmm4, %xmm5
1132; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1133; SSE-NEXT:    movaps %xmm5, %xmm6
1134; SSE-NEXT:    mulps %xmm1, %xmm6
1135; SSE-NEXT:    mulps %xmm6, %xmm4
1136; SSE-NEXT:    subps %xmm4, %xmm1
1137; SSE-NEXT:    mulps %xmm5, %xmm1
1138; SSE-NEXT:    addps %xmm6, %xmm1
1139; SSE-NEXT:    rcpps %xmm2, %xmm5
1140; SSE-NEXT:    movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1141; SSE-NEXT:    movaps %xmm5, %xmm6
1142; SSE-NEXT:    mulps %xmm4, %xmm6
1143; SSE-NEXT:    mulps %xmm6, %xmm2
1144; SSE-NEXT:    subps %xmm2, %xmm4
1145; SSE-NEXT:    mulps %xmm5, %xmm4
1146; SSE-NEXT:    addps %xmm6, %xmm4
1147; SSE-NEXT:    rcpps %xmm3, %xmm2
1148; SSE-NEXT:    movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1149; SSE-NEXT:    movaps %xmm2, %xmm6
1150; SSE-NEXT:    mulps %xmm5, %xmm6
1151; SSE-NEXT:    mulps %xmm6, %xmm3
1152; SSE-NEXT:    subps %xmm3, %xmm5
1153; SSE-NEXT:    mulps %xmm2, %xmm5
1154; SSE-NEXT:    addps %xmm6, %xmm5
1155; SSE-NEXT:    movaps %xmm4, %xmm2
1156; SSE-NEXT:    movaps %xmm5, %xmm3
1157; SSE-NEXT:    retq
1158;
1159; AVX-RECIP-LABEL: v16f32_one_step2:
1160; AVX-RECIP:       # %bb.0:
1161; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1162; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1163; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1164; AVX-RECIP-NEXT:    vmulps %ymm4, %ymm0, %ymm0
1165; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1166; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1167; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm4, %ymm0
1168; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1169; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1170; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1171; AVX-RECIP-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1172; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1173; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1174; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1175; AVX-RECIP-NEXT:    retq
1176;
1177; FMA-RECIP-LABEL: v16f32_one_step2:
1178; FMA-RECIP:       # %bb.0:
1179; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1180; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1181; FMA-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1182; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1183; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1184; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1185; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1186; FMA-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1187; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1188; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1189; FMA-RECIP-NEXT:    retq
1190;
1191; BDVER2-LABEL: v16f32_one_step2:
1192; BDVER2:       # %bb.0:
1193; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
1194; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1195; BDVER2-NEXT:    vrcpps %ymm1, %ymm5
1196; BDVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1197; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm3
1198; BDVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1199; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1200; BDVER2-NEXT:    vmulps %ymm3, %ymm5, %ymm4
1201; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm3
1202; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm5 * ymm1) + ymm4
1203; BDVER2-NEXT:    retq
1204;
1205; BTVER2-LABEL: v16f32_one_step2:
1206; BTVER2:       # %bb.0:
1207; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1208; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
1209; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1210; BTVER2-NEXT:    vmulps %ymm4, %ymm0, %ymm0
1211; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1212; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1213; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1214; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
1215; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm5
1216; BTVER2-NEXT:    vaddps %ymm0, %ymm4, %ymm0
1217; BTVER2-NEXT:    vmulps %ymm5, %ymm1, %ymm1
1218; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1219; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1220; BTVER2-NEXT:    vaddps %ymm1, %ymm5, %ymm1
1221; BTVER2-NEXT:    retq
1222;
1223; SANDY-LABEL: v16f32_one_step2:
1224; SANDY:       # %bb.0:
1225; SANDY-NEXT:    vrcpps %ymm0, %ymm2
1226; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1227; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1228; SANDY-NEXT:    vmulps %ymm4, %ymm0, %ymm0
1229; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1230; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1231; SANDY-NEXT:    vaddps %ymm0, %ymm4, %ymm0
1232; SANDY-NEXT:    vrcpps %ymm1, %ymm2
1233; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1234; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1235; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1236; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1237; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1238; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1239; SANDY-NEXT:    retq
1240;
1241; HASWELL-LABEL: v16f32_one_step2:
1242; HASWELL:       # %bb.0:
1243; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
1244; HASWELL-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1245; HASWELL-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1246; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm3
1247; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
1248; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
1249; HASWELL-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1250; HASWELL-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1251; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm3
1252; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1253; HASWELL-NEXT:    retq
1254;
1255; HASWELL-NO-FMA-LABEL: v16f32_one_step2:
1256; HASWELL-NO-FMA:       # %bb.0:
1257; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1258; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1259; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1260; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm0, %ymm0
1261; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1262; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm3
1263; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1264; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm4, %ymm0
1265; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1266; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm3, %ymm4
1267; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1268; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm2, %ymm1
1269; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm3, %ymm1
1270; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1271; HASWELL-NO-FMA-NEXT:    retq
1272;
1273; AVX512-LABEL: v16f32_one_step2:
1274; AVX512:       # %bb.0:
1275; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
1276; AVX512-NEXT:    vmovaps {{.*#+}} zmm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1277; AVX512-NEXT:    vmulps %zmm2, %zmm1, %zmm3
1278; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm2
1279; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm3
1280; AVX512-NEXT:    retq
1281  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1282  ret <16 x float> %div
1283}
1284
1285define <16 x float> @v16f32_one_step_2_divs(<16 x float> %x) #1 {
1286; SSE-LABEL: v16f32_one_step_2_divs:
1287; SSE:       # %bb.0:
1288; SSE-NEXT:    rcpps %xmm0, %xmm6
1289; SSE-NEXT:    mulps %xmm6, %xmm0
1290; SSE-NEXT:    movaps {{.*#+}} xmm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1291; SSE-NEXT:    movaps %xmm4, %xmm5
1292; SSE-NEXT:    subps %xmm0, %xmm5
1293; SSE-NEXT:    mulps %xmm6, %xmm5
1294; SSE-NEXT:    addps %xmm6, %xmm5
1295; SSE-NEXT:    rcpps %xmm1, %xmm0
1296; SSE-NEXT:    mulps %xmm0, %xmm1
1297; SSE-NEXT:    movaps %xmm4, %xmm6
1298; SSE-NEXT:    subps %xmm1, %xmm6
1299; SSE-NEXT:    mulps %xmm0, %xmm6
1300; SSE-NEXT:    addps %xmm0, %xmm6
1301; SSE-NEXT:    rcpps %xmm2, %xmm0
1302; SSE-NEXT:    mulps %xmm0, %xmm2
1303; SSE-NEXT:    movaps %xmm4, %xmm7
1304; SSE-NEXT:    subps %xmm2, %xmm7
1305; SSE-NEXT:    mulps %xmm0, %xmm7
1306; SSE-NEXT:    addps %xmm0, %xmm7
1307; SSE-NEXT:    rcpps %xmm3, %xmm0
1308; SSE-NEXT:    mulps %xmm0, %xmm3
1309; SSE-NEXT:    subps %xmm3, %xmm4
1310; SSE-NEXT:    mulps %xmm0, %xmm4
1311; SSE-NEXT:    addps %xmm0, %xmm4
1312; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1313; SSE-NEXT:    mulps %xmm4, %xmm3
1314; SSE-NEXT:    movaps {{.*#+}} xmm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1315; SSE-NEXT:    mulps %xmm7, %xmm2
1316; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1317; SSE-NEXT:    mulps %xmm6, %xmm1
1318; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1319; SSE-NEXT:    mulps %xmm5, %xmm0
1320; SSE-NEXT:    mulps %xmm5, %xmm0
1321; SSE-NEXT:    mulps %xmm6, %xmm1
1322; SSE-NEXT:    mulps %xmm7, %xmm2
1323; SSE-NEXT:    mulps %xmm4, %xmm3
1324; SSE-NEXT:    retq
1325;
1326; AVX-RECIP-LABEL: v16f32_one_step_2_divs:
1327; AVX-RECIP:       # %bb.0:
1328; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1329; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1330; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1331; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1332; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1333; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1334; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1335; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1336; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1337; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1338; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1339; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1340; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1341; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1342; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1343; AVX-RECIP-NEXT:    retq
1344;
1345; FMA-RECIP-LABEL: v16f32_one_step_2_divs:
1346; FMA-RECIP:       # %bb.0:
1347; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1348; FMA-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1349; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1350; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1351; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1352; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1353; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1354; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1355; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1356; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1357; FMA-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1358; FMA-RECIP-NEXT:    retq
1359;
1360; BDVER2-LABEL: v16f32_one_step_2_divs:
1361; BDVER2:       # %bb.0:
1362; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
1363; BDVER2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1364; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm3
1365; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
1366; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
1367; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm2) - ymm3
1368; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1369; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm2
1370; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1371; BDVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1372; BDVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1373; BDVER2-NEXT:    retq
1374;
1375; BTVER2-LABEL: v16f32_one_step_2_divs:
1376; BTVER2:       # %bb.0:
1377; BTVER2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1378; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
1379; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1380; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1381; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1382; BTVER2-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1383; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
1384; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1385; BTVER2-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1386; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1387; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1388; BTVER2-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1389; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1390; BTVER2-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1391; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1392; BTVER2-NEXT:    retq
1393;
1394; SANDY-LABEL: v16f32_one_step_2_divs:
1395; SANDY:       # %bb.0:
1396; SANDY-NEXT:    vrcpps %ymm0, %ymm2
1397; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1398; SANDY-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1399; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1400; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1401; SANDY-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1402; SANDY-NEXT:    vrcpps %ymm1, %ymm2
1403; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm1
1404; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1405; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1406; SANDY-NEXT:    vaddps %ymm1, %ymm2, %ymm1
1407; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1408; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1409; SANDY-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1410; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1411; SANDY-NEXT:    retq
1412;
1413; HASWELL-LABEL: v16f32_one_step_2_divs:
1414; HASWELL:       # %bb.0:
1415; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
1416; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1417; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm3
1418; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
1419; HASWELL-NEXT:    vrcpps %ymm1, %ymm2
1420; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm3
1421; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm1 = -(ymm1 * ymm2) + ymm2
1422; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1423; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1424; HASWELL-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1425; HASWELL-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1426; HASWELL-NEXT:    retq
1427;
1428; HASWELL-NO-FMA-LABEL: v16f32_one_step_2_divs:
1429; HASWELL-NO-FMA:       # %bb.0:
1430; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1431; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm0
1432; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1433; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1434; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm4
1435; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1436; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm2, %ymm0
1437; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1438; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1439; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm4, %ymm1
1440; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1441; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
1442; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
1443; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm3, %ymm0
1444; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1445; HASWELL-NO-FMA-NEXT:    retq
1446;
1447; AVX512-LABEL: v16f32_one_step_2_divs:
1448; AVX512:       # %bb.0:
1449; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
1450; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - mem
1451; AVX512-NEXT:    vfnmadd132ps {{.*#+}} zmm0 = -(zmm0 * zmm1) + zmm1
1452; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
1453; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
1454; AVX512-NEXT:    retq
1455  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1456  %div2 = fdiv fast <16 x float> %div, %x
1457  ret <16 x float> %div2
1458}
1459
1460define <16 x float> @v16f32_two_step2(<16 x float> %x) #2 {
1461; SSE-LABEL: v16f32_two_step2:
1462; SSE:       # %bb.0:
1463; SSE-NEXT:    movaps %xmm1, %xmm4
1464; SSE-NEXT:    movaps %xmm0, %xmm1
1465; SSE-NEXT:    rcpps %xmm0, %xmm0
1466; SSE-NEXT:    movaps %xmm1, %xmm5
1467; SSE-NEXT:    mulps %xmm0, %xmm5
1468; SSE-NEXT:    movaps {{.*#+}} xmm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1469; SSE-NEXT:    movaps %xmm6, %xmm7
1470; SSE-NEXT:    subps %xmm5, %xmm7
1471; SSE-NEXT:    mulps %xmm0, %xmm7
1472; SSE-NEXT:    addps %xmm0, %xmm7
1473; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0]
1474; SSE-NEXT:    movaps %xmm7, %xmm5
1475; SSE-NEXT:    mulps %xmm0, %xmm5
1476; SSE-NEXT:    mulps %xmm5, %xmm1
1477; SSE-NEXT:    subps %xmm1, %xmm0
1478; SSE-NEXT:    mulps %xmm7, %xmm0
1479; SSE-NEXT:    addps %xmm5, %xmm0
1480; SSE-NEXT:    rcpps %xmm4, %xmm1
1481; SSE-NEXT:    movaps %xmm4, %xmm5
1482; SSE-NEXT:    mulps %xmm1, %xmm5
1483; SSE-NEXT:    movaps %xmm6, %xmm7
1484; SSE-NEXT:    subps %xmm5, %xmm7
1485; SSE-NEXT:    mulps %xmm1, %xmm7
1486; SSE-NEXT:    addps %xmm1, %xmm7
1487; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1488; SSE-NEXT:    movaps %xmm7, %xmm5
1489; SSE-NEXT:    mulps %xmm1, %xmm5
1490; SSE-NEXT:    mulps %xmm5, %xmm4
1491; SSE-NEXT:    subps %xmm4, %xmm1
1492; SSE-NEXT:    mulps %xmm7, %xmm1
1493; SSE-NEXT:    addps %xmm5, %xmm1
1494; SSE-NEXT:    rcpps %xmm2, %xmm4
1495; SSE-NEXT:    movaps %xmm2, %xmm5
1496; SSE-NEXT:    mulps %xmm4, %xmm5
1497; SSE-NEXT:    movaps %xmm6, %xmm7
1498; SSE-NEXT:    subps %xmm5, %xmm7
1499; SSE-NEXT:    mulps %xmm4, %xmm7
1500; SSE-NEXT:    addps %xmm4, %xmm7
1501; SSE-NEXT:    movaps {{.*#+}} xmm4 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1]
1502; SSE-NEXT:    movaps %xmm7, %xmm5
1503; SSE-NEXT:    mulps %xmm4, %xmm5
1504; SSE-NEXT:    mulps %xmm5, %xmm2
1505; SSE-NEXT:    subps %xmm2, %xmm4
1506; SSE-NEXT:    mulps %xmm7, %xmm4
1507; SSE-NEXT:    addps %xmm5, %xmm4
1508; SSE-NEXT:    rcpps %xmm3, %xmm2
1509; SSE-NEXT:    movaps %xmm3, %xmm5
1510; SSE-NEXT:    mulps %xmm2, %xmm5
1511; SSE-NEXT:    subps %xmm5, %xmm6
1512; SSE-NEXT:    mulps %xmm2, %xmm6
1513; SSE-NEXT:    addps %xmm2, %xmm6
1514; SSE-NEXT:    movaps {{.*#+}} xmm5 = [1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1515; SSE-NEXT:    movaps %xmm6, %xmm2
1516; SSE-NEXT:    mulps %xmm5, %xmm2
1517; SSE-NEXT:    mulps %xmm2, %xmm3
1518; SSE-NEXT:    subps %xmm3, %xmm5
1519; SSE-NEXT:    mulps %xmm6, %xmm5
1520; SSE-NEXT:    addps %xmm2, %xmm5
1521; SSE-NEXT:    movaps %xmm4, %xmm2
1522; SSE-NEXT:    movaps %xmm5, %xmm3
1523; SSE-NEXT:    retq
1524;
1525; AVX-RECIP-LABEL: v16f32_two_step2:
1526; AVX-RECIP:       # %bb.0:
1527; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1528; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1529; AVX-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1530; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1531; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1532; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1533; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1534; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm5
1535; AVX-RECIP-NEXT:    vmulps %ymm5, %ymm0, %ymm0
1536; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1537; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1538; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm5, %ymm0
1539; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1540; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1541; AVX-RECIP-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1542; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1543; AVX-RECIP-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1544; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1545; AVX-RECIP-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1546; AVX-RECIP-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1547; AVX-RECIP-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1548; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1549; AVX-RECIP-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1550; AVX-RECIP-NEXT:    retq
1551;
1552; FMA-RECIP-LABEL: v16f32_two_step2:
1553; FMA-RECIP:       # %bb.0:
1554; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm2
1555; FMA-RECIP-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1556; FMA-RECIP-NEXT:    vmovaps %ymm2, %ymm4
1557; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1558; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1559; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1560; FMA-RECIP-NEXT:    vmulps %ymm2, %ymm4, %ymm5
1561; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2
1562; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5
1563; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm2
1564; FMA-RECIP-NEXT:    vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3
1565; FMA-RECIP-NEXT:    vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm2) + ymm2
1566; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1567; FMA-RECIP-NEXT:    vmulps %ymm2, %ymm3, %ymm4
1568; FMA-RECIP-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2
1569; FMA-RECIP-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4
1570; FMA-RECIP-NEXT:    retq
1571;
1572; BDVER2-LABEL: v16f32_two_step2:
1573; BDVER2:       # %bb.0:
1574; BDVER2-NEXT:    vrcpps %ymm0, %ymm2
1575; BDVER2-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1576; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm4 = (ymm0 * ymm2) - ymm3
1577; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm4) + ymm2
1578; BDVER2-NEXT:    vmovaps {{.*#+}} ymm4 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1579; BDVER2-NEXT:    vmulps %ymm4, %ymm2, %ymm5
1580; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm5) - ymm4
1581; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm5
1582; BDVER2-NEXT:    vrcpps %ymm1, %ymm2
1583; BDVER2-NEXT:    vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1584; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm3 = (ymm1 * ymm2) - ymm3
1585; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm2 = -(ymm2 * ymm3) + ymm2
1586; BDVER2-NEXT:    vmulps %ymm5, %ymm2, %ymm4
1587; BDVER2-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm4) - ymm5
1588; BDVER2-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm2 * ymm1) + ymm4
1589; BDVER2-NEXT:    retq
1590;
1591; BTVER2-LABEL: v16f32_two_step2:
1592; BTVER2:       # %bb.0:
1593; BTVER2-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1594; BTVER2-NEXT:    vrcpps %ymm0, %ymm2
1595; BTVER2-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1596; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1597; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1598; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1599; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1600; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm5
1601; BTVER2-NEXT:    vmulps %ymm5, %ymm0, %ymm0
1602; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1603; BTVER2-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1604; BTVER2-NEXT:    vrcpps %ymm1, %ymm2
1605; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1606; BTVER2-NEXT:    vaddps %ymm0, %ymm5, %ymm0
1607; BTVER2-NEXT:    vmovaps {{.*#+}} ymm5 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1608; BTVER2-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1609; BTVER2-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1610; BTVER2-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1611; BTVER2-NEXT:    vmulps %ymm5, %ymm2, %ymm4
1612; BTVER2-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1613; BTVER2-NEXT:    vsubps %ymm1, %ymm5, %ymm1
1614; BTVER2-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1615; BTVER2-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1616; BTVER2-NEXT:    retq
1617;
1618; SANDY-LABEL: v16f32_two_step2:
1619; SANDY:       # %bb.0:
1620; SANDY-NEXT:    vrcpps %ymm0, %ymm2
1621; SANDY-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1622; SANDY-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1623; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1624; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1625; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1626; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1627; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm5
1628; SANDY-NEXT:    vmulps %ymm5, %ymm0, %ymm0
1629; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1630; SANDY-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1631; SANDY-NEXT:    vaddps %ymm0, %ymm5, %ymm0
1632; SANDY-NEXT:    vrcpps %ymm1, %ymm2
1633; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm3
1634; SANDY-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1635; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1636; SANDY-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1637; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1638; SANDY-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1639; SANDY-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1640; SANDY-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1641; SANDY-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1642; SANDY-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1643; SANDY-NEXT:    retq
1644;
1645; HASWELL-LABEL: v16f32_two_step2:
1646; HASWELL:       # %bb.0:
1647; HASWELL-NEXT:    vrcpps %ymm0, %ymm2
1648; HASWELL-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1649; HASWELL-NEXT:    vmovaps %ymm2, %ymm4
1650; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm4 = (ymm0 * ymm4) - ymm3
1651; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm4 = -(ymm4 * ymm2) + ymm2
1652; HASWELL-NEXT:    vmovaps {{.*#+}} ymm2 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1653; HASWELL-NEXT:    vmulps %ymm2, %ymm4, %ymm5
1654; HASWELL-NEXT:    vrcpps %ymm1, %ymm6
1655; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm5 * ymm0) - ymm2
1656; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm4 * ymm0) + ymm5
1657; HASWELL-NEXT:    vfmsub231ps {{.*#+}} ymm3 = (ymm1 * ymm6) - ymm3
1658; HASWELL-NEXT:    vfnmadd132ps {{.*#+}} ymm3 = -(ymm3 * ymm6) + ymm6
1659; HASWELL-NEXT:    vmovaps {{.*#+}} ymm2 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1660; HASWELL-NEXT:    vmulps %ymm2, %ymm3, %ymm4
1661; HASWELL-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm4 * ymm1) - ymm2
1662; HASWELL-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm4
1663; HASWELL-NEXT:    retq
1664;
1665; HASWELL-NO-FMA-LABEL: v16f32_two_step2:
1666; HASWELL-NO-FMA:       # %bb.0:
1667; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm2
1668; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm0, %ymm3
1669; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1670; HASWELL-NO-FMA-NEXT:    vsubps %ymm3, %ymm4, %ymm3
1671; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm3
1672; HASWELL-NO-FMA-NEXT:    vaddps %ymm3, %ymm2, %ymm2
1673; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm3 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0]
1674; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm5
1675; HASWELL-NO-FMA-NEXT:    vmulps %ymm5, %ymm0, %ymm0
1676; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
1677; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm3
1678; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm2, %ymm0
1679; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm5, %ymm0
1680; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm1, %ymm2
1681; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm4, %ymm2
1682; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm3, %ymm2
1683; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm3, %ymm2
1684; HASWELL-NO-FMA-NEXT:    vmovaps {{.*#+}} ymm3 = [9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1685; HASWELL-NO-FMA-NEXT:    vmulps %ymm3, %ymm2, %ymm4
1686; HASWELL-NO-FMA-NEXT:    vmulps %ymm4, %ymm1, %ymm1
1687; HASWELL-NO-FMA-NEXT:    vsubps %ymm1, %ymm3, %ymm1
1688; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm2, %ymm1
1689; HASWELL-NO-FMA-NEXT:    vaddps %ymm1, %ymm4, %ymm1
1690; HASWELL-NO-FMA-NEXT:    retq
1691;
1692; AVX512-LABEL: v16f32_two_step2:
1693; AVX512:       # %bb.0:
1694; AVX512-NEXT:    vrcp14ps %zmm0, %zmm1
1695; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1696; AVX512-NEXT:    vfmsub231ps {{.*#+}} zmm2 = (zmm0 * zmm1) - zmm2
1697; AVX512-NEXT:    vfnmadd132ps {{.*#+}} zmm2 = -(zmm2 * zmm1) + zmm1
1698; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [1.0E+0,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0,8.0E+0,9.0E+0,1.0E+1,1.1E+1,1.2E+1,1.3E+1,1.4E+1,1.5E+1,1.6E+1]
1699; AVX512-NEXT:    vmulps %zmm1, %zmm2, %zmm3
1700; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm3 * zmm0) - zmm1
1701; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm2 * zmm0) + zmm3
1702; AVX512-NEXT:    retq
1703  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1704  ret <16 x float> %div
1705}
1706
1707define <16 x float> @v16f32_no_step(<16 x float> %x) #3 {
1708; SSE-LABEL: v16f32_no_step:
1709; SSE:       # %bb.0:
1710; SSE-NEXT:    rcpps %xmm0, %xmm0
1711; SSE-NEXT:    rcpps %xmm1, %xmm1
1712; SSE-NEXT:    rcpps %xmm2, %xmm2
1713; SSE-NEXT:    rcpps %xmm3, %xmm3
1714; SSE-NEXT:    retq
1715;
1716; AVX-RECIP-LABEL: v16f32_no_step:
1717; AVX-RECIP:       # %bb.0:
1718; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1719; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1720; AVX-RECIP-NEXT:    retq
1721;
1722; FMA-RECIP-LABEL: v16f32_no_step:
1723; FMA-RECIP:       # %bb.0:
1724; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1725; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1726; FMA-RECIP-NEXT:    retq
1727;
1728; BDVER2-LABEL: v16f32_no_step:
1729; BDVER2:       # %bb.0:
1730; BDVER2-NEXT:    vrcpps %ymm0, %ymm0
1731; BDVER2-NEXT:    vrcpps %ymm1, %ymm1
1732; BDVER2-NEXT:    retq
1733;
1734; BTVER2-LABEL: v16f32_no_step:
1735; BTVER2:       # %bb.0:
1736; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
1737; BTVER2-NEXT:    vrcpps %ymm1, %ymm1
1738; BTVER2-NEXT:    retq
1739;
1740; SANDY-LABEL: v16f32_no_step:
1741; SANDY:       # %bb.0:
1742; SANDY-NEXT:    vrcpps %ymm0, %ymm0
1743; SANDY-NEXT:    vrcpps %ymm1, %ymm1
1744; SANDY-NEXT:    retq
1745;
1746; HASWELL-LABEL: v16f32_no_step:
1747; HASWELL:       # %bb.0:
1748; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
1749; HASWELL-NEXT:    vrcpps %ymm1, %ymm1
1750; HASWELL-NEXT:    retq
1751;
1752; HASWELL-NO-FMA-LABEL: v16f32_no_step:
1753; HASWELL-NO-FMA:       # %bb.0:
1754; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
1755; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1
1756; HASWELL-NO-FMA-NEXT:    retq
1757;
1758; AVX512-LABEL: v16f32_no_step:
1759; AVX512:       # %bb.0:
1760; AVX512-NEXT:    vrcp14ps %zmm0, %zmm0
1761; AVX512-NEXT:    retq
1762  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
1763  ret <16 x float> %div
1764}
1765
1766define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
1767; SSE-LABEL: v16f32_no_step2:
1768; SSE:       # %bb.0:
1769; SSE-NEXT:    rcpps %xmm0, %xmm0
1770; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1771; SSE-NEXT:    rcpps %xmm1, %xmm1
1772; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1773; SSE-NEXT:    rcpps %xmm2, %xmm2
1774; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1775; SSE-NEXT:    rcpps %xmm3, %xmm3
1776; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1777; SSE-NEXT:    retq
1778;
1779; AVX-RECIP-LABEL: v16f32_no_step2:
1780; AVX-RECIP:       # %bb.0:
1781; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1782; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1783; AVX-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1784; AVX-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1785; AVX-RECIP-NEXT:    retq
1786;
1787; FMA-RECIP-LABEL: v16f32_no_step2:
1788; FMA-RECIP:       # %bb.0:
1789; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
1790; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1791; FMA-RECIP-NEXT:    vrcpps %ymm1, %ymm1
1792; FMA-RECIP-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1793; FMA-RECIP-NEXT:    retq
1794;
1795; BDVER2-LABEL: v16f32_no_step2:
1796; BDVER2:       # %bb.0:
1797; BDVER2-NEXT:    vrcpps %ymm0, %ymm0
1798; BDVER2-NEXT:    vrcpps %ymm1, %ymm1
1799; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1800; BDVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1801; BDVER2-NEXT:    retq
1802;
1803; BTVER2-LABEL: v16f32_no_step2:
1804; BTVER2:       # %bb.0:
1805; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
1806; BTVER2-NEXT:    vrcpps %ymm1, %ymm1
1807; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1808; BTVER2-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1809; BTVER2-NEXT:    retq
1810;
1811; SANDY-LABEL: v16f32_no_step2:
1812; SANDY:       # %bb.0:
1813; SANDY-NEXT:    vrcpps %ymm0, %ymm0
1814; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1815; SANDY-NEXT:    vrcpps %ymm1, %ymm1
1816; SANDY-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1817; SANDY-NEXT:    retq
1818;
1819; HASWELL-LABEL: v16f32_no_step2:
1820; HASWELL:       # %bb.0:
1821; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
1822; HASWELL-NEXT:    vrcpps %ymm1, %ymm1
1823; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1824; HASWELL-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1825; HASWELL-NEXT:    retq
1826;
1827; HASWELL-NO-FMA-LABEL: v16f32_no_step2:
1828; HASWELL-NO-FMA:       # %bb.0:
1829; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
1830; HASWELL-NO-FMA-NEXT:    vrcpps %ymm1, %ymm1
1831; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1832; HASWELL-NO-FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1833; HASWELL-NO-FMA-NEXT:    retq
1834;
1835; AVX512-LABEL: v16f32_no_step2:
1836; AVX512:       # %bb.0:
1837; AVX512-NEXT:    vrcp14ps %zmm0, %zmm0
1838; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1839; AVX512-NEXT:    retq
1840  %div = fdiv fast <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, %x
1841  ret <16 x float> %div
1842}
1843
1844attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
1845attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
1846attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
1847attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
1848
1849