xref: /llvm-project/llvm/test/CodeGen/X86/sqrt-fastmath.ll (revision 401d123a1fdcbbf4ae7a20178957b7e3a625c044)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
3; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX1
4; RUN: llc < %s -mtriple=x86_64--linux-gnu -mcpu=x86-64 -mattr=+avx512f  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512
5
6declare double @__sqrt_finite(double)
7declare float @__sqrtf_finite(float)
8declare x86_fp80 @__sqrtl_finite(x86_fp80)
9declare float @llvm.sqrt.f32(float)
10declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
11declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
12declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
13declare double @llvm.sqrt.f64(double)
14declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
15
16declare float @llvm.fabs.f32(float)
17declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
18declare double @llvm.fabs.f64(double)
19
20define double @finite_f64_no_estimate(double %d) #0 {
21; SSE-LABEL: finite_f64_no_estimate:
22; SSE:       # %bb.0:
23; SSE-NEXT:    sqrtsd %xmm0, %xmm0
24; SSE-NEXT:    retq
25;
26; AVX-LABEL: finite_f64_no_estimate:
27; AVX:       # %bb.0:
28; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
29; AVX-NEXT:    retq
30  %call = tail call double @__sqrt_finite(double %d) #2
31  ret double %call
32}
33
34; No estimates for doubles.
35
36define double @finite_f64_estimate(double %d) #1 {
37; SSE-LABEL: finite_f64_estimate:
38; SSE:       # %bb.0:
39; SSE-NEXT:    sqrtsd %xmm0, %xmm0
40; SSE-NEXT:    retq
41;
42; AVX-LABEL: finite_f64_estimate:
43; AVX:       # %bb.0:
44; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
45; AVX-NEXT:    retq
46  %call = tail call double @__sqrt_finite(double %d) #2
47  ret double %call
48}
49
50define float @finite_f32_no_estimate(float %f) #0 {
51; SSE-LABEL: finite_f32_no_estimate:
52; SSE:       # %bb.0:
53; SSE-NEXT:    sqrtss %xmm0, %xmm0
54; SSE-NEXT:    retq
55;
56; AVX-LABEL: finite_f32_no_estimate:
57; AVX:       # %bb.0:
58; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
59; AVX-NEXT:    retq
60  %call = tail call float @__sqrtf_finite(float %f) #2
61  ret float %call
62}
63
64define float @finite_f32_estimate_ieee(float %f) #1 {
65; SSE-LABEL: finite_f32_estimate_ieee:
66; SSE:       # %bb.0:
67; SSE-NEXT:    sqrtss %xmm0, %xmm0
68; SSE-NEXT:    retq
69;
70; AVX-LABEL: finite_f32_estimate_ieee:
71; AVX:       # %bb.0:
72; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
73; AVX-NEXT:    retq
74  %call = tail call float @__sqrtf_finite(float %f) #2
75  ret float %call
76}
77
78define float @finite_f32_estimate_ieee_ninf(float %f) #1 {
79; SSE-LABEL: finite_f32_estimate_ieee_ninf:
80; SSE:       # %bb.0:
81; SSE-NEXT:    sqrtss %xmm0, %xmm0
82; SSE-NEXT:    retq
83;
84; AVX-LABEL: finite_f32_estimate_ieee_ninf:
85; AVX:       # %bb.0:
86; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
87; AVX-NEXT:    retq
88  %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
89  ret float %call
90}
91
92define float @finite_f32_estimate_daz(float %f) #4 {
93; SSE-LABEL: finite_f32_estimate_daz:
94; SSE:       # %bb.0:
95; SSE-NEXT:    sqrtss %xmm0, %xmm0
96; SSE-NEXT:    retq
97;
98; AVX-LABEL: finite_f32_estimate_daz:
99; AVX:       # %bb.0:
100; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
101; AVX-NEXT:    retq
102  %call = tail call float @__sqrtf_finite(float %f) #2
103  ret float %call
104}
105
106define float @finite_f32_estimate_daz_ninf(float %f) #4 {
107; SSE-LABEL: finite_f32_estimate_daz_ninf:
108; SSE:       # %bb.0:
109; SSE-NEXT:    sqrtss %xmm0, %xmm0
110; SSE-NEXT:    retq
111;
112; AVX-LABEL: finite_f32_estimate_daz_ninf:
113; AVX:       # %bb.0:
114; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
115; AVX-NEXT:    retq
116  %call = tail call ninf afn float @__sqrtf_finite(float %f) #2
117  ret float %call
118}
119
120define x86_fp80 @finite_f80_no_estimate(x86_fp80 %ld) #0 {
121; CHECK-LABEL: finite_f80_no_estimate:
122; CHECK:       # %bb.0:
123; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
124; CHECK-NEXT:    fsqrt
125; CHECK-NEXT:    retq
126  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
127  ret x86_fp80 %call
128}
129
130; Don't die on the impossible.
131
132define x86_fp80 @finite_f80_estimate_but_no(x86_fp80 %ld) #1 {
133; CHECK-LABEL: finite_f80_estimate_but_no:
134; CHECK:       # %bb.0:
135; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
136; CHECK-NEXT:    fsqrt
137; CHECK-NEXT:    retq
138  %call = tail call x86_fp80 @__sqrtl_finite(x86_fp80 %ld) #2
139  ret x86_fp80 %call
140}
141
142; PR34994 - https://bugs.llvm.org/show_bug.cgi?id=34994
143
144define float @sqrtf_check_denorms(float %x) #3 {
145; SSE-LABEL: sqrtf_check_denorms:
146; SSE:       # %bb.0:
147; SSE-NEXT:    sqrtss %xmm0, %xmm0
148; SSE-NEXT:    retq
149;
150; AVX-LABEL: sqrtf_check_denorms:
151; AVX:       # %bb.0:
152; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
153; AVX-NEXT:    retq
154  %call = tail call float @__sqrtf_finite(float %x) #2
155  ret float %call
156}
157
158define float @sqrtf_check_denorms_ninf(float %x) #3 {
159; SSE-LABEL: sqrtf_check_denorms_ninf:
160; SSE:       # %bb.0:
161; SSE-NEXT:    sqrtss %xmm0, %xmm0
162; SSE-NEXT:    retq
163;
164; AVX-LABEL: sqrtf_check_denorms_ninf:
165; AVX:       # %bb.0:
166; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
167; AVX-NEXT:    retq
168  %call = tail call ninf afn float @__sqrtf_finite(float %x) #2
169  ret float %call
170}
171
172define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 {
173; SSE-LABEL: sqrt_v4f32_check_denorms:
174; SSE:       # %bb.0:
175; SSE-NEXT:    sqrtps %xmm0, %xmm0
176; SSE-NEXT:    retq
177;
178; AVX-LABEL: sqrt_v4f32_check_denorms:
179; AVX:       # %bb.0:
180; AVX-NEXT:    vsqrtps %xmm0, %xmm0
181; AVX-NEXT:    retq
182  %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
183  ret <4 x float> %call
184}
185
186define <4 x float> @sqrt_v4f32_check_denorms_ieee_ninf(<4 x float> %x) #3 {
187; SSE-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
188; SSE:       # %bb.0:
189; SSE-NEXT:    rsqrtps %xmm0, %xmm1
190; SSE-NEXT:    movaps %xmm0, %xmm2
191; SSE-NEXT:    mulps %xmm1, %xmm2
192; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
193; SSE-NEXT:    mulps %xmm2, %xmm3
194; SSE-NEXT:    mulps %xmm1, %xmm2
195; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
196; SSE-NEXT:    mulps %xmm3, %xmm2
197; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
198; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
199; SSE-NEXT:    cmpleps %xmm0, %xmm1
200; SSE-NEXT:    andps %xmm2, %xmm1
201; SSE-NEXT:    movaps %xmm1, %xmm0
202; SSE-NEXT:    retq
203;
204; AVX1-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
205; AVX1:       # %bb.0:
206; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
207; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm2
208; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
209; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
210; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
211; AVX1-NEXT:    vmulps %xmm1, %xmm3, %xmm1
212; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
213; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
214; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
215; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
216; AVX1-NEXT:    retq
217;
218; AVX512-LABEL: sqrt_v4f32_check_denorms_ieee_ninf:
219; AVX512:       # %bb.0:
220; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
221; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
222; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
223; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
224; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
225; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
226; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
227; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
228; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
229; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
230; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
231; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
232; AVX512-NEXT:    retq
233  %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
234  ret <4 x float> %call
235}
236
237define <4 x float> @sqrt_v4f32_check_denorms_dynamic_ninf(<4 x float> %x) #6 {
238; SSE-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
239; SSE:       # %bb.0:
240; SSE-NEXT:    rsqrtps %xmm0, %xmm1
241; SSE-NEXT:    movaps %xmm0, %xmm2
242; SSE-NEXT:    mulps %xmm1, %xmm2
243; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
244; SSE-NEXT:    mulps %xmm2, %xmm3
245; SSE-NEXT:    mulps %xmm1, %xmm2
246; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
247; SSE-NEXT:    mulps %xmm3, %xmm2
248; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
249; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
250; SSE-NEXT:    cmpleps %xmm0, %xmm1
251; SSE-NEXT:    andps %xmm2, %xmm1
252; SSE-NEXT:    movaps %xmm1, %xmm0
253; SSE-NEXT:    retq
254;
255; AVX1-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
256; AVX1:       # %bb.0:
257; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
258; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm2
259; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
260; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
261; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
262; AVX1-NEXT:    vmulps %xmm1, %xmm3, %xmm1
263; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
264; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
265; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
266; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
267; AVX1-NEXT:    retq
268;
269; AVX512-LABEL: sqrt_v4f32_check_denorms_dynamic_ninf:
270; AVX512:       # %bb.0:
271; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
272; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm2
273; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
274; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
275; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
276; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
277; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
278; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
279; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
280; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
281; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
282; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
283; AVX512-NEXT:    retq
284  %call = tail call ninf afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2
285  ret <4 x float> %call
286}
287
288define float @f32_no_estimate(float %x) #0 {
289; SSE-LABEL: f32_no_estimate:
290; SSE:       # %bb.0:
291; SSE-NEXT:    sqrtss %xmm0, %xmm1
292; SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
293; SSE-NEXT:    divss %xmm1, %xmm0
294; SSE-NEXT:    retq
295;
296; AVX-LABEL: f32_no_estimate:
297; AVX:       # %bb.0:
298; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
299; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
300; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
301; AVX-NEXT:    retq
302  %sqrt = tail call float @llvm.sqrt.f32(float %x)
303  %div = fdiv fast float 1.0, %sqrt
304  ret float %div
305}
306
307define float @f32_estimate(float %x) #1 {
308; SSE-LABEL: f32_estimate:
309; SSE:       # %bb.0:
310; SSE-NEXT:    rsqrtss %xmm0, %xmm1
311; SSE-NEXT:    mulss %xmm1, %xmm0
312; SSE-NEXT:    mulss %xmm1, %xmm0
313; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
314; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
315; SSE-NEXT:    mulss %xmm1, %xmm0
316; SSE-NEXT:    retq
317;
318; AVX1-LABEL: f32_estimate:
319; AVX1:       # %bb.0:
320; AVX1-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
321; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
322; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
323; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
324; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
325; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
326; AVX1-NEXT:    retq
327;
328; AVX512-LABEL: f32_estimate:
329; AVX512:       # %bb.0:
330; AVX512-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm1
331; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
332; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
333; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
334; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
335; AVX512-NEXT:    retq
336  %sqrt = tail call float @llvm.sqrt.f32(float %x)
337  %div = fdiv fast float 1.0, %sqrt
338  ret float %div
339}
340
341define float @f32_estimate2(float %x) #5 {
342; SSE-LABEL: f32_estimate2:
343; SSE:       # %bb.0:
344; SSE-NEXT:    sqrtss %xmm0, %xmm0
345; SSE-NEXT:    retq
346;
347; AVX-LABEL: f32_estimate2:
348; AVX:       # %bb.0:
349; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
350; AVX-NEXT:    retq
351  %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
352  ret float %sqrt
353}
354
355define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
356; SSE-LABEL: v4f32_no_estimate:
357; SSE:       # %bb.0:
358; SSE-NEXT:    sqrtps %xmm0, %xmm1
359; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
360; SSE-NEXT:    divps %xmm1, %xmm0
361; SSE-NEXT:    retq
362;
363; AVX-LABEL: v4f32_no_estimate:
364; AVX:       # %bb.0:
365; AVX-NEXT:    vsqrtps %xmm0, %xmm0
366; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
367; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
368; AVX-NEXT:    retq
369  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
370  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
371  ret <4 x float> %div
372}
373
374define <4 x float> @v4f32_estimate(<4 x float> %x) #1 {
375; SSE-LABEL: v4f32_estimate:
376; SSE:       # %bb.0:
377; SSE-NEXT:    rsqrtps %xmm0, %xmm1
378; SSE-NEXT:    mulps %xmm1, %xmm0
379; SSE-NEXT:    mulps %xmm1, %xmm0
380; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
381; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
382; SSE-NEXT:    mulps %xmm1, %xmm0
383; SSE-NEXT:    retq
384;
385; AVX1-LABEL: v4f32_estimate:
386; AVX1:       # %bb.0:
387; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
388; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
389; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
390; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
391; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
392; AVX1-NEXT:    vmulps %xmm0, %xmm1, %xmm0
393; AVX1-NEXT:    retq
394;
395; AVX512-LABEL: v4f32_estimate:
396; AVX512:       # %bb.0:
397; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
398; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
399; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
400; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
401; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm0 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
402; AVX512-NEXT:    vmulps %xmm0, %xmm1, %xmm0
403; AVX512-NEXT:    vmulps %xmm2, %xmm0, %xmm0
404; AVX512-NEXT:    retq
405  %sqrt = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
406  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
407  ret <4 x float> %div
408}
409
410define <4 x float> @v4f32_estimate2(<4 x float> %x) #5 {
411; SSE-LABEL: v4f32_estimate2:
412; SSE:       # %bb.0:
413; SSE-NEXT:    rsqrtps %xmm0, %xmm2
414; SSE-NEXT:    mulps %xmm0, %xmm2
415; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
416; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
417; SSE-NEXT:    cmpleps %xmm0, %xmm1
418; SSE-NEXT:    andps %xmm2, %xmm1
419; SSE-NEXT:    movaps %xmm1, %xmm0
420; SSE-NEXT:    retq
421;
422; AVX1-LABEL: v4f32_estimate2:
423; AVX1:       # %bb.0:
424; AVX1-NEXT:    vrsqrtps %xmm0, %xmm1
425; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm1
426; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
427; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
428; AVX1-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
429; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
430; AVX1-NEXT:    retq
431;
432; AVX512-LABEL: v4f32_estimate2:
433; AVX512:       # %bb.0:
434; AVX512-NEXT:    vrsqrtps %xmm0, %xmm1
435; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm1
436; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN]
437; AVX512-NEXT:    vandps %xmm2, %xmm0, %xmm0
438; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.17549435E-38,1.17549435E-38,1.17549435E-38,1.17549435E-38]
439; AVX512-NEXT:    vcmpleps %xmm0, %xmm2, %xmm0
440; AVX512-NEXT:    vandps %xmm1, %xmm0, %xmm0
441; AVX512-NEXT:    retq
442  %sqrt = tail call fast <4 x float> @llvm.sqrt.v4f32(<4 x float> %x)
443  ret <4 x float> %sqrt
444}
445
446define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
447; SSE-LABEL: v8f32_no_estimate:
448; SSE:       # %bb.0:
449; SSE-NEXT:    sqrtps %xmm1, %xmm2
450; SSE-NEXT:    sqrtps %xmm0, %xmm3
451; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
452; SSE-NEXT:    movaps %xmm1, %xmm0
453; SSE-NEXT:    divps %xmm3, %xmm0
454; SSE-NEXT:    divps %xmm2, %xmm1
455; SSE-NEXT:    retq
456;
457; AVX-LABEL: v8f32_no_estimate:
458; AVX:       # %bb.0:
459; AVX-NEXT:    vsqrtps %ymm0, %ymm0
460; AVX-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
461; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
462; AVX-NEXT:    retq
463  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
464  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
465  ret <8 x float> %div
466}
467
468define <8 x float> @v8f32_estimate(<8 x float> %x) #1 {
469; SSE-LABEL: v8f32_estimate:
470; SSE:       # %bb.0:
471; SSE-NEXT:    rsqrtps %xmm0, %xmm2
472; SSE-NEXT:    movaps {{.*#+}} xmm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
473; SSE-NEXT:    mulps %xmm2, %xmm0
474; SSE-NEXT:    mulps %xmm2, %xmm0
475; SSE-NEXT:    mulps %xmm3, %xmm2
476; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
477; SSE-NEXT:    addps %xmm4, %xmm0
478; SSE-NEXT:    mulps %xmm2, %xmm0
479; SSE-NEXT:    rsqrtps %xmm1, %xmm2
480; SSE-NEXT:    mulps %xmm2, %xmm3
481; SSE-NEXT:    mulps %xmm2, %xmm1
482; SSE-NEXT:    mulps %xmm2, %xmm1
483; SSE-NEXT:    addps %xmm4, %xmm1
484; SSE-NEXT:    mulps %xmm3, %xmm1
485; SSE-NEXT:    retq
486;
487; AVX1-LABEL: v8f32_estimate:
488; AVX1:       # %bb.0:
489; AVX1-NEXT:    vrsqrtps %ymm0, %ymm1
490; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
491; AVX1-NEXT:    vmulps %ymm1, %ymm0, %ymm0
492; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
493; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
494; AVX1-NEXT:    vmulps %ymm0, %ymm1, %ymm0
495; AVX1-NEXT:    retq
496;
497; AVX512-LABEL: v8f32_estimate:
498; AVX512:       # %bb.0:
499; AVX512-NEXT:    vrsqrtps %ymm0, %ymm1
500; AVX512-NEXT:    vmulps %ymm1, %ymm0, %ymm0
501; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
502; AVX512-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
503; AVX512-NEXT:    vfmadd231ps {{.*#+}} ymm2 = (ymm1 * ymm0) + ymm2
504; AVX512-NEXT:    vmulps %ymm3, %ymm1, %ymm0
505; AVX512-NEXT:    vmulps %ymm2, %ymm0, %ymm0
506; AVX512-NEXT:    retq
507  %sqrt = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %x)
508  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
509  ret <8 x float> %div
510}
511
512define <16 x float> @v16f32_no_estimate(<16 x float> %x) #0 {
513; SSE-LABEL: v16f32_no_estimate:
514; SSE:       # %bb.0:
515; SSE-NEXT:    sqrtps %xmm3, %xmm4
516; SSE-NEXT:    sqrtps %xmm2, %xmm5
517; SSE-NEXT:    sqrtps %xmm1, %xmm2
518; SSE-NEXT:    sqrtps %xmm0, %xmm1
519; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
520; SSE-NEXT:    movaps %xmm3, %xmm0
521; SSE-NEXT:    divps %xmm1, %xmm0
522; SSE-NEXT:    movaps %xmm3, %xmm1
523; SSE-NEXT:    divps %xmm2, %xmm1
524; SSE-NEXT:    movaps %xmm3, %xmm2
525; SSE-NEXT:    divps %xmm5, %xmm2
526; SSE-NEXT:    divps %xmm4, %xmm3
527; SSE-NEXT:    retq
528;
529; AVX1-LABEL: v16f32_no_estimate:
530; AVX1:       # %bb.0:
531; AVX1-NEXT:    vsqrtps %ymm1, %ymm1
532; AVX1-NEXT:    vsqrtps %ymm0, %ymm0
533; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
534; AVX1-NEXT:    vdivps %ymm0, %ymm2, %ymm0
535; AVX1-NEXT:    vdivps %ymm1, %ymm2, %ymm1
536; AVX1-NEXT:    retq
537;
538; AVX512-LABEL: v16f32_no_estimate:
539; AVX512:       # %bb.0:
540; AVX512-NEXT:    vsqrtps %zmm0, %zmm0
541; AVX512-NEXT:    vbroadcastss {{.*#+}} zmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
542; AVX512-NEXT:    vdivps %zmm0, %zmm1, %zmm0
543; AVX512-NEXT:    retq
544  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
545  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
546  ret <16 x float> %div
547}
548
549define <16 x float> @v16f32_estimate(<16 x float> %x) #1 {
550; SSE-LABEL: v16f32_estimate:
551; SSE:       # %bb.0:
552; SSE-NEXT:    rsqrtps %xmm0, %xmm6
553; SSE-NEXT:    movaps {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
554; SSE-NEXT:    mulps %xmm6, %xmm0
555; SSE-NEXT:    mulps %xmm6, %xmm0
556; SSE-NEXT:    mulps %xmm4, %xmm6
557; SSE-NEXT:    movaps {{.*#+}} xmm5 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
558; SSE-NEXT:    addps %xmm5, %xmm0
559; SSE-NEXT:    mulps %xmm6, %xmm0
560; SSE-NEXT:    rsqrtps %xmm1, %xmm6
561; SSE-NEXT:    mulps %xmm6, %xmm1
562; SSE-NEXT:    mulps %xmm6, %xmm1
563; SSE-NEXT:    mulps %xmm4, %xmm6
564; SSE-NEXT:    addps %xmm5, %xmm1
565; SSE-NEXT:    mulps %xmm6, %xmm1
566; SSE-NEXT:    rsqrtps %xmm2, %xmm6
567; SSE-NEXT:    mulps %xmm6, %xmm2
568; SSE-NEXT:    mulps %xmm6, %xmm2
569; SSE-NEXT:    mulps %xmm4, %xmm6
570; SSE-NEXT:    addps %xmm5, %xmm2
571; SSE-NEXT:    mulps %xmm6, %xmm2
572; SSE-NEXT:    rsqrtps %xmm3, %xmm6
573; SSE-NEXT:    mulps %xmm6, %xmm4
574; SSE-NEXT:    mulps %xmm6, %xmm3
575; SSE-NEXT:    mulps %xmm6, %xmm3
576; SSE-NEXT:    addps %xmm5, %xmm3
577; SSE-NEXT:    mulps %xmm4, %xmm3
578; SSE-NEXT:    retq
579;
580; AVX1-LABEL: v16f32_estimate:
581; AVX1:       # %bb.0:
582; AVX1-NEXT:    vrsqrtps %ymm0, %ymm2
583; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm3 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
584; AVX1-NEXT:    vmulps %ymm3, %ymm2, %ymm4
585; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
586; AVX1-NEXT:    vmulps %ymm2, %ymm0, %ymm0
587; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm2 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
588; AVX1-NEXT:    vaddps %ymm2, %ymm0, %ymm0
589; AVX1-NEXT:    vrsqrtps %ymm1, %ymm5
590; AVX1-NEXT:    vmulps %ymm0, %ymm4, %ymm0
591; AVX1-NEXT:    vmulps %ymm3, %ymm5, %ymm3
592; AVX1-NEXT:    vmulps %ymm5, %ymm1, %ymm1
593; AVX1-NEXT:    vmulps %ymm5, %ymm1, %ymm1
594; AVX1-NEXT:    vaddps %ymm2, %ymm1, %ymm1
595; AVX1-NEXT:    vmulps %ymm1, %ymm3, %ymm1
596; AVX1-NEXT:    retq
597;
598; AVX512-LABEL: v16f32_estimate:
599; AVX512:       # %bb.0:
600; AVX512-NEXT:    vrsqrt14ps %zmm0, %zmm1
601; AVX512-NEXT:    vmulps %zmm1, %zmm0, %zmm0
602; AVX512-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + mem
603; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
604; AVX512-NEXT:    vmulps %zmm0, %zmm1, %zmm0
605; AVX512-NEXT:    retq
606  %sqrt = tail call <16 x float> @llvm.sqrt.v16f32(<16 x float> %x)
607  %div = fdiv fast <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
608  ret <16 x float> %div
609}
610
611; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
612
613define float @div_sqrt_fabs_f32(float %x, float %y, float %z) {
614; SSE-LABEL: div_sqrt_fabs_f32:
615; SSE:       # %bb.0:
616; SSE-NEXT:    mulss %xmm1, %xmm1
617; SSE-NEXT:    mulss %xmm2, %xmm1
618; SSE-NEXT:    xorps %xmm2, %xmm2
619; SSE-NEXT:    rsqrtss %xmm1, %xmm2
620; SSE-NEXT:    mulss %xmm2, %xmm1
621; SSE-NEXT:    mulss %xmm2, %xmm1
622; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
623; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
624; SSE-NEXT:    mulss %xmm2, %xmm0
625; SSE-NEXT:    mulss %xmm1, %xmm0
626; SSE-NEXT:    retq
627;
628; AVX1-LABEL: div_sqrt_fabs_f32:
629; AVX1:       # %bb.0:
630; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm1
631; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
632; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
633; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
634; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
635; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
636; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
637; AVX1-NEXT:    vmulss %xmm2, %xmm0, %xmm0
638; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
639; AVX1-NEXT:    retq
640;
641; AVX512-LABEL: div_sqrt_fabs_f32:
642; AVX512:       # %bb.0:
643; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm1
644; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
645; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
646; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
647; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
648; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
649; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
650; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
651; AVX512-NEXT:    retq
652  %s = call fast float @llvm.sqrt.f32(float %z)
653  %a = call fast float @llvm.fabs.f32(float %y)
654  %m = fmul fast float %s, %a
655  %d = fdiv fast float %x, %m
656  ret float %d
657}
658
659; x / (fabs(y) * sqrt(z)) --> x * rsqrt(y*y*z)
660
661define <4 x float> @div_sqrt_fabs_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
662; SSE-LABEL: div_sqrt_fabs_v4f32:
663; SSE:       # %bb.0:
664; SSE-NEXT:    mulps %xmm1, %xmm1
665; SSE-NEXT:    mulps %xmm2, %xmm1
666; SSE-NEXT:    rsqrtps %xmm1, %xmm2
667; SSE-NEXT:    mulps %xmm2, %xmm1
668; SSE-NEXT:    mulps %xmm2, %xmm1
669; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
670; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
671; SSE-NEXT:    mulps %xmm1, %xmm2
672; SSE-NEXT:    mulps %xmm2, %xmm0
673; SSE-NEXT:    retq
674;
675; AVX1-LABEL: div_sqrt_fabs_v4f32:
676; AVX1:       # %bb.0:
677; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm1
678; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
679; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
680; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
681; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
682; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
683; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
684; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
685; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
686; AVX1-NEXT:    retq
687;
688; AVX512-LABEL: div_sqrt_fabs_v4f32:
689; AVX512:       # %bb.0:
690; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm1
691; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
692; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
693; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
694; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
695; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
696; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
697; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
698; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
699; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
700; AVX512-NEXT:    retq
701  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
702  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
703  %m = fmul contract reassoc <4 x float> %a, %s
704  %d = fdiv contract reassoc arcp <4 x float> %x, %m
705  ret <4 x float> %d
706}
707
708; This has 'arcp' but does not have 'reassoc' FMF.
709; We allow converting the sqrt to an estimate, but
710; do not pull the divisor into the estimate.
711; x / (fabs(y) * sqrt(z)) --> x * rsqrt(z) / fabs(y)
712
713define <4 x float> @div_sqrt_fabs_v4f32_fmf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
714; SSE-LABEL: div_sqrt_fabs_v4f32_fmf:
715; SSE:       # %bb.0:
716; SSE-NEXT:    rsqrtps %xmm2, %xmm3
717; SSE-NEXT:    mulps %xmm3, %xmm2
718; SSE-NEXT:    mulps %xmm3, %xmm2
719; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
720; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
721; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
722; SSE-NEXT:    mulps %xmm2, %xmm3
723; SSE-NEXT:    divps %xmm1, %xmm3
724; SSE-NEXT:    mulps %xmm3, %xmm0
725; SSE-NEXT:    retq
726;
727; AVX1-LABEL: div_sqrt_fabs_v4f32_fmf:
728; AVX1:       # %bb.0:
729; AVX1-NEXT:    vrsqrtps %xmm2, %xmm3
730; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
731; AVX1-NEXT:    vmulps %xmm3, %xmm2, %xmm2
732; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
733; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
734; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
735; AVX1-NEXT:    vmulps %xmm2, %xmm3, %xmm2
736; AVX1-NEXT:    vdivps %xmm1, %xmm2, %xmm1
737; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
738; AVX1-NEXT:    retq
739;
740; AVX512-LABEL: div_sqrt_fabs_v4f32_fmf:
741; AVX512:       # %bb.0:
742; AVX512-NEXT:    vrsqrtps %xmm2, %xmm3
743; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm4 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
744; AVX512-NEXT:    vmulps %xmm4, %xmm3, %xmm4
745; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
746; AVX512-NEXT:    vmulps %xmm3, %xmm2, %xmm2
747; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
748; AVX512-NEXT:    vaddps %xmm3, %xmm2, %xmm2
749; AVX512-NEXT:    vmulps %xmm2, %xmm4, %xmm2
750; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [NaN,NaN,NaN,NaN]
751; AVX512-NEXT:    vandps %xmm3, %xmm1, %xmm1
752; AVX512-NEXT:    vdivps %xmm1, %xmm2, %xmm1
753; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
754; AVX512-NEXT:    retq
755  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %z)
756  %a = call <4 x float> @llvm.fabs.v4f32(<4 x float> %y)
757  %m = fmul <4 x float> %a, %s
758  %d = fdiv arcp <4 x float> %x, %m
759  ret <4 x float> %d
760}
761
762; No estimates for f64, so do not convert fabs into an fmul.
763
764define double @div_sqrt_fabs_f64(double %x, double %y, double %z) {
765; SSE-LABEL: div_sqrt_fabs_f64:
766; SSE:       # %bb.0:
767; SSE-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
768; SSE-NEXT:    sqrtsd %xmm2, %xmm2
769; SSE-NEXT:    mulsd %xmm2, %xmm1
770; SSE-NEXT:    divsd %xmm1, %xmm0
771; SSE-NEXT:    retq
772;
773; AVX-LABEL: div_sqrt_fabs_f64:
774; AVX:       # %bb.0:
775; AVX-NEXT:    vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
776; AVX-NEXT:    vsqrtsd %xmm2, %xmm2, %xmm2
777; AVX-NEXT:    vmulsd %xmm1, %xmm2, %xmm1
778; AVX-NEXT:    vdivsd %xmm1, %xmm0, %xmm0
779; AVX-NEXT:    retq
780  %s = call fast double @llvm.sqrt.f64(double %z)
781  %a = call fast double @llvm.fabs.f64(double %y)
782  %m = fmul fast double %s, %a
783  %d = fdiv fast double %x, %m
784  ret double %d
785}
786
787; This is a special case for the general pattern above -
788; if the sqrt operand is the same as the other mul op,
789; then fabs may be omitted.
790; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
791
792define float @div_sqrt_f32(float %x, float %y) {
793; SSE-LABEL: div_sqrt_f32:
794; SSE:       # %bb.0:
795; SSE-NEXT:    movaps %xmm1, %xmm2
796; SSE-NEXT:    mulss %xmm1, %xmm2
797; SSE-NEXT:    mulss %xmm1, %xmm2
798; SSE-NEXT:    xorps %xmm1, %xmm1
799; SSE-NEXT:    rsqrtss %xmm2, %xmm1
800; SSE-NEXT:    mulss %xmm1, %xmm2
801; SSE-NEXT:    mulss %xmm1, %xmm2
802; SSE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
803; SSE-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
804; SSE-NEXT:    mulss %xmm1, %xmm0
805; SSE-NEXT:    mulss %xmm2, %xmm0
806; SSE-NEXT:    retq
807;
808; AVX1-LABEL: div_sqrt_f32:
809; AVX1:       # %bb.0:
810; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm2
811; AVX1-NEXT:    vmulss %xmm1, %xmm2, %xmm1
812; AVX1-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
813; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
814; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm1
815; AVX1-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
816; AVX1-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
817; AVX1-NEXT:    vmulss %xmm2, %xmm0, %xmm0
818; AVX1-NEXT:    vmulss %xmm1, %xmm0, %xmm0
819; AVX1-NEXT:    retq
820;
821; AVX512-LABEL: div_sqrt_f32:
822; AVX512:       # %bb.0:
823; AVX512-NEXT:    vmulss %xmm1, %xmm1, %xmm2
824; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm1
825; AVX512-NEXT:    vrsqrtss %xmm1, %xmm1, %xmm2
826; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm1
827; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm2 * xmm1) + mem
828; AVX512-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
829; AVX512-NEXT:    vmulss %xmm2, %xmm0, %xmm0
830; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
831; AVX512-NEXT:    retq
832  %s = call fast float @llvm.sqrt.f32(float %y)
833  %m = fmul fast float %s, %y
834  %d = fdiv fast float %x, %m
835  ret float %d
836}
837
838; This is a special case for the general pattern above -
839; if the sqrt operand is the same as the other mul op,
840; then fabs may be omitted.
841; x / (y * sqrt(y)) --> x * rsqrt(y*y*y)
842
843define <4 x float> @div_sqrt_v4f32(<4 x float> %x, <4 x float> %y) {
844; SSE-LABEL: div_sqrt_v4f32:
845; SSE:       # %bb.0:
846; SSE-NEXT:    movaps %xmm1, %xmm2
847; SSE-NEXT:    mulps %xmm1, %xmm2
848; SSE-NEXT:    mulps %xmm1, %xmm2
849; SSE-NEXT:    rsqrtps %xmm2, %xmm1
850; SSE-NEXT:    mulps %xmm1, %xmm2
851; SSE-NEXT:    mulps %xmm1, %xmm2
852; SSE-NEXT:    addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
853; SSE-NEXT:    mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
854; SSE-NEXT:    mulps %xmm2, %xmm1
855; SSE-NEXT:    mulps %xmm1, %xmm0
856; SSE-NEXT:    retq
857;
858; AVX1-LABEL: div_sqrt_v4f32:
859; AVX1:       # %bb.0:
860; AVX1-NEXT:    vmulps %xmm1, %xmm1, %xmm2
861; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
862; AVX1-NEXT:    vrsqrtps %xmm1, %xmm2
863; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
864; AVX1-NEXT:    vmulps %xmm2, %xmm1, %xmm1
865; AVX1-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
866; AVX1-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
867; AVX1-NEXT:    vmulps %xmm1, %xmm2, %xmm1
868; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
869; AVX1-NEXT:    retq
870;
871; AVX512-LABEL: div_sqrt_v4f32:
872; AVX512:       # %bb.0:
873; AVX512-NEXT:    vmulps %xmm1, %xmm1, %xmm2
874; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
875; AVX512-NEXT:    vrsqrtps %xmm1, %xmm2
876; AVX512-NEXT:    vmulps %xmm2, %xmm1, %xmm1
877; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-3.0E+0,-3.0E+0,-3.0E+0,-3.0E+0]
878; AVX512-NEXT:    vfmadd231ps {{.*#+}} xmm3 = (xmm2 * xmm1) + xmm3
879; AVX512-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
880; AVX512-NEXT:    vmulps %xmm1, %xmm2, %xmm1
881; AVX512-NEXT:    vmulps %xmm3, %xmm1, %xmm1
882; AVX512-NEXT:    vmulps %xmm1, %xmm0, %xmm0
883; AVX512-NEXT:    retq
884  %s = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %y)
885  %m = fmul contract reassoc <4 x float> %y, %s
886  %d = fdiv contract reassoc arcp <4 x float> %x, %m
887  ret <4 x float> %d
888}
889
890define double @sqrt_fdiv_common_operand(double %x) nounwind {
891; SSE-LABEL: sqrt_fdiv_common_operand:
892; SSE:       # %bb.0:
893; SSE-NEXT:    sqrtsd %xmm0, %xmm0
894; SSE-NEXT:    retq
895;
896; AVX-LABEL: sqrt_fdiv_common_operand:
897; AVX:       # %bb.0:
898; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
899; AVX-NEXT:    retq
900  %sqrt = call fast double @llvm.sqrt.f64(double %x)
901  %r = fdiv fast double %x, %sqrt
902  ret double %r
903}
904
905define <2 x double> @sqrt_fdiv_common_operand_vec(<2 x double> %x) nounwind {
906; SSE-LABEL: sqrt_fdiv_common_operand_vec:
907; SSE:       # %bb.0:
908; SSE-NEXT:    sqrtpd %xmm0, %xmm0
909; SSE-NEXT:    retq
910;
911; AVX-LABEL: sqrt_fdiv_common_operand_vec:
912; AVX:       # %bb.0:
913; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
914; AVX-NEXT:    retq
915  %sqrt = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
916  %r = fdiv arcp nsz reassoc <2 x double> %x, %sqrt
917  ret <2 x double> %r
918}
919
920define double @sqrt_fdiv_common_operand_extra_use(double %x, ptr %p) nounwind {
921; SSE-LABEL: sqrt_fdiv_common_operand_extra_use:
922; SSE:       # %bb.0:
923; SSE-NEXT:    sqrtsd %xmm0, %xmm0
924; SSE-NEXT:    movsd %xmm0, (%rdi)
925; SSE-NEXT:    retq
926;
927; AVX-LABEL: sqrt_fdiv_common_operand_extra_use:
928; AVX:       # %bb.0:
929; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
930; AVX-NEXT:    vmovsd %xmm0, (%rdi)
931; AVX-NEXT:    retq
932  %sqrt = call fast double @llvm.sqrt.f64(double %x)
933  store double %sqrt, ptr %p
934  %r = fdiv fast double %x, %sqrt
935  ret double %r
936}
937
938define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind {
939; SSE-LABEL: sqrt_simplify_before_recip:
940; SSE:       # %bb.0:
941; SSE-NEXT:    sqrtsd %xmm0, %xmm0
942; SSE-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
943; SSE-NEXT:    divsd %xmm0, %xmm1
944; SSE-NEXT:    movsd %xmm1, (%rdi)
945; SSE-NEXT:    retq
946;
947; AVX-LABEL: sqrt_simplify_before_recip:
948; AVX:       # %bb.0:
949; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
950; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
951; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
952; AVX-NEXT:    vmovsd %xmm1, (%rdi)
953; AVX-NEXT:    retq
954  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
955  %rsqrt = fdiv fast double 1.0, %sqrt
956  %sqrt_fast = fdiv fast double %x, %sqrt
957  store double %rsqrt, ptr %p, align 8
958  ret double %sqrt_fast
959}
960
961define <2 x double> @sqrt_simplify_before_recip_vec(<2 x double> %x, ptr %p) nounwind {
962; SSE-LABEL: sqrt_simplify_before_recip_vec:
963; SSE:       # %bb.0:
964; SSE-NEXT:    sqrtpd %xmm0, %xmm0
965; SSE-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
966; SSE-NEXT:    divpd %xmm0, %xmm1
967; SSE-NEXT:    movupd %xmm1, (%rdi)
968; SSE-NEXT:    retq
969;
970; AVX-LABEL: sqrt_simplify_before_recip_vec:
971; AVX:       # %bb.0:
972; AVX-NEXT:    vsqrtpd %xmm0, %xmm0
973; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [1.0E+0,1.0E+0]
974; AVX-NEXT:    # xmm1 = mem[0,0]
975; AVX-NEXT:    vdivpd %xmm0, %xmm1, %xmm1
976; AVX-NEXT:    vmovupd %xmm1, (%rdi)
977; AVX-NEXT:    retq
978  %sqrt = tail call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> %x)
979  %rsqrt = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
980  %sqrt_fast = fdiv fast <2 x double> %x, %sqrt
981  store <2 x double> %rsqrt, ptr %p, align 8
982  ret <2 x double> %sqrt_fast
983}
984
985define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
986; SSE-LABEL: sqrt_simplify_before_recip_order:
987; SSE:       # %bb.0:
988; SSE-NEXT:    sqrtsd %xmm0, %xmm0
989; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
990; SSE-NEXT:    divsd %xmm0, %xmm1
991; SSE-NEXT:    movsd %xmm1, (%rdi)
992; SSE-NEXT:    retq
993;
994; AVX-LABEL: sqrt_simplify_before_recip_order:
995; AVX:       # %bb.0:
996; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
997; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
998; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
999; AVX-NEXT:    vmovsd %xmm1, (%rdi)
1000; AVX-NEXT:    retq
1001  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
1002  %sqrt_fast = fdiv fast double %x, %sqrt
1003  %rsqrt = fdiv fast double 42.0, %sqrt
1004  store double %rsqrt, ptr %p, align 8
1005  ret double %sqrt_fast
1006}
1007
1008attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
1009attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
1010attributes #2 = { nounwind readnone }
1011attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
1012attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
1013attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
1014attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
1015