xref: /llvm-project/llvm/test/CodeGen/X86/fma_patterns.ll (revision a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA,FMA-INFS
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefixes=FMA4,FMA4-INFS
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast | FileCheck %s --check-prefixes=AVX512,AVX512-INFS
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA,FMA-NOINFS
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=FMA4,FMA4-NOINFS
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefixes=AVX512,AVX512-NOINFS
10
11;
12; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
13;
14
15define float @test_f32_fmadd(float %a0, float %a1, float %a2) {
16; FMA-LABEL: test_f32_fmadd:
17; FMA:       # %bb.0:
18; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
19; FMA-NEXT:    retq
20;
21; FMA4-LABEL: test_f32_fmadd:
22; FMA4:       # %bb.0:
23; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
24; FMA4-NEXT:    retq
25;
26; AVX512-LABEL: test_f32_fmadd:
27; AVX512:       # %bb.0:
28; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
29; AVX512-NEXT:    retq
30  %x = fmul float %a0, %a1
31  %res = fadd float %x, %a2
32  ret float %res
33}
34
35define <4 x float> @test_4f32_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
36; FMA-LABEL: test_4f32_fmadd:
37; FMA:       # %bb.0:
38; FMA-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
39; FMA-NEXT:    retq
40;
41; FMA4-LABEL: test_4f32_fmadd:
42; FMA4:       # %bb.0:
43; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
44; FMA4-NEXT:    retq
45;
46; AVX512-LABEL: test_4f32_fmadd:
47; AVX512:       # %bb.0:
48; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
49; AVX512-NEXT:    retq
50  %x = fmul <4 x float> %a0, %a1
51  %res = fadd <4 x float> %x, %a2
52  ret <4 x float> %res
53}
54
55define <8 x float> @test_8f32_fmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
56; FMA-LABEL: test_8f32_fmadd:
57; FMA:       # %bb.0:
58; FMA-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
59; FMA-NEXT:    retq
60;
61; FMA4-LABEL: test_8f32_fmadd:
62; FMA4:       # %bb.0:
63; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
64; FMA4-NEXT:    retq
65;
66; AVX512-LABEL: test_8f32_fmadd:
67; AVX512:       # %bb.0:
68; AVX512-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
69; AVX512-NEXT:    retq
70  %x = fmul <8 x float> %a0, %a1
71  %res = fadd <8 x float> %x, %a2
72  ret <8 x float> %res
73}
74
75define double @test_f64_fmadd(double %a0, double %a1, double %a2) {
76; FMA-LABEL: test_f64_fmadd:
77; FMA:       # %bb.0:
78; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
79; FMA-NEXT:    retq
80;
81; FMA4-LABEL: test_f64_fmadd:
82; FMA4:       # %bb.0:
83; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
84; FMA4-NEXT:    retq
85;
86; AVX512-LABEL: test_f64_fmadd:
87; AVX512:       # %bb.0:
88; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
89; AVX512-NEXT:    retq
90  %x = fmul double %a0, %a1
91  %res = fadd double %x, %a2
92  ret double %res
93}
94
95define <2 x double> @test_2f64_fmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
96; FMA-LABEL: test_2f64_fmadd:
97; FMA:       # %bb.0:
98; FMA-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
99; FMA-NEXT:    retq
100;
101; FMA4-LABEL: test_2f64_fmadd:
102; FMA4:       # %bb.0:
103; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
104; FMA4-NEXT:    retq
105;
106; AVX512-LABEL: test_2f64_fmadd:
107; AVX512:       # %bb.0:
108; AVX512-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
109; AVX512-NEXT:    retq
110  %x = fmul <2 x double> %a0, %a1
111  %res = fadd <2 x double> %x, %a2
112  ret <2 x double> %res
113}
114
115define <4 x double> @test_4f64_fmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
116; FMA-LABEL: test_4f64_fmadd:
117; FMA:       # %bb.0:
118; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
119; FMA-NEXT:    retq
120;
121; FMA4-LABEL: test_4f64_fmadd:
122; FMA4:       # %bb.0:
123; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
124; FMA4-NEXT:    retq
125;
126; AVX512-LABEL: test_4f64_fmadd:
127; AVX512:       # %bb.0:
128; AVX512-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
129; AVX512-NEXT:    retq
130  %x = fmul <4 x double> %a0, %a1
131  %res = fadd <4 x double> %x, %a2
132  ret <4 x double> %res
133}
134
135;
136; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
137;
138
139define float @test_f32_fmsub(float %a0, float %a1, float %a2) {
140; FMA-LABEL: test_f32_fmsub:
141; FMA:       # %bb.0:
142; FMA-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
143; FMA-NEXT:    retq
144;
145; FMA4-LABEL: test_f32_fmsub:
146; FMA4:       # %bb.0:
147; FMA4-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
148; FMA4-NEXT:    retq
149;
150; AVX512-LABEL: test_f32_fmsub:
151; AVX512:       # %bb.0:
152; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
153; AVX512-NEXT:    retq
154  %x = fmul float %a0, %a1
155  %res = fsub float %x, %a2
156  ret float %res
157}
158
159define <4 x float> @test_4f32_fmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
160; FMA-LABEL: test_4f32_fmsub:
161; FMA:       # %bb.0:
162; FMA-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
163; FMA-NEXT:    retq
164;
165; FMA4-LABEL: test_4f32_fmsub:
166; FMA4:       # %bb.0:
167; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
168; FMA4-NEXT:    retq
169;
170; AVX512-LABEL: test_4f32_fmsub:
171; AVX512:       # %bb.0:
172; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
173; AVX512-NEXT:    retq
174  %x = fmul <4 x float> %a0, %a1
175  %res = fsub <4 x float> %x, %a2
176  ret <4 x float> %res
177}
178
179define <8 x float> @test_8f32_fmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
180; FMA-LABEL: test_8f32_fmsub:
181; FMA:       # %bb.0:
182; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
183; FMA-NEXT:    retq
184;
185; FMA4-LABEL: test_8f32_fmsub:
186; FMA4:       # %bb.0:
187; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
188; FMA4-NEXT:    retq
189;
190; AVX512-LABEL: test_8f32_fmsub:
191; AVX512:       # %bb.0:
192; AVX512-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
193; AVX512-NEXT:    retq
194  %x = fmul <8 x float> %a0, %a1
195  %res = fsub <8 x float> %x, %a2
196  ret <8 x float> %res
197}
198
199define double @test_f64_fmsub(double %a0, double %a1, double %a2) {
200; FMA-LABEL: test_f64_fmsub:
201; FMA:       # %bb.0:
202; FMA-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
203; FMA-NEXT:    retq
204;
205; FMA4-LABEL: test_f64_fmsub:
206; FMA4:       # %bb.0:
207; FMA4-NEXT:    vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
208; FMA4-NEXT:    retq
209;
210; AVX512-LABEL: test_f64_fmsub:
211; AVX512:       # %bb.0:
212; AVX512-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
213; AVX512-NEXT:    retq
214  %x = fmul double %a0, %a1
215  %res = fsub double %x, %a2
216  ret double %res
217}
218
219define <2 x double> @test_2f64_fmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
220; FMA-LABEL: test_2f64_fmsub:
221; FMA:       # %bb.0:
222; FMA-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
223; FMA-NEXT:    retq
224;
225; FMA4-LABEL: test_2f64_fmsub:
226; FMA4:       # %bb.0:
227; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
228; FMA4-NEXT:    retq
229;
230; AVX512-LABEL: test_2f64_fmsub:
231; AVX512:       # %bb.0:
232; AVX512-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
233; AVX512-NEXT:    retq
234  %x = fmul <2 x double> %a0, %a1
235  %res = fsub <2 x double> %x, %a2
236  ret <2 x double> %res
237}
238
239define <4 x double> @test_4f64_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
240; FMA-LABEL: test_4f64_fmsub:
241; FMA:       # %bb.0:
242; FMA-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
243; FMA-NEXT:    retq
244;
245; FMA4-LABEL: test_4f64_fmsub:
246; FMA4:       # %bb.0:
247; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm1) - ymm2
248; FMA4-NEXT:    retq
249;
250; AVX512-LABEL: test_4f64_fmsub:
251; AVX512:       # %bb.0:
252; AVX512-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
253; AVX512-NEXT:    retq
254  %x = fmul <4 x double> %a0, %a1
255  %res = fsub <4 x double> %x, %a2
256  ret <4 x double> %res
257}
258
259;
260; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
261;
262
263define float @test_f32_fnmadd(float %a0, float %a1, float %a2) {
264; FMA-LABEL: test_f32_fnmadd:
265; FMA:       # %bb.0:
266; FMA-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
267; FMA-NEXT:    retq
268;
269; FMA4-LABEL: test_f32_fnmadd:
270; FMA4:       # %bb.0:
271; FMA4-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
272; FMA4-NEXT:    retq
273;
274; AVX512-LABEL: test_f32_fnmadd:
275; AVX512:       # %bb.0:
276; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
277; AVX512-NEXT:    retq
278  %x = fmul float %a0, %a1
279  %res = fsub float %a2, %x
280  ret float %res
281}
282
283define <4 x float> @test_4f32_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
284; FMA-LABEL: test_4f32_fnmadd:
285; FMA:       # %bb.0:
286; FMA-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
287; FMA-NEXT:    retq
288;
289; FMA4-LABEL: test_4f32_fnmadd:
290; FMA4:       # %bb.0:
291; FMA4-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
292; FMA4-NEXT:    retq
293;
294; AVX512-LABEL: test_4f32_fnmadd:
295; AVX512:       # %bb.0:
296; AVX512-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
297; AVX512-NEXT:    retq
298  %x = fmul <4 x float> %a0, %a1
299  %res = fsub <4 x float> %a2, %x
300  ret <4 x float> %res
301}
302
303define <8 x float> @test_8f32_fnmadd(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
304; FMA-LABEL: test_8f32_fnmadd:
305; FMA:       # %bb.0:
306; FMA-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
307; FMA-NEXT:    retq
308;
309; FMA4-LABEL: test_8f32_fnmadd:
310; FMA4:       # %bb.0:
311; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
312; FMA4-NEXT:    retq
313;
314; AVX512-LABEL: test_8f32_fnmadd:
315; AVX512:       # %bb.0:
316; AVX512-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
317; AVX512-NEXT:    retq
318  %x = fmul <8 x float> %a0, %a1
319  %res = fsub <8 x float> %a2, %x
320  ret <8 x float> %res
321}
322
323define double @test_f64_fnmadd(double %a0, double %a1, double %a2) {
324; FMA-LABEL: test_f64_fnmadd:
325; FMA:       # %bb.0:
326; FMA-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
327; FMA-NEXT:    retq
328;
329; FMA4-LABEL: test_f64_fnmadd:
330; FMA4:       # %bb.0:
331; FMA4-NEXT:    vfnmaddsd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
332; FMA4-NEXT:    retq
333;
334; AVX512-LABEL: test_f64_fnmadd:
335; AVX512:       # %bb.0:
336; AVX512-NEXT:    vfnmadd213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
337; AVX512-NEXT:    retq
338  %x = fmul double %a0, %a1
339  %res = fsub double %a2, %x
340  ret double %res
341}
342
343define <2 x double> @test_2f64_fnmadd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
344; FMA-LABEL: test_2f64_fnmadd:
345; FMA:       # %bb.0:
346; FMA-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
347; FMA-NEXT:    retq
348;
349; FMA4-LABEL: test_2f64_fnmadd:
350; FMA4:       # %bb.0:
351; FMA4-NEXT:    vfnmaddpd {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm2
352; FMA4-NEXT:    retq
353;
354; AVX512-LABEL: test_2f64_fnmadd:
355; AVX512:       # %bb.0:
356; AVX512-NEXT:    vfnmadd213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm2
357; AVX512-NEXT:    retq
358  %x = fmul <2 x double> %a0, %a1
359  %res = fsub <2 x double> %a2, %x
360  ret <2 x double> %res
361}
362
363define <4 x double> @test_4f64_fnmadd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
364; FMA-LABEL: test_4f64_fnmadd:
365; FMA:       # %bb.0:
366; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
367; FMA-NEXT:    retq
368;
369; FMA4-LABEL: test_4f64_fnmadd:
370; FMA4:       # %bb.0:
371; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
372; FMA4-NEXT:    retq
373;
374; AVX512-LABEL: test_4f64_fnmadd:
375; AVX512:       # %bb.0:
376; AVX512-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
377; AVX512-NEXT:    retq
378  %x = fmul <4 x double> %a0, %a1
379  %res = fsub <4 x double> %a2, %x
380  ret <4 x double> %res
381}
382
383;
384; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
385;
386
387define float @test_f32_fnmsub(float %a0, float %a1, float %a2) {
388; FMA-LABEL: test_f32_fnmsub:
389; FMA:       # %bb.0:
390; FMA-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
391; FMA-NEXT:    retq
392;
393; FMA4-LABEL: test_f32_fnmsub:
394; FMA4:       # %bb.0:
395; FMA4-NEXT:    vfnmsubss {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
396; FMA4-NEXT:    retq
397;
398; AVX512-LABEL: test_f32_fnmsub:
399; AVX512:       # %bb.0:
400; AVX512-NEXT:    vfnmsub213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
401; AVX512-NEXT:    retq
402  %x = fmul float %a0, %a1
403  %y = fsub float -0.000000e+00, %x
404  %res = fsub float %y, %a2
405  ret float %res
406}
407
408define <4 x float> @test_4f32_fnmsub(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
409; FMA-LABEL: test_4f32_fnmsub:
410; FMA:       # %bb.0:
411; FMA-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
412; FMA-NEXT:    retq
413;
414; FMA4-LABEL: test_4f32_fnmsub:
415; FMA4:       # %bb.0:
416; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
417; FMA4-NEXT:    retq
418;
419; AVX512-LABEL: test_4f32_fnmsub:
420; AVX512:       # %bb.0:
421; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
422; AVX512-NEXT:    retq
423  %x = fmul <4 x float> %a0, %a1
424  %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
425  %res = fsub <4 x float> %y, %a2
426  ret <4 x float> %res
427}
428
429define <8 x float> @test_8f32_fnmsub(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
430; FMA-LABEL: test_8f32_fnmsub:
431; FMA:       # %bb.0:
432; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
433; FMA-NEXT:    retq
434;
435; FMA4-LABEL: test_8f32_fnmsub:
436; FMA4:       # %bb.0:
437; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
438; FMA4-NEXT:    retq
439;
440; AVX512-LABEL: test_8f32_fnmsub:
441; AVX512:       # %bb.0:
442; AVX512-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
443; AVX512-NEXT:    retq
444  %x = fmul <8 x float> %a0, %a1
445  %y = fsub <8 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
446  %res = fsub <8 x float> %y, %a2
447  ret <8 x float> %res
448}
449
450define double @test_f64_fnmsub(double %a0, double %a1, double %a2) {
451; FMA-LABEL: test_f64_fnmsub:
452; FMA:       # %bb.0:
453; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
454; FMA-NEXT:    retq
455;
456; FMA4-LABEL: test_f64_fnmsub:
457; FMA4:       # %bb.0:
458; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
459; FMA4-NEXT:    retq
460;
461; AVX512-LABEL: test_f64_fnmsub:
462; AVX512:       # %bb.0:
463; AVX512-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
464; AVX512-NEXT:    retq
465  %x = fmul double %a0, %a1
466  %y = fsub double -0.000000e+00, %x
467  %res = fsub double %y, %a2
468  ret double %res
469}
470
471define <2 x double> @test_2f64_fnmsub(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
472; FMA-LABEL: test_2f64_fnmsub:
473; FMA:       # %bb.0:
474; FMA-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
475; FMA-NEXT:    retq
476;
477; FMA4-LABEL: test_2f64_fnmsub:
478; FMA4:       # %bb.0:
479; FMA4-NEXT:    vfnmsubpd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
480; FMA4-NEXT:    retq
481;
482; AVX512-LABEL: test_2f64_fnmsub:
483; AVX512:       # %bb.0:
484; AVX512-NEXT:    vfnmsub213pd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
485; AVX512-NEXT:    retq
486  %x = fmul <2 x double> %a0, %a1
487  %y = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %x
488  %res = fsub <2 x double> %y, %a2
489  ret <2 x double> %res
490}
491
492define <4 x double> @test_4f64_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
493; FMA-LABEL: test_4f64_fnmsub:
494; FMA:       # %bb.0:
495; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
496; FMA-NEXT:    retq
497;
498; FMA4-LABEL: test_4f64_fnmsub:
499; FMA4:       # %bb.0:
500; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
501; FMA4-NEXT:    retq
502;
503; AVX512-LABEL: test_4f64_fnmsub:
504; AVX512:       # %bb.0:
505; AVX512-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
506; AVX512-NEXT:    retq
507  %x = fmul <4 x double> %a0, %a1
508  %y = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
509  %res = fsub <4 x double> %y, %a2
510  ret <4 x double> %res
511}
512
513;
514; Load Folding Patterns
515;
516
517define <4 x float> @test_4f32_fmadd_load(ptr %a0, <4 x float> %a1, <4 x float> %a2) {
518; FMA-LABEL: test_4f32_fmadd_load:
519; FMA:       # %bb.0:
520; FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
521; FMA-NEXT:    retq
522;
523; FMA4-LABEL: test_4f32_fmadd_load:
524; FMA4:       # %bb.0:
525; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
526; FMA4-NEXT:    retq
527;
528; AVX512-LABEL: test_4f32_fmadd_load:
529; AVX512:       # %bb.0:
530; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
531; AVX512-NEXT:    retq
532  %x = load <4 x float>, ptr %a0
533  %y = fmul <4 x float> %x, %a1
534  %res = fadd <4 x float> %y, %a2
535  ret <4 x float> %res
536}
537
538define <2 x double> @test_2f64_fmsub_load(ptr %a0, <2 x double> %a1, <2 x double> %a2) {
539; FMA-LABEL: test_2f64_fmsub_load:
540; FMA:       # %bb.0:
541; FMA-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
542; FMA-NEXT:    retq
543;
544; FMA4-LABEL: test_2f64_fmsub_load:
545; FMA4:       # %bb.0:
546; FMA4-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
547; FMA4-NEXT:    retq
548;
549; AVX512-LABEL: test_2f64_fmsub_load:
550; AVX512:       # %bb.0:
551; AVX512-NEXT:    vfmsub132pd {{.*#+}} xmm0 = (xmm0 * mem) - xmm1
552; AVX512-NEXT:    retq
553  %x = load <2 x double>, ptr %a0
554  %y = fmul <2 x double> %x, %a1
555  %res = fsub <2 x double> %y, %a2
556  ret <2 x double> %res
557}
558
559;
560; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
561;
562
563define <4 x float> @test_v4f32_mul_add_x_one_y(<4 x float> %x, <4 x float> %y) {
564; FMA-INFS-LABEL: test_v4f32_mul_add_x_one_y:
565; FMA-INFS:       # %bb.0:
566; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
567; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
568; FMA-INFS-NEXT:    retq
569;
570; FMA4-INFS-LABEL: test_v4f32_mul_add_x_one_y:
571; FMA4-INFS:       # %bb.0:
572; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
573; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
574; FMA4-INFS-NEXT:    retq
575;
576; AVX512-INFS-LABEL: test_v4f32_mul_add_x_one_y:
577; AVX512-INFS:       # %bb.0:
578; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
579; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
580; AVX512-INFS-NEXT:    retq
581;
582; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
583; FMA-NOINFS:       # %bb.0:
584; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
585; FMA-NOINFS-NEXT:    retq
586;
587; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
588; FMA4-NOINFS:       # %bb.0:
589; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
590; FMA4-NOINFS-NEXT:    retq
591;
592; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_one_y:
593; AVX512-NOINFS:       # %bb.0:
594; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
595; AVX512-NOINFS-NEXT:    retq
596  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
597  %m = fmul <4 x float> %a, %y
598  ret <4 x float> %m
599}
600
601define <4 x float> @test_v4f32_mul_y_add_x_one(<4 x float> %x, <4 x float> %y) {
602; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one:
603; FMA-INFS:       # %bb.0:
604; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
605; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
606; FMA-INFS-NEXT:    retq
607;
608; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one:
609; FMA4-INFS:       # %bb.0:
610; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
611; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
612; FMA4-INFS-NEXT:    retq
613;
614; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one:
615; AVX512-INFS:       # %bb.0:
616; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
617; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
618; AVX512-INFS-NEXT:    retq
619;
620; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
621; FMA-NOINFS:       # %bb.0:
622; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
623; FMA-NOINFS-NEXT:    retq
624;
625; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
626; FMA4-NOINFS:       # %bb.0:
627; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
628; FMA4-NOINFS-NEXT:    retq
629;
630; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one:
631; AVX512-NOINFS:       # %bb.0:
632; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
633; AVX512-NOINFS-NEXT:    retq
634  %a = fadd <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
635  %m = fmul <4 x float> %y, %a
636  ret <4 x float> %m
637}
638
639define <4 x float> @test_v4f32_mul_y_add_x_one_undefs(<4 x float> %x, <4 x float> %y) {
640; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
641; FMA-INFS:       # %bb.0:
642; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
643; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
644; FMA-INFS-NEXT:    retq
645;
646; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
647; FMA4-INFS:       # %bb.0:
648; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
649; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
650; FMA4-INFS-NEXT:    retq
651;
652; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
653; AVX512-INFS:       # %bb.0:
654; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
655; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
656; AVX512-INFS-NEXT:    retq
657;
658; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
659; FMA-NOINFS:       # %bb.0:
660; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
661; FMA-NOINFS-NEXT:    retq
662;
663; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
664; FMA4-NOINFS:       # %bb.0:
665; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
666; FMA4-NOINFS-NEXT:    retq
667;
668; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_one_undefs:
669; AVX512-NOINFS:       # %bb.0:
670; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
671; AVX512-NOINFS-NEXT:    retq
672  %a = fadd <4 x float> %x, <float 1.0, float undef, float 1.0, float undef>
673  %m = fmul <4 x float> %y, %a
674  ret <4 x float> %m
675}
676
677define <4 x float> @test_v4f32_mul_add_x_negone_y(<4 x float> %x, <4 x float> %y) {
678; FMA-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
679; FMA-INFS:       # %bb.0:
680; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
681; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
682; FMA-INFS-NEXT:    retq
683;
684; FMA4-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
685; FMA4-INFS:       # %bb.0:
686; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
687; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
688; FMA4-INFS-NEXT:    retq
689;
690; AVX512-INFS-LABEL: test_v4f32_mul_add_x_negone_y:
691; AVX512-INFS:       # %bb.0:
692; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
693; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
694; AVX512-INFS-NEXT:    retq
695;
696; FMA-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
697; FMA-NOINFS:       # %bb.0:
698; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
699; FMA-NOINFS-NEXT:    retq
700;
701; FMA4-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
702; FMA4-NOINFS:       # %bb.0:
703; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
704; FMA4-NOINFS-NEXT:    retq
705;
706; AVX512-NOINFS-LABEL: test_v4f32_mul_add_x_negone_y:
707; AVX512-NOINFS:       # %bb.0:
708; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
709; AVX512-NOINFS-NEXT:    retq
710  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
711  %m = fmul <4 x float> %a, %y
712  ret <4 x float> %m
713}
714
715define <4 x float> @test_v4f32_mul_y_add_x_negone(<4 x float> %x, <4 x float> %y) {
716; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
717; FMA-INFS:       # %bb.0:
718; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
719; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
720; FMA-INFS-NEXT:    retq
721;
722; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
723; FMA4-INFS:       # %bb.0:
724; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
725; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
726; FMA4-INFS-NEXT:    retq
727;
728; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone:
729; AVX512-INFS:       # %bb.0:
730; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
731; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
732; AVX512-INFS-NEXT:    retq
733;
734; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
735; FMA-NOINFS:       # %bb.0:
736; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
737; FMA-NOINFS-NEXT:    retq
738;
739; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
740; FMA4-NOINFS:       # %bb.0:
741; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
742; FMA4-NOINFS-NEXT:    retq
743;
744; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone:
745; AVX512-NOINFS:       # %bb.0:
746; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
747; AVX512-NOINFS-NEXT:    retq
748  %a = fadd <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
749  %m = fmul <4 x float> %y, %a
750  ret <4 x float> %m
751}
752
753define <4 x float> @test_v4f32_mul_y_add_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
754; FMA-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
755; FMA-INFS:       # %bb.0:
756; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
757; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
758; FMA-INFS-NEXT:    retq
759;
760; FMA4-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
761; FMA4-INFS:       # %bb.0:
762; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
763; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
764; FMA4-INFS-NEXT:    retq
765;
766; AVX512-INFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
767; AVX512-INFS:       # %bb.0:
768; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
769; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
770; AVX512-INFS-NEXT:    retq
771;
772; FMA-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
773; FMA-NOINFS:       # %bb.0:
774; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
775; FMA-NOINFS-NEXT:    retq
776;
777; FMA4-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
778; FMA4-NOINFS:       # %bb.0:
779; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
780; FMA4-NOINFS-NEXT:    retq
781;
782; AVX512-NOINFS-LABEL: test_v4f32_mul_y_add_x_negone_undefs:
783; AVX512-NOINFS:       # %bb.0:
784; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
785; AVX512-NOINFS-NEXT:    retq
786  %a = fadd <4 x float> %x, <float undef, float -1.0, float undef, float -1.0>
787  %m = fmul <4 x float> %y, %a
788  ret <4 x float> %m
789}
790
791define <4 x float> @test_v4f32_mul_sub_one_x_y(<4 x float> %x, <4 x float> %y) {
792; FMA-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
793; FMA-INFS:       # %bb.0:
794; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
795; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
796; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
797; FMA-INFS-NEXT:    retq
798;
799; FMA4-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
800; FMA4-INFS:       # %bb.0:
801; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
802; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
803; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
804; FMA4-INFS-NEXT:    retq
805;
806; AVX512-INFS-LABEL: test_v4f32_mul_sub_one_x_y:
807; AVX512-INFS:       # %bb.0:
808; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
809; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
810; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
811; AVX512-INFS-NEXT:    retq
812;
813; FMA-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
814; FMA-NOINFS:       # %bb.0:
815; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
816; FMA-NOINFS-NEXT:    retq
817;
818; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
819; FMA4-NOINFS:       # %bb.0:
820; FMA4-NOINFS-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
821; FMA4-NOINFS-NEXT:    retq
822;
823; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_one_x_y:
824; AVX512-NOINFS:       # %bb.0:
825; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
826; AVX512-NOINFS-NEXT:    retq
827  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
828  %m = fmul <4 x float> %s, %y
829  ret <4 x float> %m
830}
831
832define <4 x float> @test_v4f32_mul_y_sub_one_x(<4 x float> %x, <4 x float> %y) {
833; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
834; FMA-INFS:       # %bb.0:
835; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
836; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
837; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
838; FMA-INFS-NEXT:    retq
839;
840; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
841; FMA4-INFS:       # %bb.0:
842; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
843; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
844; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
845; FMA4-INFS-NEXT:    retq
846;
847; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x:
848; AVX512-INFS:       # %bb.0:
849; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
850; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
851; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
852; AVX512-INFS-NEXT:    retq
853;
854; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
855; FMA-NOINFS:       # %bb.0:
856; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
857; FMA-NOINFS-NEXT:    retq
858;
859; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
860; FMA4-NOINFS:       # %bb.0:
861; FMA4-NOINFS-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
862; FMA4-NOINFS-NEXT:    retq
863;
864; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x:
865; AVX512-NOINFS:       # %bb.0:
866; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
867; AVX512-NOINFS-NEXT:    retq
868  %s = fsub <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
869  %m = fmul <4 x float> %y, %s
870  ret <4 x float> %m
871}
872
873define <4 x float> @test_v4f32_mul_y_sub_one_x_undefs(<4 x float> %x, <4 x float> %y) {
874; FMA-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
875; FMA-INFS:       # %bb.0:
876; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
877; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
878; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
879; FMA-INFS-NEXT:    retq
880;
881; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
882; FMA4-INFS:       # %bb.0:
883; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
884; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
885; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
886; FMA4-INFS-NEXT:    retq
887;
888; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
889; AVX512-INFS:       # %bb.0:
890; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
891; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
892; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
893; AVX512-INFS-NEXT:    retq
894;
895; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
896; FMA-NOINFS:       # %bb.0:
897; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
898; FMA-NOINFS-NEXT:    retq
899;
900; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
901; FMA4-NOINFS:       # %bb.0:
902; FMA4-NOINFS-NEXT:    vfnmaddps {{.*#+}} xmm0 = -(xmm0 * xmm1) + xmm1
903; FMA4-NOINFS-NEXT:    retq
904;
905; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_one_x_undefs:
906; AVX512-NOINFS:       # %bb.0:
907; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm1
908; AVX512-NOINFS-NEXT:    retq
909  %s = fsub <4 x float> <float 1.0, float undef, float 1.0, float 1.0>, %x
910  %m = fmul <4 x float> %y, %s
911  ret <4 x float> %m
912}
913
914define <4 x float> @test_v4f32_mul_sub_negone_x_y(<4 x float> %x, <4 x float> %y) {
915; FMA-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
916; FMA-INFS:       # %bb.0:
917; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
918; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
919; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
920; FMA-INFS-NEXT:    retq
921;
922; FMA4-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
923; FMA4-INFS:       # %bb.0:
924; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
925; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
926; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
927; FMA4-INFS-NEXT:    retq
928;
929; AVX512-INFS-LABEL: test_v4f32_mul_sub_negone_x_y:
930; AVX512-INFS:       # %bb.0:
931; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
932; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
933; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
934; AVX512-INFS-NEXT:    retq
935;
936; FMA-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
937; FMA-NOINFS:       # %bb.0:
938; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
939; FMA-NOINFS-NEXT:    retq
940;
941; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
942; FMA4-NOINFS:       # %bb.0:
943; FMA4-NOINFS-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1
944; FMA4-NOINFS-NEXT:    retq
945;
946; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_negone_x_y:
947; AVX512-NOINFS:       # %bb.0:
948; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
949; AVX512-NOINFS-NEXT:    retq
950  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
951  %m = fmul <4 x float> %s, %y
952  ret <4 x float> %m
953}
954
955define <4 x float> @test_v4f32_mul_y_sub_negone_x(<4 x float> %x, <4 x float> %y) {
956; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
957; FMA-INFS:       # %bb.0:
958; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
959; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
960; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
961; FMA-INFS-NEXT:    retq
962;
963; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
964; FMA4-INFS:       # %bb.0:
965; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
966; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
967; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
968; FMA4-INFS-NEXT:    retq
969;
970; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x:
971; AVX512-INFS:       # %bb.0:
972; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
973; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
974; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
975; AVX512-INFS-NEXT:    retq
976;
977; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
978; FMA-NOINFS:       # %bb.0:
979; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
980; FMA-NOINFS-NEXT:    retq
981;
982; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
983; FMA4-NOINFS:       # %bb.0:
984; FMA4-NOINFS-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1
985; FMA4-NOINFS-NEXT:    retq
986;
987; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x:
988; AVX512-NOINFS:       # %bb.0:
989; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
990; AVX512-NOINFS-NEXT:    retq
991  %s = fsub <4 x float> <float -1.0, float -1.0, float -1.0, float -1.0>, %x
992  %m = fmul <4 x float> %y, %s
993  ret <4 x float> %m
994}
995
996define <4 x float> @test_v4f32_mul_y_sub_negone_x_undefs(<4 x float> %x, <4 x float> %y) {
997; FMA-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
998; FMA-INFS:       # %bb.0:
999; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
1000; FMA-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
1001; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1002; FMA-INFS-NEXT:    retq
1003;
1004; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
1005; FMA4-INFS:       # %bb.0:
1006; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
1007; FMA4-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
1008; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1009; FMA4-INFS-NEXT:    retq
1010;
1011; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
1012; AVX512-INFS:       # %bb.0:
1013; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
1014; AVX512-INFS-NEXT:    vsubps %xmm0, %xmm2, %xmm0
1015; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1016; AVX512-INFS-NEXT:    retq
1017;
1018; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
1019; FMA-NOINFS:       # %bb.0:
1020; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
1021; FMA-NOINFS-NEXT:    retq
1022;
1023; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
1024; FMA4-NOINFS:       # %bb.0:
1025; FMA4-NOINFS-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm1
1026; FMA4-NOINFS-NEXT:    retq
1027;
1028; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_negone_x_undefs:
1029; AVX512-NOINFS:       # %bb.0:
1030; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm1
1031; AVX512-NOINFS-NEXT:    retq
1032  %s = fsub <4 x float> <float -1.0, float -1.0, float undef, float -1.0>, %x
1033  %m = fmul <4 x float> %y, %s
1034  ret <4 x float> %m
1035}
1036
1037define <4 x float> @test_v4f32_mul_sub_x_one_y(<4 x float> %x, <4 x float> %y) {
1038; FMA-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
1039; FMA-INFS:       # %bb.0:
1040; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1041; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1042; FMA-INFS-NEXT:    retq
1043;
1044; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
1045; FMA4-INFS:       # %bb.0:
1046; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1047; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1048; FMA4-INFS-NEXT:    retq
1049;
1050; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_one_y:
1051; AVX512-INFS:       # %bb.0:
1052; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1053; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1054; AVX512-INFS-NEXT:    retq
1055;
1056; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
1057; FMA-NOINFS:       # %bb.0:
1058; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1059; FMA-NOINFS-NEXT:    retq
1060;
1061; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
1062; FMA4-NOINFS:       # %bb.0:
1063; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
1064; FMA4-NOINFS-NEXT:    retq
1065;
1066; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_one_y:
1067; AVX512-NOINFS:       # %bb.0:
1068; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1069; AVX512-NOINFS-NEXT:    retq
1070  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
1071  %m = fmul <4 x float> %s, %y
1072  ret <4 x float> %m
1073}
1074
1075define <4 x float> @test_v4f32_mul_y_sub_x_one(<4 x float> %x, <4 x float> %y) {
1076; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
1077; FMA-INFS:       # %bb.0:
1078; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1079; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1080; FMA-INFS-NEXT:    retq
1081;
1082; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
1083; FMA4-INFS:       # %bb.0:
1084; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1085; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1086; FMA4-INFS-NEXT:    retq
1087;
1088; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one:
1089; AVX512-INFS:       # %bb.0:
1090; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1091; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1092; AVX512-INFS-NEXT:    retq
1093;
1094; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
1095; FMA-NOINFS:       # %bb.0:
1096; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1097; FMA-NOINFS-NEXT:    retq
1098;
1099; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
1100; FMA4-NOINFS:       # %bb.0:
1101; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
1102; FMA4-NOINFS-NEXT:    retq
1103;
1104; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one:
1105; AVX512-NOINFS:       # %bb.0:
1106; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1107; AVX512-NOINFS-NEXT:    retq
1108  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0>
1109  %m = fmul <4 x float> %y, %s
1110  ret <4 x float> %m
1111}
1112
1113define <4 x float> @test_v4f32_mul_y_sub_x_one_undefs(<4 x float> %x, <4 x float> %y) {
1114; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1115; FMA-INFS:       # %bb.0:
1116; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1117; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1118; FMA-INFS-NEXT:    retq
1119;
1120; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1121; FMA4-INFS:       # %bb.0:
1122; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1123; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1124; FMA4-INFS-NEXT:    retq
1125;
1126; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1127; AVX512-INFS:       # %bb.0:
1128; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1129; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1130; AVX512-INFS-NEXT:    retq
1131;
1132; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1133; FMA-NOINFS:       # %bb.0:
1134; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1135; FMA-NOINFS-NEXT:    retq
1136;
1137; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1138; FMA4-NOINFS:       # %bb.0:
1139; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm1
1140; FMA4-NOINFS-NEXT:    retq
1141;
1142; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_one_undefs:
1143; AVX512-NOINFS:       # %bb.0:
1144; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm1
1145; AVX512-NOINFS-NEXT:    retq
1146  %s = fsub <4 x float> %x, <float 1.0, float 1.0, float 1.0, float undef>
1147  %m = fmul <4 x float> %y, %s
1148  ret <4 x float> %m
1149}
1150
1151define <4 x float> @test_v4f32_mul_sub_x_negone_y(<4 x float> %x, <4 x float> %y) {
1152; FMA-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1153; FMA-INFS:       # %bb.0:
1154; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1155; FMA-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1156; FMA-INFS-NEXT:    retq
1157;
1158; FMA4-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1159; FMA4-INFS:       # %bb.0:
1160; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1161; FMA4-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1162; FMA4-INFS-NEXT:    retq
1163;
1164; AVX512-INFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1165; AVX512-INFS:       # %bb.0:
1166; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1167; AVX512-INFS-NEXT:    vmulps %xmm1, %xmm0, %xmm0
1168; AVX512-INFS-NEXT:    retq
1169;
1170; FMA-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1171; FMA-NOINFS:       # %bb.0:
1172; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1173; FMA-NOINFS-NEXT:    retq
1174;
1175; FMA4-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1176; FMA4-NOINFS:       # %bb.0:
1177; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
1178; FMA4-NOINFS-NEXT:    retq
1179;
1180; AVX512-NOINFS-LABEL: test_v4f32_mul_sub_x_negone_y:
1181; AVX512-NOINFS:       # %bb.0:
1182; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1183; AVX512-NOINFS-NEXT:    retq
1184  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
1185  %m = fmul <4 x float> %s, %y
1186  ret <4 x float> %m
1187}
1188
1189define <4 x float> @test_v4f32_mul_y_sub_x_negone(<4 x float> %x, <4 x float> %y) {
1190; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1191; FMA-INFS:       # %bb.0:
1192; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1193; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1194; FMA-INFS-NEXT:    retq
1195;
1196; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1197; FMA4-INFS:       # %bb.0:
1198; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1199; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1200; FMA4-INFS-NEXT:    retq
1201;
1202; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1203; AVX512-INFS:       # %bb.0:
1204; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1205; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1206; AVX512-INFS-NEXT:    retq
1207;
1208; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1209; FMA-NOINFS:       # %bb.0:
1210; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1211; FMA-NOINFS-NEXT:    retq
1212;
1213; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1214; FMA4-NOINFS:       # %bb.0:
1215; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
1216; FMA4-NOINFS-NEXT:    retq
1217;
1218; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone:
1219; AVX512-NOINFS:       # %bb.0:
1220; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1221; AVX512-NOINFS-NEXT:    retq
1222  %s = fsub <4 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0>
1223  %m = fmul <4 x float> %y, %s
1224  ret <4 x float> %m
1225}
1226
1227define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x float> %y) {
1228; FMA-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1229; FMA-INFS:       # %bb.0:
1230; FMA-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1231; FMA-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1232; FMA-INFS-NEXT:    retq
1233;
1234; FMA4-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1235; FMA4-INFS:       # %bb.0:
1236; FMA4-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1237; FMA4-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1238; FMA4-INFS-NEXT:    retq
1239;
1240; AVX512-INFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1241; AVX512-INFS:       # %bb.0:
1242; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1243; AVX512-INFS-NEXT:    vmulps %xmm0, %xmm1, %xmm0
1244; AVX512-INFS-NEXT:    retq
1245;
1246; FMA-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1247; FMA-NOINFS:       # %bb.0:
1248; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1249; FMA-NOINFS-NEXT:    retq
1250;
1251; FMA4-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1252; FMA4-NOINFS:       # %bb.0:
1253; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm1
1254; FMA4-NOINFS-NEXT:    retq
1255;
1256; AVX512-NOINFS-LABEL: test_v4f32_mul_y_sub_x_negone_undefs:
1257; AVX512-NOINFS:       # %bb.0:
1258; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
1259; AVX512-NOINFS-NEXT:    retq
1260  %s = fsub <4 x float> %x, <float undef, float -1.0, float -1.0, float -1.0>
1261  %m = fmul <4 x float> %y, %s
1262  ret <4 x float> %m
1263}
1264
1265;
1266; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
1267;
1268
1269define float @test_f32_interp(float %x, float %y, float %t) {
1270; FMA-INFS-LABEL: test_f32_interp:
1271; FMA-INFS:       # %bb.0:
1272; FMA-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
1273; FMA-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
1274; FMA-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
1275; FMA-INFS-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1276; FMA-INFS-NEXT:    retq
1277;
1278; FMA4-INFS-LABEL: test_f32_interp:
1279; FMA4-INFS:       # %bb.0:
1280; FMA4-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
1281; FMA4-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
1282; FMA4-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
1283; FMA4-INFS-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
1284; FMA4-INFS-NEXT:    retq
1285;
1286; AVX512-INFS-LABEL: test_f32_interp:
1287; AVX512-INFS:       # %bb.0:
1288; AVX512-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
1289; AVX512-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
1290; AVX512-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
1291; AVX512-INFS-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1292; AVX512-INFS-NEXT:    retq
1293;
1294; FMA-NOINFS-LABEL: test_f32_interp:
1295; FMA-NOINFS:       # %bb.0:
1296; FMA-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1297; FMA-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1298; FMA-NOINFS-NEXT:    retq
1299;
1300; FMA4-NOINFS-LABEL: test_f32_interp:
1301; FMA4-NOINFS:       # %bb.0:
1302; FMA4-NOINFS-NEXT:    vfmsubss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1303; FMA4-NOINFS-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1
1304; FMA4-NOINFS-NEXT:    retq
1305;
1306; AVX512-NOINFS-LABEL: test_f32_interp:
1307; AVX512-NOINFS:       # %bb.0:
1308; AVX512-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1309; AVX512-NOINFS-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1310; AVX512-NOINFS-NEXT:    retq
1311  %t1 = fsub nsz float 1.0, %t
1312  %tx = fmul nsz float %x, %t
1313  %ty = fmul nsz float %y, %t1
1314  %r = fadd nsz float %tx, %ty
1315  ret float %r
1316}
1317
1318define <4 x float> @test_v4f32_interp(<4 x float> %x, <4 x float> %y, <4 x float> %t) {
1319; FMA-INFS-LABEL: test_v4f32_interp:
1320; FMA-INFS:       # %bb.0:
1321; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1322; FMA-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
1323; FMA-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
1324; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1325; FMA-INFS-NEXT:    retq
1326;
1327; FMA4-INFS-LABEL: test_v4f32_interp:
1328; FMA4-INFS:       # %bb.0:
1329; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1330; FMA4-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
1331; FMA4-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
1332; FMA4-INFS-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
1333; FMA4-INFS-NEXT:    retq
1334;
1335; AVX512-INFS-LABEL: test_v4f32_interp:
1336; AVX512-INFS:       # %bb.0:
1337; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1338; AVX512-INFS-NEXT:    vsubps %xmm2, %xmm3, %xmm3
1339; AVX512-INFS-NEXT:    vmulps %xmm3, %xmm1, %xmm1
1340; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1341; AVX512-INFS-NEXT:    retq
1342;
1343; FMA-NOINFS-LABEL: test_v4f32_interp:
1344; FMA-NOINFS:       # %bb.0:
1345; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1346; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1347; FMA-NOINFS-NEXT:    retq
1348;
1349; FMA4-NOINFS-LABEL: test_v4f32_interp:
1350; FMA4-NOINFS:       # %bb.0:
1351; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1352; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1
1353; FMA4-NOINFS-NEXT:    retq
1354;
1355; AVX512-NOINFS-LABEL: test_v4f32_interp:
1356; AVX512-NOINFS:       # %bb.0:
1357; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1358; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1359; AVX512-NOINFS-NEXT:    retq
1360  %t1 = fsub nsz <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %t
1361  %tx = fmul nsz <4 x float> %x, %t
1362  %ty = fmul nsz <4 x float> %y, %t1
1363  %r = fadd nsz <4 x float> %tx, %ty
1364  ret <4 x float> %r
1365}
1366
1367define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float> %t) {
1368; FMA-INFS-LABEL: test_v8f32_interp:
1369; FMA-INFS:       # %bb.0:
1370; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1371; FMA-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
1372; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
1373; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
1374; FMA-INFS-NEXT:    retq
1375;
1376; FMA4-INFS-LABEL: test_v8f32_interp:
1377; FMA4-INFS:       # %bb.0:
1378; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1379; FMA4-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
1380; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
1381; FMA4-INFS-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
1382; FMA4-INFS-NEXT:    retq
1383;
1384; AVX512-INFS-LABEL: test_v8f32_interp:
1385; AVX512-INFS:       # %bb.0:
1386; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1387; AVX512-INFS-NEXT:    vsubps %ymm2, %ymm3, %ymm3
1388; AVX512-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
1389; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
1390; AVX512-INFS-NEXT:    retq
1391;
1392; FMA-NOINFS-LABEL: test_v8f32_interp:
1393; FMA-NOINFS:       # %bb.0:
1394; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1395; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
1396; FMA-NOINFS-NEXT:    retq
1397;
1398; FMA4-NOINFS-LABEL: test_v8f32_interp:
1399; FMA4-NOINFS:       # %bb.0:
1400; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1401; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1
1402; FMA4-NOINFS-NEXT:    retq
1403;
1404; AVX512-NOINFS-LABEL: test_v8f32_interp:
1405; AVX512-NOINFS:       # %bb.0:
1406; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1407; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
1408; AVX512-NOINFS-NEXT:    retq
1409  %t1 = fsub nsz <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
1410  %tx = fmul nsz <8 x float> %x, %t
1411  %ty = fmul nsz <8 x float> %y, %t1
1412  %r = fadd nsz <8 x float> %tx, %ty
1413  ret <8 x float> %r
1414}
1415
1416define double @test_f64_interp(double %x, double %y, double %t) {
1417; FMA-INFS-LABEL: test_f64_interp:
1418; FMA-INFS:       # %bb.0:
1419; FMA-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
1420; FMA-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
1421; FMA-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
1422; FMA-INFS-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1423; FMA-INFS-NEXT:    retq
1424;
1425; FMA4-INFS-LABEL: test_f64_interp:
1426; FMA4-INFS:       # %bb.0:
1427; FMA4-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
1428; FMA4-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
1429; FMA4-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
1430; FMA4-INFS-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
1431; FMA4-INFS-NEXT:    retq
1432;
1433; AVX512-INFS-LABEL: test_f64_interp:
1434; AVX512-INFS:       # %bb.0:
1435; AVX512-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
1436; AVX512-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
1437; AVX512-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
1438; AVX512-INFS-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1439; AVX512-INFS-NEXT:    retq
1440;
1441; FMA-NOINFS-LABEL: test_f64_interp:
1442; FMA-NOINFS:       # %bb.0:
1443; FMA-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1444; FMA-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1445; FMA-NOINFS-NEXT:    retq
1446;
1447; FMA4-NOINFS-LABEL: test_f64_interp:
1448; FMA4-NOINFS:       # %bb.0:
1449; FMA4-NOINFS-NEXT:    vfmsubsd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1450; FMA4-NOINFS-NEXT:    vfmsubsd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1
1451; FMA4-NOINFS-NEXT:    retq
1452;
1453; AVX512-NOINFS-LABEL: test_f64_interp:
1454; AVX512-NOINFS:       # %bb.0:
1455; AVX512-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1456; AVX512-NOINFS-NEXT:    vfmsub213sd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1457; AVX512-NOINFS-NEXT:    retq
1458  %t1 = fsub nsz double 1.0, %t
1459  %tx = fmul nsz double %x, %t
1460  %ty = fmul nsz double %y, %t1
1461  %r = fadd nsz double %tx, %ty
1462  ret double %r
1463}
1464
1465define <2 x double> @test_v2f64_interp(<2 x double> %x, <2 x double> %y, <2 x double> %t) {
1466; FMA-INFS-LABEL: test_v2f64_interp:
1467; FMA-INFS:       # %bb.0:
1468; FMA-INFS-NEXT:    vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
1469; FMA-INFS-NEXT:    # xmm3 = mem[0,0]
1470; FMA-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
1471; FMA-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
1472; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1473; FMA-INFS-NEXT:    retq
1474;
1475; FMA4-INFS-LABEL: test_v2f64_interp:
1476; FMA4-INFS:       # %bb.0:
1477; FMA4-INFS-NEXT:    vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
1478; FMA4-INFS-NEXT:    # xmm3 = mem[0,0]
1479; FMA4-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
1480; FMA4-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
1481; FMA4-INFS-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
1482; FMA4-INFS-NEXT:    retq
1483;
1484; AVX512-INFS-LABEL: test_v2f64_interp:
1485; AVX512-INFS:       # %bb.0:
1486; AVX512-INFS-NEXT:    vmovddup {{.*#+}} xmm3 = [1.0E+0,1.0E+0]
1487; AVX512-INFS-NEXT:    # xmm3 = mem[0,0]
1488; AVX512-INFS-NEXT:    vsubpd %xmm2, %xmm3, %xmm3
1489; AVX512-INFS-NEXT:    vmulpd %xmm3, %xmm1, %xmm1
1490; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
1491; AVX512-INFS-NEXT:    retq
1492;
1493; FMA-NOINFS-LABEL: test_v2f64_interp:
1494; FMA-NOINFS:       # %bb.0:
1495; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1496; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1497; FMA-NOINFS-NEXT:    retq
1498;
1499; FMA4-NOINFS-LABEL: test_v2f64_interp:
1500; FMA4-NOINFS:       # %bb.0:
1501; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1502; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} xmm0 = (xmm0 * xmm2) - xmm1
1503; FMA4-NOINFS-NEXT:    retq
1504;
1505; AVX512-NOINFS-LABEL: test_v2f64_interp:
1506; AVX512-NOINFS:       # %bb.0:
1507; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm1 = (xmm2 * xmm1) - xmm1
1508; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} xmm0 = (xmm2 * xmm0) - xmm1
1509; AVX512-NOINFS-NEXT:    retq
1510  %t1 = fsub nsz <2 x double> <double 1.0, double 1.0>, %t
1511  %tx = fmul nsz <2 x double> %x, %t
1512  %ty = fmul nsz <2 x double> %y, %t1
1513  %r = fadd nsz <2 x double> %tx, %ty
1514  ret <2 x double> %r
1515}
1516
1517define <4 x double> @test_v4f64_interp(<4 x double> %x, <4 x double> %y, <4 x double> %t) {
1518; FMA-INFS-LABEL: test_v4f64_interp:
1519; FMA-INFS:       # %bb.0:
1520; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1521; FMA-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
1522; FMA-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
1523; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
1524; FMA-INFS-NEXT:    retq
1525;
1526; FMA4-INFS-LABEL: test_v4f64_interp:
1527; FMA4-INFS:       # %bb.0:
1528; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1529; FMA4-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
1530; FMA4-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
1531; FMA4-INFS-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm1
1532; FMA4-INFS-NEXT:    retq
1533;
1534; AVX512-INFS-LABEL: test_v4f64_interp:
1535; AVX512-INFS:       # %bb.0:
1536; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
1537; AVX512-INFS-NEXT:    vsubpd %ymm2, %ymm3, %ymm3
1538; AVX512-INFS-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
1539; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm1
1540; AVX512-INFS-NEXT:    retq
1541;
1542; FMA-NOINFS-LABEL: test_v4f64_interp:
1543; FMA-NOINFS:       # %bb.0:
1544; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1545; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
1546; FMA-NOINFS-NEXT:    retq
1547;
1548; FMA4-NOINFS-LABEL: test_v4f64_interp:
1549; FMA4-NOINFS:       # %bb.0:
1550; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1551; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm1
1552; FMA4-NOINFS-NEXT:    retq
1553;
1554; AVX512-NOINFS-LABEL: test_v4f64_interp:
1555; AVX512-NOINFS:       # %bb.0:
1556; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm2 * ymm1) - ymm1
1557; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm1
1558; AVX512-NOINFS-NEXT:    retq
1559  %t1 = fsub nsz <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %t
1560  %tx = fmul nsz <4 x double> %x, %t
1561  %ty = fmul nsz <4 x double> %y, %t1
1562  %r = fadd nsz <4 x double> %tx, %ty
1563  ret <4 x double> %r
1564}
1565
1566;
1567; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
1568;
1569
1570define <4 x float> @test_v4f32_fneg_fmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
1571; FMA-LABEL: test_v4f32_fneg_fmadd:
1572; FMA:       # %bb.0:
1573; FMA-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1574; FMA-NEXT:    retq
1575;
1576; FMA4-LABEL: test_v4f32_fneg_fmadd:
1577; FMA4:       # %bb.0:
1578; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1579; FMA4-NEXT:    retq
1580;
1581; AVX512-LABEL: test_v4f32_fneg_fmadd:
1582; AVX512:       # %bb.0:
1583; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1584; AVX512-NEXT:    retq
1585  %mul = fmul nsz <4 x float> %a0, %a1
1586  %add = fadd nsz <4 x float> %mul, %a2
1587  %neg = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
1588  ret <4 x float> %neg
1589}
1590
1591define <4 x double> @test_v4f64_fneg_fmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1592; FMA-LABEL: test_v4f64_fneg_fmsub:
1593; FMA:       # %bb.0:
1594; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
1595; FMA-NEXT:    retq
1596;
1597; FMA4-LABEL: test_v4f64_fneg_fmsub:
1598; FMA4:       # %bb.0:
1599; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm1) + ymm2
1600; FMA4-NEXT:    retq
1601;
1602; AVX512-LABEL: test_v4f64_fneg_fmsub:
1603; AVX512:       # %bb.0:
1604; AVX512-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
1605; AVX512-NEXT:    retq
1606  %mul = fmul nsz <4 x double> %a0, %a1
1607  %sub = fsub nsz <4 x double> %mul, %a2
1608  %neg = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1609  ret <4 x double> %neg
1610}
1611
1612define <4 x float> @test_v4f32_fneg_fnmadd(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) #0 {
1613; FMA-LABEL: test_v4f32_fneg_fnmadd:
1614; FMA:       # %bb.0:
1615; FMA-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
1616; FMA-NEXT:    retq
1617;
1618; FMA4-LABEL: test_v4f32_fneg_fnmadd:
1619; FMA4:       # %bb.0:
1620; FMA4-NEXT:    vfmsubps {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
1621; FMA4-NEXT:    retq
1622;
1623; AVX512-LABEL: test_v4f32_fneg_fnmadd:
1624; AVX512:       # %bb.0:
1625; AVX512-NEXT:    vfmsub213ps {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
1626; AVX512-NEXT:    retq
1627  %mul = fmul nsz <4 x float> %a0, %a1
1628  %neg0 = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %mul
1629  %add = fadd nsz <4 x float> %neg0, %a2
1630  %neg1 = fsub nsz <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %add
1631  ret <4 x float> %neg1
1632}
1633
1634define <4 x double> @test_v4f64_fneg_fnmsub(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) #0 {
1635; FMA-LABEL: test_v4f64_fneg_fnmsub:
1636; FMA:       # %bb.0:
1637; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
1638; FMA-NEXT:    retq
1639;
1640; FMA4-LABEL: test_v4f64_fneg_fnmsub:
1641; FMA4:       # %bb.0:
1642; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm1) + ymm2
1643; FMA4-NEXT:    retq
1644;
1645; AVX512-LABEL: test_v4f64_fneg_fnmsub:
1646; AVX512:       # %bb.0:
1647; AVX512-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
1648; AVX512-NEXT:    retq
1649  %mul = fmul nsz  <4 x double> %a0, %a1
1650  %neg0 = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %mul
1651  %sub = fsub nsz <4 x double> %neg0, %a2
1652  %neg1 = fsub nsz <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1653  ret <4 x double> %neg1
1654}
1655
1656;
1657; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
1658;
1659
1660define <4 x float> @test_v4f32_fma_x_c1_fmul_x_c2(<4 x float> %x) #0 {
1661; FMA-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1662; FMA:       # %bb.0:
1663; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1664; FMA-NEXT:    retq
1665;
1666; FMA4-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1667; FMA4:       # %bb.0:
1668; FMA4-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1669; FMA4-NEXT:    retq
1670;
1671; AVX512-LABEL: test_v4f32_fma_x_c1_fmul_x_c2:
1672; AVX512:       # %bb.0:
1673; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1674; AVX512-NEXT:    retq
1675  %m0 = fmul <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 4.0>
1676  %m1 = fmul <4 x float> %x, <float 4.0, float 3.0, float 2.0, float 1.0>
1677  %a  = fadd <4 x float> %m0, %m1
1678  ret <4 x float> %a
1679}
1680
1681;
1682; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
1683;
1684
1685define <4 x float> @test_v4f32_fma_fmul_x_c1_c2_y(<4 x float> %x, <4 x float> %y) #0 {
1686; FMA-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1687; FMA:       # %bb.0:
1688; FMA-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
1689; FMA-NEXT:    retq
1690;
1691; FMA4-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1692; FMA4:       # %bb.0:
1693; FMA4-NEXT:    vfmaddps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
1694; FMA4-NEXT:    retq
1695;
1696; AVX512-LABEL: test_v4f32_fma_fmul_x_c1_c2_y:
1697; AVX512:       # %bb.0:
1698; AVX512-NEXT:    vfmadd132ps {{.*#+}} xmm0 = (xmm0 * mem) + xmm1
1699; AVX512-NEXT:    retq
1700  %m0 = fmul <4 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0>
1701  %m1 = fmul <4 x float> %m0, <float 4.0, float 3.0, float 2.0, float 1.0>
1702  %a  = fadd <4 x float> %m1, %y
1703  ret <4 x float> %a
1704}
1705
1706; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
1707
1708define double @test_f64_fneg_fmul(double %x, double %y) #0 {
1709; FMA-LABEL: test_f64_fneg_fmul:
1710; FMA:       # %bb.0:
1711; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1712; FMA-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1713; FMA-NEXT:    retq
1714;
1715; FMA4-LABEL: test_f64_fneg_fmul:
1716; FMA4:       # %bb.0:
1717; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1718; FMA4-NEXT:    vfnmsubsd {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1719; FMA4-NEXT:    retq
1720;
1721; AVX512-LABEL: test_f64_fneg_fmul:
1722; AVX512:       # %bb.0:
1723; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1724; AVX512-NEXT:    vfnmsub213sd {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1725; AVX512-NEXT:    retq
1726  %m = fmul nsz double %x, %y
1727  %n = fsub double -0.0, %m
1728  ret double %n
1729}
1730
1731define <4 x float> @test_v4f32_fneg_fmul(<4 x float> %x, <4 x float> %y) #0 {
1732; FMA-LABEL: test_v4f32_fneg_fmul:
1733; FMA:       # %bb.0:
1734; FMA-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1735; FMA-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1736; FMA-NEXT:    retq
1737;
1738; FMA4-LABEL: test_v4f32_fneg_fmul:
1739; FMA4:       # %bb.0:
1740; FMA4-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1741; FMA4-NEXT:    vfnmsubps {{.*#+}} xmm0 = -(xmm0 * xmm1) - xmm2
1742; FMA4-NEXT:    retq
1743;
1744; AVX512-LABEL: test_v4f32_fneg_fmul:
1745; AVX512:       # %bb.0:
1746; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1747; AVX512-NEXT:    vfnmsub213ps {{.*#+}} xmm0 = -(xmm1 * xmm0) - xmm2
1748; AVX512-NEXT:    retq
1749  %m = fmul nsz <4 x float> %x, %y
1750  %n = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %m
1751  ret <4 x float> %n
1752}
1753
1754define <4 x double> @test_v4f64_fneg_fmul(<4 x double> %x, <4 x double> %y) #0 {
1755; FMA-LABEL: test_v4f64_fneg_fmul:
1756; FMA:       # %bb.0:
1757; FMA-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1758; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
1759; FMA-NEXT:    retq
1760;
1761; FMA4-LABEL: test_v4f64_fneg_fmul:
1762; FMA4:       # %bb.0:
1763; FMA4-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1764; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm1) - ymm2
1765; FMA4-NEXT:    retq
1766;
1767; AVX512-LABEL: test_v4f64_fneg_fmul:
1768; AVX512:       # %bb.0:
1769; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1770; AVX512-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm1 * ymm0) - ymm2
1771; AVX512-NEXT:    retq
1772  %m = fmul nsz <4 x double> %x, %y
1773  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1774  ret <4 x double> %n
1775}
1776
1777define <4 x double> @test_v4f64_fneg_fmul_no_nsz(<4 x double> %x, <4 x double> %y) #0 {
1778; FMA-LABEL: test_v4f64_fneg_fmul_no_nsz:
1779; FMA:       # %bb.0:
1780; FMA-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1781; FMA-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1782; FMA-NEXT:    retq
1783;
1784; FMA4-LABEL: test_v4f64_fneg_fmul_no_nsz:
1785; FMA4:       # %bb.0:
1786; FMA4-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1787; FMA4-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1788; FMA4-NEXT:    retq
1789;
1790; AVX512-LABEL: test_v4f64_fneg_fmul_no_nsz:
1791; AVX512:       # %bb.0:
1792; AVX512-NEXT:    vmulpd %ymm1, %ymm0, %ymm0
1793; AVX512-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm0, %ymm0
1794; AVX512-NEXT:    retq
1795  %m = fmul <4 x double> %x, %y
1796  %n = fsub <4 x double> <double -0.0, double -0.0, double -0.0, double -0.0>, %m
1797  ret <4 x double> %n
1798}
1799
1800; ((a*b) + (c*d)) + n1 --> (a*b) + ((c*d) + n1)
1801
1802define double @fadd_fma_fmul_1(double %a, double %b, double %c, double %d, double %n1) nounwind {
1803; FMA-LABEL: fadd_fma_fmul_1:
1804; FMA:       # %bb.0:
1805; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
1806; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1807; FMA-NEXT:    retq
1808;
1809; FMA4-LABEL: fadd_fma_fmul_1:
1810; FMA4:       # %bb.0:
1811; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4
1812; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
1813; FMA4-NEXT:    retq
1814;
1815; AVX512-LABEL: fadd_fma_fmul_1:
1816; AVX512:       # %bb.0:
1817; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
1818; AVX512-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1819; AVX512-NEXT:    retq
1820  %m1 = fmul fast double %a, %b
1821  %m2 = fmul fast double %c, %d
1822  %a1 = fadd fast double %m1, %m2
1823  %a2 = fadd fast double %a1, %n1
1824  ret double %a2
1825}
1826
1827; Minimum FMF - the 1st fadd is contracted because that combines
1828; fmul+fadd as specified by the order of operations; the 2nd fadd
1829; requires reassociation to fuse with c*d.
1830
1831define float @fadd_fma_fmul_fmf(float %a, float %b, float %c, float %d, float %n0) nounwind {
1832; FMA-LABEL: fadd_fma_fmul_fmf:
1833; FMA:       # %bb.0:
1834; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
1835; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1836; FMA-NEXT:    retq
1837;
1838; FMA4-LABEL: fadd_fma_fmul_fmf:
1839; FMA4:       # %bb.0:
1840; FMA4-NEXT:    vfmaddss {{.*#+}} xmm2 = (xmm2 * xmm3) + xmm4
1841; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
1842; FMA4-NEXT:    retq
1843;
1844; AVX512-LABEL: fadd_fma_fmul_fmf:
1845; AVX512:       # %bb.0:
1846; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm4
1847; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1848; AVX512-NEXT:    retq
1849  %m1 = fmul float %a, %b
1850  %m2 = fmul float %c, %d
1851  %a1 = fadd contract float %m1, %m2
1852  %a2 = fadd reassoc float %n0, %a1
1853  ret float %a2
1854}
1855
1856; Not minimum FMF.
1857
1858define float @fadd_fma_fmul_2(float %a, float %b, float %c, float %d, float %n0) nounwind {
1859; FMA-LABEL: fadd_fma_fmul_2:
1860; FMA:       # %bb.0:
1861; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1862; FMA-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
1863; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
1864; FMA-NEXT:    retq
1865;
1866; FMA4-LABEL: fadd_fma_fmul_2:
1867; FMA4:       # %bb.0:
1868; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1869; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
1870; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
1871; FMA4-NEXT:    retq
1872;
1873; AVX512-LABEL: fadd_fma_fmul_2:
1874; AVX512:       # %bb.0:
1875; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1876; AVX512-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
1877; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
1878; AVX512-NEXT:    retq
1879  %m1 = fmul float %a, %b
1880  %m2 = fmul float %c, %d
1881  %a1 = fadd contract float %m1, %m2
1882  %a2 = fadd contract float %n0, %a1
1883  ret float %a2
1884}
1885
1886; The final fadd can be folded with either 1 of the leading fmuls.
1887
1888define <2 x double> @fadd_fma_fmul_3(<2 x double> %x1, <2 x double> %x2, <2 x double> %x3, <2 x double> %x4, <2 x double> %x5, <2 x double> %x6, <2 x double> %x7, <2 x double> %x8) nounwind {
1889; FMA-LABEL: fadd_fma_fmul_3:
1890; FMA:       # %bb.0:
1891; FMA-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
1892; FMA-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
1893; FMA-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2
1894; FMA-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2
1895; FMA-NEXT:    vmovapd %xmm2, %xmm0
1896; FMA-NEXT:    retq
1897;
1898; FMA4-LABEL: fadd_fma_fmul_3:
1899; FMA4:       # %bb.0:
1900; FMA4-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
1901; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
1902; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm6 * xmm7) + xmm0
1903; FMA4-NEXT:    vfmaddpd {{.*#+}} xmm0 = (xmm4 * xmm5) + xmm0
1904; FMA4-NEXT:    retq
1905;
1906; AVX512-LABEL: fadd_fma_fmul_3:
1907; AVX512:       # %bb.0:
1908; AVX512-NEXT:    vmulpd %xmm3, %xmm2, %xmm2
1909; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
1910; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm7 * xmm6) + xmm2
1911; AVX512-NEXT:    vfmadd231pd {{.*#+}} xmm2 = (xmm5 * xmm4) + xmm2
1912; AVX512-NEXT:    vmovapd %xmm2, %xmm0
1913; AVX512-NEXT:    retq
1914  %m1 = fmul fast <2 x double> %x1, %x2
1915  %m2 = fmul fast <2 x double> %x3, %x4
1916  %m3 = fmul fast <2 x double> %x5, %x6
1917  %m4 = fmul fast <2 x double> %x7, %x8
1918  %a1 = fadd fast <2 x double> %m1, %m2
1919  %a2 = fadd fast <2 x double> %m3, %m4
1920  %a3 = fadd fast <2 x double> %a1, %a2
1921  ret <2 x double> %a3
1922}
1923
1924; negative test
1925
1926define float @fadd_fma_fmul_extra_use_1(float %a, float %b, float %c, float %d, float %n0, ptr %p) nounwind {
1927; FMA-LABEL: fadd_fma_fmul_extra_use_1:
1928; FMA:       # %bb.0:
1929; FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1930; FMA-NEXT:    vmovss %xmm0, (%rdi)
1931; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0
1932; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
1933; FMA-NEXT:    retq
1934;
1935; FMA4-LABEL: fadd_fma_fmul_extra_use_1:
1936; FMA4:       # %bb.0:
1937; FMA4-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1938; FMA4-NEXT:    vmovss %xmm0, (%rdi)
1939; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm2 * xmm3) + xmm0
1940; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
1941; FMA4-NEXT:    retq
1942;
1943; AVX512-LABEL: fadd_fma_fmul_extra_use_1:
1944; AVX512:       # %bb.0:
1945; AVX512-NEXT:    vmulss %xmm1, %xmm0, %xmm0
1946; AVX512-NEXT:    vmovss %xmm0, (%rdi)
1947; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm3 * xmm2) + xmm0
1948; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
1949; AVX512-NEXT:    retq
1950  %m1 = fmul fast float %a, %b
1951  store float %m1, ptr %p
1952  %m2 = fmul fast float %c, %d
1953  %a1 = fadd fast float %m1, %m2
1954  %a2 = fadd fast float %n0, %a1
1955  ret float %a2
1956}
1957
1958; negative test
1959
1960define float @fadd_fma_fmul_extra_use_2(float %a, float %b, float %c, float %d, float %n0, ptr %p) nounwind {
1961; FMA-LABEL: fadd_fma_fmul_extra_use_2:
1962; FMA:       # %bb.0:
1963; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1964; FMA-NEXT:    vmovss %xmm2, (%rdi)
1965; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1966; FMA-NEXT:    vaddss %xmm0, %xmm4, %xmm0
1967; FMA-NEXT:    retq
1968;
1969; FMA4-LABEL: fadd_fma_fmul_extra_use_2:
1970; FMA4:       # %bb.0:
1971; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1972; FMA4-NEXT:    vmovss %xmm2, (%rdi)
1973; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
1974; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
1975; FMA4-NEXT:    retq
1976;
1977; AVX512-LABEL: fadd_fma_fmul_extra_use_2:
1978; AVX512:       # %bb.0:
1979; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1980; AVX512-NEXT:    vmovss %xmm2, (%rdi)
1981; AVX512-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
1982; AVX512-NEXT:    vaddss %xmm0, %xmm4, %xmm0
1983; AVX512-NEXT:    retq
1984  %m1 = fmul fast float %a, %b
1985  %m2 = fmul fast float %c, %d
1986  store float %m2, ptr %p
1987  %a1 = fadd fast float %m1, %m2
1988  %a2 = fadd fast float %n0, %a1
1989  ret float %a2
1990}
1991
1992; negative test
1993
1994define float @fadd_fma_fmul_extra_use_3(float %a, float %b, float %c, float %d, float %n0, ptr %p) nounwind {
1995; FMA-LABEL: fadd_fma_fmul_extra_use_3:
1996; FMA:       # %bb.0:
1997; FMA-NEXT:    vmulss %xmm3, %xmm2, %xmm2
1998; FMA-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
1999; FMA-NEXT:    vmovss %xmm2, (%rdi)
2000; FMA-NEXT:    vaddss %xmm2, %xmm4, %xmm0
2001; FMA-NEXT:    retq
2002;
2003; FMA4-LABEL: fadd_fma_fmul_extra_use_3:
2004; FMA4:       # %bb.0:
2005; FMA4-NEXT:    vmulss %xmm3, %xmm2, %xmm2
2006; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm1) + xmm2
2007; FMA4-NEXT:    vmovss %xmm0, (%rdi)
2008; FMA4-NEXT:    vaddss %xmm0, %xmm4, %xmm0
2009; FMA4-NEXT:    retq
2010;
2011; AVX512-LABEL: fadd_fma_fmul_extra_use_3:
2012; AVX512:       # %bb.0:
2013; AVX512-NEXT:    vmulss %xmm3, %xmm2, %xmm2
2014; AVX512-NEXT:    vfmadd231ss {{.*#+}} xmm2 = (xmm1 * xmm0) + xmm2
2015; AVX512-NEXT:    vmovss %xmm2, (%rdi)
2016; AVX512-NEXT:    vaddss %xmm2, %xmm4, %xmm0
2017; AVX512-NEXT:    retq
2018  %m1 = fmul fast float %a, %b
2019  %m2 = fmul fast float %c, %d
2020  %a1 = fadd fast float %m1, %m2
2021  store float %a1, ptr %p
2022  %a2 = fadd fast float %n0, %a1
2023  ret float %a2
2024}
2025
2026attributes #0 = { "unsafe-fp-math"="true" }
2027