xref: /llvm-project/llvm/test/CodeGen/X86/fma_patterns_wide.ll (revision 834cc88c5d08ca55664b7742590463de813d768f)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA --check-prefix=FMA-INFS
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-INFS
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-INFS
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA --check-prefix=FMA-NOINFS
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4,+fma -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma4 -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=FMA4 --check-prefix=FMA4-NOINFS
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq -fp-contract=fast -enable-no-infs-fp-math | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512-NOINFS
10
11;
12; Pattern: (fadd (fmul x, y), z) -> (fmadd x,y,z)
13;
14
15define <16 x float> @test_16f32_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
16; FMA-LABEL: test_16f32_fmadd:
17; FMA:       # %bb.0:
18; FMA-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
19; FMA-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm5
20; FMA-NEXT:    retq
21;
22; FMA4-LABEL: test_16f32_fmadd:
23; FMA4:       # %bb.0:
24; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm4
25; FMA4-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm5
26; FMA4-NEXT:    retq
27;
28; AVX512-LABEL: test_16f32_fmadd:
29; AVX512:       # %bb.0:
30; AVX512-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
31; AVX512-NEXT:    retq
32  %x = fmul <16 x float> %a0, %a1
33  %res = fadd <16 x float> %x, %a2
34  ret <16 x float> %res
35}
36
37define <8 x double> @test_8f64_fmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
38; FMA-LABEL: test_8f64_fmadd:
39; FMA:       # %bb.0:
40; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
41; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm5
42; FMA-NEXT:    retq
43;
44; FMA4-LABEL: test_8f64_fmadd:
45; FMA4:       # %bb.0:
46; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm4
47; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm5
48; FMA4-NEXT:    retq
49;
50; AVX512-LABEL: test_8f64_fmadd:
51; AVX512:       # %bb.0:
52; AVX512-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
53; AVX512-NEXT:    retq
54  %x = fmul <8 x double> %a0, %a1
55  %res = fadd <8 x double> %x, %a2
56  ret <8 x double> %res
57}
58
59;
60; Pattern: (fsub (fmul x, y), z) -> (fmsub x, y, z)
61;
62
63define <16 x float> @test_16f32_fmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
64; FMA-LABEL: test_16f32_fmsub:
65; FMA:       # %bb.0:
66; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
67; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm5
68; FMA-NEXT:    retq
69;
70; FMA4-LABEL: test_16f32_fmsub:
71; FMA4:       # %bb.0:
72; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm4
73; FMA4-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm5
74; FMA4-NEXT:    retq
75;
76; AVX512-LABEL: test_16f32_fmsub:
77; AVX512:       # %bb.0:
78; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
79; AVX512-NEXT:    retq
80  %x = fmul <16 x float> %a0, %a1
81  %res = fsub <16 x float> %x, %a2
82  ret <16 x float> %res
83}
84
85define <8 x double> @test_8f64_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
86; FMA-LABEL: test_8f64_fmsub:
87; FMA:       # %bb.0:
88; FMA-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
89; FMA-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm5
90; FMA-NEXT:    retq
91;
92; FMA4-LABEL: test_8f64_fmsub:
93; FMA4:       # %bb.0:
94; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm4
95; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm5
96; FMA4-NEXT:    retq
97;
98; AVX512-LABEL: test_8f64_fmsub:
99; AVX512:       # %bb.0:
100; AVX512-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
101; AVX512-NEXT:    retq
102  %x = fmul <8 x double> %a0, %a1
103  %res = fsub <8 x double> %x, %a2
104  ret <8 x double> %res
105}
106
107;
108; Pattern: (fsub z, (fmul x, y)) -> (fnmadd x, y, z)
109;
110
111define <16 x float> @test_16f32_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
112; FMA-LABEL: test_16f32_fnmadd:
113; FMA:       # %bb.0:
114; FMA-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
115; FMA-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm5
116; FMA-NEXT:    retq
117;
118; FMA4-LABEL: test_16f32_fnmadd:
119; FMA4:       # %bb.0:
120; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm4
121; FMA4-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm5
122; FMA4-NEXT:    retq
123;
124; AVX512-LABEL: test_16f32_fnmadd:
125; AVX512:       # %bb.0:
126; AVX512-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
127; AVX512-NEXT:    retq
128  %x = fmul <16 x float> %a0, %a1
129  %res = fsub <16 x float> %a2, %x
130  ret <16 x float> %res
131}
132
133define <8 x double> @test_8f64_fnmadd(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
134; FMA-LABEL: test_8f64_fnmadd:
135; FMA:       # %bb.0:
136; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
137; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm5
138; FMA-NEXT:    retq
139;
140; FMA4-LABEL: test_8f64_fnmadd:
141; FMA4:       # %bb.0:
142; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm4
143; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm5
144; FMA4-NEXT:    retq
145;
146; AVX512-LABEL: test_8f64_fnmadd:
147; AVX512:       # %bb.0:
148; AVX512-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
149; AVX512-NEXT:    retq
150  %x = fmul <8 x double> %a0, %a1
151  %res = fsub <8 x double> %a2, %x
152  ret <8 x double> %res
153}
154
155;
156; Pattern: (fsub (fneg (fmul x, y)), z) -> (fnmsub x, y, z)
157;
158
159define <16 x float> @test_16f32_fnmsub(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
160; FMA-LABEL: test_16f32_fnmsub:
161; FMA:       # %bb.0:
162; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
163; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm5
164; FMA-NEXT:    retq
165;
166; FMA4-LABEL: test_16f32_fnmsub:
167; FMA4:       # %bb.0:
168; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm4
169; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm5
170; FMA4-NEXT:    retq
171;
172; AVX512-LABEL: test_16f32_fnmsub:
173; AVX512:       # %bb.0:
174; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
175; AVX512-NEXT:    retq
176  %x = fmul <16 x float> %a0, %a1
177  %y = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
178  %res = fsub <16 x float> %y, %a2
179  ret <16 x float> %res
180}
181
182define <8 x double> @test_8f64_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
183; FMA-LABEL: test_8f64_fnmsub:
184; FMA:       # %bb.0:
185; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
186; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm5
187; FMA-NEXT:    retq
188;
189; FMA4-LABEL: test_8f64_fnmsub:
190; FMA4:       # %bb.0:
191; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm4
192; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm5
193; FMA4-NEXT:    retq
194;
195; AVX512-LABEL: test_8f64_fnmsub:
196; AVX512:       # %bb.0:
197; AVX512-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
198; AVX512-NEXT:    retq
199  %x = fmul <8 x double> %a0, %a1
200  %y = fsub <8 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %x
201  %res = fsub <8 x double> %y, %a2
202  ret <8 x double> %res
203}
204
205;
206; Load Folding Patterns
207;
208
209define <16 x float> @test_16f32_fmadd_load(ptr %a0, <16 x float> %a1, <16 x float> %a2) {
210; FMA-LABEL: test_16f32_fmadd_load:
211; FMA:       # %bb.0:
212; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
213; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * mem) + ymm3
214; FMA-NEXT:    retq
215;
216; FMA4-LABEL: test_16f32_fmadd_load:
217; FMA4:       # %bb.0:
218; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
219; FMA4-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * mem) + ymm3
220; FMA4-NEXT:    retq
221;
222; AVX512-LABEL: test_16f32_fmadd_load:
223; AVX512:       # %bb.0:
224; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
225; AVX512-NEXT:    retq
226  %x = load <16 x float>, ptr %a0
227  %y = fmul <16 x float> %x, %a1
228  %res = fadd <16 x float> %y, %a2
229  ret <16 x float> %res
230}
231
232define <8 x double> @test_8f64_fmsub_load(ptr %a0, <8 x double> %a1, <8 x double> %a2) {
233; FMA-LABEL: test_8f64_fmsub_load:
234; FMA:       # %bb.0:
235; FMA-NEXT:    vfmsub132pd {{.*#+}} ymm0 = (ymm0 * mem) - ymm2
236; FMA-NEXT:    vfmsub132pd {{.*#+}} ymm1 = (ymm1 * mem) - ymm3
237; FMA-NEXT:    retq
238;
239; FMA4-LABEL: test_8f64_fmsub_load:
240; FMA4:       # %bb.0:
241; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * mem) - ymm2
242; FMA4-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm1 * mem) - ymm3
243; FMA4-NEXT:    retq
244;
245; AVX512-LABEL: test_8f64_fmsub_load:
246; AVX512:       # %bb.0:
247; AVX512-NEXT:    vfmsub132pd {{.*#+}} zmm0 = (zmm0 * mem) - zmm1
248; AVX512-NEXT:    retq
249  %x = load <8 x double>, ptr %a0
250  %y = fmul <8 x double> %x, %a1
251  %res = fsub <8 x double> %y, %a2
252  ret <8 x double> %res
253}
254
255;
256; Patterns (+ fneg variants): mul(add(1.0,x),y), mul(sub(1.0,x),y), mul(sub(x,1.0),y)
257;
258
259define <16 x float> @test_v16f32_mul_add_x_one_y(<16 x float> %x, <16 x float> %y) {
260; FMA-INFS-LABEL: test_v16f32_mul_add_x_one_y:
261; FMA-INFS:       # %bb.0:
262; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
263; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
264; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
265; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
266; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
267; FMA-INFS-NEXT:    retq
268;
269; FMA4-INFS-LABEL: test_v16f32_mul_add_x_one_y:
270; FMA4-INFS:       # %bb.0:
271; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
272; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
273; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
274; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
275; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
276; FMA4-INFS-NEXT:    retq
277;
278; AVX512-INFS-LABEL: test_v16f32_mul_add_x_one_y:
279; AVX512-INFS:       # %bb.0:
280; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
281; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
282; AVX512-INFS-NEXT:    retq
283;
284; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
285; FMA-NOINFS:       # %bb.0:
286; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2
287; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3
288; FMA-NOINFS-NEXT:    retq
289;
290; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
291; FMA4-NOINFS:       # %bb.0:
292; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
293; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3
294; FMA4-NOINFS-NEXT:    retq
295;
296; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_one_y:
297; AVX512-NOINFS:       # %bb.0:
298; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
299; AVX512-NOINFS-NEXT:    retq
300  %a = fadd <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
301  %m = fmul <16 x float> %a, %y
302  ret <16 x float> %m
303}
304
305define <8 x double> @test_v8f64_mul_y_add_x_one(<8 x double> %x, <8 x double> %y) {
306; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_one:
307; FMA-INFS:       # %bb.0:
308; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
309; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
310; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
311; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
312; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
313; FMA-INFS-NEXT:    retq
314;
315; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_one:
316; FMA4-INFS:       # %bb.0:
317; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
318; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
319; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
320; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
321; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
322; FMA4-INFS-NEXT:    retq
323;
324; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_one:
325; AVX512-INFS:       # %bb.0:
326; AVX512-INFS-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
327; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
328; AVX512-INFS-NEXT:    retq
329;
330; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
331; FMA-NOINFS:       # %bb.0:
332; FMA-NOINFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2
333; FMA-NOINFS-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3
334; FMA-NOINFS-NEXT:    retq
335;
336; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
337; FMA4-NOINFS:       # %bb.0:
338; FMA4-NOINFS-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
339; FMA4-NOINFS-NEXT:    vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3
340; FMA4-NOINFS-NEXT:    retq
341;
342; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_one:
343; AVX512-NOINFS:       # %bb.0:
344; AVX512-NOINFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
345; AVX512-NOINFS-NEXT:    retq
346  %a = fadd <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
347  %m = fmul <8 x double> %y, %a
348  ret <8 x double> %m
349}
350
351define <16 x float> @test_v16f32_mul_add_x_negone_y(<16 x float> %x, <16 x float> %y) {
352; FMA-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
353; FMA-INFS:       # %bb.0:
354; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
355; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
356; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
357; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
358; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
359; FMA-INFS-NEXT:    retq
360;
361; FMA4-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
362; FMA4-INFS:       # %bb.0:
363; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
364; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
365; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
366; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
367; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
368; FMA4-INFS-NEXT:    retq
369;
370; AVX512-INFS-LABEL: test_v16f32_mul_add_x_negone_y:
371; AVX512-INFS:       # %bb.0:
372; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
373; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
374; AVX512-INFS-NEXT:    retq
375;
376; FMA-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
377; FMA-NOINFS:       # %bb.0:
378; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2
379; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3
380; FMA-NOINFS-NEXT:    retq
381;
382; FMA4-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
383; FMA4-NOINFS:       # %bb.0:
384; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2
385; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3
386; FMA4-NOINFS-NEXT:    retq
387;
388; AVX512-NOINFS-LABEL: test_v16f32_mul_add_x_negone_y:
389; AVX512-NOINFS:       # %bb.0:
390; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
391; AVX512-NOINFS-NEXT:    retq
392  %a = fadd <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
393  %m = fmul <16 x float> %a, %y
394  ret <16 x float> %m
395}
396
397define <8 x double> @test_v8f64_mul_y_add_x_negone(<8 x double> %x, <8 x double> %y) {
398; FMA-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
399; FMA-INFS:       # %bb.0:
400; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
401; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
402; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
403; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
404; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
405; FMA-INFS-NEXT:    retq
406;
407; FMA4-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
408; FMA4-INFS:       # %bb.0:
409; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
410; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
411; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
412; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
413; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
414; FMA4-INFS-NEXT:    retq
415;
416; AVX512-INFS-LABEL: test_v8f64_mul_y_add_x_negone:
417; AVX512-INFS:       # %bb.0:
418; AVX512-INFS-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
419; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
420; AVX512-INFS-NEXT:    retq
421;
422; FMA-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
423; FMA-NOINFS:       # %bb.0:
424; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2
425; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3
426; FMA-NOINFS-NEXT:    retq
427;
428; FMA4-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
429; FMA4-NOINFS:       # %bb.0:
430; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2
431; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3
432; FMA4-NOINFS-NEXT:    retq
433;
434; AVX512-NOINFS-LABEL: test_v8f64_mul_y_add_x_negone:
435; AVX512-NOINFS:       # %bb.0:
436; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
437; AVX512-NOINFS-NEXT:    retq
438  %a = fadd <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
439  %m = fmul <8 x double> %y, %a
440  ret <8 x double> %m
441}
442
443define <16 x float> @test_v16f32_mul_sub_one_x_y(<16 x float> %x, <16 x float> %y) {
444; FMA-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
445; FMA-INFS:       # %bb.0:
446; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
447; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
448; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
449; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
450; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
451; FMA-INFS-NEXT:    retq
452;
453; FMA4-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
454; FMA4-INFS:       # %bb.0:
455; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
456; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
457; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
458; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
459; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
460; FMA4-INFS-NEXT:    retq
461;
462; AVX512-INFS-LABEL: test_v16f32_mul_sub_one_x_y:
463; AVX512-INFS:       # %bb.0:
464; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
465; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
466; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
467; AVX512-INFS-NEXT:    retq
468;
469; FMA-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
470; FMA-NOINFS:       # %bb.0:
471; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
472; FMA-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm3
473; FMA-NOINFS-NEXT:    retq
474;
475; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
476; FMA4-NOINFS:       # %bb.0:
477; FMA4-NOINFS-NEXT:    vfnmaddps {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
478; FMA4-NOINFS-NEXT:    vfnmaddps {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm3
479; FMA4-NOINFS-NEXT:    retq
480;
481; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_one_x_y:
482; AVX512-NOINFS:       # %bb.0:
483; AVX512-NOINFS-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1
484; AVX512-NOINFS-NEXT:    retq
485  %s = fsub <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
486  %m = fmul <16 x float> %s, %y
487  ret <16 x float> %m
488}
489
490define <8 x double> @test_v8f64_mul_y_sub_one_x(<8 x double> %x, <8 x double> %y) {
491; FMA-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
492; FMA-INFS:       # %bb.0:
493; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
494; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
495; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
496; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
497; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
498; FMA-INFS-NEXT:    retq
499;
500; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
501; FMA4-INFS:       # %bb.0:
502; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
503; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
504; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
505; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
506; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
507; FMA4-INFS-NEXT:    retq
508;
509; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_one_x:
510; AVX512-INFS:       # %bb.0:
511; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
512; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
513; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
514; AVX512-INFS-NEXT:    retq
515;
516; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
517; FMA-NOINFS:       # %bb.0:
518; FMA-NOINFS-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm2
519; FMA-NOINFS-NEXT:    vfnmadd213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm3
520; FMA-NOINFS-NEXT:    retq
521;
522; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
523; FMA4-NOINFS:       # %bb.0:
524; FMA4-NOINFS-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm2
525; FMA4-NOINFS-NEXT:    vfnmaddpd {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm3
526; FMA4-NOINFS-NEXT:    retq
527;
528; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_one_x:
529; AVX512-NOINFS:       # %bb.0:
530; AVX512-NOINFS-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm1
531; AVX512-NOINFS-NEXT:    retq
532  %s = fsub <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %x
533  %m = fmul <8 x double> %y, %s
534  ret <8 x double> %m
535}
536
537define <16 x float> @test_v16f32_mul_sub_negone_x_y(<16 x float> %x, <16 x float> %y) {
538; FMA-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
539; FMA-INFS:       # %bb.0:
540; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
541; FMA-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
542; FMA-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
543; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
544; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
545; FMA-INFS-NEXT:    retq
546;
547; FMA4-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
548; FMA4-INFS:       # %bb.0:
549; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
550; FMA4-INFS-NEXT:    vsubps %ymm1, %ymm4, %ymm1
551; FMA4-INFS-NEXT:    vsubps %ymm0, %ymm4, %ymm0
552; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
553; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
554; FMA4-INFS-NEXT:    retq
555;
556; AVX512-INFS-LABEL: test_v16f32_mul_sub_negone_x_y:
557; AVX512-INFS:       # %bb.0:
558; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
559; AVX512-INFS-NEXT:    vsubps %zmm0, %zmm2, %zmm0
560; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
561; AVX512-INFS-NEXT:    retq
562;
563; FMA-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
564; FMA-NOINFS:       # %bb.0:
565; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm2
566; FMA-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm3
567; FMA-NOINFS-NEXT:    retq
568;
569; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
570; FMA4-NOINFS:       # %bb.0:
571; FMA4-NOINFS-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm2
572; FMA4-NOINFS-NEXT:    vfnmsubps {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm3
573; FMA4-NOINFS-NEXT:    retq
574;
575; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_negone_x_y:
576; AVX512-NOINFS:       # %bb.0:
577; AVX512-NOINFS-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1
578; AVX512-NOINFS-NEXT:    retq
579  %s = fsub <16 x float> <float -1.0, float -1.0, float -1.0, float -1.0,float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>, %x
580  %m = fmul <16 x float> %s, %y
581  ret <16 x float> %m
582}
583
584define <8 x double> @test_v8f64_mul_y_sub_negone_x(<8 x double> %x, <8 x double> %y) {
585; FMA-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
586; FMA-INFS:       # %bb.0:
587; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
588; FMA-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
589; FMA-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
590; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
591; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
592; FMA-INFS-NEXT:    retq
593;
594; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
595; FMA4-INFS:       # %bb.0:
596; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
597; FMA4-INFS-NEXT:    vsubpd %ymm1, %ymm4, %ymm1
598; FMA4-INFS-NEXT:    vsubpd %ymm0, %ymm4, %ymm0
599; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
600; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
601; FMA4-INFS-NEXT:    retq
602;
603; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_negone_x:
604; AVX512-INFS:       # %bb.0:
605; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
606; AVX512-INFS-NEXT:    vsubpd %zmm0, %zmm2, %zmm0
607; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
608; AVX512-INFS-NEXT:    retq
609;
610; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
611; FMA-NOINFS:       # %bb.0:
612; FMA-NOINFS-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm2
613; FMA-NOINFS-NEXT:    vfnmsub213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm3
614; FMA-NOINFS-NEXT:    retq
615;
616; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
617; FMA4-NOINFS:       # %bb.0:
618; FMA4-NOINFS-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm2
619; FMA4-NOINFS-NEXT:    vfnmsubpd {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm3
620; FMA4-NOINFS-NEXT:    retq
621;
622; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_negone_x:
623; AVX512-NOINFS:       # %bb.0:
624; AVX512-NOINFS-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm1
625; AVX512-NOINFS-NEXT:    retq
626  %s = fsub <8 x double> <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>, %x
627  %m = fmul <8 x double> %y, %s
628  ret <8 x double> %m
629}
630
631define <16 x float> @test_v16f32_mul_sub_x_one_y(<16 x float> %x, <16 x float> %y) {
632; FMA-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
633; FMA-INFS:       # %bb.0:
634; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
635; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
636; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
637; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
638; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
639; FMA-INFS-NEXT:    retq
640;
641; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
642; FMA4-INFS:       # %bb.0:
643; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
644; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
645; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
646; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
647; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
648; FMA4-INFS-NEXT:    retq
649;
650; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_one_y:
651; AVX512-INFS:       # %bb.0:
652; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
653; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
654; AVX512-INFS-NEXT:    retq
655;
656; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
657; FMA-NOINFS:       # %bb.0:
658; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2
659; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3
660; FMA-NOINFS-NEXT:    retq
661;
662; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
663; FMA4-NOINFS:       # %bb.0:
664; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2
665; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3
666; FMA4-NOINFS-NEXT:    retq
667;
668; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_one_y:
669; AVX512-NOINFS:       # %bb.0:
670; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
671; AVX512-NOINFS-NEXT:    retq
672  %s = fsub <16 x float> %x, <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
673  %m = fmul <16 x float> %s, %y
674  ret <16 x float> %m
675}
676
677define <8 x double> @test_v8f64_mul_y_sub_x_one(<8 x double> %x, <8 x double> %y) {
678; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
679; FMA-INFS:       # %bb.0:
680; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
681; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
682; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
683; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
684; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
685; FMA-INFS-NEXT:    retq
686;
687; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
688; FMA4-INFS:       # %bb.0:
689; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
690; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
691; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
692; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
693; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
694; FMA4-INFS-NEXT:    retq
695;
696; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_one:
697; AVX512-INFS:       # %bb.0:
698; AVX512-INFS-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
699; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
700; AVX512-INFS-NEXT:    retq
701;
702; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
703; FMA-NOINFS:       # %bb.0:
704; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm2
705; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm3
706; FMA-NOINFS-NEXT:    retq
707;
708; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
709; FMA4-NOINFS:       # %bb.0:
710; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm2
711; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm3
712; FMA4-NOINFS-NEXT:    retq
713;
714; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_one:
715; AVX512-NOINFS:       # %bb.0:
716; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm1
717; AVX512-NOINFS-NEXT:    retq
718  %s = fsub <8 x double> %x, <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>
719  %m = fmul <8 x double> %y, %s
720  ret <8 x double> %m
721}
722
723define <16 x float> @test_v16f32_mul_sub_x_negone_y(<16 x float> %x, <16 x float> %y) {
724; FMA-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
725; FMA-INFS:       # %bb.0:
726; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
727; FMA-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
728; FMA-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
729; FMA-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
730; FMA-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
731; FMA-INFS-NEXT:    retq
732;
733; FMA4-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
734; FMA4-INFS:       # %bb.0:
735; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
736; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm1, %ymm1
737; FMA4-INFS-NEXT:    vaddps %ymm4, %ymm0, %ymm0
738; FMA4-INFS-NEXT:    vmulps %ymm2, %ymm0, %ymm0
739; FMA4-INFS-NEXT:    vmulps %ymm3, %ymm1, %ymm1
740; FMA4-INFS-NEXT:    retq
741;
742; AVX512-INFS-LABEL: test_v16f32_mul_sub_x_negone_y:
743; AVX512-INFS:       # %bb.0:
744; AVX512-INFS-NEXT:    vaddps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
745; AVX512-INFS-NEXT:    vmulps %zmm1, %zmm0, %zmm0
746; AVX512-INFS-NEXT:    retq
747;
748; FMA-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
749; FMA-NOINFS:       # %bb.0:
750; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2
751; FMA-NOINFS-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3
752; FMA-NOINFS-NEXT:    retq
753;
754; FMA4-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
755; FMA4-NOINFS:       # %bb.0:
756; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
757; FMA4-NOINFS-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3
758; FMA4-NOINFS-NEXT:    retq
759;
760; AVX512-NOINFS-LABEL: test_v16f32_mul_sub_x_negone_y:
761; AVX512-NOINFS:       # %bb.0:
762; AVX512-NOINFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
763; AVX512-NOINFS-NEXT:    retq
764  %s = fsub <16 x float> %x, <float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0, float -1.0>
765  %m = fmul <16 x float> %s, %y
766  ret <16 x float> %m
767}
768
769define <8 x double> @test_v8f64_mul_y_sub_x_negone(<8 x double> %x, <8 x double> %y) {
770; FMA-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
771; FMA-INFS:       # %bb.0:
772; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
773; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
774; FMA-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
775; FMA-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
776; FMA-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
777; FMA-INFS-NEXT:    retq
778;
779; FMA4-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
780; FMA4-INFS:       # %bb.0:
781; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm4 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
782; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm1, %ymm1
783; FMA4-INFS-NEXT:    vaddpd %ymm4, %ymm0, %ymm0
784; FMA4-INFS-NEXT:    vmulpd %ymm0, %ymm2, %ymm0
785; FMA4-INFS-NEXT:    vmulpd %ymm1, %ymm3, %ymm1
786; FMA4-INFS-NEXT:    retq
787;
788; AVX512-INFS-LABEL: test_v8f64_mul_y_sub_x_negone:
789; AVX512-INFS:       # %bb.0:
790; AVX512-INFS-NEXT:    vaddpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
791; AVX512-INFS-NEXT:    vmulpd %zmm0, %zmm1, %zmm0
792; AVX512-INFS-NEXT:    retq
793;
794; FMA-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
795; FMA-NOINFS:       # %bb.0:
796; FMA-NOINFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm2
797; FMA-NOINFS-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm3
798; FMA-NOINFS-NEXT:    retq
799;
800; FMA4-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
801; FMA4-NOINFS:       # %bb.0:
802; FMA4-NOINFS-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm2
803; FMA4-NOINFS-NEXT:    vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm3
804; FMA4-NOINFS-NEXT:    retq
805;
806; AVX512-NOINFS-LABEL: test_v8f64_mul_y_sub_x_negone:
807; AVX512-NOINFS:       # %bb.0:
808; AVX512-NOINFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm1
809; AVX512-NOINFS-NEXT:    retq
810  %s = fsub <8 x double> %x, <double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0, double -1.0>
811  %m = fmul <8 x double> %y, %s
812  ret <8 x double> %m
813}
814
815;
816; Interpolation Patterns: add(mul(x,t),mul(sub(1.0,t),y))
817;
818
819define <16 x float> @test_v16f32_interp(<16 x float> %x, <16 x float> %y, <16 x float> %t) {
820; FMA-INFS-LABEL: test_v16f32_interp:
821; FMA-INFS:       # %bb.0:
822; FMA-INFS-NEXT:    vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
823; FMA-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
824; FMA-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
825; FMA-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
826; FMA-INFS-NEXT:    vmulps %ymm7, %ymm2, %ymm2
827; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2
828; FMA-INFS-NEXT:    vfmadd213ps {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3
829; FMA-INFS-NEXT:    retq
830;
831; FMA4-INFS-LABEL: test_v16f32_interp:
832; FMA4-INFS:       # %bb.0:
833; FMA4-INFS-NEXT:    vbroadcastss {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
834; FMA4-INFS-NEXT:    vsubps %ymm4, %ymm6, %ymm7
835; FMA4-INFS-NEXT:    vsubps %ymm5, %ymm6, %ymm6
836; FMA4-INFS-NEXT:    vmulps %ymm6, %ymm3, %ymm3
837; FMA4-INFS-NEXT:    vmulps %ymm7, %ymm2, %ymm2
838; FMA4-INFS-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2
839; FMA4-INFS-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3
840; FMA4-INFS-NEXT:    retq
841;
842; AVX512-INFS-LABEL: test_v16f32_interp:
843; AVX512-INFS:       # %bb.0:
844; AVX512-INFS-NEXT:    vbroadcastss {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
845; AVX512-INFS-NEXT:    vsubps %zmm2, %zmm3, %zmm3
846; AVX512-INFS-NEXT:    vmulps %zmm3, %zmm1, %zmm1
847; AVX512-INFS-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
848; AVX512-INFS-NEXT:    retq
849;
850; FMA-NOINFS-LABEL: test_v16f32_interp:
851; FMA-NOINFS:       # %bb.0:
852; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3
853; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2
854; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2
855; FMA-NOINFS-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3
856; FMA-NOINFS-NEXT:    retq
857;
858; FMA4-NOINFS-LABEL: test_v16f32_interp:
859; FMA4-NOINFS:       # %bb.0:
860; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3
861; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2
862; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2
863; FMA4-NOINFS-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3
864; FMA4-NOINFS-NEXT:    retq
865;
866; AVX512-NOINFS-LABEL: test_v16f32_interp:
867; AVX512-NOINFS:       # %bb.0:
868; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1
869; AVX512-NOINFS-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1
870; AVX512-NOINFS-NEXT:    retq
871  %t1 = fsub nsz <16 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %t
872  %tx = fmul nsz <16 x float> %x, %t
873  %ty = fmul nsz <16 x float> %y, %t1
874  %r = fadd nsz <16 x float> %tx, %ty
875  ret <16 x float> %r
876}
877
878define <8 x double> @test_v8f64_interp(<8 x double> %x, <8 x double> %y, <8 x double> %t) {
879; FMA-INFS-LABEL: test_v8f64_interp:
880; FMA-INFS:       # %bb.0:
881; FMA-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
882; FMA-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
883; FMA-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
884; FMA-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
885; FMA-INFS-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
886; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm4 * ymm0) + ymm2
887; FMA-INFS-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm5 * ymm1) + ymm3
888; FMA-INFS-NEXT:    retq
889;
890; FMA4-INFS-LABEL: test_v8f64_interp:
891; FMA4-INFS:       # %bb.0:
892; FMA4-INFS-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
893; FMA4-INFS-NEXT:    vsubpd %ymm4, %ymm6, %ymm7
894; FMA4-INFS-NEXT:    vsubpd %ymm5, %ymm6, %ymm6
895; FMA4-INFS-NEXT:    vmulpd %ymm6, %ymm3, %ymm3
896; FMA4-INFS-NEXT:    vmulpd %ymm7, %ymm2, %ymm2
897; FMA4-INFS-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm4) + ymm2
898; FMA4-INFS-NEXT:    vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm5) + ymm3
899; FMA4-INFS-NEXT:    retq
900;
901; AVX512-INFS-LABEL: test_v8f64_interp:
902; AVX512-INFS:       # %bb.0:
903; AVX512-INFS-NEXT:    vbroadcastsd {{.*#+}} zmm3 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
904; AVX512-INFS-NEXT:    vsubpd %zmm2, %zmm3, %zmm3
905; AVX512-INFS-NEXT:    vmulpd %zmm3, %zmm1, %zmm1
906; AVX512-INFS-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm2 * zmm0) + zmm1
907; AVX512-INFS-NEXT:    retq
908;
909; FMA-NOINFS-LABEL: test_v8f64_interp:
910; FMA-NOINFS:       # %bb.0:
911; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3
912; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2
913; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm0 = (ymm4 * ymm0) - ymm2
914; FMA-NOINFS-NEXT:    vfmsub213pd {{.*#+}} ymm1 = (ymm5 * ymm1) - ymm3
915; FMA-NOINFS-NEXT:    retq
916;
917; FMA4-NOINFS-LABEL: test_v8f64_interp:
918; FMA4-NOINFS:       # %bb.0:
919; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm3 = (ymm5 * ymm3) - ymm3
920; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm2 = (ymm4 * ymm2) - ymm2
921; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm0 = (ymm0 * ymm4) - ymm2
922; FMA4-NOINFS-NEXT:    vfmsubpd {{.*#+}} ymm1 = (ymm1 * ymm5) - ymm3
923; FMA4-NOINFS-NEXT:    retq
924;
925; AVX512-NOINFS-LABEL: test_v8f64_interp:
926; AVX512-NOINFS:       # %bb.0:
927; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm1 = (zmm2 * zmm1) - zmm1
928; AVX512-NOINFS-NEXT:    vfmsub213pd {{.*#+}} zmm0 = (zmm2 * zmm0) - zmm1
929; AVX512-NOINFS-NEXT:    retq
930  %t1 = fsub nsz <8 x double> <double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0, double 1.0>, %t
931  %tx = fmul nsz <8 x double> %x, %t
932  %ty = fmul nsz <8 x double> %y, %t1
933  %r = fadd nsz <8 x double> %tx, %ty
934  ret <8 x double> %r
935}
936
937;
938; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
939;
940
941define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
942; FMA-LABEL: test_v16f32_fneg_fmadd:
943; FMA:       # %bb.0:
944; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
945; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm5
946; FMA-NEXT:    retq
947;
948; FMA4-LABEL: test_v16f32_fneg_fmadd:
949; FMA4:       # %bb.0:
950; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm4
951; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm5
952; FMA4-NEXT:    retq
953;
954; AVX512-LABEL: test_v16f32_fneg_fmadd:
955; AVX512:       # %bb.0:
956; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
957; AVX512-NEXT:    retq
958  %mul = fmul nsz <16 x float> %a0, %a1
959  %add = fadd nsz <16 x float> %mul, %a2
960  %neg = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
961  ret <16 x float> %neg
962}
963
964define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
965; FMA-LABEL: test_v8f64_fneg_fmsub:
966; FMA:       # %bb.0:
967; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
968; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) + ymm5
969; FMA-NEXT:    retq
970;
971; FMA4-LABEL: test_v8f64_fneg_fmsub:
972; FMA4:       # %bb.0:
973; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm4
974; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm1 = -(ymm1 * ymm3) + ymm5
975; FMA4-NEXT:    retq
976;
977; AVX512-LABEL: test_v8f64_fneg_fmsub:
978; AVX512:       # %bb.0:
979; AVX512-NEXT:    vfnmadd213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
980; AVX512-NEXT:    retq
981  %mul = fmul nsz <8 x double> %a0, %a1
982  %sub = fsub nsz <8 x double> %mul, %a2
983  %neg = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
984  ret <8 x double> %neg
985}
986
987define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
988; FMA-LABEL: test_v16f32_fneg_fnmadd:
989; FMA:       # %bb.0:
990; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
991; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm1 = (ymm3 * ymm1) - ymm5
992; FMA-NEXT:    retq
993;
994; FMA4-LABEL: test_v16f32_fneg_fnmadd:
995; FMA4:       # %bb.0:
996; FMA4-NEXT:    vfmsubps {{.*#+}} ymm0 = (ymm0 * ymm2) - ymm4
997; FMA4-NEXT:    vfmsubps {{.*#+}} ymm1 = (ymm1 * ymm3) - ymm5
998; FMA4-NEXT:    retq
999;
1000; AVX512-LABEL: test_v16f32_fneg_fnmadd:
1001; AVX512:       # %bb.0:
1002; AVX512-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
1003; AVX512-NEXT:    retq
1004  %mul = fmul nsz <16 x float> %a0, %a1
1005  %neg0 = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %mul
1006  %add = fadd nsz <16 x float> %neg0, %a2
1007  %neg1 = fsub nsz <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %add
1008  ret <16 x float> %neg1
1009}
1010
1011define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
1012; FMA-LABEL: test_v8f64_fneg_fnmsub:
1013; FMA:       # %bb.0:
1014; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
1015; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm1 = (ymm3 * ymm1) + ymm5
1016; FMA-NEXT:    retq
1017;
1018; FMA4-LABEL: test_v8f64_fneg_fnmsub:
1019; FMA4:       # %bb.0:
1020; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm0 = (ymm0 * ymm2) + ymm4
1021; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm1 = (ymm1 * ymm3) + ymm5
1022; FMA4-NEXT:    retq
1023;
1024; AVX512-LABEL: test_v8f64_fneg_fnmsub:
1025; AVX512:       # %bb.0:
1026; AVX512-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
1027; AVX512-NEXT:    retq
1028  %mul = fmul nsz <8 x double> %a0, %a1
1029  %neg0 = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %mul
1030  %sub = fsub nsz <8 x double> %neg0, %a2
1031  %neg1 = fsub nsz <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sub
1032  ret <8 x double> %neg1
1033}
1034
1035;
1036; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
1037;
1038
1039define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
1040; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
1041; FMA:       # %bb.0:
1042; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1043; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1044; FMA-NEXT:    retq
1045;
1046; FMA4-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
1047; FMA4:       # %bb.0:
1048; FMA4-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1049; FMA4-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1050; FMA4-NEXT:    retq
1051;
1052; AVX512-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
1053; AVX512:       # %bb.0:
1054; AVX512-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1055; AVX512-NEXT:    retq
1056  %m0 = fmul <16 x float> %x, <float 17.0, float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0>
1057  %m1 = fmul <16 x float> %x, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
1058  %a  = fadd <16 x float> %m0, %m1
1059  ret <16 x float> %a
1060}
1061
1062;
1063; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
1064;
1065
1066define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
1067; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
1068; FMA:       # %bb.0:
1069; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
1070; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm1 = (ymm1 * mem) + ymm3
1071; FMA-NEXT:    retq
1072;
1073; FMA4-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
1074; FMA4:       # %bb.0:
1075; FMA4-NEXT:    vfmaddps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
1076; FMA4-NEXT:    vfmaddps {{.*#+}} ymm1 = (ymm1 * mem) + ymm3
1077; FMA4-NEXT:    retq
1078;
1079; AVX512-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
1080; AVX512:       # %bb.0:
1081; AVX512-NEXT:    vfmadd132ps {{.*#+}} zmm0 = (zmm0 * mem) + zmm1
1082; AVX512-NEXT:    retq
1083  %m0 = fmul <16 x float> %x,  <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>
1084  %m1 = fmul <16 x float> %m0, <float 16.0, float 15.0, float 14.0, float 13.0, float 12.0, float 11.0, float 10.0, float 9.0, float 8.0, float 7.0, float 6.0, float 5.0, float 4.0, float 3.0, float 2.0, float 1.0>
1085  %a  = fadd <16 x float> %m1, %y
1086  ret <16 x float> %a
1087}
1088
1089; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
1090
1091define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
1092; FMA-LABEL: test_v16f32_fneg_fmul:
1093; FMA:       # %bb.0:
1094; FMA-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1095; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
1096; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm4
1097; FMA-NEXT:    retq
1098;
1099; FMA4-LABEL: test_v16f32_fneg_fmul:
1100; FMA4:       # %bb.0:
1101; FMA4-NEXT:    vxorps %xmm4, %xmm4, %xmm4
1102; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm4
1103; FMA4-NEXT:    vfnmsubps {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm4
1104; FMA4-NEXT:    retq
1105;
1106; AVX512-LABEL: test_v16f32_fneg_fmul:
1107; AVX512:       # %bb.0:
1108; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
1109; AVX512-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
1110; AVX512-NEXT:    retq
1111  %m = fmul nsz <16 x float> %x, %y
1112  %n = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %m
1113  ret <16 x float> %n
1114}
1115
1116define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
1117; FMA-LABEL: test_v8f64_fneg_fmul:
1118; FMA:       # %bb.0:
1119; FMA-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
1120; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
1121; FMA-NEXT:    vfnmsub213pd {{.*#+}} ymm1 = -(ymm3 * ymm1) - ymm4
1122; FMA-NEXT:    retq
1123;
1124; FMA4-LABEL: test_v8f64_fneg_fmul:
1125; FMA4:       # %bb.0:
1126; FMA4-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
1127; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm0 = -(ymm0 * ymm2) - ymm4
1128; FMA4-NEXT:    vfnmsubpd {{.*#+}} ymm1 = -(ymm1 * ymm3) - ymm4
1129; FMA4-NEXT:    retq
1130;
1131; AVX512-LABEL: test_v8f64_fneg_fmul:
1132; AVX512:       # %bb.0:
1133; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1134; AVX512-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
1135; AVX512-NEXT:    retq
1136  %m = fmul nsz <8 x double> %x, %y
1137  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
1138  ret <8 x double> %n
1139}
1140
1141define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
1142; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
1143; FMA:       # %bb.0:
1144; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
1145; FMA-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
1146; FMA-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1147; FMA-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
1148; FMA-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
1149; FMA-NEXT:    retq
1150;
1151; FMA4-LABEL: test_v8f64_fneg_fmul_no_nsz:
1152; FMA4:       # %bb.0:
1153; FMA4-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
1154; FMA4-NEXT:    vmulpd %ymm2, %ymm0, %ymm0
1155; FMA4-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1156; FMA4-NEXT:    vxorpd %ymm2, %ymm0, %ymm0
1157; FMA4-NEXT:    vxorpd %ymm2, %ymm1, %ymm1
1158; FMA4-NEXT:    retq
1159;
1160; AVX512-LABEL: test_v8f64_fneg_fmul_no_nsz:
1161; AVX512:       # %bb.0:
1162; AVX512-NEXT:    vmulpd %zmm1, %zmm0, %zmm0
1163; AVX512-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
1164; AVX512-NEXT:    retq
1165  %m = fmul <8 x double> %x, %y
1166  %n = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %m
1167  ret <8 x double> %n
1168}
1169
1170attributes #0 = { "unsafe-fp-math"="true" }
1171