xref: /llvm-project/llvm/test/CodeGen/X86/fma-fneg-combine.ll (revision 9be6e7b0f2496efa76229f70654ebe6494dc2cf9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512bw -mattr=+avx512vl -mattr=+avx512dq  | FileCheck %s  --check-prefix=CHECK --check-prefix=SKX
3; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f -mattr=+fma | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
4
5; This test checks combinations of FNEG and FMA intrinsics on AVX-512 target
6; PR28892
7
8declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
9declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
10declare <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i32)
11declare <16 x float> @llvm.x86.avx512.mask.vfnmadd.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
12declare <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i16, i32)
13declare <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>)
14declare <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32)
15declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8, i32)
16declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
17declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32)
18declare <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float>, <16 x float>, <16 x float>, i32)
19declare <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double>, <8 x double>, <8 x double>, i32)
20
21define <16 x float> @test1(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
22; CHECK-LABEL: test1:
23; CHECK:       # %bb.0:
24; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
25; CHECK-NEXT:    retq
26  %sub.i = fneg <16 x float> %c
27  %t0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i32 4)
28  ret <16 x float> %t0
29}
30
31define <16 x float> @test2(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
32; SKX-LABEL: test2:
33; SKX:       # %bb.0:
34; SKX-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
35; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
36; SKX-NEXT:    retq
37;
38; KNL-LABEL: test2:
39; KNL:       # %bb.0:
40; KNL-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
41; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
42; KNL-NEXT:    retq
43  %fma = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
44  %neg = fneg <16 x float> %fma
45  ret <16 x float> %neg
46}
47
48define <16 x float> @test2_nsz(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
49; CHECK-LABEL: test2_nsz:
50; CHECK:       # %bb.0:
51; CHECK-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
52; CHECK-NEXT:    retq
53  %fma = call nsz <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
54  %neg = fneg <16 x float> %fma
55  ret <16 x float> %neg
56}
57
58define <16 x float> @test3(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
59; SKX-LABEL: test3:
60; SKX:       # %bb.0:
61; SKX-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
62; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
63; SKX-NEXT:    retq
64;
65; KNL-LABEL: test3:
66; KNL:       # %bb.0:
67; KNL-NEXT:    vfnmadd213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) + zmm2
68; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
69; KNL-NEXT:    retq
70  %t0 = fneg <16 x float> %b
71  %t1 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %t0, <16 x float> %c)
72  %sub.i = fneg <16 x float> %t1
73  ret <16 x float> %sub.i
74}
75
76define <16 x float> @test3_nsz(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
77; CHECK-LABEL: test3_nsz:
78; CHECK:       # %bb.0:
79; CHECK-NEXT:    vfmsub213ps {{.*#+}} zmm0 = (zmm1 * zmm0) - zmm2
80; CHECK-NEXT:    retq
81  %t0 = fneg <16 x float> %b
82  %t1 = call nsz <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %t0, <16 x float> %c)
83  %sub.i = fneg <16 x float> %t1
84  ret <16 x float> %sub.i
85}
86
87define <16 x float> @test4(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
88; SKX-LABEL: test4:
89; SKX:       # %bb.0:
90; SKX-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
91; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
92; SKX-NEXT:    retq
93;
94; KNL-LABEL: test4:
95; KNL:       # %bb.0:
96; KNL-NEXT:    vfnmsub213ps {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
97; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
98; KNL-NEXT:    retq
99  %t0 = fneg <16 x float> %b
100  %t1 = fneg <16 x float> %c
101  %t2 = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %t0, <16 x float> %t1)
102  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t2
103  ret <16 x float> %sub.i
104}
105
106define <16 x float> @test4_nsz(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
107; CHECK-LABEL: test4_nsz:
108; CHECK:       # %bb.0:
109; CHECK-NEXT:    vfmadd213ps {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
110; CHECK-NEXT:    retq
111  %t0 = fneg <16 x float> %b
112  %t1 = fneg <16 x float> %c
113  %t2 = call nsz <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %t0, <16 x float> %t1)
114  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t2
115  ret <16 x float> %sub.i
116}
117
118define <16 x float> @test5(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
119; CHECK-LABEL: test5:
120; CHECK:       # %bb.0: # %entry
121; CHECK-NEXT:    vfmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
122; CHECK-NEXT:    retq
123entry:
124  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %c
125  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i32 10) #2
126  ret <16 x float> %0
127}
128
129define <16 x float> @test6(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
130; SKX-LABEL: test6:
131; SKX:       # %bb.0:
132; SKX-NEXT:    vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
133; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
134; SKX-NEXT:    retq
135;
136; KNL-LABEL: test6:
137; KNL:       # %bb.0:
138; KNL-NEXT:    vfnmsub213ps {ru-sae}, %zmm2, %zmm1, %zmm0
139; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
140; KNL-NEXT:    retq
141  %t0 = fneg <16 x float> %b
142  %t1 = fneg <16 x float> %c
143  %t2 = call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %t0, <16 x float> %t1, i32 10)
144  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t2
145  ret <16 x float> %sub.i
146}
147
148define <16 x float> @test6_nsz(<16 x float> %a, <16 x float> %b, <16 x float> %c) {
149; CHECK-LABEL: test6_nsz:
150; CHECK:       # %bb.0:
151; CHECK-NEXT:    vfmadd213ps {ru-sae}, %zmm2, %zmm1, %zmm0
152; CHECK-NEXT:    retq
153  %t0 = fneg <16 x float> %b
154  %t1 = fneg <16 x float> %c
155  %t2 = call nsz <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %t0, <16 x float> %t1, i32 10)
156  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t2
157  ret <16 x float> %sub.i
158}
159
160define <8 x float> @test7(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
161; SKX-LABEL: test7:
162; SKX:       # %bb.0:
163; SKX-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
164; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
165; SKX-NEXT:    retq
166;
167; KNL-LABEL: test7:
168; KNL:       # %bb.0:
169; KNL-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm1 * ymm0) - ymm2
170; KNL-NEXT:    vbroadcastss {{.*#+}} ymm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
171; KNL-NEXT:    vxorps %ymm1, %ymm0, %ymm0
172; KNL-NEXT:    retq
173  %t0 = fneg <8 x float> %c
174  %t1 = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %t0)
175  %sub.i = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t1
176  ret <8 x float> %sub.i
177}
178
179define <8 x float> @test7_nsz(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
180; CHECK-LABEL: test7_nsz:
181; CHECK:       # %bb.0:
182; CHECK-NEXT:    vfnmadd213ps {{.*#+}} ymm0 = -(ymm1 * ymm0) + ymm2
183; CHECK-NEXT:    retq
184  %t0 = fneg <8 x float> %c
185  %t1 = call nsz <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %t0)
186  %sub.i = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %t1
187  ret <8 x float> %sub.i
188}
189
190define <8 x float> @test8(<8 x float> %a, <8 x float> %b, <8 x float> %c) {
191; CHECK-LABEL: test8:
192; CHECK:       # %bb.0: # %entry
193; CHECK-NEXT:    vfmadd213ps {{.*#+}} ymm0 = (ymm1 * ymm0) + ymm2
194; CHECK-NEXT:    retq
195entry:
196  %sub.c = fsub <8 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %c
197  %0 = tail call <8 x float> @llvm.x86.fma.vfmsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %sub.c) #2
198  ret <8 x float> %0
199}
200
201define <8 x double> @test9(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
202; SKX-LABEL: test9:
203; SKX:       # %bb.0:
204; SKX-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
205; SKX-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
206; SKX-NEXT:    retq
207;
208; KNL-LABEL: test9:
209; KNL:       # %bb.0:
210; KNL-NEXT:    vfmadd213pd {{.*#+}} zmm0 = (zmm1 * zmm0) + zmm2
211; KNL-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
212; KNL-NEXT:    retq
213  %t0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4)
214  %sub.i = fneg <8 x double> %t0
215  ret <8 x double> %sub.i
216}
217
218define <8 x double> @test9_nsz(<8 x double> %a, <8 x double> %b, <8 x double> %c) {
219; CHECK-LABEL: test9_nsz:
220; CHECK:       # %bb.0:
221; CHECK-NEXT:    vfnmsub213pd {{.*#+}} zmm0 = -(zmm1 * zmm0) - zmm2
222; CHECK-NEXT:    retq
223  %t0 = tail call nsz <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4)
224  %sub.i = fneg <8 x double> %t0
225  ret <8 x double> %sub.i
226}
227
228define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
229; SKX-LABEL: test10:
230; SKX:       # %bb.0: # %entry
231; SKX-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
232; SKX-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
233; SKX-NEXT:    retq
234;
235; KNL-LABEL: test10:
236; KNL:       # %bb.0: # %entry
237; KNL-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2
238; KNL-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
239; KNL-NEXT:    retq
240entry:
241  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2
242  %sub.i = fsub <2 x double> <double -0.0, double -0.0>, %0
243  ret <2 x double> %sub.i
244}
245
246define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
247; SKX-LABEL: test11:
248; SKX:       # %bb.0: # %entry
249; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm3
250; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
251; SKX-NEXT:    kmovd %edi, %k1
252; SKX-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
253; SKX-NEXT:    vmovaps %xmm3, %xmm0
254; SKX-NEXT:    retq
255;
256; KNL-LABEL: test11:
257; KNL:       # %bb.0: # %entry
258; KNL-NEXT:    vbroadcastss {{.*#+}} xmm3 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
259; KNL-NEXT:    vxorps %xmm3, %xmm2, %xmm3
260; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm1 * xmm0) - xmm2
261; KNL-NEXT:    kmovw %edi, %k1
262; KNL-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k1}
263; KNL-NEXT:    vmovaps %xmm3, %xmm0
264; KNL-NEXT:    retq
265entry:
266  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %c
267  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
268  ret <4 x float> %0
269}
270
271define <4 x float> @test11b(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
272; SKX-LABEL: test11b:
273; SKX:       # %bb.0: # %entry
274; SKX-NEXT:    kmovd %edi, %k1
275; SKX-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
276; SKX-NEXT:    retq
277;
278; KNL-LABEL: test11b:
279; KNL:       # %bb.0: # %entry
280; KNL-NEXT:    kmovw %edi, %k1
281; KNL-NEXT:    vfmsub213ss {{.*#+}} xmm0 {%k1} = (xmm1 * xmm0) - xmm2
282; KNL-NEXT:    retq
283entry:
284  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %c
285  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 4) #10
286  ret <4 x float> %0
287}
288
289define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
290; SKX-LABEL: test12:
291; SKX:       # %bb.0: # %entry
292; SKX-NEXT:    kmovd %edi, %k1
293; SKX-NEXT:    vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
294; SKX-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
295; SKX-NEXT:    retq
296;
297; KNL-LABEL: test12:
298; KNL:       # %bb.0: # %entry
299; KNL-NEXT:    kmovw %edi, %k1
300; KNL-NEXT:    vfmadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) + zmm2
301; KNL-NEXT:    vpxorq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
302; KNL-NEXT:    retq
303entry:
304  %0 = tail call <8 x double> @llvm.x86.avx512.vfmadd.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %c, i32 4) #2
305  %bc = bitcast i8 %mask to <8 x i1>
306  %sel = select <8 x i1> %bc, <8 x double> %0, <8 x double> %a
307  %sub.i = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %sel
308  ret <8 x double> %sub.i
309}
310
311define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
312; SKX-LABEL: test13:
313; SKX:       # %bb.0: # %entry
314; SKX-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm3
315; SKX-NEXT:    vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2
316; SKX-NEXT:    kmovd %edi, %k1
317; SKX-NEXT:    vmovsd %xmm1, %xmm3, %xmm3 {%k1}
318; SKX-NEXT:    vmovapd %xmm3, %xmm0
319; SKX-NEXT:    retq
320;
321; KNL-LABEL: test13:
322; KNL:       # %bb.0: # %entry
323; KNL-NEXT:    vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
324; KNL-NEXT:    vfnmadd213sd {{.*#+}} xmm1 = -(xmm0 * xmm1) + xmm2
325; KNL-NEXT:    kmovw %edi, %k1
326; KNL-NEXT:    vmovsd %xmm1, %xmm3, %xmm3 {%k1}
327; KNL-NEXT:    vmovapd %xmm3, %xmm0
328; KNL-NEXT:    retq
329entry:
330  %sub.i = fsub <2 x double> <double -0.0, double -0.0>, %a
331  %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
332  ret <2 x double> %0
333}
334
335define <16 x float> @test14(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
336; SKX-LABEL: test14:
337; SKX:       # %bb.0: # %entry
338; SKX-NEXT:    kmovd %edi, %k1
339; SKX-NEXT:    vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
340; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
341; SKX-NEXT:    retq
342;
343; KNL-LABEL: test14:
344; KNL:       # %bb.0: # %entry
345; KNL-NEXT:    kmovw %edi, %k1
346; KNL-NEXT:    vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
347; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
348; KNL-NEXT:    retq
349entry:
350  %0 = tail call <16 x float> @llvm.x86.avx512.mask.vfnmsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask, i32 10) #2
351  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %0
352  ret <16 x float> %sub.i
353}
354
355define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask)  {
356; SKX-LABEL: test15:
357; SKX:       # %bb.0: # %entry
358; SKX-NEXT:    kmovd %edi, %k1
359; SKX-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
360; SKX-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
361; SKX-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
362; SKX-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
363; SKX-NEXT:    vmovaps %zmm3, %zmm0
364; SKX-NEXT:    retq
365;
366; KNL-LABEL: test15:
367; KNL:       # %bb.0: # %entry
368; KNL-NEXT:    kmovw %edi, %k1
369; KNL-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm3
370; KNL-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
371; KNL-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
372; KNL-NEXT:    vfnmadd132ps {rd-sae}, %zmm0, %zmm2, %zmm3 {%k1}
373; KNL-NEXT:    vmovaps %zmm3, %zmm0
374; KNL-NEXT:    retq
375entry:
376  %bc = bitcast i16 %mask to <16 x i1>
377  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %a
378  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sub.i, <16 x float> %b, <16 x float> %c, i32 10)
379  %sel = select <16 x i1> %bc, <16 x float> %0, <16 x float> %sub.i
380  %1 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %sel, <16 x float> %sub.i, <16 x float> %c, i32 9)
381  %sel2 = select <16 x i1> %bc, <16 x float> %1, <16 x float> %sel
382  ret <16 x float> %sel2
383}
384
385define <16 x float> @test16(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
386; SKX-LABEL: test16:
387; SKX:       # %bb.0:
388; SKX-NEXT:    kmovd %edi, %k1
389; SKX-NEXT:    vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
390; SKX-NEXT:    retq
391;
392; KNL-LABEL: test16:
393; KNL:       # %bb.0:
394; KNL-NEXT:    kmovw %edi, %k1
395; KNL-NEXT:    vfmsubadd132ps {rd-sae}, %zmm1, %zmm2, %zmm0 {%k1}
396; KNL-NEXT:    retq
397  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %c
398  %res = call <16 x float> @llvm.x86.avx512.vfmaddsub.ps.512(<16 x float> %a, <16 x float> %b, <16 x float> %sub.i, i32 9)
399  %bc = bitcast i16 %mask to <16 x i1>
400  %sel = select <16 x i1> %bc, <16 x float> %res, <16 x float> %a
401  ret <16 x float> %sel
402}
403
404define <8 x double> @test17(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
405; SKX-LABEL: test17:
406; SKX:       # %bb.0:
407; SKX-NEXT:    kmovd %edi, %k1
408; SKX-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
409; SKX-NEXT:    retq
410;
411; KNL-LABEL: test17:
412; KNL:       # %bb.0:
413; KNL-NEXT:    kmovw %edi, %k1
414; KNL-NEXT:    vfmsubadd132pd {{.*#+}} zmm0 {%k1} = (zmm0 * zmm1) -/+ zmm2
415; KNL-NEXT:    retq
416  %sub.i = fsub <8 x double> <double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0, double -0.0>, %c
417  %res = call <8 x double> @llvm.x86.avx512.vfmaddsub.pd.512(<8 x double> %a, <8 x double> %b, <8 x double> %sub.i, i32 4)
418  %bc = bitcast i8 %mask to <8 x i1>
419  %sel = select <8 x i1> %bc, <8 x double> %res, <8 x double> %a
420  ret <8 x double> %sel
421}
422
423define <4 x float> @test18(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
424; SKX-LABEL: test18:
425; SKX:       # %bb.0: # %entry
426; SKX-NEXT:    kmovd %edi, %k1
427; SKX-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
428; SKX-NEXT:    retq
429;
430; KNL-LABEL: test18:
431; KNL:       # %bb.0: # %entry
432; KNL-NEXT:    kmovw %edi, %k1
433; KNL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) + xmm2
434; KNL-NEXT:    retq
435entry:
436  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
437  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
438  ret <4 x float> %0
439}
440
441define <4 x float> @test19(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
442; SKX-LABEL: test19:
443; SKX:       # %bb.0: # %entry
444; SKX-NEXT:    kmovd %edi, %k1
445; SKX-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
446; SKX-NEXT:    retq
447;
448; KNL-LABEL: test19:
449; KNL:       # %bb.0: # %entry
450; KNL-NEXT:    kmovw %edi, %k1
451; KNL-NEXT:    vfnmsub213ss {{.*#+}} xmm0 {%k1} = -(xmm1 * xmm0) - xmm2
452; KNL-NEXT:    retq
453entry:
454  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
455  %sub.i.2 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %c
456  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 4) #10
457  ret <4 x float> %0
458}
459
460define <4 x float> @test20(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
461; SKX-LABEL: test20:
462; SKX:       # %bb.0: # %entry
463; SKX-NEXT:    kmovd %edi, %k1
464; SKX-NEXT:    vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
465; SKX-NEXT:    vmovaps %xmm2, %xmm0
466; SKX-NEXT:    retq
467;
468; KNL-LABEL: test20:
469; KNL:       # %bb.0: # %entry
470; KNL-NEXT:    kmovw %edi, %k1
471; KNL-NEXT:    vfnmadd231ss {{.*#+}} xmm2 {%k1} = -(xmm0 * xmm1) + xmm2
472; KNL-NEXT:    vmovaps %xmm2, %xmm0
473; KNL-NEXT:    retq
474entry:
475  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
476  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 4) #10
477  ret <4 x float> %0
478}
479
480define <4 x float> @test21(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
481; SKX-LABEL: test21:
482; SKX:       # %bb.0: # %entry
483; SKX-NEXT:    kmovd %edi, %k1
484; SKX-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
485; SKX-NEXT:    retq
486;
487; KNL-LABEL: test21:
488; KNL:       # %bb.0: # %entry
489; KNL-NEXT:    kmovw %edi, %k1
490; KNL-NEXT:    vfnmadd213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
491; KNL-NEXT:    retq
492entry:
493  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
494  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
495  ret <4 x float> %0
496}
497
498define <4 x float> @test22(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
499; SKX-LABEL: test22:
500; SKX:       # %bb.0: # %entry
501; SKX-NEXT:    kmovd %edi, %k1
502; SKX-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
503; SKX-NEXT:    retq
504;
505; KNL-LABEL: test22:
506; KNL:       # %bb.0: # %entry
507; KNL-NEXT:    kmovw %edi, %k1
508; KNL-NEXT:    vfnmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
509; KNL-NEXT:    retq
510entry:
511  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
512  %sub.i.2 = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %c
513  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %sub.i.2, i8 %mask, i32 8) #10
514  ret <4 x float> %0
515}
516
517define <4 x float> @test23(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
518; SKX-LABEL: test23:
519; SKX:       # %bb.0: # %entry
520; SKX-NEXT:    kmovd %edi, %k1
521; SKX-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
522; SKX-NEXT:    vmovaps %xmm2, %xmm0
523; SKX-NEXT:    retq
524;
525; KNL-LABEL: test23:
526; KNL:       # %bb.0: # %entry
527; KNL-NEXT:    kmovw %edi, %k1
528; KNL-NEXT:    vfnmadd231ss {rn-sae}, %xmm1, %xmm0, %xmm2 {%k1}
529; KNL-NEXT:    vmovaps %xmm2, %xmm0
530; KNL-NEXT:    retq
531entry:
532  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %b
533  %0 = tail call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %a, <4 x float> %sub.i, <4 x float> %c, i8 %mask, i32 8) #10
534  ret <4 x float> %0
535}
536
537define <4 x float> @test24(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 zeroext %mask) local_unnamed_addr #0 {
538; SKX-LABEL: test24:
539; SKX:       # %bb.0: # %entry
540; SKX-NEXT:    kmovd %edi, %k1
541; SKX-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
542; SKX-NEXT:    retq
543;
544; KNL-LABEL: test24:
545; KNL:       # %bb.0: # %entry
546; KNL-NEXT:    kmovw %edi, %k1
547; KNL-NEXT:    vfmsub213ss {rn-sae}, %xmm2, %xmm1, %xmm0 {%k1}
548; KNL-NEXT:    retq
549entry:
550  %sub.i = fsub <4 x float> <float -0.0, float -0.0, float -0.0, float -0.0>, %c
551  %0 = tail call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %a, <4 x float> %b, <4 x float> %sub.i, i8 %mask, i32 8) #10
552  ret <4 x float> %0
553}
554
555define <16 x float> @test25(<16 x float> %a, <16 x float> %b, <16 x float> %c)  {
556; CHECK-LABEL: test25:
557; CHECK:       # %bb.0: # %entry
558; CHECK-NEXT:    vfnmsub213ps {rn-sae}, %zmm2, %zmm1, %zmm0
559; CHECK-NEXT:    retq
560entry:
561  %sub.i = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %b
562  %sub.i.2 = fsub <16 x float> <float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0, float -0.0>, %c
563  %0 = tail call <16 x float> @llvm.x86.avx512.vfmadd.ps.512(<16 x float> %a, <16 x float> %sub.i, <16 x float> %sub.i.2, i32 8) #2
564  ret <16 x float> %0
565}
566