xref: /llvm-project/llvm/test/CodeGen/X86/expand-vp-fp-intrinsics.ll (revision 171a3a6b9ed456ded8a07050efb1cffa91613ac5)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown                  | FileCheck %s --check-prefixes=SSE
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2     | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512
7
8define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
9; X86-LABEL: vp_fadd_v4f32:
10; X86:       # %bb.0:
11; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
12; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
13; X86-NEXT:    vmovaps %xmm0, (%eax)
14; X86-NEXT:    retl
15;
16; SSE-LABEL: vp_fadd_v4f32:
17; SSE:       # %bb.0:
18; SSE-NEXT:    addps %xmm1, %xmm0
19; SSE-NEXT:    movaps %xmm0, (%rdi)
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: vp_fadd_v4f32:
23; AVX:       # %bb.0:
24; AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
25; AVX-NEXT:    vmovaps %xmm0, (%rdi)
26; AVX-NEXT:    retq
27  %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
28  store <4 x float> %res, ptr %out
29  ret void
30}
31declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
32
33define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
34; X86-LABEL: vp_fsub_v4f32:
35; X86:       # %bb.0:
36; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
37; X86-NEXT:    vsubps %xmm1, %xmm0, %xmm0
38; X86-NEXT:    vmovaps %xmm0, (%eax)
39; X86-NEXT:    retl
40;
41; SSE-LABEL: vp_fsub_v4f32:
42; SSE:       # %bb.0:
43; SSE-NEXT:    subps %xmm1, %xmm0
44; SSE-NEXT:    movaps %xmm0, (%rdi)
45; SSE-NEXT:    retq
46;
47; AVX-LABEL: vp_fsub_v4f32:
48; AVX:       # %bb.0:
49; AVX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
50; AVX-NEXT:    vmovaps %xmm0, (%rdi)
51; AVX-NEXT:    retq
52  %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
53  store <4 x float> %res, ptr %out
54  ret void
55}
56declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
57
58define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
59; X86-LABEL: vp_fmul_v4f32:
60; X86:       # %bb.0:
61; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
62; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
63; X86-NEXT:    vmovaps %xmm0, (%eax)
64; X86-NEXT:    retl
65;
66; SSE-LABEL: vp_fmul_v4f32:
67; SSE:       # %bb.0:
68; SSE-NEXT:    mulps %xmm1, %xmm0
69; SSE-NEXT:    movaps %xmm0, (%rdi)
70; SSE-NEXT:    retq
71;
72; AVX-LABEL: vp_fmul_v4f32:
73; AVX:       # %bb.0:
74; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
75; AVX-NEXT:    vmovaps %xmm0, (%rdi)
76; AVX-NEXT:    retq
77  %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
78  store <4 x float> %res, ptr %out
79  ret void
80}
81declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
82
83define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
84; X86-LABEL: vp_fdiv_v4f32:
85; X86:       # %bb.0:
86; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
87; X86-NEXT:    vdivps %xmm1, %xmm0, %xmm0
88; X86-NEXT:    vmovaps %xmm0, (%eax)
89; X86-NEXT:    retl
90;
91; SSE-LABEL: vp_fdiv_v4f32:
92; SSE:       # %bb.0:
93; SSE-NEXT:    divps %xmm1, %xmm0
94; SSE-NEXT:    movaps %xmm0, (%rdi)
95; SSE-NEXT:    retq
96;
97; AVX-LABEL: vp_fdiv_v4f32:
98; AVX:       # %bb.0:
99; AVX-NEXT:    vdivps %xmm1, %xmm0, %xmm0
100; AVX-NEXT:    vmovaps %xmm0, (%rdi)
101; AVX-NEXT:    retq
102  %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
103  store <4 x float> %res, ptr %out
104  ret void
105}
106declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
107
108define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
109; X86-LABEL: vp_frem_v4f32:
110; X86:       # %bb.0:
111; X86-NEXT:    pushl %esi
112; X86-NEXT:    subl $80, %esp
113; X86-NEXT:    vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
114; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
115; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
116; X86-NEXT:    vextractps $2, %xmm1, {{[0-9]+}}(%esp)
117; X86-NEXT:    vextractps $2, %xmm0, (%esp)
118; X86-NEXT:    calll fmodf
119; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
120; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
121; X86-NEXT:    vextractps $1, %xmm0, {{[0-9]+}}(%esp)
122; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
123; X86-NEXT:    vextractps $1, %xmm0, (%esp)
124; X86-NEXT:    calll fmodf
125; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
126; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
127; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
128; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
129; X86-NEXT:    vmovss %xmm0, (%esp)
130; X86-NEXT:    calll fmodf
131; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
132; X86-NEXT:    vextractps $3, %xmm0, {{[0-9]+}}(%esp)
133; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
134; X86-NEXT:    vextractps $3, %xmm0, (%esp)
135; X86-NEXT:    fstps {{[0-9]+}}(%esp)
136; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
137; X86-NEXT:    fstps {{[0-9]+}}(%esp)
138; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
139; X86-NEXT:    fstps {{[0-9]+}}(%esp)
140; X86-NEXT:    calll fmodf
141; X86-NEXT:    fstps {{[0-9]+}}(%esp)
142; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
143; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
144; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
145; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
146; X86-NEXT:    vmovaps %xmm0, (%esi)
147; X86-NEXT:    addl $80, %esp
148; X86-NEXT:    popl %esi
149; X86-NEXT:    retl
150;
151; SSE-LABEL: vp_frem_v4f32:
152; SSE:       # %bb.0:
153; SSE-NEXT:    pushq %rbx
154; SSE-NEXT:    subq $64, %rsp
155; SSE-NEXT:    movq %rdi, %rbx
156; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
157; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
158; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
159; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
160; SSE-NEXT:    callq fmodf@PLT
161; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
162; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
163; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
164; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
165; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
166; SSE-NEXT:    callq fmodf@PLT
167; SSE-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
168; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
169; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
170; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
171; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
172; SSE-NEXT:    callq fmodf@PLT
173; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
174; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
175; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
176; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
177; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
178; SSE-NEXT:    callq fmodf@PLT
179; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
180; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
181; SSE-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
182; SSE-NEXT:    # xmm1 = xmm1[0],mem[0]
183; SSE-NEXT:    movaps %xmm1, (%rbx)
184; SSE-NEXT:    addq $64, %rsp
185; SSE-NEXT:    popq %rbx
186; SSE-NEXT:    retq
187;
188; AVX-LABEL: vp_frem_v4f32:
189; AVX:       # %bb.0:
190; AVX-NEXT:    pushq %rbx
191; AVX-NEXT:    subq $48, %rsp
192; AVX-NEXT:    movq %rdi, %rbx
193; AVX-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
194; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
195; AVX-NEXT:    callq fmodf@PLT
196; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
197; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
198; AVX-NEXT:    # xmm0 = mem[1,1,3,3]
199; AVX-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
200; AVX-NEXT:    # xmm1 = mem[1,1,3,3]
201; AVX-NEXT:    callq fmodf@PLT
202; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
203; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
204; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
205; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
206; AVX-NEXT:    # xmm0 = mem[1,0]
207; AVX-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
208; AVX-NEXT:    # xmm1 = mem[1,0]
209; AVX-NEXT:    callq fmodf@PLT
210; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
211; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
212; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
213; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
214; AVX-NEXT:    # xmm0 = mem[3,3,3,3]
215; AVX-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
216; AVX-NEXT:    # xmm1 = mem[3,3,3,3]
217; AVX-NEXT:    callq fmodf@PLT
218; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
219; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
220; AVX-NEXT:    vmovaps %xmm0, (%rbx)
221; AVX-NEXT:    addq $48, %rsp
222; AVX-NEXT:    popq %rbx
223; AVX-NEXT:    retq
224  %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
225  store <4 x float> %res, ptr %out
226  ret void
227}
228declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
229
230define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
231; X86-LABEL: vp_fabs_v4f32:
232; X86:       # %bb.0:
233; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
234; X86-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
235; X86-NEXT:    vmovaps %xmm0, (%eax)
236; X86-NEXT:    retl
237;
238; SSE-LABEL: vp_fabs_v4f32:
239; SSE:       # %bb.0:
240; SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
241; SSE-NEXT:    movaps %xmm0, (%rdi)
242; SSE-NEXT:    retq
243;
244; AVX1-LABEL: vp_fabs_v4f32:
245; AVX1:       # %bb.0:
246; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
247; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
248; AVX1-NEXT:    retq
249;
250; AVX2-LABEL: vp_fabs_v4f32:
251; AVX2:       # %bb.0:
252; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN]
253; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
254; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
255; AVX2-NEXT:    retq
256;
257; AVX512-LABEL: vp_fabs_v4f32:
258; AVX512:       # %bb.0:
259; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
260; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
261; AVX512-NEXT:    retq
262  %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
263  store <4 x float> %res, ptr %out
264  ret void
265}
266declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32)
267
268define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
269; X86-LABEL: vp_sqrt_v4f32:
270; X86:       # %bb.0:
271; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
272; X86-NEXT:    vsqrtps %xmm0, %xmm0
273; X86-NEXT:    vmovaps %xmm0, (%eax)
274; X86-NEXT:    retl
275;
276; SSE-LABEL: vp_sqrt_v4f32:
277; SSE:       # %bb.0:
278; SSE-NEXT:    sqrtps %xmm0, %xmm0
279; SSE-NEXT:    movaps %xmm0, (%rdi)
280; SSE-NEXT:    retq
281;
282; AVX-LABEL: vp_sqrt_v4f32:
283; AVX:       # %bb.0:
284; AVX-NEXT:    vsqrtps %xmm0, %xmm0
285; AVX-NEXT:    vmovaps %xmm0, (%rdi)
286; AVX-NEXT:    retq
287  %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
288  store <4 x float> %res, ptr %out
289  ret void
290}
291declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32)
292
293define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind {
294; X86-LABEL: vp_fneg_v4f32:
295; X86:       # %bb.0:
296; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
297; X86-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
298; X86-NEXT:    vmovaps %xmm0, (%eax)
299; X86-NEXT:    retl
300;
301; SSE-LABEL: vp_fneg_v4f32:
302; SSE:       # %bb.0:
303; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
304; SSE-NEXT:    movaps %xmm0, (%rdi)
305; SSE-NEXT:    retq
306;
307; AVX1-LABEL: vp_fneg_v4f32:
308; AVX1:       # %bb.0:
309; AVX1-NEXT:    vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
310; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
311; AVX1-NEXT:    retq
312;
313; AVX2-LABEL: vp_fneg_v4f32:
314; AVX2:       # %bb.0:
315; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
316; AVX2-NEXT:    vxorps %xmm1, %xmm0, %xmm0
317; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
318; AVX2-NEXT:    retq
319;
320; AVX512-LABEL: vp_fneg_v4f32:
321; AVX512:       # %bb.0:
322; AVX512-NEXT:    vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
323; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
324; AVX512-NEXT:    retq
325  %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp)
326  store <4 x float> %res, ptr %out
327  ret void
328}
329declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32)
330
331define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
332; X86-LABEL: vp_fma_v4f32:
333; X86:       # %bb.0:
334; X86-NEXT:    pushl %esi
335; X86-NEXT:    subl $84, %esp
336; X86-NEXT:    vmovupd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
337; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
338; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
339; X86-NEXT:    vextractps $2, %xmm0, (%esp)
340; X86-NEXT:    vshufpd {{.*#+}} xmm0 = xmm1[1,0]
341; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
342; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
343; X86-NEXT:    calll fmaf
344; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
345; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
346; X86-NEXT:    vextractps $1, %xmm0, (%esp)
347; X86-NEXT:    vmovshdup {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
348; X86-NEXT:    # xmm0 = mem[1,1,3,3]
349; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
350; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
351; X86-NEXT:    calll fmaf
352; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
353; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
354; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
355; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
356; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
357; X86-NEXT:    vmovss %xmm0, (%esp)
358; X86-NEXT:    calll fmaf
359; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
360; X86-NEXT:    vextractps $3, %xmm0, (%esp)
361; X86-NEXT:    vpermilps $255, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload
362; X86-NEXT:    # xmm0 = mem[3,3,3,3]
363; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
364; X86-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
365; X86-NEXT:    fstps {{[0-9]+}}(%esp)
366; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
367; X86-NEXT:    fstps {{[0-9]+}}(%esp)
368; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
369; X86-NEXT:    fstps {{[0-9]+}}(%esp)
370; X86-NEXT:    calll fmaf
371; X86-NEXT:    fstps {{[0-9]+}}(%esp)
372; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
373; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
374; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
375; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
376; X86-NEXT:    vmovaps %xmm0, (%esi)
377; X86-NEXT:    addl $84, %esp
378; X86-NEXT:    popl %esi
379; X86-NEXT:    retl
380;
381; SSE-LABEL: vp_fma_v4f32:
382; SSE:       # %bb.0:
383; SSE-NEXT:    pushq %rbx
384; SSE-NEXT:    subq $64, %rsp
385; SSE-NEXT:    movq %rdi, %rbx
386; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
387; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
388; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
389; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
390; SSE-NEXT:    movaps %xmm1, %xmm2
391; SSE-NEXT:    callq fmaf@PLT
392; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
393; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
394; SSE-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
395; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
396; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
397; SSE-NEXT:    movaps %xmm1, %xmm2
398; SSE-NEXT:    callq fmaf@PLT
399; SSE-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
400; SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
401; SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
402; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
403; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
404; SSE-NEXT:    movaps %xmm1, %xmm2
405; SSE-NEXT:    callq fmaf@PLT
406; SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
407; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
408; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
409; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
410; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
411; SSE-NEXT:    movaps %xmm1, %xmm2
412; SSE-NEXT:    callq fmaf@PLT
413; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
414; SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
415; SSE-NEXT:    unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload
416; SSE-NEXT:    # xmm1 = xmm1[0],mem[0]
417; SSE-NEXT:    movaps %xmm1, (%rbx)
418; SSE-NEXT:    addq $64, %rsp
419; SSE-NEXT:    popq %rbx
420; SSE-NEXT:    retq
421;
422; AVX1-LABEL: vp_fma_v4f32:
423; AVX1:       # %bb.0:
424; AVX1-NEXT:    pushq %rbx
425; AVX1-NEXT:    subq $48, %rsp
426; AVX1-NEXT:    movq %rdi, %rbx
427; AVX1-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
428; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
429; AVX1-NEXT:    vmovaps %xmm1, %xmm2
430; AVX1-NEXT:    callq fmaf@PLT
431; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
432; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
433; AVX1-NEXT:    # xmm0 = mem[1,1,3,3]
434; AVX1-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
435; AVX1-NEXT:    # xmm1 = mem[1,1,3,3]
436; AVX1-NEXT:    vmovaps %xmm1, %xmm2
437; AVX1-NEXT:    callq fmaf@PLT
438; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
439; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
440; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
441; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
442; AVX1-NEXT:    # xmm0 = mem[1,0]
443; AVX1-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
444; AVX1-NEXT:    # xmm1 = mem[1,0]
445; AVX1-NEXT:    vmovapd %xmm1, %xmm2
446; AVX1-NEXT:    callq fmaf@PLT
447; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
448; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
449; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
450; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
451; AVX1-NEXT:    # xmm0 = mem[3,3,3,3]
452; AVX1-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
453; AVX1-NEXT:    # xmm1 = mem[3,3,3,3]
454; AVX1-NEXT:    vmovaps %xmm1, %xmm2
455; AVX1-NEXT:    callq fmaf@PLT
456; AVX1-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
457; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
458; AVX1-NEXT:    vmovaps %xmm0, (%rbx)
459; AVX1-NEXT:    addq $48, %rsp
460; AVX1-NEXT:    popq %rbx
461; AVX1-NEXT:    retq
462;
463; AVX2-LABEL: vp_fma_v4f32:
464; AVX2:       # %bb.0:
465; AVX2-NEXT:    pushq %rbx
466; AVX2-NEXT:    subq $48, %rsp
467; AVX2-NEXT:    movq %rdi, %rbx
468; AVX2-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
469; AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
470; AVX2-NEXT:    vmovaps %xmm1, %xmm2
471; AVX2-NEXT:    callq fmaf@PLT
472; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
473; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
474; AVX2-NEXT:    # xmm0 = mem[1,1,3,3]
475; AVX2-NEXT:    vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
476; AVX2-NEXT:    # xmm1 = mem[1,1,3,3]
477; AVX2-NEXT:    vmovaps %xmm1, %xmm2
478; AVX2-NEXT:    callq fmaf@PLT
479; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
480; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
481; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
482; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
483; AVX2-NEXT:    # xmm0 = mem[1,0]
484; AVX2-NEXT:    vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
485; AVX2-NEXT:    # xmm1 = mem[1,0]
486; AVX2-NEXT:    vmovapd %xmm1, %xmm2
487; AVX2-NEXT:    callq fmaf@PLT
488; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
489; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
490; AVX2-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
491; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
492; AVX2-NEXT:    # xmm0 = mem[3,3,3,3]
493; AVX2-NEXT:    vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
494; AVX2-NEXT:    # xmm1 = mem[3,3,3,3]
495; AVX2-NEXT:    vmovaps %xmm1, %xmm2
496; AVX2-NEXT:    callq fmaf@PLT
497; AVX2-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
498; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
499; AVX2-NEXT:    vmovaps %xmm0, (%rbx)
500; AVX2-NEXT:    addq $48, %rsp
501; AVX2-NEXT:    popq %rbx
502; AVX2-NEXT:    retq
503;
504; AVX512-LABEL: vp_fma_v4f32:
505; AVX512:       # %bb.0:
506; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
507; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
508; AVX512-NEXT:    retq
509  %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
510  store <4 x float> %res, ptr %out
511  ret void
512}
513declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
514
515define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind {
516; X86-LABEL: vp_fmuladd_v4f32:
517; X86:       # %bb.0:
518; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
519; X86-NEXT:    vmulps %xmm1, %xmm0, %xmm0
520; X86-NEXT:    vaddps %xmm1, %xmm0, %xmm0
521; X86-NEXT:    vmovaps %xmm0, (%eax)
522; X86-NEXT:    retl
523;
524; SSE-LABEL: vp_fmuladd_v4f32:
525; SSE:       # %bb.0:
526; SSE-NEXT:    mulps %xmm1, %xmm0
527; SSE-NEXT:    addps %xmm1, %xmm0
528; SSE-NEXT:    movaps %xmm0, (%rdi)
529; SSE-NEXT:    retq
530;
531; AVX1-LABEL: vp_fmuladd_v4f32:
532; AVX1:       # %bb.0:
533; AVX1-NEXT:    vmulps %xmm1, %xmm0, %xmm0
534; AVX1-NEXT:    vaddps %xmm1, %xmm0, %xmm0
535; AVX1-NEXT:    vmovaps %xmm0, (%rdi)
536; AVX1-NEXT:    retq
537;
538; AVX2-LABEL: vp_fmuladd_v4f32:
539; AVX2:       # %bb.0:
540; AVX2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
541; AVX2-NEXT:    vaddps %xmm1, %xmm0, %xmm0
542; AVX2-NEXT:    vmovaps %xmm0, (%rdi)
543; AVX2-NEXT:    retq
544;
545; AVX512-LABEL: vp_fmuladd_v4f32:
546; AVX512:       # %bb.0:
547; AVX512-NEXT:    vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1
548; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
549; AVX512-NEXT:    retq
550  %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4)
551  store <4 x float> %res, ptr %out
552  ret void
553}
554declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32)
555
556declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
557define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
558; X86-LABEL: vfmax_vv_v4f32:
559; X86:       # %bb.0:
560; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
561; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
562; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
563; X86-NEXT:    retl
564;
565; SSE-LABEL: vfmax_vv_v4f32:
566; SSE:       # %bb.0:
567; SSE-NEXT:    movaps %xmm1, %xmm2
568; SSE-NEXT:    maxps %xmm0, %xmm2
569; SSE-NEXT:    cmpunordps %xmm0, %xmm0
570; SSE-NEXT:    andps %xmm0, %xmm1
571; SSE-NEXT:    andnps %xmm2, %xmm0
572; SSE-NEXT:    orps %xmm1, %xmm0
573; SSE-NEXT:    retq
574;
575; AVX1-LABEL: vfmax_vv_v4f32:
576; AVX1:       # %bb.0:
577; AVX1-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
578; AVX1-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
579; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
580; AVX1-NEXT:    retq
581;
582; AVX2-LABEL: vfmax_vv_v4f32:
583; AVX2:       # %bb.0:
584; AVX2-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
585; AVX2-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
586; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
587; AVX2-NEXT:    retq
588;
589; AVX512-LABEL: vfmax_vv_v4f32:
590; AVX512:       # %bb.0:
591; AVX512-NEXT:    vmaxps %xmm0, %xmm1, %xmm2
592; AVX512-NEXT:    vcmpunordps %xmm0, %xmm0, %k1
593; AVX512-NEXT:    vmovaps %xmm1, %xmm2 {%k1}
594; AVX512-NEXT:    vmovaps %xmm2, %xmm0
595; AVX512-NEXT:    retq
596  %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
597  ret <4 x float> %v
598}
599
600declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
601define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
602; X86-LABEL: vfmax_vv_v8f32:
603; X86:       # %bb.0:
604; X86-NEXT:    vmaxps %ymm0, %ymm1, %ymm2
605; X86-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
606; X86-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
607; X86-NEXT:    retl
608;
609; SSE-LABEL: vfmax_vv_v8f32:
610; SSE:       # %bb.0:
611; SSE-NEXT:    movaps %xmm2, %xmm4
612; SSE-NEXT:    maxps %xmm0, %xmm4
613; SSE-NEXT:    cmpunordps %xmm0, %xmm0
614; SSE-NEXT:    andps %xmm0, %xmm2
615; SSE-NEXT:    andnps %xmm4, %xmm0
616; SSE-NEXT:    orps %xmm2, %xmm0
617; SSE-NEXT:    movaps %xmm3, %xmm2
618; SSE-NEXT:    maxps %xmm1, %xmm2
619; SSE-NEXT:    cmpunordps %xmm1, %xmm1
620; SSE-NEXT:    andps %xmm1, %xmm3
621; SSE-NEXT:    andnps %xmm2, %xmm1
622; SSE-NEXT:    orps %xmm3, %xmm1
623; SSE-NEXT:    retq
624;
625; AVX1-LABEL: vfmax_vv_v8f32:
626; AVX1:       # %bb.0:
627; AVX1-NEXT:    vmaxps %ymm0, %ymm1, %ymm2
628; AVX1-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
629; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
630; AVX1-NEXT:    retq
631;
632; AVX2-LABEL: vfmax_vv_v8f32:
633; AVX2:       # %bb.0:
634; AVX2-NEXT:    vmaxps %ymm0, %ymm1, %ymm2
635; AVX2-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
636; AVX2-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
637; AVX2-NEXT:    retq
638;
639; AVX512-LABEL: vfmax_vv_v8f32:
640; AVX512:       # %bb.0:
641; AVX512-NEXT:    vmaxps %ymm0, %ymm1, %ymm2
642; AVX512-NEXT:    vcmpunordps %ymm0, %ymm0, %k1
643; AVX512-NEXT:    vmovaps %ymm1, %ymm2 {%k1}
644; AVX512-NEXT:    vmovaps %ymm2, %ymm0
645; AVX512-NEXT:    retq
646  %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
647  ret <8 x float> %v
648}
649
650declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
651define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
652; X86-LABEL: vfmin_vv_v4f32:
653; X86:       # %bb.0:
654; X86-NEXT:    vminps %xmm0, %xmm1, %xmm2
655; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
656; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
657; X86-NEXT:    retl
658;
659; SSE-LABEL: vfmin_vv_v4f32:
660; SSE:       # %bb.0:
661; SSE-NEXT:    movaps %xmm1, %xmm2
662; SSE-NEXT:    minps %xmm0, %xmm2
663; SSE-NEXT:    cmpunordps %xmm0, %xmm0
664; SSE-NEXT:    andps %xmm0, %xmm1
665; SSE-NEXT:    andnps %xmm2, %xmm0
666; SSE-NEXT:    orps %xmm1, %xmm0
667; SSE-NEXT:    retq
668;
669; AVX1-LABEL: vfmin_vv_v4f32:
670; AVX1:       # %bb.0:
671; AVX1-NEXT:    vminps %xmm0, %xmm1, %xmm2
672; AVX1-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
673; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
674; AVX1-NEXT:    retq
675;
676; AVX2-LABEL: vfmin_vv_v4f32:
677; AVX2:       # %bb.0:
678; AVX2-NEXT:    vminps %xmm0, %xmm1, %xmm2
679; AVX2-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm0
680; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
681; AVX2-NEXT:    retq
682;
683; AVX512-LABEL: vfmin_vv_v4f32:
684; AVX512:       # %bb.0:
685; AVX512-NEXT:    vminps %xmm0, %xmm1, %xmm2
686; AVX512-NEXT:    vcmpunordps %xmm0, %xmm0, %k1
687; AVX512-NEXT:    vmovaps %xmm1, %xmm2 {%k1}
688; AVX512-NEXT:    vmovaps %xmm2, %xmm0
689; AVX512-NEXT:    retq
690  %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
691  ret <4 x float> %v
692}
693
694declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
695define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
696; X86-LABEL: vfmin_vv_v8f32:
697; X86:       # %bb.0:
698; X86-NEXT:    vminps %ymm0, %ymm1, %ymm2
699; X86-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
700; X86-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
701; X86-NEXT:    retl
702;
703; SSE-LABEL: vfmin_vv_v8f32:
704; SSE:       # %bb.0:
705; SSE-NEXT:    movaps %xmm2, %xmm4
706; SSE-NEXT:    minps %xmm0, %xmm4
707; SSE-NEXT:    cmpunordps %xmm0, %xmm0
708; SSE-NEXT:    andps %xmm0, %xmm2
709; SSE-NEXT:    andnps %xmm4, %xmm0
710; SSE-NEXT:    orps %xmm2, %xmm0
711; SSE-NEXT:    movaps %xmm3, %xmm2
712; SSE-NEXT:    minps %xmm1, %xmm2
713; SSE-NEXT:    cmpunordps %xmm1, %xmm1
714; SSE-NEXT:    andps %xmm1, %xmm3
715; SSE-NEXT:    andnps %xmm2, %xmm1
716; SSE-NEXT:    orps %xmm3, %xmm1
717; SSE-NEXT:    retq
718;
719; AVX1-LABEL: vfmin_vv_v8f32:
720; AVX1:       # %bb.0:
721; AVX1-NEXT:    vminps %ymm0, %ymm1, %ymm2
722; AVX1-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
723; AVX1-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
724; AVX1-NEXT:    retq
725;
726; AVX2-LABEL: vfmin_vv_v8f32:
727; AVX2:       # %bb.0:
728; AVX2-NEXT:    vminps %ymm0, %ymm1, %ymm2
729; AVX2-NEXT:    vcmpunordps %ymm0, %ymm0, %ymm0
730; AVX2-NEXT:    vblendvps %ymm0, %ymm1, %ymm2, %ymm0
731; AVX2-NEXT:    retq
732;
733; AVX512-LABEL: vfmin_vv_v8f32:
734; AVX512:       # %bb.0:
735; AVX512-NEXT:    vminps %ymm0, %ymm1, %ymm2
736; AVX512-NEXT:    vcmpunordps %ymm0, %ymm0, %k1
737; AVX512-NEXT:    vmovaps %ymm1, %ymm2 {%k1}
738; AVX512-NEXT:    vmovaps %ymm2, %ymm0
739; AVX512-NEXT:    retq
740  %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
741  ret <8 x float> %v
742}
743