xref: /llvm-project/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll (revision 13c6abfac84fca4bc55c0721d1853ce86a385678)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
8
9declare float @llvm.maximumnum.f32(float, float)
10declare double @llvm.maximumnum.f64(double, double)
11declare float @llvm.minimumnum.f32(float, float)
12declare double @llvm.minimumnum.f64(double, double)
13declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>)
14declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
15declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>)
16declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
17
18;
19; fmaximumnum
20;
21
22define float @test_fmaximumnum(float %x, float %y) nounwind {
23; SSE2-LABEL: test_fmaximumnum:
24; SSE2:       # %bb.0:
25; SSE2-NEXT:    movdqa %xmm0, %xmm2
26; SSE2-NEXT:    movd %xmm0, %eax
27; SSE2-NEXT:    testl %eax, %eax
28; SSE2-NEXT:    movdqa %xmm0, %xmm3
29; SSE2-NEXT:    js .LBB0_2
30; SSE2-NEXT:  # %bb.1:
31; SSE2-NEXT:    movdqa %xmm1, %xmm3
32; SSE2-NEXT:  .LBB0_2:
33; SSE2-NEXT:    movdqa %xmm3, %xmm0
34; SSE2-NEXT:    cmpordss %xmm3, %xmm0
35; SSE2-NEXT:    movaps %xmm0, %xmm4
36; SSE2-NEXT:    andps %xmm3, %xmm4
37; SSE2-NEXT:    js .LBB0_4
38; SSE2-NEXT:  # %bb.3:
39; SSE2-NEXT:    movdqa %xmm2, %xmm1
40; SSE2-NEXT:  .LBB0_4:
41; SSE2-NEXT:    maxss %xmm1, %xmm3
42; SSE2-NEXT:    andnps %xmm3, %xmm0
43; SSE2-NEXT:    orps %xmm4, %xmm0
44; SSE2-NEXT:    retq
45;
46; AVX1-LABEL: test_fmaximumnum:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vmovd %xmm0, %eax
49; AVX1-NEXT:    testl %eax, %eax
50; AVX1-NEXT:    js .LBB0_1
51; AVX1-NEXT:  # %bb.2:
52; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
53; AVX1-NEXT:    jmp .LBB0_3
54; AVX1-NEXT:  .LBB0_1:
55; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
56; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
57; AVX1-NEXT:  .LBB0_3:
58; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
59; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
60; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
61; AVX1-NEXT:    retq
62;
63; AVX512-LABEL: test_fmaximumnum:
64; AVX512:       # %bb.0:
65; AVX512-NEXT:    vmovd %xmm0, %eax
66; AVX512-NEXT:    testl %eax, %eax
67; AVX512-NEXT:    sets %al
68; AVX512-NEXT:    kmovw %eax, %k1
69; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
70; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
71; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
72; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
73; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
74; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
75; AVX512-NEXT:    retq
76;
77; AVX10_2-LABEL: test_fmaximumnum:
78; AVX10_2:       # %bb.0:
79; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
80; AVX10_2-NEXT:    retq
81;
82; X86-LABEL: test_fmaximumnum:
83; X86:       # %bb.0:
84; X86-NEXT:    pushl %eax
85; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
86; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
87; X86-NEXT:    vmovd %xmm2, %eax
88; X86-NEXT:    testl %eax, %eax
89; X86-NEXT:    js .LBB0_1
90; X86-NEXT:  # %bb.2:
91; X86-NEXT:    vmovdqa %xmm2, %xmm1
92; X86-NEXT:    jmp .LBB0_3
93; X86-NEXT:  .LBB0_1:
94; X86-NEXT:    vmovdqa %xmm0, %xmm1
95; X86-NEXT:    vmovdqa %xmm2, %xmm0
96; X86-NEXT:  .LBB0_3:
97; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
98; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
99; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
100; X86-NEXT:    vmovss %xmm0, (%esp)
101; X86-NEXT:    flds (%esp)
102; X86-NEXT:    popl %eax
103; X86-NEXT:    retl
104  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
105  ret float %1
106}
107
108define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
109; SSE2-LABEL: test_fmaximumnum_scalarize:
110; SSE2:       # %bb.0:
111; SSE2-NEXT:    maxps %xmm1, %xmm0
112; SSE2-NEXT:    retq
113;
114; AVX-LABEL: test_fmaximumnum_scalarize:
115; AVX:       # %bb.0:
116; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
117; AVX-NEXT:    retq
118;
119; AVX10_2-LABEL: test_fmaximumnum_scalarize:
120; AVX10_2:       # %bb.0:
121; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
122; AVX10_2-NEXT:    retq
123;
124; X86-LABEL: test_fmaximumnum_scalarize:
125; X86:       # %bb.0:
126; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
127; X86-NEXT:    retl
128  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
129  ret <4 x float> %r
130}
131
132define float @test_fmaximumnum_nan0(float %x, float %y) {
133; SSE2-LABEL: test_fmaximumnum_nan0:
134; SSE2:       # %bb.0:
135; SSE2-NEXT:    movaps %xmm1, %xmm0
136; SSE2-NEXT:    retq
137;
138; AVX-LABEL: test_fmaximumnum_nan0:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vmovaps %xmm1, %xmm0
141; AVX-NEXT:    retq
142;
143; AVX10_2-LABEL: test_fmaximumnum_nan0:
144; AVX10_2:       # %bb.0:
145; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
146; AVX10_2-NEXT:    retq
147;
148; X86-LABEL: test_fmaximumnum_nan0:
149; X86:       # %bb.0:
150; X86-NEXT:    flds {{[0-9]+}}(%esp)
151; X86-NEXT:    retl
152  %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y)
153  ret float %1
154}
155
156define float @test_fmaximumnum_nan1(float %x, float %y) {
157; SSE2-LABEL: test_fmaximumnum_nan1:
158; SSE2:       # %bb.0:
159; SSE2-NEXT:    retq
160;
161; AVX-LABEL: test_fmaximumnum_nan1:
162; AVX:       # %bb.0:
163; AVX-NEXT:    retq
164;
165; AVX10_2-LABEL: test_fmaximumnum_nan1:
166; AVX10_2:       # %bb.0:
167; AVX10_2-NEXT:    retq
168;
169; X86-LABEL: test_fmaximumnum_nan1:
170; X86:       # %bb.0:
171; X86-NEXT:    flds {{[0-9]+}}(%esp)
172; X86-NEXT:    retl
173  %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000)
174  ret float %1
175}
176
177define float @test_fmaximumnum_nnan(float %x, float %y) nounwind {
178; SSE2-LABEL: test_fmaximumnum_nnan:
179; SSE2:       # %bb.0:
180; SSE2-NEXT:    movaps %xmm0, %xmm2
181; SSE2-NEXT:    addss %xmm1, %xmm2
182; SSE2-NEXT:    subss %xmm1, %xmm0
183; SSE2-NEXT:    movd %xmm2, %eax
184; SSE2-NEXT:    testl %eax, %eax
185; SSE2-NEXT:    js .LBB4_1
186; SSE2-NEXT:  # %bb.2:
187; SSE2-NEXT:    maxss %xmm2, %xmm0
188; SSE2-NEXT:    retq
189; SSE2-NEXT:  .LBB4_1:
190; SSE2-NEXT:    movaps %xmm0, %xmm1
191; SSE2-NEXT:    movaps %xmm2, %xmm0
192; SSE2-NEXT:    maxss %xmm1, %xmm0
193; SSE2-NEXT:    retq
194;
195; AVX1-LABEL: test_fmaximumnum_nnan:
196; AVX1:       # %bb.0:
197; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
198; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
199; AVX1-NEXT:    vmovd %xmm2, %eax
200; AVX1-NEXT:    testl %eax, %eax
201; AVX1-NEXT:    js .LBB4_1
202; AVX1-NEXT:  # %bb.2:
203; AVX1-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
204; AVX1-NEXT:    retq
205; AVX1-NEXT:  .LBB4_1:
206; AVX1-NEXT:    vmovaps %xmm0, %xmm1
207; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
208; AVX1-NEXT:    retq
209;
210; AVX512F-LABEL: test_fmaximumnum_nnan:
211; AVX512F:       # %bb.0:
212; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
213; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
214; AVX512F-NEXT:    vmovd %xmm2, %eax
215; AVX512F-NEXT:    testl %eax, %eax
216; AVX512F-NEXT:    sets %al
217; AVX512F-NEXT:    kmovw %eax, %k1
218; AVX512F-NEXT:    vmovaps %xmm2, %xmm1
219; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
220; AVX512F-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
221; AVX512F-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
222; AVX512F-NEXT:    retq
223;
224; AVX512DQ-LABEL: test_fmaximumnum_nnan:
225; AVX512DQ:       # %bb.0:
226; AVX512DQ-NEXT:    vaddss %xmm1, %xmm0, %xmm2
227; AVX512DQ-NEXT:    vsubss %xmm1, %xmm0, %xmm0
228; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
229; AVX512DQ-NEXT:    kmovw %k0, %k1
230; AVX512DQ-NEXT:    vmovaps %xmm2, %xmm1
231; AVX512DQ-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
232; AVX512DQ-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
233; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
234; AVX512DQ-NEXT:    retq
235;
236; AVX10_2-LABEL: test_fmaximumnum_nnan:
237; AVX10_2:       # %bb.0:
238; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
239; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
240; AVX10_2-NEXT:    vminmaxss $17, %xmm0, %xmm2
241; AVX10_2-NEXT:    retq
242;
243; X86-LABEL: test_fmaximumnum_nnan:
244; X86:       # %bb.0:
245; X86-NEXT:    pushl %eax
246; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
247; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
248; X86-NEXT:    vaddss %xmm0, %xmm2, %xmm1
249; X86-NEXT:    vsubss %xmm0, %xmm2, %xmm0
250; X86-NEXT:    vmovd %xmm1, %eax
251; X86-NEXT:    testl %eax, %eax
252; X86-NEXT:    js .LBB4_1
253; X86-NEXT:  # %bb.2:
254; X86-NEXT:    vmovaps %xmm1, %xmm2
255; X86-NEXT:    jmp .LBB4_3
256; X86-NEXT:  .LBB4_1:
257; X86-NEXT:    vmovaps %xmm0, %xmm2
258; X86-NEXT:    vmovaps %xmm1, %xmm0
259; X86-NEXT:  .LBB4_3:
260; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
261; X86-NEXT:    vmovss %xmm0, (%esp)
262; X86-NEXT:    flds (%esp)
263; X86-NEXT:    popl %eax
264; X86-NEXT:    retl
265  %1 = fadd nnan float %x, %y
266  %2 = fsub nnan float %x, %y
267  %3 = tail call float @llvm.maximumnum.f32(float %1, float %2)
268  ret float %3
269}
270
271define double @test_fmaximumnum_zero0(double %x, double %y) nounwind {
272; SSE2-LABEL: test_fmaximumnum_zero0:
273; SSE2:       # %bb.0:
274; SSE2-NEXT:    movapd %xmm1, %xmm0
275; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
276; SSE2-NEXT:    movapd %xmm0, %xmm2
277; SSE2-NEXT:    andpd %xmm1, %xmm2
278; SSE2-NEXT:    xorpd %xmm3, %xmm3
279; SSE2-NEXT:    maxsd %xmm3, %xmm1
280; SSE2-NEXT:    andnpd %xmm1, %xmm0
281; SSE2-NEXT:    orpd %xmm2, %xmm0
282; SSE2-NEXT:    retq
283;
284; AVX1-LABEL: test_fmaximumnum_zero0:
285; AVX1:       # %bb.0:
286; AVX1-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
287; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
288; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm2
289; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
290; AVX1-NEXT:    retq
291;
292; AVX512-LABEL: test_fmaximumnum_zero0:
293; AVX512:       # %bb.0:
294; AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
295; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
296; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
297; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
298; AVX512-NEXT:    retq
299;
300; AVX10_2-LABEL: test_fmaximumnum_zero0:
301; AVX10_2:       # %bb.0:
302; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
303; AVX10_2-NEXT:    vminmaxsd $17, %xmm0, %xmm1
304; AVX10_2-NEXT:    retq
305;
306; X86-LABEL: test_fmaximumnum_zero0:
307; X86:       # %bb.0:
308; X86-NEXT:    pushl %ebp
309; X86-NEXT:    movl %esp, %ebp
310; X86-NEXT:    andl $-8, %esp
311; X86-NEXT:    subl $8, %esp
312; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
313; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
314; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
315; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
316; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
317; X86-NEXT:    vmovlpd %xmm0, (%esp)
318; X86-NEXT:    fldl (%esp)
319; X86-NEXT:    movl %ebp, %esp
320; X86-NEXT:    popl %ebp
321; X86-NEXT:    retl
322  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y)
323  ret double %1
324}
325
326define double @test_fmaximumnum_zero1(double %x, double %y) nounwind {
327; SSE2-LABEL: test_fmaximumnum_zero1:
328; SSE2:       # %bb.0:
329; SSE2-NEXT:    movapd %xmm0, %xmm1
330; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
331; SSE2-NEXT:    movapd %xmm1, %xmm2
332; SSE2-NEXT:    andpd %xmm0, %xmm2
333; SSE2-NEXT:    xorpd %xmm3, %xmm3
334; SSE2-NEXT:    maxsd %xmm3, %xmm0
335; SSE2-NEXT:    andnpd %xmm0, %xmm1
336; SSE2-NEXT:    orpd %xmm2, %xmm1
337; SSE2-NEXT:    movapd %xmm1, %xmm0
338; SSE2-NEXT:    retq
339;
340; AVX1-LABEL: test_fmaximumnum_zero1:
341; AVX1:       # %bb.0:
342; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
343; AVX1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
344; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
345; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
346; AVX1-NEXT:    retq
347;
348; AVX512-LABEL: test_fmaximumnum_zero1:
349; AVX512:       # %bb.0:
350; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
351; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
352; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
353; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
354; AVX512-NEXT:    vmovapd %xmm1, %xmm0
355; AVX512-NEXT:    retq
356;
357; AVX10_2-LABEL: test_fmaximumnum_zero1:
358; AVX10_2:       # %bb.0:
359; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
360; AVX10_2-NEXT:    vminmaxsd $17, %xmm1, %xmm0
361; AVX10_2-NEXT:    retq
362;
363; X86-LABEL: test_fmaximumnum_zero1:
364; X86:       # %bb.0:
365; X86-NEXT:    pushl %ebp
366; X86-NEXT:    movl %esp, %ebp
367; X86-NEXT:    andl $-8, %esp
368; X86-NEXT:    subl $8, %esp
369; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
370; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
371; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
372; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm2
373; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
374; X86-NEXT:    vmovlpd %xmm0, (%esp)
375; X86-NEXT:    fldl (%esp)
376; X86-NEXT:    movl %ebp, %esp
377; X86-NEXT:    popl %ebp
378; X86-NEXT:    retl
379  %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0)
380  ret double %1
381}
382
383define double @test_fmaximumnum_zero2(double %x, double %y) {
384; SSE2-LABEL: test_fmaximumnum_zero2:
385; SSE2:       # %bb.0:
386; SSE2-NEXT:    xorps %xmm0, %xmm0
387; SSE2-NEXT:    retq
388;
389; AVX-LABEL: test_fmaximumnum_zero2:
390; AVX:       # %bb.0:
391; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
392; AVX-NEXT:    retq
393;
394; AVX10_2-LABEL: test_fmaximumnum_zero2:
395; AVX10_2:       # %bb.0:
396; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
397; AVX10_2-NEXT:    retq
398;
399; X86-LABEL: test_fmaximumnum_zero2:
400; X86:       # %bb.0:
401; X86-NEXT:    fldz
402; X86-NEXT:    retl
403  %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0)
404  ret double %1
405}
406
407define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
408; SSE2-LABEL: test_fmaximumnum_nsz:
409; SSE2:       # %bb.0:
410; SSE2-NEXT:    movaps %xmm0, %xmm2
411; SSE2-NEXT:    cmpordss %xmm0, %xmm2
412; SSE2-NEXT:    movaps %xmm2, %xmm3
413; SSE2-NEXT:    andps %xmm0, %xmm3
414; SSE2-NEXT:    maxss %xmm1, %xmm0
415; SSE2-NEXT:    andnps %xmm0, %xmm2
416; SSE2-NEXT:    orps %xmm3, %xmm2
417; SSE2-NEXT:    movaps %xmm2, %xmm0
418; SSE2-NEXT:    retq
419;
420; AVX1-LABEL: test_fmaximumnum_nsz:
421; AVX1:       # %bb.0:
422; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
423; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
424; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
425; AVX1-NEXT:    retq
426;
427; AVX512-LABEL: test_fmaximumnum_nsz:
428; AVX512:       # %bb.0:
429; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
430; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
431; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
432; AVX512-NEXT:    vmovaps %xmm1, %xmm0
433; AVX512-NEXT:    retq
434;
435; AVX10_2-LABEL: test_fmaximumnum_nsz:
436; AVX10_2:       # %bb.0:
437; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
438; AVX10_2-NEXT:    retq
439;
440; X86-LABEL: test_fmaximumnum_nsz:
441; X86:       # %bb.0:
442; X86-NEXT:    pushl %eax
443; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
444; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
445; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
446; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
447; X86-NEXT:    vmovss %xmm0, (%esp)
448; X86-NEXT:    flds (%esp)
449; X86-NEXT:    popl %eax
450; X86-NEXT:    retl
451  %1 = tail call float @llvm.maximumnum.f32(float %x, float %y)
452  ret float %1
453}
454
455define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
456; SSE2-LABEL: test_fmaximumnum_combine_cmps:
457; SSE2:       # %bb.0:
458; SSE2-NEXT:    divss %xmm0, %xmm1
459; SSE2-NEXT:    movd %xmm0, %eax
460; SSE2-NEXT:    testl %eax, %eax
461; SSE2-NEXT:    movaps %xmm0, %xmm3
462; SSE2-NEXT:    js .LBB9_2
463; SSE2-NEXT:  # %bb.1:
464; SSE2-NEXT:    movaps %xmm1, %xmm3
465; SSE2-NEXT:  .LBB9_2:
466; SSE2-NEXT:    movaps %xmm3, %xmm2
467; SSE2-NEXT:    cmpordss %xmm3, %xmm2
468; SSE2-NEXT:    movaps %xmm2, %xmm4
469; SSE2-NEXT:    andps %xmm3, %xmm4
470; SSE2-NEXT:    js .LBB9_4
471; SSE2-NEXT:  # %bb.3:
472; SSE2-NEXT:    movaps %xmm0, %xmm1
473; SSE2-NEXT:  .LBB9_4:
474; SSE2-NEXT:    maxss %xmm1, %xmm3
475; SSE2-NEXT:    andnps %xmm3, %xmm2
476; SSE2-NEXT:    orps %xmm4, %xmm2
477; SSE2-NEXT:    movaps %xmm2, %xmm0
478; SSE2-NEXT:    retq
479;
480; AVX1-LABEL: test_fmaximumnum_combine_cmps:
481; AVX1:       # %bb.0:
482; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm1
483; AVX1-NEXT:    vmovd %xmm0, %eax
484; AVX1-NEXT:    testl %eax, %eax
485; AVX1-NEXT:    js .LBB9_1
486; AVX1-NEXT:  # %bb.2:
487; AVX1-NEXT:    vmovaps %xmm0, %xmm2
488; AVX1-NEXT:    jmp .LBB9_3
489; AVX1-NEXT:  .LBB9_1:
490; AVX1-NEXT:    vmovaps %xmm1, %xmm2
491; AVX1-NEXT:    vmovaps %xmm0, %xmm1
492; AVX1-NEXT:  .LBB9_3:
493; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
494; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
495; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
496; AVX1-NEXT:    retq
497;
498; AVX512F-LABEL: test_fmaximumnum_combine_cmps:
499; AVX512F:       # %bb.0:
500; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
501; AVX512F-NEXT:    vmovd %xmm0, %eax
502; AVX512F-NEXT:    testl %eax, %eax
503; AVX512F-NEXT:    sets %al
504; AVX512F-NEXT:    kmovw %eax, %k1
505; AVX512F-NEXT:    vmovaps %xmm0, %xmm2
506; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
507; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
508; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
509; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
510; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
511; AVX512F-NEXT:    retq
512;
513; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps:
514; AVX512DQ:       # %bb.0:
515; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
516; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
517; AVX512DQ-NEXT:    kmovw %k0, %k1
518; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
519; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
520; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
521; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
522; AVX512DQ-NEXT:    retq
523;
524; AVX10_2-LABEL: test_fmaximumnum_combine_cmps:
525; AVX10_2:       # %bb.0:
526; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
527; AVX10_2-NEXT:    vminmaxss $17, %xmm1, %xmm0
528; AVX10_2-NEXT:    retq
529;
530; X86-LABEL: test_fmaximumnum_combine_cmps:
531; X86:       # %bb.0:
532; X86-NEXT:    pushl %eax
533; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
534; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
535; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
536; X86-NEXT:    vmovd %xmm1, %eax
537; X86-NEXT:    testl %eax, %eax
538; X86-NEXT:    js .LBB9_1
539; X86-NEXT:  # %bb.2:
540; X86-NEXT:    vmovaps %xmm1, %xmm2
541; X86-NEXT:    jmp .LBB9_3
542; X86-NEXT:  .LBB9_1:
543; X86-NEXT:    vmovaps %xmm0, %xmm2
544; X86-NEXT:    vmovaps %xmm1, %xmm0
545; X86-NEXT:  .LBB9_3:
546; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
547; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
548; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
549; X86-NEXT:    vmovss %xmm0, (%esp)
550; X86-NEXT:    flds (%esp)
551; X86-NEXT:    popl %eax
552; X86-NEXT:    retl
553  %1 = fdiv nnan float %y, %x
554  %2 = tail call float @llvm.maximumnum.f32(float %x, float %1)
555  ret float %2
556}
557
558;
559; fminimumnum
560;
561
562define float @test_fminimumnum(float %x, float %y) nounwind {
563; SSE2-LABEL: test_fminimumnum:
564; SSE2:       # %bb.0:
565; SSE2-NEXT:    movd %xmm0, %eax
566; SSE2-NEXT:    testl %eax, %eax
567; SSE2-NEXT:    movdqa %xmm1, %xmm3
568; SSE2-NEXT:    js .LBB10_2
569; SSE2-NEXT:  # %bb.1:
570; SSE2-NEXT:    movdqa %xmm0, %xmm3
571; SSE2-NEXT:  .LBB10_2:
572; SSE2-NEXT:    movdqa %xmm3, %xmm2
573; SSE2-NEXT:    cmpordss %xmm3, %xmm2
574; SSE2-NEXT:    movaps %xmm2, %xmm4
575; SSE2-NEXT:    andps %xmm3, %xmm4
576; SSE2-NEXT:    js .LBB10_4
577; SSE2-NEXT:  # %bb.3:
578; SSE2-NEXT:    movdqa %xmm1, %xmm0
579; SSE2-NEXT:  .LBB10_4:
580; SSE2-NEXT:    minss %xmm0, %xmm3
581; SSE2-NEXT:    andnps %xmm3, %xmm2
582; SSE2-NEXT:    orps %xmm4, %xmm2
583; SSE2-NEXT:    movaps %xmm2, %xmm0
584; SSE2-NEXT:    retq
585;
586; AVX1-LABEL: test_fminimumnum:
587; AVX1:       # %bb.0:
588; AVX1-NEXT:    vmovd %xmm0, %eax
589; AVX1-NEXT:    testl %eax, %eax
590; AVX1-NEXT:    js .LBB10_1
591; AVX1-NEXT:  # %bb.2:
592; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
593; AVX1-NEXT:    jmp .LBB10_3
594; AVX1-NEXT:  .LBB10_1:
595; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
596; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
597; AVX1-NEXT:  .LBB10_3:
598; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
599; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
600; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
601; AVX1-NEXT:    retq
602;
603; AVX512-LABEL: test_fminimumnum:
604; AVX512:       # %bb.0:
605; AVX512-NEXT:    vmovd %xmm0, %eax
606; AVX512-NEXT:    testl %eax, %eax
607; AVX512-NEXT:    sets %al
608; AVX512-NEXT:    kmovw %eax, %k1
609; AVX512-NEXT:    vmovaps %xmm1, %xmm2
610; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
611; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
612; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
613; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
614; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
615; AVX512-NEXT:    vmovaps %xmm1, %xmm0
616; AVX512-NEXT:    retq
617;
618; AVX10_2-LABEL: test_fminimumnum:
619; AVX10_2:       # %bb.0:
620; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
621; AVX10_2-NEXT:    retq
622;
623; X86-LABEL: test_fminimumnum:
624; X86:       # %bb.0:
625; X86-NEXT:    pushl %eax
626; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
627; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
628; X86-NEXT:    vmovd %xmm0, %eax
629; X86-NEXT:    testl %eax, %eax
630; X86-NEXT:    js .LBB10_1
631; X86-NEXT:  # %bb.2:
632; X86-NEXT:    vmovdqa %xmm1, %xmm2
633; X86-NEXT:    jmp .LBB10_3
634; X86-NEXT:  .LBB10_1:
635; X86-NEXT:    vmovdqa %xmm0, %xmm2
636; X86-NEXT:    vmovdqa %xmm1, %xmm0
637; X86-NEXT:  .LBB10_3:
638; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
639; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
640; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
641; X86-NEXT:    vmovss %xmm0, (%esp)
642; X86-NEXT:    flds (%esp)
643; X86-NEXT:    popl %eax
644; X86-NEXT:    retl
645  %1 = tail call float @llvm.minimumnum.f32(float %x, float %y)
646  ret float %1
647}
648
649define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
650; SSE2-LABEL: test_fminimumnum_scalarize:
651; SSE2:       # %bb.0:
652; SSE2-NEXT:    minpd %xmm1, %xmm0
653; SSE2-NEXT:    retq
654;
655; AVX-LABEL: test_fminimumnum_scalarize:
656; AVX:       # %bb.0:
657; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
658; AVX-NEXT:    retq
659;
660; AVX10_2-LABEL: test_fminimumnum_scalarize:
661; AVX10_2:       # %bb.0:
662; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
663; AVX10_2-NEXT:    retq
664;
665; X86-LABEL: test_fminimumnum_scalarize:
666; X86:       # %bb.0:
667; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
668; X86-NEXT:    retl
669  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
670  ret <2 x double> %r
671}
672
673define float @test_fminimumnum_nan0(float %x, float %y) {
674; SSE2-LABEL: test_fminimumnum_nan0:
675; SSE2:       # %bb.0:
676; SSE2-NEXT:    movaps %xmm1, %xmm0
677; SSE2-NEXT:    retq
678;
679; AVX-LABEL: test_fminimumnum_nan0:
680; AVX:       # %bb.0:
681; AVX-NEXT:    vmovaps %xmm1, %xmm0
682; AVX-NEXT:    retq
683;
684; AVX10_2-LABEL: test_fminimumnum_nan0:
685; AVX10_2:       # %bb.0:
686; AVX10_2-NEXT:    vmovaps %xmm1, %xmm0
687; AVX10_2-NEXT:    retq
688;
689; X86-LABEL: test_fminimumnum_nan0:
690; X86:       # %bb.0:
691; X86-NEXT:    flds {{[0-9]+}}(%esp)
692; X86-NEXT:    retl
693  %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y)
694  ret float %1
695}
696
697define float @test_fminimumnum_nan1(float %x, float %y) {
698; SSE2-LABEL: test_fminimumnum_nan1:
699; SSE2:       # %bb.0:
700; SSE2-NEXT:    retq
701;
702; AVX-LABEL: test_fminimumnum_nan1:
703; AVX:       # %bb.0:
704; AVX-NEXT:    retq
705;
706; AVX10_2-LABEL: test_fminimumnum_nan1:
707; AVX10_2:       # %bb.0:
708; AVX10_2-NEXT:    retq
709;
710; X86-LABEL: test_fminimumnum_nan1:
711; X86:       # %bb.0:
712; X86-NEXT:    flds {{[0-9]+}}(%esp)
713; X86-NEXT:    retl
714  %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000)
715  ret float %1
716}
717
718define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
719; SSE2-LABEL: test_fminimumnum_nnan:
720; SSE2:       # %bb.0:
721; SSE2-NEXT:    movq %xmm0, %rax
722; SSE2-NEXT:    testq %rax, %rax
723; SSE2-NEXT:    js .LBB14_1
724; SSE2-NEXT:  # %bb.2:
725; SSE2-NEXT:    minsd %xmm1, %xmm0
726; SSE2-NEXT:    retq
727; SSE2-NEXT:  .LBB14_1:
728; SSE2-NEXT:    movdqa %xmm0, %xmm2
729; SSE2-NEXT:    movapd %xmm1, %xmm0
730; SSE2-NEXT:    minsd %xmm2, %xmm0
731; SSE2-NEXT:    retq
732;
733; AVX1-LABEL: test_fminimumnum_nnan:
734; AVX1:       # %bb.0:
735; AVX1-NEXT:    vmovq %xmm0, %rax
736; AVX1-NEXT:    testq %rax, %rax
737; AVX1-NEXT:    js .LBB14_1
738; AVX1-NEXT:  # %bb.2:
739; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
740; AVX1-NEXT:    retq
741; AVX1-NEXT:  .LBB14_1:
742; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
743; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
744; AVX1-NEXT:    retq
745;
746; AVX512F-LABEL: test_fminimumnum_nnan:
747; AVX512F:       # %bb.0:
748; AVX512F-NEXT:    vmovq %xmm0, %rax
749; AVX512F-NEXT:    testq %rax, %rax
750; AVX512F-NEXT:    sets %al
751; AVX512F-NEXT:    kmovw %eax, %k1
752; AVX512F-NEXT:    vmovapd %xmm1, %xmm2
753; AVX512F-NEXT:    vmovsd %xmm0, %xmm2, %xmm2 {%k1}
754; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
755; AVX512F-NEXT:    vminsd %xmm2, %xmm0, %xmm0
756; AVX512F-NEXT:    retq
757;
758; AVX512DQ-LABEL: test_fminimumnum_nnan:
759; AVX512DQ:       # %bb.0:
760; AVX512DQ-NEXT:    vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
761; AVX512DQ-NEXT:    kmovw %k0, %k1
762; AVX512DQ-NEXT:    vmovapd %xmm0, %xmm2
763; AVX512DQ-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
764; AVX512DQ-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
765; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
766; AVX512DQ-NEXT:    retq
767;
768; AVX10_2-LABEL: test_fminimumnum_nnan:
769; AVX10_2:       # %bb.0:
770; AVX10_2-NEXT:    vminmaxsd $16, %xmm1, %xmm0
771; AVX10_2-NEXT:    retq
772;
773; X86-LABEL: test_fminimumnum_nnan:
774; X86:       # %bb.0:
775; X86-NEXT:    pushl %ebp
776; X86-NEXT:    movl %esp, %ebp
777; X86-NEXT:    andl $-8, %esp
778; X86-NEXT:    subl $8, %esp
779; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
780; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
781; X86-NEXT:    vextractps $1, %xmm0, %eax
782; X86-NEXT:    testl %eax, %eax
783; X86-NEXT:    js .LBB14_1
784; X86-NEXT:  # %bb.2:
785; X86-NEXT:    vmovapd %xmm1, %xmm2
786; X86-NEXT:    jmp .LBB14_3
787; X86-NEXT:  .LBB14_1:
788; X86-NEXT:    vmovapd %xmm0, %xmm2
789; X86-NEXT:    vmovapd %xmm1, %xmm0
790; X86-NEXT:  .LBB14_3:
791; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
792; X86-NEXT:    vmovsd %xmm0, (%esp)
793; X86-NEXT:    fldl (%esp)
794; X86-NEXT:    movl %ebp, %esp
795; X86-NEXT:    popl %ebp
796; X86-NEXT:    retl
797  %1 = tail call double @llvm.minimumnum.f64(double %x, double %y)
798  ret double %1
799}
800
801define double @test_fminimumnum_zero0(double %x, double %y) nounwind {
802; SSE2-LABEL: test_fminimumnum_zero0:
803; SSE2:       # %bb.0:
804; SSE2-NEXT:    movapd %xmm1, %xmm0
805; SSE2-NEXT:    cmpordsd %xmm1, %xmm0
806; SSE2-NEXT:    movapd %xmm0, %xmm2
807; SSE2-NEXT:    andpd %xmm1, %xmm2
808; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
809; SSE2-NEXT:    andnpd %xmm1, %xmm0
810; SSE2-NEXT:    orpd %xmm2, %xmm0
811; SSE2-NEXT:    retq
812;
813; AVX1-LABEL: test_fminimumnum_zero0:
814; AVX1:       # %bb.0:
815; AVX1-NEXT:    vcmpordsd %xmm1, %xmm1, %xmm0
816; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
817; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
818; AVX1-NEXT:    retq
819;
820; AVX512-LABEL: test_fminimumnum_zero0:
821; AVX512:       # %bb.0:
822; AVX512-NEXT:    vcmpordsd %xmm1, %xmm1, %k1
823; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
824; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
825; AVX512-NEXT:    retq
826;
827; AVX10_2-LABEL: test_fminimumnum_zero0:
828; AVX10_2:       # %bb.0:
829; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
830; AVX10_2-NEXT:    retq
831;
832; X86-LABEL: test_fminimumnum_zero0:
833; X86:       # %bb.0:
834; X86-NEXT:    pushl %ebp
835; X86-NEXT:    movl %esp, %ebp
836; X86-NEXT:    andl $-8, %esp
837; X86-NEXT:    subl $8, %esp
838; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
839; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
840; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
841; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
842; X86-NEXT:    vmovlpd %xmm0, (%esp)
843; X86-NEXT:    fldl (%esp)
844; X86-NEXT:    movl %ebp, %esp
845; X86-NEXT:    popl %ebp
846; X86-NEXT:    retl
847  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y)
848  ret double %1
849}
850
851define double @test_fminimumnum_zero1(double %x, double %y) nounwind {
852; SSE2-LABEL: test_fminimumnum_zero1:
853; SSE2:       # %bb.0:
854; SSE2-NEXT:    movapd %xmm0, %xmm1
855; SSE2-NEXT:    cmpordsd %xmm0, %xmm1
856; SSE2-NEXT:    movapd %xmm1, %xmm2
857; SSE2-NEXT:    andpd %xmm0, %xmm2
858; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
859; SSE2-NEXT:    andnpd %xmm0, %xmm1
860; SSE2-NEXT:    orpd %xmm2, %xmm1
861; SSE2-NEXT:    movapd %xmm1, %xmm0
862; SSE2-NEXT:    retq
863;
864; AVX1-LABEL: test_fminimumnum_zero1:
865; AVX1:       # %bb.0:
866; AVX1-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
867; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
868; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
869; AVX1-NEXT:    retq
870;
871; AVX512-LABEL: test_fminimumnum_zero1:
872; AVX512:       # %bb.0:
873; AVX512-NEXT:    vcmpordsd %xmm0, %xmm0, %k1
874; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
875; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
876; AVX512-NEXT:    vmovapd %xmm1, %xmm0
877; AVX512-NEXT:    retq
878;
879; AVX10_2-LABEL: test_fminimumnum_zero1:
880; AVX10_2:       # %bb.0:
881; AVX10_2-NEXT:    vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
882; AVX10_2-NEXT:    retq
883;
884; X86-LABEL: test_fminimumnum_zero1:
885; X86:       # %bb.0:
886; X86-NEXT:    pushl %ebp
887; X86-NEXT:    movl %esp, %ebp
888; X86-NEXT:    andl $-8, %esp
889; X86-NEXT:    subl $8, %esp
890; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
891; X86-NEXT:    vcmpordsd %xmm0, %xmm0, %xmm1
892; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
893; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
894; X86-NEXT:    vmovlpd %xmm0, (%esp)
895; X86-NEXT:    fldl (%esp)
896; X86-NEXT:    movl %ebp, %esp
897; X86-NEXT:    popl %ebp
898; X86-NEXT:    retl
899  %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0)
900  ret double %1
901}
902
903define double @test_fminimumnum_zero2(double %x, double %y) {
904; SSE2-LABEL: test_fminimumnum_zero2:
905; SSE2:       # %bb.0:
906; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
907; SSE2-NEXT:    retq
908;
909; AVX-LABEL: test_fminimumnum_zero2:
910; AVX:       # %bb.0:
911; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
912; AVX-NEXT:    retq
913;
914; AVX10_2-LABEL: test_fminimumnum_zero2:
915; AVX10_2:       # %bb.0:
916; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
917; AVX10_2-NEXT:    retq
918;
919; X86-LABEL: test_fminimumnum_zero2:
920; X86:       # %bb.0:
921; X86-NEXT:    fldz
922; X86-NEXT:    fchs
923; X86-NEXT:    retl
924  %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0)
925  ret double %1
926}
927
928define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
929; SSE2-LABEL: test_fminimumnum_nsz:
930; SSE2:       # %bb.0:
931; SSE2-NEXT:    movaps %xmm0, %xmm2
932; SSE2-NEXT:    cmpordss %xmm0, %xmm2
933; SSE2-NEXT:    movaps %xmm2, %xmm3
934; SSE2-NEXT:    andps %xmm0, %xmm3
935; SSE2-NEXT:    minss %xmm1, %xmm0
936; SSE2-NEXT:    andnps %xmm0, %xmm2
937; SSE2-NEXT:    orps %xmm3, %xmm2
938; SSE2-NEXT:    movaps %xmm2, %xmm0
939; SSE2-NEXT:    retq
940;
941; AVX1-LABEL: test_fminimumnum_nsz:
942; AVX1:       # %bb.0:
943; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
944; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
945; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
946; AVX1-NEXT:    retq
947;
948; AVX512-LABEL: test_fminimumnum_nsz:
949; AVX512:       # %bb.0:
950; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
951; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
952; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
953; AVX512-NEXT:    vmovaps %xmm1, %xmm0
954; AVX512-NEXT:    retq
955;
956; AVX10_2-LABEL: test_fminimumnum_nsz:
957; AVX10_2:       # %bb.0:
958; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
959; AVX10_2-NEXT:    retq
960;
961; X86-LABEL: test_fminimumnum_nsz:
962; X86:       # %bb.0:
963; X86-NEXT:    pushl %eax
964; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
965; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
966; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
967; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
968; X86-NEXT:    vmovss %xmm0, (%esp)
969; X86-NEXT:    flds (%esp)
970; X86-NEXT:    popl %eax
971; X86-NEXT:    retl
972  %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y)
973  ret float %1
974}
975
976define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
977; SSE2-LABEL: test_fminimumnum_combine_cmps:
978; SSE2:       # %bb.0:
979; SSE2-NEXT:    divss %xmm0, %xmm1
980; SSE2-NEXT:    movd %xmm0, %eax
981; SSE2-NEXT:    testl %eax, %eax
982; SSE2-NEXT:    movaps %xmm1, %xmm3
983; SSE2-NEXT:    js .LBB19_2
984; SSE2-NEXT:  # %bb.1:
985; SSE2-NEXT:    movaps %xmm0, %xmm3
986; SSE2-NEXT:  .LBB19_2:
987; SSE2-NEXT:    movaps %xmm3, %xmm2
988; SSE2-NEXT:    cmpordss %xmm3, %xmm2
989; SSE2-NEXT:    movaps %xmm2, %xmm4
990; SSE2-NEXT:    andps %xmm3, %xmm4
991; SSE2-NEXT:    js .LBB19_4
992; SSE2-NEXT:  # %bb.3:
993; SSE2-NEXT:    movaps %xmm1, %xmm0
994; SSE2-NEXT:  .LBB19_4:
995; SSE2-NEXT:    minss %xmm0, %xmm3
996; SSE2-NEXT:    andnps %xmm3, %xmm2
997; SSE2-NEXT:    orps %xmm4, %xmm2
998; SSE2-NEXT:    movaps %xmm2, %xmm0
999; SSE2-NEXT:    retq
1000;
1001; AVX1-LABEL: test_fminimumnum_combine_cmps:
1002; AVX1:       # %bb.0:
1003; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
1004; AVX1-NEXT:    vmovd %xmm0, %eax
1005; AVX1-NEXT:    testl %eax, %eax
1006; AVX1-NEXT:    js .LBB19_1
1007; AVX1-NEXT:  # %bb.2:
1008; AVX1-NEXT:    vmovaps %xmm2, %xmm1
1009; AVX1-NEXT:    jmp .LBB19_3
1010; AVX1-NEXT:  .LBB19_1:
1011; AVX1-NEXT:    vmovaps %xmm0, %xmm1
1012; AVX1-NEXT:    vmovaps %xmm2, %xmm0
1013; AVX1-NEXT:  .LBB19_3:
1014; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
1015; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
1016; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1017; AVX1-NEXT:    retq
1018;
1019; AVX512F-LABEL: test_fminimumnum_combine_cmps:
1020; AVX512F:       # %bb.0:
1021; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1022; AVX512F-NEXT:    vmovd %xmm0, %eax
1023; AVX512F-NEXT:    testl %eax, %eax
1024; AVX512F-NEXT:    sets %al
1025; AVX512F-NEXT:    kmovw %eax, %k1
1026; AVX512F-NEXT:    vmovaps %xmm1, %xmm2
1027; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
1028; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
1029; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
1030; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
1031; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
1032; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
1033; AVX512F-NEXT:    retq
1034;
1035; AVX512DQ-LABEL: test_fminimumnum_combine_cmps:
1036; AVX512DQ:       # %bb.0:
1037; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1038; AVX512DQ-NEXT:    vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
1039; AVX512DQ-NEXT:    kmovw %k0, %k1
1040; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
1041; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
1042; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
1043; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
1044; AVX512DQ-NEXT:    retq
1045;
1046; AVX10_2-LABEL: test_fminimumnum_combine_cmps:
1047; AVX10_2:       # %bb.0:
1048; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1049; AVX10_2-NEXT:    vminmaxss $16, %xmm1, %xmm0
1050; AVX10_2-NEXT:    retq
1051;
1052; X86-LABEL: test_fminimumnum_combine_cmps:
1053; X86:       # %bb.0:
1054; X86-NEXT:    pushl %eax
1055; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1056; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1057; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
1058; X86-NEXT:    vmovd %xmm0, %eax
1059; X86-NEXT:    testl %eax, %eax
1060; X86-NEXT:    js .LBB19_1
1061; X86-NEXT:  # %bb.2:
1062; X86-NEXT:    vmovaps %xmm2, %xmm1
1063; X86-NEXT:    jmp .LBB19_3
1064; X86-NEXT:  .LBB19_1:
1065; X86-NEXT:    vmovaps %xmm0, %xmm1
1066; X86-NEXT:    vmovaps %xmm2, %xmm0
1067; X86-NEXT:  .LBB19_3:
1068; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
1069; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
1070; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1071; X86-NEXT:    vmovss %xmm0, (%esp)
1072; X86-NEXT:    flds (%esp)
1073; X86-NEXT:    popl %eax
1074; X86-NEXT:    retl
1075  %1 = fdiv nnan float %y, %x
1076  %2 = tail call float @llvm.minimumnum.f32(float %x, float %1)
1077  ret float %2
1078}
1079
1080define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
1081; SSE2-LABEL: test_fminimumnum_vector:
1082; SSE2:       # %bb.0:
1083; SSE2-NEXT:    movaps %xmm0, %xmm2
1084; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
1085; SSE2-NEXT:    pxor %xmm3, %xmm3
1086; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
1087; SSE2-NEXT:    movdqa %xmm3, %xmm2
1088; SSE2-NEXT:    pandn %xmm1, %xmm2
1089; SSE2-NEXT:    movdqa %xmm3, %xmm4
1090; SSE2-NEXT:    pandn %xmm0, %xmm4
1091; SSE2-NEXT:    pand %xmm3, %xmm0
1092; SSE2-NEXT:    por %xmm2, %xmm0
1093; SSE2-NEXT:    pand %xmm1, %xmm3
1094; SSE2-NEXT:    por %xmm4, %xmm3
1095; SSE2-NEXT:    movdqa %xmm3, %xmm1
1096; SSE2-NEXT:    minpd %xmm0, %xmm1
1097; SSE2-NEXT:    movdqa %xmm3, %xmm0
1098; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
1099; SSE2-NEXT:    andpd %xmm0, %xmm3
1100; SSE2-NEXT:    andnpd %xmm1, %xmm0
1101; SSE2-NEXT:    orpd %xmm3, %xmm0
1102; SSE2-NEXT:    retq
1103;
1104; AVX-LABEL: test_fminimumnum_vector:
1105; AVX:       # %bb.0:
1106; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1107; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1108; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1109; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
1110; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1111; AVX-NEXT:    retq
1112;
1113; AVX10_2-LABEL: test_fminimumnum_vector:
1114; AVX10_2:       # %bb.0:
1115; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1116; AVX10_2-NEXT:    retq
1117;
1118; X86-LABEL: test_fminimumnum_vector:
1119; X86:       # %bb.0:
1120; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1121; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1122; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1123; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
1124; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1125; X86-NEXT:    retl
1126  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
1127  ret <2 x double> %r
1128}
1129
1130define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
1131; SSE2-LABEL: test_fmaximumnum_vector:
1132; SSE2:       # %bb.0:
1133; SSE2-NEXT:    maxps %xmm1, %xmm0
1134; SSE2-NEXT:    retq
1135;
1136; AVX-LABEL: test_fmaximumnum_vector:
1137; AVX:       # %bb.0:
1138; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
1139; AVX-NEXT:    retq
1140;
1141; AVX10_2-LABEL: test_fmaximumnum_vector:
1142; AVX10_2:       # %bb.0:
1143; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
1144; AVX10_2-NEXT:    retq
1145;
1146; X86-LABEL: test_fmaximumnum_vector:
1147; X86:       # %bb.0:
1148; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
1149; X86-NEXT:    retl
1150  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y)
1151  ret <4 x float> %r
1152}
1153
1154define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
1155; SSE2-LABEL: test_fminimumnum_vector_zero:
1156; SSE2:       # %bb.0:
1157; SSE2-NEXT:    xorpd %xmm1, %xmm1
1158; SSE2-NEXT:    minpd %xmm0, %xmm1
1159; SSE2-NEXT:    movapd %xmm1, %xmm0
1160; SSE2-NEXT:    retq
1161;
1162; AVX-LABEL: test_fminimumnum_vector_zero:
1163; AVX:       # %bb.0:
1164; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1165; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1166; AVX-NEXT:    retq
1167;
1168; AVX10_2-LABEL: test_fminimumnum_vector_zero:
1169; AVX10_2:       # %bb.0:
1170; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1171; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1172; AVX10_2-NEXT:    retq
1173;
1174; X86-LABEL: test_fminimumnum_vector_zero:
1175; X86:       # %bb.0:
1176; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1177; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1178; X86-NEXT:    retl
1179  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
1180  ret <2 x double> %r
1181}
1182
1183define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
1184; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
1185; SSE2:       # %bb.0:
1186; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1187; SSE2-NEXT:    maxps %xmm0, %xmm1
1188; SSE2-NEXT:    movaps %xmm1, %xmm0
1189; SSE2-NEXT:    retq
1190;
1191; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
1192; AVX:       # %bb.0:
1193; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1194; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1195; AVX-NEXT:    retq
1196;
1197; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
1198; AVX10_2:       # %bb.0:
1199; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1200; AVX10_2-NEXT:    retq
1201;
1202; X86-LABEL: test_fmaximumnum_vector_signed_zero:
1203; X86:       # %bb.0:
1204; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1205; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1206; X86-NEXT:    retl
1207  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
1208  ret <4 x float> %r
1209}
1210
1211define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
1212; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
1213; SSE2:       # %bb.0:
1214; SSE2-NEXT:    xorpd %xmm1, %xmm1
1215; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1216; SSE2-NEXT:    minpd %xmm0, %xmm1
1217; SSE2-NEXT:    movapd %xmm1, %xmm0
1218; SSE2-NEXT:    retq
1219;
1220; AVX-LABEL: test_fminimumnum_vector_partially_zero:
1221; AVX:       # %bb.0:
1222; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1223; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1224; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1225; AVX-NEXT:    retq
1226;
1227; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
1228; AVX10_2:       # %bb.0:
1229; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1230; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1231; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1232; AVX10_2-NEXT:    retq
1233;
1234; X86-LABEL: test_fminimumnum_vector_partially_zero:
1235; X86:       # %bb.0:
1236; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1237; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1238; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1239; X86-NEXT:    retl
1240  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
1241  ret <2 x double> %r
1242}
1243
1244define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
1245; SSE2-LABEL: test_fminimumnum_vector_different_zeros:
1246; SSE2:       # %bb.0:
1247; SSE2-NEXT:    movaps %xmm0, %xmm1
1248; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
1249; SSE2-NEXT:    xorps %xmm2, %xmm2
1250; SSE2-NEXT:    pxor %xmm3, %xmm3
1251; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1252; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1253; SSE2-NEXT:    movdqa %xmm3, %xmm1
1254; SSE2-NEXT:    pandn %xmm2, %xmm1
1255; SSE2-NEXT:    movaps %xmm0, %xmm4
1256; SSE2-NEXT:    andps %xmm3, %xmm4
1257; SSE2-NEXT:    orps %xmm1, %xmm4
1258; SSE2-NEXT:    pand %xmm0, %xmm2
1259; SSE2-NEXT:    pandn %xmm0, %xmm3
1260; SSE2-NEXT:    por %xmm2, %xmm3
1261; SSE2-NEXT:    movdqa %xmm3, %xmm1
1262; SSE2-NEXT:    minpd %xmm4, %xmm1
1263; SSE2-NEXT:    movdqa %xmm3, %xmm0
1264; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
1265; SSE2-NEXT:    andpd %xmm0, %xmm3
1266; SSE2-NEXT:    andnpd %xmm1, %xmm0
1267; SSE2-NEXT:    orpd %xmm3, %xmm0
1268; SSE2-NEXT:    retq
1269;
1270; AVX-LABEL: test_fminimumnum_vector_different_zeros:
1271; AVX:       # %bb.0:
1272; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1273; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1274; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1275; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1276; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1277; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
1278; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1279; AVX-NEXT:    retq
1280;
1281; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros:
1282; AVX10_2:       # %bb.0:
1283; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1284; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1285; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1286; AVX10_2-NEXT:    retq
1287;
1288; X86-LABEL: test_fminimumnum_vector_different_zeros:
1289; X86:       # %bb.0:
1290; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1291; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1292; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1293; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1294; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1295; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
1296; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1297; X86-NEXT:    retl
1298  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
1299  ret <2 x double> %r
1300}
1301
1302define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
1303; SSE2-LABEL: test_fmaximumnum_vector_non_zero:
1304; SSE2:       # %bb.0:
1305; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1306; SSE2-NEXT:    maxps %xmm0, %xmm1
1307; SSE2-NEXT:    movaps %xmm1, %xmm0
1308; SSE2-NEXT:    retq
1309;
1310; AVX-LABEL: test_fmaximumnum_vector_non_zero:
1311; AVX:       # %bb.0:
1312; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1313; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1314; AVX-NEXT:    retq
1315;
1316; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero:
1317; AVX10_2:       # %bb.0:
1318; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1319; AVX10_2-NEXT:    retq
1320;
1321; X86-LABEL: test_fmaximumnum_vector_non_zero:
1322; X86:       # %bb.0:
1323; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1324; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1325; X86-NEXT:    retl
1326  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>)
1327  ret <4 x float> %r
1328}
1329
1330define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
1331; SSE2-LABEL: test_fminimumnum_vector_nan:
1332; SSE2:       # %bb.0:
1333; SSE2-NEXT:    xorpd %xmm2, %xmm2
1334; SSE2-NEXT:    xorpd %xmm1, %xmm1
1335; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1336; SSE2-NEXT:    minpd %xmm0, %xmm1
1337; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
1338; SSE2-NEXT:    movapd %xmm1, %xmm0
1339; SSE2-NEXT:    retq
1340;
1341; AVX-LABEL: test_fminimumnum_vector_nan:
1342; AVX:       # %bb.0:
1343; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1344; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
1345; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
1346; AVX-NEXT:    vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1347; AVX-NEXT:    retq
1348;
1349; AVX10_2-LABEL: test_fminimumnum_vector_nan:
1350; AVX10_2:       # %bb.0:
1351; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1352; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1353; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1354; AVX10_2-NEXT:    retq
1355;
1356; X86-LABEL: test_fminimumnum_vector_nan:
1357; X86:       # %bb.0:
1358; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1359; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1360; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1361; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
1362; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1363; X86-NEXT:    retl
1364  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
1365  ret <2 x double> %r
1366}
1367
1368define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
1369; SSE2-LABEL: test_fminimumnum_vector_zero_first:
1370; SSE2:       # %bb.0:
1371; SSE2-NEXT:    xorpd %xmm1, %xmm1
1372; SSE2-NEXT:    minpd %xmm0, %xmm1
1373; SSE2-NEXT:    movapd %xmm1, %xmm0
1374; SSE2-NEXT:    retq
1375;
1376; AVX-LABEL: test_fminimumnum_vector_zero_first:
1377; AVX:       # %bb.0:
1378; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1379; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1380; AVX-NEXT:    retq
1381;
1382; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
1383; AVX10_2:       # %bb.0:
1384; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1385; AVX10_2-NEXT:    vminmaxpd $16, %xmm1, %xmm0, %xmm0
1386; AVX10_2-NEXT:    retq
1387;
1388; X86-LABEL: test_fminimumnum_vector_zero_first:
1389; X86:       # %bb.0:
1390; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1391; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1392; X86-NEXT:    retl
1393  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
1394  ret <2 x double> %r
1395}
1396
1397define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) {
1398; SSE2-LABEL: test_fminimumnum_vector_signed_zero:
1399; SSE2:       # %bb.0:
1400; SSE2-NEXT:    movapd %xmm0, %xmm1
1401; SSE2-NEXT:    minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1402; SSE2-NEXT:    movapd %xmm0, %xmm2
1403; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
1404; SSE2-NEXT:    andpd %xmm2, %xmm0
1405; SSE2-NEXT:    andnpd %xmm1, %xmm2
1406; SSE2-NEXT:    orpd %xmm2, %xmm0
1407; SSE2-NEXT:    retq
1408;
1409; AVX-LABEL: test_fminimumnum_vector_signed_zero:
1410; AVX:       # %bb.0:
1411; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
1412; AVX-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1413; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
1414; AVX-NEXT:    retq
1415;
1416; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero:
1417; AVX10_2:       # %bb.0:
1418; AVX10_2-NEXT:    vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
1419; AVX10_2-NEXT:    retq
1420;
1421; X86-LABEL: test_fminimumnum_vector_signed_zero:
1422; X86:       # %bb.0:
1423; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm1
1424; X86-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
1425; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
1426; X86-NEXT:    retl
1427  %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>)
1428  ret <2 x double> %r
1429}
1430
1431define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
1432; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
1433; SSE2:       # %bb.0:
1434; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1435; SSE2-NEXT:    maxps %xmm0, %xmm1
1436; SSE2-NEXT:    movaps %xmm1, %xmm0
1437; SSE2-NEXT:    retq
1438;
1439; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
1440; AVX:       # %bb.0:
1441; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1442; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1443; AVX-NEXT:    retq
1444;
1445; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
1446; AVX10_2:       # %bb.0:
1447; AVX10_2-NEXT:    vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1448; AVX10_2-NEXT:    retq
1449;
1450; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
1451; X86:       # %bb.0:
1452; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1453; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1454; X86-NEXT:    retl
1455  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
1456  ret <4 x float> %r
1457}
1458
1459define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) {
1460; SSE2-LABEL: test_fmaximumnum_vector_zero:
1461; SSE2:       # %bb.0:
1462; SSE2-NEXT:    xorps %xmm1, %xmm1
1463; SSE2-NEXT:    movaps %xmm0, %xmm2
1464; SSE2-NEXT:    maxps %xmm1, %xmm2
1465; SSE2-NEXT:    movaps %xmm0, %xmm1
1466; SSE2-NEXT:    cmpordps %xmm0, %xmm1
1467; SSE2-NEXT:    andps %xmm1, %xmm0
1468; SSE2-NEXT:    andnps %xmm2, %xmm1
1469; SSE2-NEXT:    orps %xmm1, %xmm0
1470; SSE2-NEXT:    retq
1471;
1472; AVX-LABEL: test_fmaximumnum_vector_zero:
1473; AVX:       # %bb.0:
1474; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1475; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
1476; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
1477; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1478; AVX-NEXT:    retq
1479;
1480; AVX10_2-LABEL: test_fmaximumnum_vector_zero:
1481; AVX10_2:       # %bb.0:
1482; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1483; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
1484; AVX10_2-NEXT:    retq
1485;
1486; X86-LABEL: test_fmaximumnum_vector_zero:
1487; X86:       # %bb.0:
1488; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1489; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
1490; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
1491; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1492; X86-NEXT:    retl
1493  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>)
1494  ret <4 x float> %r
1495}
1496
1497; PR77805: Check that signed zeroes are handled correctly in this case (FIXME)
1498define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
1499; SSE2-LABEL: test_fmaximumnum_v4f32_splat:
1500; SSE2:       # %bb.0:
1501; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1502; SSE2-NEXT:    pxor %xmm2, %xmm2
1503; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
1504; SSE2-NEXT:    movdqa %xmm2, %xmm3
1505; SSE2-NEXT:    pandn %xmm0, %xmm3
1506; SSE2-NEXT:    movaps %xmm1, %xmm4
1507; SSE2-NEXT:    andps %xmm2, %xmm4
1508; SSE2-NEXT:    orps %xmm3, %xmm4
1509; SSE2-NEXT:    pand %xmm2, %xmm0
1510; SSE2-NEXT:    andnps %xmm1, %xmm2
1511; SSE2-NEXT:    por %xmm2, %xmm0
1512; SSE2-NEXT:    movdqa %xmm0, %xmm1
1513; SSE2-NEXT:    maxps %xmm4, %xmm1
1514; SSE2-NEXT:    movdqa %xmm0, %xmm2
1515; SSE2-NEXT:    cmpordps %xmm0, %xmm2
1516; SSE2-NEXT:    andps %xmm2, %xmm0
1517; SSE2-NEXT:    andnps %xmm1, %xmm2
1518; SSE2-NEXT:    orps %xmm2, %xmm0
1519; SSE2-NEXT:    retq
1520;
1521; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
1522; AVX1:       # %bb.0:
1523; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1524; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1525; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1526; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1527; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
1528; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1529; AVX1-NEXT:    retq
1530;
1531; AVX512-LABEL: test_fmaximumnum_v4f32_splat:
1532; AVX512:       # %bb.0:
1533; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1
1534; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1535; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1536; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1537; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
1538; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1539; AVX512-NEXT:    retq
1540;
1541; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat:
1542; AVX10_2:       # %bb.0:
1543; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
1544; AVX10_2-NEXT:    vminmaxps $17, %xmm1, %xmm0, %xmm0
1545; AVX10_2-NEXT:    retq
1546;
1547; X86-LABEL: test_fmaximumnum_v4f32_splat:
1548; X86:       # %bb.0:
1549; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
1550; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1551; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1552; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1553; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
1554; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1555; X86-NEXT:    retl
1556  %splatinsert = insertelement <4 x float> poison, float %y, i64 0
1557  %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1558  %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone
1559  ret <4 x float> %r
1560}
1561
1562define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
1563; SSE2-LABEL: test_fmaximumnum_v4f16:
1564; SSE2:       # %bb.0:
1565; SSE2-NEXT:    subq $104, %rsp
1566; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1567; SSE2-NEXT:    psrld $16, %xmm0
1568; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1569; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
1570; SSE2-NEXT:    movdqa %xmm1, %xmm0
1571; SSE2-NEXT:    psrld $16, %xmm0
1572; SSE2-NEXT:    callq __extendhfsf2@PLT
1573; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1574; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1575; SSE2-NEXT:    callq __extendhfsf2@PLT
1576; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1577; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1578; SSE2-NEXT:    movdqa %xmm0, %xmm1
1579; SSE2-NEXT:    movd %xmm0, %eax
1580; SSE2-NEXT:    testl %eax, %eax
1581; SSE2-NEXT:    movdqa %xmm0, %xmm2
1582; SSE2-NEXT:    js .LBB33_2
1583; SSE2-NEXT:  # %bb.1:
1584; SSE2-NEXT:    movdqa %xmm4, %xmm2
1585; SSE2-NEXT:  .LBB33_2:
1586; SSE2-NEXT:    movdqa %xmm2, %xmm0
1587; SSE2-NEXT:    cmpordss %xmm2, %xmm0
1588; SSE2-NEXT:    movaps %xmm0, %xmm3
1589; SSE2-NEXT:    andps %xmm2, %xmm3
1590; SSE2-NEXT:    js .LBB33_4
1591; SSE2-NEXT:  # %bb.3:
1592; SSE2-NEXT:    movdqa %xmm1, %xmm4
1593; SSE2-NEXT:  .LBB33_4:
1594; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1595; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
1596; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1597; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
1598; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
1599; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1600; SSE2-NEXT:    maxss %xmm4, %xmm2
1601; SSE2-NEXT:    andnps %xmm2, %xmm0
1602; SSE2-NEXT:    orps %xmm3, %xmm0
1603; SSE2-NEXT:    callq __truncsfhf2@PLT
1604; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1605; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1606; SSE2-NEXT:    callq __extendhfsf2@PLT
1607; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1608; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1609; SSE2-NEXT:    callq __extendhfsf2@PLT
1610; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1611; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1612; SSE2-NEXT:    movdqa %xmm0, %xmm1
1613; SSE2-NEXT:    movd %xmm0, %eax
1614; SSE2-NEXT:    testl %eax, %eax
1615; SSE2-NEXT:    movdqa %xmm0, %xmm2
1616; SSE2-NEXT:    js .LBB33_6
1617; SSE2-NEXT:  # %bb.5:
1618; SSE2-NEXT:    movdqa %xmm4, %xmm2
1619; SSE2-NEXT:  .LBB33_6:
1620; SSE2-NEXT:    movdqa %xmm2, %xmm0
1621; SSE2-NEXT:    cmpordss %xmm2, %xmm0
1622; SSE2-NEXT:    movaps %xmm0, %xmm3
1623; SSE2-NEXT:    andps %xmm2, %xmm3
1624; SSE2-NEXT:    js .LBB33_8
1625; SSE2-NEXT:  # %bb.7:
1626; SSE2-NEXT:    movdqa %xmm1, %xmm4
1627; SSE2-NEXT:  .LBB33_8:
1628; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1629; SSE2-NEXT:    psrlq $48, %xmm1
1630; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1631; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
1632; SSE2-NEXT:    psrlq $48, %xmm1
1633; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
1634; SSE2-NEXT:    maxss %xmm4, %xmm2
1635; SSE2-NEXT:    andnps %xmm2, %xmm0
1636; SSE2-NEXT:    orps %xmm3, %xmm0
1637; SSE2-NEXT:    callq __truncsfhf2@PLT
1638; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1639; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1640; SSE2-NEXT:    callq __extendhfsf2@PLT
1641; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1642; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1643; SSE2-NEXT:    callq __extendhfsf2@PLT
1644; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1645; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1646; SSE2-NEXT:    movd %xmm0, %eax
1647; SSE2-NEXT:    testl %eax, %eax
1648; SSE2-NEXT:    movdqa %xmm0, %xmm2
1649; SSE2-NEXT:    js .LBB33_10
1650; SSE2-NEXT:  # %bb.9:
1651; SSE2-NEXT:    movdqa %xmm4, %xmm2
1652; SSE2-NEXT:  .LBB33_10:
1653; SSE2-NEXT:    movdqa %xmm2, %xmm1
1654; SSE2-NEXT:    cmpordss %xmm2, %xmm1
1655; SSE2-NEXT:    movaps %xmm1, %xmm3
1656; SSE2-NEXT:    andps %xmm2, %xmm3
1657; SSE2-NEXT:    js .LBB33_12
1658; SSE2-NEXT:  # %bb.11:
1659; SSE2-NEXT:    movdqa %xmm0, %xmm4
1660; SSE2-NEXT:  .LBB33_12:
1661; SSE2-NEXT:    maxss %xmm4, %xmm2
1662; SSE2-NEXT:    andnps %xmm2, %xmm1
1663; SSE2-NEXT:    orps %xmm3, %xmm1
1664; SSE2-NEXT:    movaps %xmm1, %xmm0
1665; SSE2-NEXT:    callq __truncsfhf2@PLT
1666; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1667; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1668; SSE2-NEXT:    callq __extendhfsf2@PLT
1669; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
1670; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1671; SSE2-NEXT:    callq __extendhfsf2@PLT
1672; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
1673; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1674; SSE2-NEXT:    movdqa %xmm0, %xmm1
1675; SSE2-NEXT:    movd %xmm0, %eax
1676; SSE2-NEXT:    testl %eax, %eax
1677; SSE2-NEXT:    movdqa %xmm0, %xmm2
1678; SSE2-NEXT:    js .LBB33_14
1679; SSE2-NEXT:  # %bb.13:
1680; SSE2-NEXT:    movdqa %xmm4, %xmm2
1681; SSE2-NEXT:  .LBB33_14:
1682; SSE2-NEXT:    movdqa %xmm2, %xmm0
1683; SSE2-NEXT:    cmpordss %xmm2, %xmm0
1684; SSE2-NEXT:    movaps %xmm0, %xmm3
1685; SSE2-NEXT:    andps %xmm2, %xmm3
1686; SSE2-NEXT:    js .LBB33_16
1687; SSE2-NEXT:  # %bb.15:
1688; SSE2-NEXT:    movdqa %xmm1, %xmm4
1689; SSE2-NEXT:  .LBB33_16:
1690; SSE2-NEXT:    maxss %xmm4, %xmm2
1691; SSE2-NEXT:    andnps %xmm2, %xmm0
1692; SSE2-NEXT:    orps %xmm3, %xmm0
1693; SSE2-NEXT:    callq __truncsfhf2@PLT
1694; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1695; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1696; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1697; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1698; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1699; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1700; SSE2-NEXT:    addq $104, %rsp
1701; SSE2-NEXT:    retq
1702;
1703; AVX1-LABEL: test_fmaximumnum_v4f16:
1704; AVX1:       # %bb.0:
1705; AVX1-NEXT:    subq $120, %rsp
1706; AVX1-NEXT:    vmovaps %xmm0, %xmm2
1707; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1708; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1709; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1710; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1711; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1712; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm0
1713; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1714; AVX1-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
1715; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
1716; AVX1-NEXT:    callq __extendhfsf2@PLT
1717; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1719; AVX1-NEXT:    callq __extendhfsf2@PLT
1720; AVX1-NEXT:    vmovd %xmm0, %eax
1721; AVX1-NEXT:    testl %eax, %eax
1722; AVX1-NEXT:    js .LBB33_1
1723; AVX1-NEXT:  # %bb.2:
1724; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1725; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1726; AVX1-NEXT:    jmp .LBB33_3
1727; AVX1-NEXT:  .LBB33_1:
1728; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1729; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1730; AVX1-NEXT:  .LBB33_3:
1731; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1732; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1733; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1734; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1735; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1736; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1737; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1738; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
1739; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1740; AVX1-NEXT:    callq __truncsfhf2@PLT
1741; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1742; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1743; AVX1-NEXT:    callq __extendhfsf2@PLT
1744; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1745; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1746; AVX1-NEXT:    callq __extendhfsf2@PLT
1747; AVX1-NEXT:    vmovd %xmm0, %eax
1748; AVX1-NEXT:    testl %eax, %eax
1749; AVX1-NEXT:    js .LBB33_4
1750; AVX1-NEXT:  # %bb.5:
1751; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1752; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
1753; AVX1-NEXT:    jmp .LBB33_6
1754; AVX1-NEXT:  .LBB33_4:
1755; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1756; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1757; AVX1-NEXT:  .LBB33_6:
1758; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1759; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
1760; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1761; AVX1-NEXT:    callq __truncsfhf2@PLT
1762; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1763; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1764; AVX1-NEXT:    callq __extendhfsf2@PLT
1765; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1766; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1767; AVX1-NEXT:    callq __extendhfsf2@PLT
1768; AVX1-NEXT:    vmovd %xmm0, %eax
1769; AVX1-NEXT:    testl %eax, %eax
1770; AVX1-NEXT:    js .LBB33_7
1771; AVX1-NEXT:  # %bb.8:
1772; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1773; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
1774; AVX1-NEXT:    jmp .LBB33_9
1775; AVX1-NEXT:  .LBB33_7:
1776; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1777; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1778; AVX1-NEXT:  .LBB33_9:
1779; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1780; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
1781; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1782; AVX1-NEXT:    callq __truncsfhf2@PLT
1783; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1784; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1785; AVX1-NEXT:    callq __extendhfsf2@PLT
1786; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1787; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1788; AVX1-NEXT:    callq __extendhfsf2@PLT
1789; AVX1-NEXT:    vmovd %xmm0, %eax
1790; AVX1-NEXT:    testl %eax, %eax
1791; AVX1-NEXT:    js .LBB33_10
1792; AVX1-NEXT:  # %bb.11:
1793; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1794; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1795; AVX1-NEXT:    jmp .LBB33_12
1796; AVX1-NEXT:  .LBB33_10:
1797; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1798; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1799; AVX1-NEXT:  .LBB33_12:
1800; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1801; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
1802; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1803; AVX1-NEXT:    callq __truncsfhf2@PLT
1804; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1805; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1806; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1807; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1808; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1809; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1810; AVX1-NEXT:    addq $120, %rsp
1811; AVX1-NEXT:    retq
1812;
1813; AVX512-LABEL: test_fmaximumnum_v4f16:
1814; AVX512:       # %bb.0:
1815; AVX512-NEXT:    subq $88, %rsp
1816; AVX512-NEXT:    vmovdqa %xmm1, %xmm4
1817; AVX512-NEXT:    vmovdqa %xmm0, %xmm6
1818; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1819; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1820; AVX512-NEXT:    vucomiss %xmm0, %xmm0
1821; AVX512-NEXT:    setp %al
1822; AVX512-NEXT:    kmovw %eax, %k1
1823; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1824; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1825; AVX512-NEXT:    vucomiss %xmm2, %xmm2
1826; AVX512-NEXT:    setp %al
1827; AVX512-NEXT:    kmovw %eax, %k2
1828; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
1829; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1830; AVX512-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
1831; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
1832; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
1833; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1834; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1835; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1836; AVX512-NEXT:    vucomiss %xmm0, %xmm2
1837; AVX512-NEXT:    seta %al
1838; AVX512-NEXT:    kmovw %eax, %k1
1839; AVX512-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
1840; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1841; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1842; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1843; AVX512-NEXT:    movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax
1844; AVX512-NEXT:    vmovd %eax, %xmm2
1845; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm9
1846; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
1847; AVX512-NEXT:    vxorps %xmm10, %xmm10, %xmm10
1848; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
1849; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1850; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1851; AVX512-NEXT:    vmovd %xmm0, %eax
1852; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1853; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3]
1854; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1855; AVX512-NEXT:    vucomiss %xmm2, %xmm2
1856; AVX512-NEXT:    setp %al
1857; AVX512-NEXT:    kmovw %eax, %k1
1858; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3]
1859; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1860; AVX512-NEXT:    vucomiss %xmm3, %xmm3
1861; AVX512-NEXT:    setp %al
1862; AVX512-NEXT:    kmovw %eax, %k2
1863; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k2}
1864; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
1865; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1866; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm3
1867; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
1868; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1869; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1870; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
1871; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1872; AVX512-NEXT:    seta %al
1873; AVX512-NEXT:    kmovw %eax, %k1
1874; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
1875; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1876; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1877; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1878; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
1879; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
1880; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm1
1881; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1882; AVX512-NEXT:    vmovd %xmm1, %eax
1883; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
1884; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
1885; AVX512-NEXT:    vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1886; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1887; AVX512-NEXT:    vucomiss %xmm0, %xmm0
1888; AVX512-NEXT:    setp %al
1889; AVX512-NEXT:    kmovw %eax, %k1
1890; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1891; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1892; AVX512-NEXT:    vucomiss %xmm3, %xmm3
1893; AVX512-NEXT:    setp %al
1894; AVX512-NEXT:    kmovw %eax, %k2
1895; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
1896; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
1897; AVX512-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1898; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm5
1899; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
1900; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1901; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1902; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm3
1903; AVX512-NEXT:    vucomiss %xmm3, %xmm5
1904; AVX512-NEXT:    seta %al
1905; AVX512-NEXT:    kmovw %eax, %k1
1906; AVX512-NEXT:    vmovss %xmm5, %xmm3, %xmm3 {%k1}
1907; AVX512-NEXT:    vshufpd {{.*#+}} xmm0 = xmm4[1,0]
1908; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1909; AVX512-NEXT:    vucomiss %xmm0, %xmm0
1910; AVX512-NEXT:    setp %al
1911; AVX512-NEXT:    kmovw %eax, %k1
1912; AVX512-NEXT:    vshufpd {{.*#+}} xmm5 = xmm6[1,0]
1913; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1914; AVX512-NEXT:    vucomiss %xmm5, %xmm5
1915; AVX512-NEXT:    setp %al
1916; AVX512-NEXT:    kmovw %eax, %k2
1917; AVX512-NEXT:    vmovss %xmm0, %xmm5, %xmm5 {%k2}
1918; AVX512-NEXT:    vcvtps2ph $4, %xmm5, %xmm15
1919; AVX512-NEXT:    vcvtph2ps %xmm15, %xmm5
1920; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
1921; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1922; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1923; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1924; AVX512-NEXT:    vucomiss %xmm0, %xmm5
1925; AVX512-NEXT:    seta %al
1926; AVX512-NEXT:    kmovw %eax, %k1
1927; AVX512-NEXT:    vmovss %xmm5, %xmm0, %xmm0 {%k1}
1928; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
1929; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
1930; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1931; AVX512-NEXT:    vmulss %xmm3, %xmm9, %xmm3
1932; AVX512-NEXT:    vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3]
1933; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm1
1934; AVX512-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1935; AVX512-NEXT:    vmovd %xmm1, %eax
1936; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1937; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1938; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1939; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
1940; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
1941; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1942; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1943; AVX512-NEXT:    vmovd %xmm0, %ecx
1944; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
1945; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm3
1946; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1947; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1948; AVX512-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1949; AVX512-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7]
1950; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1951; AVX512-NEXT:    vucomiss %xmm0, %xmm0
1952; AVX512-NEXT:    setp %al
1953; AVX512-NEXT:    kmovw %eax, %k1
1954; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7]
1955; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1956; AVX512-NEXT:    vucomiss %xmm2, %xmm2
1957; AVX512-NEXT:    setp %al
1958; AVX512-NEXT:    kmovw %eax, %k2
1959; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
1960; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm11
1961; AVX512-NEXT:    vcvtph2ps %xmm11, %xmm3
1962; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
1963; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1964; AVX512-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1965; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm2
1966; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1967; AVX512-NEXT:    seta %al
1968; AVX512-NEXT:    kmovw %eax, %k1
1969; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
1970; AVX512-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3]
1971; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
1972; AVX512-NEXT:    vucomiss %xmm0, %xmm0
1973; AVX512-NEXT:    setp %al
1974; AVX512-NEXT:    kmovw %eax, %k1
1975; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3]
1976; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1977; AVX512-NEXT:    vucomiss %xmm3, %xmm3
1978; AVX512-NEXT:    setp %al
1979; AVX512-NEXT:    kmovw %eax, %k2
1980; AVX512-NEXT:    vmovss %xmm0, %xmm3, %xmm3 {%k2}
1981; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm7
1982; AVX512-NEXT:    vcvtph2ps %xmm7, %xmm3
1983; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
1984; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm12
1985; AVX512-NEXT:    vcvtph2ps %xmm12, %xmm0
1986; AVX512-NEXT:    vucomiss %xmm0, %xmm3
1987; AVX512-NEXT:    seta %al
1988; AVX512-NEXT:    kmovw %eax, %k1
1989; AVX512-NEXT:    vmovss %xmm3, %xmm0, %xmm0 {%k1}
1990; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
1991; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
1992; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1993; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
1994; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
1995; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm14
1996; AVX512-NEXT:    vmovd %xmm14, %eax
1997; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
1998; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
1999; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
2000; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
2001; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
2002; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm13
2003; AVX512-NEXT:    vmovd %xmm13, %ecx
2004; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
2005; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
2006; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2007; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm0
2008; AVX512-NEXT:    vucomiss %xmm0, %xmm0
2009; AVX512-NEXT:    setp %al
2010; AVX512-NEXT:    kmovw %eax, %k1
2011; AVX512-NEXT:    vcvtph2ps %xmm6, %xmm2
2012; AVX512-NEXT:    vucomiss %xmm2, %xmm2
2013; AVX512-NEXT:    setp %al
2014; AVX512-NEXT:    kmovw %eax, %k2
2015; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k2}
2016; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm3
2017; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm1
2018; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2019; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm8
2020; AVX512-NEXT:    vcvtph2ps %xmm8, %xmm2
2021; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2022; AVX512-NEXT:    seta %al
2023; AVX512-NEXT:    kmovw %eax, %k1
2024; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2025; AVX512-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7]
2026; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm1
2027; AVX512-NEXT:    vucomiss %xmm1, %xmm1
2028; AVX512-NEXT:    setp %al
2029; AVX512-NEXT:    kmovw %eax, %k1
2030; AVX512-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7]
2031; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
2032; AVX512-NEXT:    vucomiss %xmm4, %xmm4
2033; AVX512-NEXT:    setp %al
2034; AVX512-NEXT:    kmovw %eax, %k2
2035; AVX512-NEXT:    vmovss %xmm1, %xmm4, %xmm4 {%k2}
2036; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
2037; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm6
2038; AVX512-NEXT:    vmovss %xmm6, %xmm1, %xmm1 {%k1}
2039; AVX512-NEXT:    vcvtps2ph $4, %xmm1, %xmm1
2040; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm0
2041; AVX512-NEXT:    vucomiss %xmm0, %xmm6
2042; AVX512-NEXT:    seta %al
2043; AVX512-NEXT:    kmovw %eax, %k1
2044; AVX512-NEXT:    vmovss %xmm6, %xmm0, %xmm0 {%k1}
2045; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
2046; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
2047; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
2048; AVX512-NEXT:    vmulss %xmm2, %xmm9, %xmm2
2049; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
2050; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
2051; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm0
2052; AVX512-NEXT:    vmulss %xmm0, %xmm9, %xmm0
2053; AVX512-NEXT:    vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3]
2054; AVX512-NEXT:    vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3]
2055; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm9
2056; AVX512-NEXT:    vmovd %xmm9, %eax
2057; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm10
2058; AVX512-NEXT:    vmovd %xmm10, %ecx
2059; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
2060; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
2061; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
2062; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1]
2063; AVX512-NEXT:    vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
2064; AVX512-NEXT:    # xmm6 = xmm0[0],mem[0]
2065; AVX512-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
2066; AVX512-NEXT:    vmovd %xmm0, %eax
2067; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2068; AVX512-NEXT:    vmovd %xmm0, %ecx
2069; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
2070; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm2
2071; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
2072; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
2073; AVX512-NEXT:    vmovd %xmm2, %eax
2074; AVX512-NEXT:    vmovd %xmm15, %ecx
2075; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
2076; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
2077; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
2078; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
2079; AVX512-NEXT:    vmovd %xmm11, %eax
2080; AVX512-NEXT:    vmovd %xmm7, %ecx
2081; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm2
2082; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
2083; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
2084; AVX512-NEXT:    vmovd %xmm3, %eax
2085; AVX512-NEXT:    vmovd %xmm4, %ecx
2086; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
2087; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
2088; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
2089; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
2090; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
2091; AVX512-NEXT:    vpxor %xmm0, %xmm0, %xmm0
2092; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm2, %xmm3
2093; AVX512-NEXT:    vpblendvb %xmm3, %xmm2, %xmm6, %xmm2
2094; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2095; AVX512-NEXT:    vmovd %xmm3, %eax
2096; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
2097; AVX512-NEXT:    vmovd %xmm3, %ecx
2098; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm3
2099; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm4
2100; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
2101; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2102; AVX512-NEXT:    vmovd %xmm4, %eax
2103; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2104; AVX512-NEXT:    vmovd %xmm4, %ecx
2105; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
2106; AVX512-NEXT:    vpinsrw $0, %ecx, %xmm0, %xmm5
2107; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2108; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
2109; AVX512-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
2110; AVX512-NEXT:    vmovd %xmm4, %eax
2111; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm4
2112; AVX512-NEXT:    vmovd %xmm12, %eax
2113; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
2114; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
2115; AVX512-NEXT:    vmovd %xmm8, %eax
2116; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm5
2117; AVX512-NEXT:    vmovd %xmm1, %eax
2118; AVX512-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm1
2119; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
2120; AVX512-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
2121; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
2122; AVX512-NEXT:    vpcmpeqw %xmm0, %xmm1, %xmm0
2123; AVX512-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
2124; AVX512-NEXT:    vcvtph2ps %xmm10, %xmm1
2125; AVX512-NEXT:    xorl %eax, %eax
2126; AVX512-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2127; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2128; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
2129; AVX512-NEXT:    movl $0, %edx
2130; AVX512-NEXT:    cmovel %ecx, %edx
2131; AVX512-NEXT:    vcvtph2ps %xmm9, %xmm1
2132; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2133; AVX512-NEXT:    movl $0, %esi
2134; AVX512-NEXT:    cmovel %ecx, %esi
2135; AVX512-NEXT:    vcvtph2ps %xmm13, %xmm1
2136; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2137; AVX512-NEXT:    movl $0, %edi
2138; AVX512-NEXT:    cmovel %ecx, %edi
2139; AVX512-NEXT:    vcvtph2ps %xmm14, %xmm1
2140; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2141; AVX512-NEXT:    movl $0, %r8d
2142; AVX512-NEXT:    cmovel %ecx, %r8d
2143; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2144; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2145; AVX512-NEXT:    movl $0, %r9d
2146; AVX512-NEXT:    cmovel %ecx, %r9d
2147; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2148; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2149; AVX512-NEXT:    movl $0, %r10d
2150; AVX512-NEXT:    cmovel %ecx, %r10d
2151; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2152; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2153; AVX512-NEXT:    movl $0, %r11d
2154; AVX512-NEXT:    cmovel %ecx, %r11d
2155; AVX512-NEXT:    vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
2156; AVX512-NEXT:    vucomiss %xmm2, %xmm1
2157; AVX512-NEXT:    vmovd %esi, %xmm1
2158; AVX512-NEXT:    vpinsrw $1, %edx, %xmm1, %xmm1
2159; AVX512-NEXT:    vpinsrw $2, %edi, %xmm1, %xmm1
2160; AVX512-NEXT:    vpinsrw $3, %r8d, %xmm1, %xmm1
2161; AVX512-NEXT:    vpinsrw $4, %r9d, %xmm1, %xmm1
2162; AVX512-NEXT:    vpinsrw $5, %r10d, %xmm1, %xmm1
2163; AVX512-NEXT:    vpinsrw $6, %r11d, %xmm1, %xmm1
2164; AVX512-NEXT:    cmovel %ecx, %eax
2165; AVX512-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
2166; AVX512-NEXT:    vpblendvb %xmm1, %xmm0, %xmm6, %xmm0
2167; AVX512-NEXT:    addq $88, %rsp
2168; AVX512-NEXT:    retq
2169;
2170; AVX10_2-LABEL: test_fmaximumnum_v4f16:
2171; AVX10_2:       # %bb.0:
2172; AVX10_2-NEXT:    vminmaxph $17, %xmm1, %xmm0, %xmm0
2173; AVX10_2-NEXT:    retq
2174;
2175; X86-LABEL: test_fmaximumnum_v4f16:
2176; X86:       # %bb.0:
2177; X86-NEXT:    subl $164, %esp
2178; X86-NEXT:    vmovdqa %xmm0, %xmm2
2179; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2180; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
2181; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2182; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
2183; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2184; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2185; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2186; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
2187; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2188; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
2189; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2190; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
2191; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2192; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
2193; X86-NEXT:    calll __extendhfsf2
2194; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2195; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2196; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2197; X86-NEXT:    calll __extendhfsf2
2198; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2199; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2200; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2201; X86-NEXT:    calll __extendhfsf2
2202; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2203; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2204; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2205; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2206; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2207; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2208; X86-NEXT:    vmovd %xmm2, %eax
2209; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2210; X86-NEXT:    testl %eax, %eax
2211; X86-NEXT:    js .LBB33_1
2212; X86-NEXT:  # %bb.2:
2213; X86-NEXT:    vmovdqa %xmm2, %xmm1
2214; X86-NEXT:    jmp .LBB33_3
2215; X86-NEXT:  .LBB33_1:
2216; X86-NEXT:    vmovdqa %xmm0, %xmm1
2217; X86-NEXT:    vmovdqa %xmm2, %xmm0
2218; X86-NEXT:  .LBB33_3:
2219; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2220; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2221; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2222; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2223; X86-NEXT:    calll __extendhfsf2
2224; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2225; X86-NEXT:    vmovss %xmm0, (%esp)
2226; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2227; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2228; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2229; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2230; X86-NEXT:    vmovd %xmm1, %eax
2231; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2232; X86-NEXT:    testl %eax, %eax
2233; X86-NEXT:    js .LBB33_4
2234; X86-NEXT:  # %bb.5:
2235; X86-NEXT:    vmovdqa %xmm1, %xmm2
2236; X86-NEXT:    jmp .LBB33_6
2237; X86-NEXT:  .LBB33_4:
2238; X86-NEXT:    vmovdqa %xmm0, %xmm2
2239; X86-NEXT:    vmovdqa %xmm1, %xmm0
2240; X86-NEXT:  .LBB33_6:
2241; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2242; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2243; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2244; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2245; X86-NEXT:    calll __truncsfhf2
2246; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2247; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2248; X86-NEXT:    vmovss %xmm0, (%esp)
2249; X86-NEXT:    calll __truncsfhf2
2250; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2251; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2252; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2253; X86-NEXT:    calll __extendhfsf2
2254; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2255; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2256; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2257; X86-NEXT:    calll __extendhfsf2
2258; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2259; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2260; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2261; X86-NEXT:    calll __extendhfsf2
2262; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2263; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2264; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2265; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2266; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2267; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2268; X86-NEXT:    vmovd %xmm1, %eax
2269; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2270; X86-NEXT:    testl %eax, %eax
2271; X86-NEXT:    js .LBB33_7
2272; X86-NEXT:  # %bb.8:
2273; X86-NEXT:    vmovdqa %xmm1, %xmm2
2274; X86-NEXT:    jmp .LBB33_9
2275; X86-NEXT:  .LBB33_7:
2276; X86-NEXT:    vmovdqa %xmm0, %xmm2
2277; X86-NEXT:    vmovdqa %xmm1, %xmm0
2278; X86-NEXT:  .LBB33_9:
2279; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2280; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2281; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2282; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2283; X86-NEXT:    calll __extendhfsf2
2284; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2285; X86-NEXT:    vmovss %xmm0, (%esp)
2286; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2287; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2288; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2289; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2290; X86-NEXT:    vmovd %xmm1, %eax
2291; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2292; X86-NEXT:    testl %eax, %eax
2293; X86-NEXT:    js .LBB33_10
2294; X86-NEXT:  # %bb.11:
2295; X86-NEXT:    vmovdqa %xmm1, %xmm2
2296; X86-NEXT:    jmp .LBB33_12
2297; X86-NEXT:  .LBB33_10:
2298; X86-NEXT:    vmovdqa %xmm0, %xmm2
2299; X86-NEXT:    vmovdqa %xmm1, %xmm0
2300; X86-NEXT:  .LBB33_12:
2301; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2302; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2303; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2304; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2305; X86-NEXT:    calll __truncsfhf2
2306; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2307; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2308; X86-NEXT:    vmovd %xmm0, (%esp)
2309; X86-NEXT:    calll __truncsfhf2
2310; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2311; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2312; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2313; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2314; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2315; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2316; X86-NEXT:    addl $164, %esp
2317; X86-NEXT:    retl
2318  %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y)
2319  ret <4 x half> %r
2320}
2321
2322define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind {
2323; SSE2-LABEL: test_fmaximumnum_v4bf16:
2324; SSE2:       # %bb.0:
2325; SSE2-NEXT:    pushq %rbp
2326; SSE2-NEXT:    pushq %r15
2327; SSE2-NEXT:    pushq %r14
2328; SSE2-NEXT:    pushq %rbx
2329; SSE2-NEXT:    subq $56, %rsp
2330; SSE2-NEXT:    movdqa %xmm1, %xmm4
2331; SSE2-NEXT:    movdqa %xmm0, %xmm5
2332; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
2333; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
2334; SSE2-NEXT:    movdqa %xmm1, %xmm0
2335; SSE2-NEXT:    psrld $16, %xmm0
2336; SSE2-NEXT:    pextrw $0, %xmm0, %eax
2337; SSE2-NEXT:    movdqa %xmm5, %xmm0
2338; SSE2-NEXT:    psrld $16, %xmm0
2339; SSE2-NEXT:    pextrw $0, %xmm0, %ecx
2340; SSE2-NEXT:    shll $16, %ecx
2341; SSE2-NEXT:    movd %ecx, %xmm3
2342; SSE2-NEXT:    shll $16, %eax
2343; SSE2-NEXT:    movd %eax, %xmm2
2344; SSE2-NEXT:    testl %ecx, %ecx
2345; SSE2-NEXT:    movdqa %xmm3, %xmm1
2346; SSE2-NEXT:    js .LBB34_2
2347; SSE2-NEXT:  # %bb.1:
2348; SSE2-NEXT:    movdqa %xmm2, %xmm1
2349; SSE2-NEXT:  .LBB34_2:
2350; SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2351; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
2352; SSE2-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
2353; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,1,1]
2354; SSE2-NEXT:    movdqa %xmm1, %xmm0
2355; SSE2-NEXT:    cmpordss %xmm1, %xmm0
2356; SSE2-NEXT:    movaps %xmm0, %xmm6
2357; SSE2-NEXT:    andps %xmm1, %xmm6
2358; SSE2-NEXT:    js .LBB34_4
2359; SSE2-NEXT:  # %bb.3:
2360; SSE2-NEXT:    movdqa %xmm3, %xmm2
2361; SSE2-NEXT:  .LBB34_4:
2362; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
2363; SSE2-NEXT:    pextrw $0, %xmm5, %ebx
2364; SSE2-NEXT:    maxss %xmm2, %xmm1
2365; SSE2-NEXT:    andnps %xmm1, %xmm0
2366; SSE2-NEXT:    orps %xmm6, %xmm0
2367; SSE2-NEXT:    callq __truncsfbf2@PLT
2368; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2369; SSE2-NEXT:    shll $16, %r15d
2370; SSE2-NEXT:    movd %r15d, %xmm3
2371; SSE2-NEXT:    shll $16, %r14d
2372; SSE2-NEXT:    movd %r14d, %xmm2
2373; SSE2-NEXT:    testl %r15d, %r15d
2374; SSE2-NEXT:    movdqa %xmm3, %xmm1
2375; SSE2-NEXT:    js .LBB34_6
2376; SSE2-NEXT:  # %bb.5:
2377; SSE2-NEXT:    movdqa %xmm2, %xmm1
2378; SSE2-NEXT:  .LBB34_6:
2379; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2380; SSE2-NEXT:    psrlq $48, %xmm5
2381; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
2382; SSE2-NEXT:    psrlq $48, %xmm6
2383; SSE2-NEXT:    movdqa %xmm1, %xmm0
2384; SSE2-NEXT:    cmpordss %xmm1, %xmm0
2385; SSE2-NEXT:    movaps %xmm0, %xmm4
2386; SSE2-NEXT:    andps %xmm1, %xmm4
2387; SSE2-NEXT:    js .LBB34_8
2388; SSE2-NEXT:  # %bb.7:
2389; SSE2-NEXT:    movdqa %xmm3, %xmm2
2390; SSE2-NEXT:  .LBB34_8:
2391; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
2392; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
2393; SSE2-NEXT:    maxss %xmm2, %xmm1
2394; SSE2-NEXT:    andnps %xmm1, %xmm0
2395; SSE2-NEXT:    orps %xmm4, %xmm0
2396; SSE2-NEXT:    callq __truncsfbf2@PLT
2397; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2398; SSE2-NEXT:    shll $16, %ebx
2399; SSE2-NEXT:    movd %ebx, %xmm1
2400; SSE2-NEXT:    shll $16, %ebp
2401; SSE2-NEXT:    movd %ebp, %xmm3
2402; SSE2-NEXT:    testl %ebx, %ebx
2403; SSE2-NEXT:    movdqa %xmm1, %xmm2
2404; SSE2-NEXT:    js .LBB34_10
2405; SSE2-NEXT:  # %bb.9:
2406; SSE2-NEXT:    movdqa %xmm3, %xmm2
2407; SSE2-NEXT:  .LBB34_10:
2408; SSE2-NEXT:    movdqa %xmm2, %xmm0
2409; SSE2-NEXT:    cmpordss %xmm2, %xmm0
2410; SSE2-NEXT:    movaps %xmm0, %xmm4
2411; SSE2-NEXT:    andps %xmm2, %xmm4
2412; SSE2-NEXT:    js .LBB34_12
2413; SSE2-NEXT:  # %bb.11:
2414; SSE2-NEXT:    movdqa %xmm1, %xmm3
2415; SSE2-NEXT:  .LBB34_12:
2416; SSE2-NEXT:    maxss %xmm3, %xmm2
2417; SSE2-NEXT:    andnps %xmm2, %xmm0
2418; SSE2-NEXT:    orps %xmm4, %xmm0
2419; SSE2-NEXT:    callq __truncsfbf2@PLT
2420; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
2421; SSE2-NEXT:    shll $16, %r14d
2422; SSE2-NEXT:    movd %r14d, %xmm1
2423; SSE2-NEXT:    shll $16, %r15d
2424; SSE2-NEXT:    movd %r15d, %xmm3
2425; SSE2-NEXT:    testl %r14d, %r14d
2426; SSE2-NEXT:    movdqa %xmm1, %xmm2
2427; SSE2-NEXT:    js .LBB34_14
2428; SSE2-NEXT:  # %bb.13:
2429; SSE2-NEXT:    movdqa %xmm3, %xmm2
2430; SSE2-NEXT:  .LBB34_14:
2431; SSE2-NEXT:    movdqa %xmm2, %xmm0
2432; SSE2-NEXT:    cmpordss %xmm2, %xmm0
2433; SSE2-NEXT:    movaps %xmm0, %xmm4
2434; SSE2-NEXT:    andps %xmm2, %xmm4
2435; SSE2-NEXT:    js .LBB34_16
2436; SSE2-NEXT:  # %bb.15:
2437; SSE2-NEXT:    movdqa %xmm1, %xmm3
2438; SSE2-NEXT:  .LBB34_16:
2439; SSE2-NEXT:    maxss %xmm3, %xmm2
2440; SSE2-NEXT:    andnps %xmm2, %xmm0
2441; SSE2-NEXT:    orps %xmm4, %xmm0
2442; SSE2-NEXT:    callq __truncsfbf2@PLT
2443; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
2444; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2445; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2446; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2447; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2448; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2449; SSE2-NEXT:    addq $56, %rsp
2450; SSE2-NEXT:    popq %rbx
2451; SSE2-NEXT:    popq %r14
2452; SSE2-NEXT:    popq %r15
2453; SSE2-NEXT:    popq %rbp
2454; SSE2-NEXT:    retq
2455;
2456; AVX1-LABEL: test_fmaximumnum_v4bf16:
2457; AVX1:       # %bb.0:
2458; AVX1-NEXT:    pushq %rbp
2459; AVX1-NEXT:    pushq %r15
2460; AVX1-NEXT:    pushq %r14
2461; AVX1-NEXT:    pushq %r13
2462; AVX1-NEXT:    pushq %r12
2463; AVX1-NEXT:    pushq %rbx
2464; AVX1-NEXT:    subq $56, %rsp
2465; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm2
2466; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm3
2467; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2468; AVX1-NEXT:    vpextrw $0, %xmm4, %ebx
2469; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
2470; AVX1-NEXT:    vpextrw $0, %xmm4, %ebp
2471; AVX1-NEXT:    vpextrw $0, %xmm0, %r12d
2472; AVX1-NEXT:    vpextrw $0, %xmm1, %r13d
2473; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2474; AVX1-NEXT:    vpextrw $0, %xmm0, %eax
2475; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
2476; AVX1-NEXT:    vpextrw $0, %xmm0, %ecx
2477; AVX1-NEXT:    shll $16, %ecx
2478; AVX1-NEXT:    vmovd %ecx, %xmm0
2479; AVX1-NEXT:    shll $16, %eax
2480; AVX1-NEXT:    vmovd %eax, %xmm4
2481; AVX1-NEXT:    js .LBB34_1
2482; AVX1-NEXT:  # %bb.2:
2483; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
2484; AVX1-NEXT:    jmp .LBB34_3
2485; AVX1-NEXT:  .LBB34_1:
2486; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2487; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
2488; AVX1-NEXT:  .LBB34_3:
2489; AVX1-NEXT:    vpextrw $0, %xmm2, %r14d
2490; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
2491; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2492; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2493; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2494; AVX1-NEXT:    callq __truncsfbf2@PLT
2495; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2496; AVX1-NEXT:    shll $16, %r13d
2497; AVX1-NEXT:    vmovd %r13d, %xmm0
2498; AVX1-NEXT:    shll $16, %r12d
2499; AVX1-NEXT:    vmovd %r12d, %xmm2
2500; AVX1-NEXT:    js .LBB34_4
2501; AVX1-NEXT:  # %bb.5:
2502; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2503; AVX1-NEXT:    jmp .LBB34_6
2504; AVX1-NEXT:  .LBB34_4:
2505; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2506; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2507; AVX1-NEXT:  .LBB34_6:
2508; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2509; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2510; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2511; AVX1-NEXT:    callq __truncsfbf2@PLT
2512; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2513; AVX1-NEXT:    shll $16, %ebp
2514; AVX1-NEXT:    vmovd %ebp, %xmm0
2515; AVX1-NEXT:    shll $16, %ebx
2516; AVX1-NEXT:    vmovd %ebx, %xmm2
2517; AVX1-NEXT:    js .LBB34_7
2518; AVX1-NEXT:  # %bb.8:
2519; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2520; AVX1-NEXT:    jmp .LBB34_9
2521; AVX1-NEXT:  .LBB34_7:
2522; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2523; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2524; AVX1-NEXT:  .LBB34_9:
2525; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2526; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2527; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2528; AVX1-NEXT:    callq __truncsfbf2@PLT
2529; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2530; AVX1-NEXT:    shll $16, %r15d
2531; AVX1-NEXT:    vmovd %r15d, %xmm0
2532; AVX1-NEXT:    shll $16, %r14d
2533; AVX1-NEXT:    vmovd %r14d, %xmm2
2534; AVX1-NEXT:    js .LBB34_10
2535; AVX1-NEXT:  # %bb.11:
2536; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2537; AVX1-NEXT:    jmp .LBB34_12
2538; AVX1-NEXT:  .LBB34_10:
2539; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2540; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2541; AVX1-NEXT:  .LBB34_12:
2542; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2543; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2544; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2545; AVX1-NEXT:    callq __truncsfbf2@PLT
2546; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2547; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2548; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2549; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2550; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2551; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2552; AVX1-NEXT:    addq $56, %rsp
2553; AVX1-NEXT:    popq %rbx
2554; AVX1-NEXT:    popq %r12
2555; AVX1-NEXT:    popq %r13
2556; AVX1-NEXT:    popq %r14
2557; AVX1-NEXT:    popq %r15
2558; AVX1-NEXT:    popq %rbp
2559; AVX1-NEXT:    retq
2560;
2561; AVX512-LABEL: test_fmaximumnum_v4bf16:
2562; AVX512:       # %bb.0:
2563; AVX512-NEXT:    pushq %rbp
2564; AVX512-NEXT:    pushq %r15
2565; AVX512-NEXT:    pushq %r14
2566; AVX512-NEXT:    pushq %r13
2567; AVX512-NEXT:    pushq %r12
2568; AVX512-NEXT:    pushq %rbx
2569; AVX512-NEXT:    pushq %rax
2570; AVX512-NEXT:    vmovq %xmm1, %r13
2571; AVX512-NEXT:    movq %r13, %rbx
2572; AVX512-NEXT:    shrq $32, %rbx
2573; AVX512-NEXT:    vmovq %xmm0, %rbp
2574; AVX512-NEXT:    movq %rbp, %r14
2575; AVX512-NEXT:    shrq $32, %r14
2576; AVX512-NEXT:    movq %r13, %r15
2577; AVX512-NEXT:    shrq $48, %r15
2578; AVX512-NEXT:    movq %rbp, %r12
2579; AVX512-NEXT:    shrq $48, %r12
2580; AVX512-NEXT:    movl %ebp, %eax
2581; AVX512-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
2582; AVX512-NEXT:    sets %cl
2583; AVX512-NEXT:    kmovw %ecx, %k1
2584; AVX512-NEXT:    movl %r13d, %ecx
2585; AVX512-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
2586; AVX512-NEXT:    vmovd %ecx, %xmm1
2587; AVX512-NEXT:    vmovd %eax, %xmm0
2588; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2589; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2590; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2591; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2592; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
2593; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2594; AVX512-NEXT:    callq __truncsfbf2@PLT
2595; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2596; AVX512-NEXT:    shll $16, %ebp
2597; AVX512-NEXT:    sets %al
2598; AVX512-NEXT:    kmovw %eax, %k1
2599; AVX512-NEXT:    shll $16, %r13d
2600; AVX512-NEXT:    vmovd %r13d, %xmm1
2601; AVX512-NEXT:    vmovd %ebp, %xmm0
2602; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2603; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2604; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2605; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2606; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
2607; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2608; AVX512-NEXT:    callq __truncsfbf2@PLT
2609; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
2610; AVX512-NEXT:    shll $16, %r12d
2611; AVX512-NEXT:    sets %al
2612; AVX512-NEXT:    kmovw %eax, %k1
2613; AVX512-NEXT:    shll $16, %r15d
2614; AVX512-NEXT:    vmovd %r15d, %xmm1
2615; AVX512-NEXT:    vmovd %r12d, %xmm0
2616; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2617; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2618; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2619; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2620; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
2621; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2622; AVX512-NEXT:    callq __truncsfbf2@PLT
2623; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2624; AVX512-NEXT:    shll $16, %r14d
2625; AVX512-NEXT:    sets %al
2626; AVX512-NEXT:    kmovw %eax, %k1
2627; AVX512-NEXT:    shll $16, %ebx
2628; AVX512-NEXT:    vmovd %ebx, %xmm1
2629; AVX512-NEXT:    vmovd %r14d, %xmm0
2630; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2631; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2632; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2633; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2634; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
2635; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2636; AVX512-NEXT:    callq __truncsfbf2@PLT
2637; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2638; AVX512-NEXT:    vmovaps (%rsp), %xmm0
2639; AVX512-NEXT:    addq $8, %rsp
2640; AVX512-NEXT:    popq %rbx
2641; AVX512-NEXT:    popq %r12
2642; AVX512-NEXT:    popq %r13
2643; AVX512-NEXT:    popq %r14
2644; AVX512-NEXT:    popq %r15
2645; AVX512-NEXT:    popq %rbp
2646; AVX512-NEXT:    retq
2647;
2648; AVX10_2-LABEL: test_fmaximumnum_v4bf16:
2649; AVX10_2:       # %bb.0:
2650; AVX10_2-NEXT:    vminmaxbf16 $17, %xmm1, %xmm0, %xmm0
2651; AVX10_2-NEXT:    retq
2652;
2653; X86-LABEL: test_fmaximumnum_v4bf16:
2654; X86:       # %bb.0:
2655; X86-NEXT:    pushl %ebp
2656; X86-NEXT:    pushl %ebx
2657; X86-NEXT:    pushl %edi
2658; X86-NEXT:    pushl %esi
2659; X86-NEXT:    subl $68, %esp
2660; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
2661; X86-NEXT:    vpsrlq $48, %xmm1, %xmm3
2662; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2663; X86-NEXT:    vpextrw $0, %xmm4, %esi
2664; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
2665; X86-NEXT:    vpextrw $0, %xmm4, %ebx
2666; X86-NEXT:    vpextrw $0, %xmm0, %eax
2667; X86-NEXT:    vpextrw $0, %xmm1, %ecx
2668; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
2669; X86-NEXT:    vpextrw $0, %xmm0, %edx
2670; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
2671; X86-NEXT:    vpextrw $0, %xmm0, %edi
2672; X86-NEXT:    shll $16, %edi
2673; X86-NEXT:    vmovd %edi, %xmm0
2674; X86-NEXT:    shll $16, %edx
2675; X86-NEXT:    vmovd %edx, %xmm4
2676; X86-NEXT:    js .LBB34_1
2677; X86-NEXT:  # %bb.2:
2678; X86-NEXT:    vmovdqa %xmm4, %xmm1
2679; X86-NEXT:    jmp .LBB34_3
2680; X86-NEXT:  .LBB34_1:
2681; X86-NEXT:    vmovdqa %xmm0, %xmm1
2682; X86-NEXT:    vmovdqa %xmm4, %xmm0
2683; X86-NEXT:  .LBB34_3:
2684; X86-NEXT:    vpextrw $0, %xmm2, %edi
2685; X86-NEXT:    vpextrw $0, %xmm3, %ebp
2686; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2687; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2688; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2689; X86-NEXT:    vmovss %xmm0, (%esp)
2690; X86-NEXT:    shll $16, %ecx
2691; X86-NEXT:    vmovd %ecx, %xmm0
2692; X86-NEXT:    shll $16, %eax
2693; X86-NEXT:    vmovd %eax, %xmm2
2694; X86-NEXT:    js .LBB34_4
2695; X86-NEXT:  # %bb.5:
2696; X86-NEXT:    vmovdqa %xmm2, %xmm1
2697; X86-NEXT:    jmp .LBB34_6
2698; X86-NEXT:  .LBB34_4:
2699; X86-NEXT:    vmovdqa %xmm0, %xmm1
2700; X86-NEXT:    vmovdqa %xmm2, %xmm0
2701; X86-NEXT:  .LBB34_6:
2702; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2703; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2704; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2705; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2706; X86-NEXT:    calll __truncsfbf2
2707; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2708; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2709; X86-NEXT:    vmovss %xmm0, (%esp)
2710; X86-NEXT:    shll $16, %ebx
2711; X86-NEXT:    vmovd %ebx, %xmm0
2712; X86-NEXT:    shll $16, %esi
2713; X86-NEXT:    vmovd %esi, %xmm2
2714; X86-NEXT:    js .LBB34_7
2715; X86-NEXT:  # %bb.8:
2716; X86-NEXT:    vmovdqa %xmm2, %xmm1
2717; X86-NEXT:    jmp .LBB34_9
2718; X86-NEXT:  .LBB34_7:
2719; X86-NEXT:    vmovdqa %xmm0, %xmm1
2720; X86-NEXT:    vmovdqa %xmm2, %xmm0
2721; X86-NEXT:  .LBB34_9:
2722; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2723; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2724; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2725; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2726; X86-NEXT:    calll __truncsfbf2
2727; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2728; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2729; X86-NEXT:    vmovss %xmm0, (%esp)
2730; X86-NEXT:    shll $16, %ebp
2731; X86-NEXT:    vmovd %ebp, %xmm0
2732; X86-NEXT:    shll $16, %edi
2733; X86-NEXT:    vmovd %edi, %xmm2
2734; X86-NEXT:    js .LBB34_10
2735; X86-NEXT:  # %bb.11:
2736; X86-NEXT:    vmovdqa %xmm2, %xmm1
2737; X86-NEXT:    jmp .LBB34_12
2738; X86-NEXT:  .LBB34_10:
2739; X86-NEXT:    vmovdqa %xmm0, %xmm1
2740; X86-NEXT:    vmovdqa %xmm2, %xmm0
2741; X86-NEXT:  .LBB34_12:
2742; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2743; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
2744; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2745; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2746; X86-NEXT:    calll __truncsfbf2
2747; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2748; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2749; X86-NEXT:    vmovd %xmm0, (%esp)
2750; X86-NEXT:    calll __truncsfbf2
2751; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2752; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2753; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2754; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2755; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2756; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2757; X86-NEXT:    addl $68, %esp
2758; X86-NEXT:    popl %esi
2759; X86-NEXT:    popl %edi
2760; X86-NEXT:    popl %ebx
2761; X86-NEXT:    popl %ebp
2762; X86-NEXT:    retl
2763  %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
2764  ret <4 x bfloat> %r
2765}
2766