xref: /llvm-project/llvm/test/CodeGen/X86/fminimum-fmaximum.ll (revision 13c6abfac84fca4bc55c0721d1853ce86a385678)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefixes=SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefixes=AVX,AVX1
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f  | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2
7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx        | FileCheck %s --check-prefixes=X86
8
9declare float @llvm.maximum.f32(float, float)
10declare double @llvm.maximum.f64(double, double)
11declare float @llvm.minimum.f32(float, float)
12declare double @llvm.minimum.f64(double, double)
13declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
14declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>)
15declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
16declare <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat>, <4 x bfloat>)
17
18;
19; fmaximum
20;
21
22define float @test_fmaximum(float %x, float %y) nounwind {
23; SSE2-LABEL: test_fmaximum:
24; SSE2:       # %bb.0:
25; SSE2-NEXT:    movdqa %xmm0, %xmm2
26; SSE2-NEXT:    movd %xmm0, %eax
27; SSE2-NEXT:    testl %eax, %eax
28; SSE2-NEXT:    movdqa %xmm0, %xmm3
29; SSE2-NEXT:    js .LBB0_2
30; SSE2-NEXT:  # %bb.1:
31; SSE2-NEXT:    movdqa %xmm1, %xmm3
32; SSE2-NEXT:  .LBB0_2:
33; SSE2-NEXT:    movdqa %xmm3, %xmm0
34; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
35; SSE2-NEXT:    movaps %xmm0, %xmm4
36; SSE2-NEXT:    andps %xmm3, %xmm4
37; SSE2-NEXT:    js .LBB0_4
38; SSE2-NEXT:  # %bb.3:
39; SSE2-NEXT:    movdqa %xmm2, %xmm1
40; SSE2-NEXT:  .LBB0_4:
41; SSE2-NEXT:    maxss %xmm1, %xmm3
42; SSE2-NEXT:    andnps %xmm3, %xmm0
43; SSE2-NEXT:    orps %xmm4, %xmm0
44; SSE2-NEXT:    retq
45;
46; AVX1-LABEL: test_fmaximum:
47; AVX1:       # %bb.0:
48; AVX1-NEXT:    vmovd %xmm0, %eax
49; AVX1-NEXT:    testl %eax, %eax
50; AVX1-NEXT:    js .LBB0_1
51; AVX1-NEXT:  # %bb.2:
52; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
53; AVX1-NEXT:    jmp .LBB0_3
54; AVX1-NEXT:  .LBB0_1:
55; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
56; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
57; AVX1-NEXT:  .LBB0_3:
58; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
59; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
60; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
61; AVX1-NEXT:    retq
62;
63; AVX512-LABEL: test_fmaximum:
64; AVX512:       # %bb.0:
65; AVX512-NEXT:    vmovd %xmm0, %eax
66; AVX512-NEXT:    testl %eax, %eax
67; AVX512-NEXT:    sets %al
68; AVX512-NEXT:    kmovw %eax, %k1
69; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
70; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
71; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
72; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
73; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
74; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
75; AVX512-NEXT:    retq
76;
77; AVX10_2-LABEL: test_fmaximum:
78; AVX10_2:       # %bb.0:
79; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
80; AVX10_2-NEXT:    retq
81;
82; X86-LABEL: test_fmaximum:
83; X86:       # %bb.0:
84; X86-NEXT:    pushl %eax
85; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
86; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
87; X86-NEXT:    vmovd %xmm2, %eax
88; X86-NEXT:    testl %eax, %eax
89; X86-NEXT:    js .LBB0_1
90; X86-NEXT:  # %bb.2:
91; X86-NEXT:    vmovdqa %xmm2, %xmm1
92; X86-NEXT:    jmp .LBB0_3
93; X86-NEXT:  .LBB0_1:
94; X86-NEXT:    vmovdqa %xmm0, %xmm1
95; X86-NEXT:    vmovdqa %xmm2, %xmm0
96; X86-NEXT:  .LBB0_3:
97; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
98; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
99; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
100; X86-NEXT:    vmovss %xmm0, (%esp)
101; X86-NEXT:    flds (%esp)
102; X86-NEXT:    popl %eax
103; X86-NEXT:    retl
104  %1 = tail call float @llvm.maximum.f32(float %x, float %y)
105  ret float %1
106}
107
108define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
109; SSE2-LABEL: test_fmaximum_scalarize:
110; SSE2:       # %bb.0:
111; SSE2-NEXT:    maxps %xmm1, %xmm0
112; SSE2-NEXT:    retq
113;
114; AVX-LABEL: test_fmaximum_scalarize:
115; AVX:       # %bb.0:
116; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
117; AVX-NEXT:    retq
118;
119; AVX10_2-LABEL: test_fmaximum_scalarize:
120; AVX10_2:       # %bb.0:
121; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
122; AVX10_2-NEXT:    retq
123;
124; X86-LABEL: test_fmaximum_scalarize:
125; X86:       # %bb.0:
126; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
127; X86-NEXT:    retl
128  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
129  ret <4 x float> %r
130}
131
132define float @test_fmaximum_nan0(float %x, float %y) {
133; SSE2-LABEL: test_fmaximum_nan0:
134; SSE2:       # %bb.0:
135; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
136; SSE2-NEXT:    retq
137;
138; AVX-LABEL: test_fmaximum_nan0:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
141; AVX-NEXT:    retq
142;
143; AVX10_2-LABEL: test_fmaximum_nan0:
144; AVX10_2:       # %bb.0:
145; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
146; AVX10_2-NEXT:    retq
147;
148; X86-LABEL: test_fmaximum_nan0:
149; X86:       # %bb.0:
150; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
151; X86-NEXT:    retl
152  %1 = tail call float @llvm.maximum.f32(float 0x7fff000000000000, float %y)
153  ret float %1
154}
155
156define float @test_fmaximum_nan1(float %x, float %y) {
157; SSE2-LABEL: test_fmaximum_nan1:
158; SSE2:       # %bb.0:
159; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
160; SSE2-NEXT:    retq
161;
162; AVX-LABEL: test_fmaximum_nan1:
163; AVX:       # %bb.0:
164; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
165; AVX-NEXT:    retq
166;
167; AVX10_2-LABEL: test_fmaximum_nan1:
168; AVX10_2:       # %bb.0:
169; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
170; AVX10_2-NEXT:    retq
171;
172; X86-LABEL: test_fmaximum_nan1:
173; X86:       # %bb.0:
174; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
175; X86-NEXT:    retl
176  %1 = tail call float @llvm.maximum.f32(float %x, float 0x7fff000000000000)
177  ret float %1
178}
179
180define float @test_fmaximum_nnan(float %x, float %y) nounwind {
181; SSE2-LABEL: test_fmaximum_nnan:
182; SSE2:       # %bb.0:
183; SSE2-NEXT:    movaps %xmm0, %xmm2
184; SSE2-NEXT:    addss %xmm1, %xmm2
185; SSE2-NEXT:    subss %xmm1, %xmm0
186; SSE2-NEXT:    movd %xmm2, %eax
187; SSE2-NEXT:    testl %eax, %eax
188; SSE2-NEXT:    js .LBB4_1
189; SSE2-NEXT:  # %bb.2:
190; SSE2-NEXT:    maxss %xmm2, %xmm0
191; SSE2-NEXT:    retq
192; SSE2-NEXT:  .LBB4_1:
193; SSE2-NEXT:    movaps %xmm0, %xmm1
194; SSE2-NEXT:    movaps %xmm2, %xmm0
195; SSE2-NEXT:    maxss %xmm1, %xmm0
196; SSE2-NEXT:    retq
197;
198; AVX1-LABEL: test_fmaximum_nnan:
199; AVX1:       # %bb.0:
200; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
201; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
202; AVX1-NEXT:    vmovd %xmm2, %eax
203; AVX1-NEXT:    testl %eax, %eax
204; AVX1-NEXT:    js .LBB4_1
205; AVX1-NEXT:  # %bb.2:
206; AVX1-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
207; AVX1-NEXT:    retq
208; AVX1-NEXT:  .LBB4_1:
209; AVX1-NEXT:    vmovaps %xmm0, %xmm1
210; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
211; AVX1-NEXT:    retq
212;
213; AVX512F-LABEL: test_fmaximum_nnan:
214; AVX512F:       # %bb.0:
215; AVX512F-NEXT:    vaddss %xmm1, %xmm0, %xmm2
216; AVX512F-NEXT:    vsubss %xmm1, %xmm0, %xmm0
217; AVX512F-NEXT:    vmovd %xmm2, %eax
218; AVX512F-NEXT:    testl %eax, %eax
219; AVX512F-NEXT:    sets %al
220; AVX512F-NEXT:    kmovw %eax, %k1
221; AVX512F-NEXT:    vmovaps %xmm2, %xmm1
222; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
223; AVX512F-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
224; AVX512F-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
225; AVX512F-NEXT:    retq
226;
227; AVX512DQ-LABEL: test_fmaximum_nnan:
228; AVX512DQ:       # %bb.0:
229; AVX512DQ-NEXT:    vaddss %xmm1, %xmm0, %xmm2
230; AVX512DQ-NEXT:    vsubss %xmm1, %xmm0, %xmm0
231; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
232; AVX512DQ-NEXT:    kmovw %k0, %k1
233; AVX512DQ-NEXT:    vmovaps %xmm2, %xmm1
234; AVX512DQ-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
235; AVX512DQ-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
236; AVX512DQ-NEXT:    vmaxss %xmm1, %xmm0, %xmm0
237; AVX512DQ-NEXT:    retq
238;
239; AVX10_2-LABEL: test_fmaximum_nnan:
240; AVX10_2:       # %bb.0:
241; AVX10_2-NEXT:    vaddss %xmm1, %xmm0, %xmm2
242; AVX10_2-NEXT:    vsubss %xmm1, %xmm0, %xmm0
243; AVX10_2-NEXT:    vminmaxss $1, %xmm0, %xmm2
244; AVX10_2-NEXT:    retq
245;
246; X86-LABEL: test_fmaximum_nnan:
247; X86:       # %bb.0:
248; X86-NEXT:    pushl %eax
249; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
250; X86-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
251; X86-NEXT:    vaddss %xmm0, %xmm2, %xmm1
252; X86-NEXT:    vsubss %xmm0, %xmm2, %xmm0
253; X86-NEXT:    vmovd %xmm1, %eax
254; X86-NEXT:    testl %eax, %eax
255; X86-NEXT:    js .LBB4_1
256; X86-NEXT:  # %bb.2:
257; X86-NEXT:    vmovaps %xmm1, %xmm2
258; X86-NEXT:    jmp .LBB4_3
259; X86-NEXT:  .LBB4_1:
260; X86-NEXT:    vmovaps %xmm0, %xmm2
261; X86-NEXT:    vmovaps %xmm1, %xmm0
262; X86-NEXT:  .LBB4_3:
263; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
264; X86-NEXT:    vmovss %xmm0, (%esp)
265; X86-NEXT:    flds (%esp)
266; X86-NEXT:    popl %eax
267; X86-NEXT:    retl
268  %1 = fadd nnan float %x, %y
269  %2 = fsub nnan float %x, %y
270  %3 = tail call float @llvm.maximum.f32(float %1, float %2)
271  ret float %3
272}
273
274define double @test_fmaximum_zero0(double %x, double %y) nounwind {
275; SSE2-LABEL: test_fmaximum_zero0:
276; SSE2:       # %bb.0:
277; SSE2-NEXT:    movapd %xmm1, %xmm0
278; SSE2-NEXT:    cmpunordsd %xmm1, %xmm0
279; SSE2-NEXT:    movapd %xmm0, %xmm2
280; SSE2-NEXT:    andpd %xmm1, %xmm2
281; SSE2-NEXT:    xorpd %xmm3, %xmm3
282; SSE2-NEXT:    maxsd %xmm3, %xmm1
283; SSE2-NEXT:    andnpd %xmm1, %xmm0
284; SSE2-NEXT:    orpd %xmm2, %xmm0
285; SSE2-NEXT:    retq
286;
287; AVX1-LABEL: test_fmaximum_zero0:
288; AVX1:       # %bb.0:
289; AVX1-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
290; AVX1-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
291; AVX1-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm2
292; AVX1-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
293; AVX1-NEXT:    retq
294;
295; AVX512-LABEL: test_fmaximum_zero0:
296; AVX512:       # %bb.0:
297; AVX512-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
298; AVX512-NEXT:    vmaxsd %xmm0, %xmm1, %xmm0
299; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
300; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
301; AVX512-NEXT:    retq
302;
303; AVX10_2-LABEL: test_fmaximum_zero0:
304; AVX10_2:       # %bb.0:
305; AVX10_2-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
306; AVX10_2-NEXT:    vminmaxsd $1, %xmm0, %xmm1
307; AVX10_2-NEXT:    retq
308;
309; X86-LABEL: test_fmaximum_zero0:
310; X86:       # %bb.0:
311; X86-NEXT:    pushl %ebp
312; X86-NEXT:    movl %esp, %ebp
313; X86-NEXT:    andl $-8, %esp
314; X86-NEXT:    subl $8, %esp
315; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
316; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
317; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
318; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
319; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
320; X86-NEXT:    vmovlpd %xmm0, (%esp)
321; X86-NEXT:    fldl (%esp)
322; X86-NEXT:    movl %ebp, %esp
323; X86-NEXT:    popl %ebp
324; X86-NEXT:    retl
325  %1 = tail call double @llvm.maximum.f64(double 0.0, double %y)
326  ret double %1
327}
328
329define double @test_fmaximum_zero1(double %x, double %y) nounwind {
330; SSE2-LABEL: test_fmaximum_zero1:
331; SSE2:       # %bb.0:
332; SSE2-NEXT:    movapd %xmm0, %xmm1
333; SSE2-NEXT:    cmpunordsd %xmm0, %xmm1
334; SSE2-NEXT:    movapd %xmm1, %xmm2
335; SSE2-NEXT:    andpd %xmm0, %xmm2
336; SSE2-NEXT:    xorpd %xmm3, %xmm3
337; SSE2-NEXT:    maxsd %xmm3, %xmm0
338; SSE2-NEXT:    andnpd %xmm0, %xmm1
339; SSE2-NEXT:    orpd %xmm2, %xmm1
340; SSE2-NEXT:    movapd %xmm1, %xmm0
341; SSE2-NEXT:    retq
342;
343; AVX1-LABEL: test_fmaximum_zero1:
344; AVX1:       # %bb.0:
345; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
346; AVX1-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
347; AVX1-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
348; AVX1-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
349; AVX1-NEXT:    retq
350;
351; AVX512-LABEL: test_fmaximum_zero1:
352; AVX512:       # %bb.0:
353; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
354; AVX512-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
355; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
356; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
357; AVX512-NEXT:    vmovapd %xmm1, %xmm0
358; AVX512-NEXT:    retq
359;
360; AVX10_2-LABEL: test_fmaximum_zero1:
361; AVX10_2:       # %bb.0:
362; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
363; AVX10_2-NEXT:    vminmaxsd $1, %xmm1, %xmm0
364; AVX10_2-NEXT:    retq
365;
366; X86-LABEL: test_fmaximum_zero1:
367; X86:       # %bb.0:
368; X86-NEXT:    pushl %ebp
369; X86-NEXT:    movl %esp, %ebp
370; X86-NEXT:    andl $-8, %esp
371; X86-NEXT:    subl $8, %esp
372; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
373; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
374; X86-NEXT:    vmaxsd %xmm1, %xmm0, %xmm1
375; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm2
376; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
377; X86-NEXT:    vmovlpd %xmm0, (%esp)
378; X86-NEXT:    fldl (%esp)
379; X86-NEXT:    movl %ebp, %esp
380; X86-NEXT:    popl %ebp
381; X86-NEXT:    retl
382  %1 = tail call double @llvm.maximum.f64(double %x, double 0.0)
383  ret double %1
384}
385
386define double @test_fmaximum_zero2(double %x, double %y) {
387; SSE2-LABEL: test_fmaximum_zero2:
388; SSE2:       # %bb.0:
389; SSE2-NEXT:    xorps %xmm0, %xmm0
390; SSE2-NEXT:    retq
391;
392; AVX-LABEL: test_fmaximum_zero2:
393; AVX:       # %bb.0:
394; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
395; AVX-NEXT:    retq
396;
397; AVX10_2-LABEL: test_fmaximum_zero2:
398; AVX10_2:       # %bb.0:
399; AVX10_2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
400; AVX10_2-NEXT:    retq
401;
402; X86-LABEL: test_fmaximum_zero2:
403; X86:       # %bb.0:
404; X86-NEXT:    fldz
405; X86-NEXT:    retl
406  %1 = tail call double @llvm.maximum.f64(double 0.0, double -0.0)
407  ret double %1
408}
409
410define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind {
411; SSE2-LABEL: test_fmaximum_nsz:
412; SSE2:       # %bb.0:
413; SSE2-NEXT:    movaps %xmm0, %xmm2
414; SSE2-NEXT:    cmpunordss %xmm0, %xmm2
415; SSE2-NEXT:    movaps %xmm2, %xmm3
416; SSE2-NEXT:    andps %xmm0, %xmm3
417; SSE2-NEXT:    maxss %xmm1, %xmm0
418; SSE2-NEXT:    andnps %xmm0, %xmm2
419; SSE2-NEXT:    orps %xmm3, %xmm2
420; SSE2-NEXT:    movaps %xmm2, %xmm0
421; SSE2-NEXT:    retq
422;
423; AVX1-LABEL: test_fmaximum_nsz:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
426; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
427; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
428; AVX1-NEXT:    retq
429;
430; AVX512-LABEL: test_fmaximum_nsz:
431; AVX512:       # %bb.0:
432; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
433; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
434; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
435; AVX512-NEXT:    vmovaps %xmm1, %xmm0
436; AVX512-NEXT:    retq
437;
438; AVX10_2-LABEL: test_fmaximum_nsz:
439; AVX10_2:       # %bb.0:
440; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
441; AVX10_2-NEXT:    retq
442;
443; X86-LABEL: test_fmaximum_nsz:
444; X86:       # %bb.0:
445; X86-NEXT:    pushl %eax
446; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
447; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
448; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
449; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
450; X86-NEXT:    vmovss %xmm0, (%esp)
451; X86-NEXT:    flds (%esp)
452; X86-NEXT:    popl %eax
453; X86-NEXT:    retl
454  %1 = tail call float @llvm.maximum.f32(float %x, float %y)
455  ret float %1
456}
457
458define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind {
459; SSE2-LABEL: test_fmaximum_combine_cmps:
460; SSE2:       # %bb.0:
461; SSE2-NEXT:    divss %xmm0, %xmm1
462; SSE2-NEXT:    movd %xmm0, %eax
463; SSE2-NEXT:    testl %eax, %eax
464; SSE2-NEXT:    movaps %xmm0, %xmm3
465; SSE2-NEXT:    js .LBB9_2
466; SSE2-NEXT:  # %bb.1:
467; SSE2-NEXT:    movaps %xmm1, %xmm3
468; SSE2-NEXT:  .LBB9_2:
469; SSE2-NEXT:    movaps %xmm3, %xmm2
470; SSE2-NEXT:    cmpunordss %xmm3, %xmm2
471; SSE2-NEXT:    movaps %xmm2, %xmm4
472; SSE2-NEXT:    andps %xmm3, %xmm4
473; SSE2-NEXT:    js .LBB9_4
474; SSE2-NEXT:  # %bb.3:
475; SSE2-NEXT:    movaps %xmm0, %xmm1
476; SSE2-NEXT:  .LBB9_4:
477; SSE2-NEXT:    maxss %xmm1, %xmm3
478; SSE2-NEXT:    andnps %xmm3, %xmm2
479; SSE2-NEXT:    orps %xmm4, %xmm2
480; SSE2-NEXT:    movaps %xmm2, %xmm0
481; SSE2-NEXT:    retq
482;
483; AVX1-LABEL: test_fmaximum_combine_cmps:
484; AVX1:       # %bb.0:
485; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm1
486; AVX1-NEXT:    vmovd %xmm0, %eax
487; AVX1-NEXT:    testl %eax, %eax
488; AVX1-NEXT:    js .LBB9_1
489; AVX1-NEXT:  # %bb.2:
490; AVX1-NEXT:    vmovaps %xmm0, %xmm2
491; AVX1-NEXT:    jmp .LBB9_3
492; AVX1-NEXT:  .LBB9_1:
493; AVX1-NEXT:    vmovaps %xmm1, %xmm2
494; AVX1-NEXT:    vmovaps %xmm0, %xmm1
495; AVX1-NEXT:  .LBB9_3:
496; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
497; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
498; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
499; AVX1-NEXT:    retq
500;
501; AVX512F-LABEL: test_fmaximum_combine_cmps:
502; AVX512F:       # %bb.0:
503; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
504; AVX512F-NEXT:    vmovd %xmm0, %eax
505; AVX512F-NEXT:    testl %eax, %eax
506; AVX512F-NEXT:    sets %al
507; AVX512F-NEXT:    kmovw %eax, %k1
508; AVX512F-NEXT:    vmovaps %xmm0, %xmm2
509; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
510; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
511; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
512; AVX512F-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
513; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
514; AVX512F-NEXT:    retq
515;
516; AVX512DQ-LABEL: test_fmaximum_combine_cmps:
517; AVX512DQ:       # %bb.0:
518; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
519; AVX512DQ-NEXT:    vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0)
520; AVX512DQ-NEXT:    kmovw %k0, %k1
521; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
522; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
523; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
524; AVX512DQ-NEXT:    vmaxss %xmm2, %xmm0, %xmm0
525; AVX512DQ-NEXT:    retq
526;
527; AVX10_2-LABEL: test_fmaximum_combine_cmps:
528; AVX10_2:       # %bb.0:
529; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
530; AVX10_2-NEXT:    vminmaxss $1, %xmm1, %xmm0
531; AVX10_2-NEXT:    retq
532;
533; X86-LABEL: test_fmaximum_combine_cmps:
534; X86:       # %bb.0:
535; X86-NEXT:    pushl %eax
536; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
537; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
538; X86-NEXT:    vdivss %xmm1, %xmm0, %xmm0
539; X86-NEXT:    vmovd %xmm1, %eax
540; X86-NEXT:    testl %eax, %eax
541; X86-NEXT:    js .LBB9_1
542; X86-NEXT:  # %bb.2:
543; X86-NEXT:    vmovaps %xmm1, %xmm2
544; X86-NEXT:    jmp .LBB9_3
545; X86-NEXT:  .LBB9_1:
546; X86-NEXT:    vmovaps %xmm0, %xmm2
547; X86-NEXT:    vmovaps %xmm1, %xmm0
548; X86-NEXT:  .LBB9_3:
549; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
550; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
551; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
552; X86-NEXT:    vmovss %xmm0, (%esp)
553; X86-NEXT:    flds (%esp)
554; X86-NEXT:    popl %eax
555; X86-NEXT:    retl
556  %1 = fdiv nnan float %y, %x
557  %2 = tail call float @llvm.maximum.f32(float %x, float %1)
558  ret float %2
559}
560
561;
562; fminimum
563;
564
565define float @test_fminimum(float %x, float %y) nounwind {
566; SSE2-LABEL: test_fminimum:
567; SSE2:       # %bb.0:
568; SSE2-NEXT:    movd %xmm0, %eax
569; SSE2-NEXT:    testl %eax, %eax
570; SSE2-NEXT:    movdqa %xmm1, %xmm3
571; SSE2-NEXT:    js .LBB10_2
572; SSE2-NEXT:  # %bb.1:
573; SSE2-NEXT:    movdqa %xmm0, %xmm3
574; SSE2-NEXT:  .LBB10_2:
575; SSE2-NEXT:    movdqa %xmm3, %xmm2
576; SSE2-NEXT:    cmpunordss %xmm3, %xmm2
577; SSE2-NEXT:    movaps %xmm2, %xmm4
578; SSE2-NEXT:    andps %xmm3, %xmm4
579; SSE2-NEXT:    js .LBB10_4
580; SSE2-NEXT:  # %bb.3:
581; SSE2-NEXT:    movdqa %xmm1, %xmm0
582; SSE2-NEXT:  .LBB10_4:
583; SSE2-NEXT:    minss %xmm0, %xmm3
584; SSE2-NEXT:    andnps %xmm3, %xmm2
585; SSE2-NEXT:    orps %xmm4, %xmm2
586; SSE2-NEXT:    movaps %xmm2, %xmm0
587; SSE2-NEXT:    retq
588;
589; AVX1-LABEL: test_fminimum:
590; AVX1:       # %bb.0:
591; AVX1-NEXT:    vmovd %xmm0, %eax
592; AVX1-NEXT:    testl %eax, %eax
593; AVX1-NEXT:    js .LBB10_1
594; AVX1-NEXT:  # %bb.2:
595; AVX1-NEXT:    vmovdqa %xmm1, %xmm2
596; AVX1-NEXT:    jmp .LBB10_3
597; AVX1-NEXT:  .LBB10_1:
598; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
599; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
600; AVX1-NEXT:  .LBB10_3:
601; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
602; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
603; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
604; AVX1-NEXT:    retq
605;
606; AVX512-LABEL: test_fminimum:
607; AVX512:       # %bb.0:
608; AVX512-NEXT:    vmovd %xmm0, %eax
609; AVX512-NEXT:    testl %eax, %eax
610; AVX512-NEXT:    sets %al
611; AVX512-NEXT:    kmovw %eax, %k1
612; AVX512-NEXT:    vmovaps %xmm1, %xmm2
613; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
614; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
615; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
616; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
617; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
618; AVX512-NEXT:    vmovaps %xmm1, %xmm0
619; AVX512-NEXT:    retq
620;
621; AVX10_2-LABEL: test_fminimum:
622; AVX10_2:       # %bb.0:
623; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
624; AVX10_2-NEXT:    retq
625;
626; X86-LABEL: test_fminimum:
627; X86:       # %bb.0:
628; X86-NEXT:    pushl %eax
629; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
630; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
631; X86-NEXT:    vmovd %xmm0, %eax
632; X86-NEXT:    testl %eax, %eax
633; X86-NEXT:    js .LBB10_1
634; X86-NEXT:  # %bb.2:
635; X86-NEXT:    vmovdqa %xmm1, %xmm2
636; X86-NEXT:    jmp .LBB10_3
637; X86-NEXT:  .LBB10_1:
638; X86-NEXT:    vmovdqa %xmm0, %xmm2
639; X86-NEXT:    vmovdqa %xmm1, %xmm0
640; X86-NEXT:  .LBB10_3:
641; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
642; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
643; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
644; X86-NEXT:    vmovss %xmm0, (%esp)
645; X86-NEXT:    flds (%esp)
646; X86-NEXT:    popl %eax
647; X86-NEXT:    retl
648  %1 = tail call float @llvm.minimum.f32(float %x, float %y)
649  ret float %1
650}
651
652define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
653; SSE2-LABEL: test_fminimum_scalarize:
654; SSE2:       # %bb.0:
655; SSE2-NEXT:    minpd %xmm1, %xmm0
656; SSE2-NEXT:    retq
657;
658; AVX-LABEL: test_fminimum_scalarize:
659; AVX:       # %bb.0:
660; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0
661; AVX-NEXT:    retq
662;
663; AVX10_2-LABEL: test_fminimum_scalarize:
664; AVX10_2:       # %bb.0:
665; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
666; AVX10_2-NEXT:    retq
667;
668; X86-LABEL: test_fminimum_scalarize:
669; X86:       # %bb.0:
670; X86-NEXT:    vminpd %xmm1, %xmm0, %xmm0
671; X86-NEXT:    retl
672  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y)
673  ret <2 x double> %r
674}
675
676define float @test_fminimum_nan0(float %x, float %y) {
677; SSE2-LABEL: test_fminimum_nan0:
678; SSE2:       # %bb.0:
679; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
680; SSE2-NEXT:    retq
681;
682; AVX-LABEL: test_fminimum_nan0:
683; AVX:       # %bb.0:
684; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
685; AVX-NEXT:    retq
686;
687; AVX10_2-LABEL: test_fminimum_nan0:
688; AVX10_2:       # %bb.0:
689; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
690; AVX10_2-NEXT:    retq
691;
692; X86-LABEL: test_fminimum_nan0:
693; X86:       # %bb.0:
694; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
695; X86-NEXT:    retl
696  %1 = tail call float @llvm.minimum.f32(float 0x7fff000000000000, float %y)
697  ret float %1
698}
699
700define float @test_fminimum_nan1(float %x, float %y) {
701; SSE2-LABEL: test_fminimum_nan1:
702; SSE2:       # %bb.0:
703; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
704; SSE2-NEXT:    retq
705;
706; AVX-LABEL: test_fminimum_nan1:
707; AVX:       # %bb.0:
708; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
709; AVX-NEXT:    retq
710;
711; AVX10_2-LABEL: test_fminimum_nan1:
712; AVX10_2:       # %bb.0:
713; AVX10_2-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
714; AVX10_2-NEXT:    retq
715;
716; X86-LABEL: test_fminimum_nan1:
717; X86:       # %bb.0:
718; X86-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
719; X86-NEXT:    retl
720  %1 = tail call float @llvm.minimum.f32(float %x, float 0x7fff000000000000)
721  ret float %1
722}
723
724define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind {
725; SSE2-LABEL: test_fminimum_nnan:
726; SSE2:       # %bb.0:
727; SSE2-NEXT:    movq %xmm0, %rax
728; SSE2-NEXT:    testq %rax, %rax
729; SSE2-NEXT:    js .LBB14_1
730; SSE2-NEXT:  # %bb.2:
731; SSE2-NEXT:    minsd %xmm1, %xmm0
732; SSE2-NEXT:    retq
733; SSE2-NEXT:  .LBB14_1:
734; SSE2-NEXT:    movdqa %xmm0, %xmm2
735; SSE2-NEXT:    movapd %xmm1, %xmm0
736; SSE2-NEXT:    minsd %xmm2, %xmm0
737; SSE2-NEXT:    retq
738;
739; AVX1-LABEL: test_fminimum_nnan:
740; AVX1:       # %bb.0:
741; AVX1-NEXT:    vmovq %xmm0, %rax
742; AVX1-NEXT:    testq %rax, %rax
743; AVX1-NEXT:    js .LBB14_1
744; AVX1-NEXT:  # %bb.2:
745; AVX1-NEXT:    vminsd %xmm1, %xmm0, %xmm0
746; AVX1-NEXT:    retq
747; AVX1-NEXT:  .LBB14_1:
748; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
749; AVX1-NEXT:    vminsd %xmm2, %xmm1, %xmm0
750; AVX1-NEXT:    retq
751;
752; AVX512F-LABEL: test_fminimum_nnan:
753; AVX512F:       # %bb.0:
754; AVX512F-NEXT:    vmovq %xmm0, %rax
755; AVX512F-NEXT:    testq %rax, %rax
756; AVX512F-NEXT:    sets %al
757; AVX512F-NEXT:    kmovw %eax, %k1
758; AVX512F-NEXT:    vmovapd %xmm1, %xmm2
759; AVX512F-NEXT:    vmovsd %xmm0, %xmm2, %xmm2 {%k1}
760; AVX512F-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
761; AVX512F-NEXT:    vminsd %xmm2, %xmm0, %xmm0
762; AVX512F-NEXT:    retq
763;
764; AVX512DQ-LABEL: test_fminimum_nnan:
765; AVX512DQ:       # %bb.0:
766; AVX512DQ-NEXT:    vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1)
767; AVX512DQ-NEXT:    kmovw %k0, %k1
768; AVX512DQ-NEXT:    vmovapd %xmm0, %xmm2
769; AVX512DQ-NEXT:    vmovsd %xmm1, %xmm2, %xmm2 {%k1}
770; AVX512DQ-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
771; AVX512DQ-NEXT:    vminsd %xmm2, %xmm1, %xmm0
772; AVX512DQ-NEXT:    retq
773;
774; AVX10_2-LABEL: test_fminimum_nnan:
775; AVX10_2:       # %bb.0:
776; AVX10_2-NEXT:    vminmaxsd $0, %xmm1, %xmm0
777; AVX10_2-NEXT:    retq
778;
779; X86-LABEL: test_fminimum_nnan:
780; X86:       # %bb.0:
781; X86-NEXT:    pushl %ebp
782; X86-NEXT:    movl %esp, %ebp
783; X86-NEXT:    andl $-8, %esp
784; X86-NEXT:    subl $8, %esp
785; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
786; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
787; X86-NEXT:    vextractps $1, %xmm0, %eax
788; X86-NEXT:    testl %eax, %eax
789; X86-NEXT:    js .LBB14_1
790; X86-NEXT:  # %bb.2:
791; X86-NEXT:    vmovapd %xmm1, %xmm2
792; X86-NEXT:    jmp .LBB14_3
793; X86-NEXT:  .LBB14_1:
794; X86-NEXT:    vmovapd %xmm0, %xmm2
795; X86-NEXT:    vmovapd %xmm1, %xmm0
796; X86-NEXT:  .LBB14_3:
797; X86-NEXT:    vminsd %xmm2, %xmm0, %xmm0
798; X86-NEXT:    vmovsd %xmm0, (%esp)
799; X86-NEXT:    fldl (%esp)
800; X86-NEXT:    movl %ebp, %esp
801; X86-NEXT:    popl %ebp
802; X86-NEXT:    retl
803  %1 = tail call double @llvm.minimum.f64(double %x, double %y)
804  ret double %1
805}
806
807define double @test_fminimum_zero0(double %x, double %y) nounwind {
808; SSE2-LABEL: test_fminimum_zero0:
809; SSE2:       # %bb.0:
810; SSE2-NEXT:    movapd %xmm1, %xmm0
811; SSE2-NEXT:    cmpunordsd %xmm1, %xmm0
812; SSE2-NEXT:    movapd %xmm0, %xmm2
813; SSE2-NEXT:    andpd %xmm1, %xmm2
814; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
815; SSE2-NEXT:    andnpd %xmm1, %xmm0
816; SSE2-NEXT:    orpd %xmm2, %xmm0
817; SSE2-NEXT:    retq
818;
819; AVX1-LABEL: test_fminimum_zero0:
820; AVX1:       # %bb.0:
821; AVX1-NEXT:    vcmpunordsd %xmm1, %xmm1, %xmm0
822; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
823; AVX1-NEXT:    vblendvpd %xmm0, %xmm1, %xmm2, %xmm0
824; AVX1-NEXT:    retq
825;
826; AVX512-LABEL: test_fminimum_zero0:
827; AVX512:       # %bb.0:
828; AVX512-NEXT:    vcmpunordsd %xmm1, %xmm1, %k1
829; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
830; AVX512-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
831; AVX512-NEXT:    retq
832;
833; AVX10_2-LABEL: test_fminimum_zero0:
834; AVX10_2:       # %bb.0:
835; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
836; AVX10_2-NEXT:    retq
837;
838; X86-LABEL: test_fminimum_zero0:
839; X86:       # %bb.0:
840; X86-NEXT:    pushl %ebp
841; X86-NEXT:    movl %esp, %ebp
842; X86-NEXT:    andl $-8, %esp
843; X86-NEXT:    subl $8, %esp
844; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
845; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm1
846; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
847; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
848; X86-NEXT:    vmovlpd %xmm0, (%esp)
849; X86-NEXT:    fldl (%esp)
850; X86-NEXT:    movl %ebp, %esp
851; X86-NEXT:    popl %ebp
852; X86-NEXT:    retl
853  %1 = tail call double @llvm.minimum.f64(double -0.0, double %y)
854  ret double %1
855}
856
857define double @test_fminimum_zero1(double %x, double %y) nounwind {
858; SSE2-LABEL: test_fminimum_zero1:
859; SSE2:       # %bb.0:
860; SSE2-NEXT:    movapd %xmm0, %xmm1
861; SSE2-NEXT:    cmpunordsd %xmm0, %xmm1
862; SSE2-NEXT:    movapd %xmm1, %xmm2
863; SSE2-NEXT:    andpd %xmm0, %xmm2
864; SSE2-NEXT:    minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
865; SSE2-NEXT:    andnpd %xmm0, %xmm1
866; SSE2-NEXT:    orpd %xmm2, %xmm1
867; SSE2-NEXT:    movapd %xmm1, %xmm0
868; SSE2-NEXT:    retq
869;
870; AVX1-LABEL: test_fminimum_zero1:
871; AVX1:       # %bb.0:
872; AVX1-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm1
873; AVX1-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
874; AVX1-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
875; AVX1-NEXT:    retq
876;
877; AVX512-LABEL: test_fminimum_zero1:
878; AVX512:       # %bb.0:
879; AVX512-NEXT:    vcmpunordsd %xmm0, %xmm0, %k1
880; AVX512-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
881; AVX512-NEXT:    vmovsd %xmm0, %xmm1, %xmm1 {%k1}
882; AVX512-NEXT:    vmovapd %xmm1, %xmm0
883; AVX512-NEXT:    retq
884;
885; AVX10_2-LABEL: test_fminimum_zero1:
886; AVX10_2:       # %bb.0:
887; AVX10_2-NEXT:    vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
888; AVX10_2-NEXT:    retq
889;
890; X86-LABEL: test_fminimum_zero1:
891; X86:       # %bb.0:
892; X86-NEXT:    pushl %ebp
893; X86-NEXT:    movl %esp, %ebp
894; X86-NEXT:    andl $-8, %esp
895; X86-NEXT:    subl $8, %esp
896; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
897; X86-NEXT:    vcmpunordsd %xmm0, %xmm0, %xmm1
898; X86-NEXT:    vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
899; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
900; X86-NEXT:    vmovlpd %xmm0, (%esp)
901; X86-NEXT:    fldl (%esp)
902; X86-NEXT:    movl %ebp, %esp
903; X86-NEXT:    popl %ebp
904; X86-NEXT:    retl
905  %1 = tail call double @llvm.minimum.f64(double %x, double -0.0)
906  ret double %1
907}
908
909define double @test_fminimum_zero2(double %x, double %y) {
910; SSE2-LABEL: test_fminimum_zero2:
911; SSE2:       # %bb.0:
912; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
913; SSE2-NEXT:    retq
914;
915; AVX-LABEL: test_fminimum_zero2:
916; AVX:       # %bb.0:
917; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
918; AVX-NEXT:    retq
919;
920; AVX10_2-LABEL: test_fminimum_zero2:
921; AVX10_2:       # %bb.0:
922; AVX10_2-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
923; AVX10_2-NEXT:    retq
924;
925; X86-LABEL: test_fminimum_zero2:
926; X86:       # %bb.0:
927; X86-NEXT:    fldz
928; X86-NEXT:    fchs
929; X86-NEXT:    retl
930  %1 = tail call double @llvm.minimum.f64(double -0.0, double 0.0)
931  ret double %1
932}
933
934define float @test_fminimum_nsz(float %x, float %y) nounwind {
935; SSE2-LABEL: test_fminimum_nsz:
936; SSE2:       # %bb.0:
937; SSE2-NEXT:    movaps %xmm0, %xmm2
938; SSE2-NEXT:    cmpunordss %xmm0, %xmm2
939; SSE2-NEXT:    movaps %xmm2, %xmm3
940; SSE2-NEXT:    andps %xmm0, %xmm3
941; SSE2-NEXT:    minss %xmm1, %xmm0
942; SSE2-NEXT:    andnps %xmm0, %xmm2
943; SSE2-NEXT:    orps %xmm3, %xmm2
944; SSE2-NEXT:    movaps %xmm2, %xmm0
945; SSE2-NEXT:    retq
946;
947; AVX1-LABEL: test_fminimum_nsz:
948; AVX1:       # %bb.0:
949; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
950; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
951; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
952; AVX1-NEXT:    retq
953;
954; AVX512-LABEL: test_fminimum_nsz:
955; AVX512:       # %bb.0:
956; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
957; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
958; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
959; AVX512-NEXT:    vmovaps %xmm1, %xmm0
960; AVX512-NEXT:    retq
961;
962; AVX10_2-LABEL: test_fminimum_nsz:
963; AVX10_2:       # %bb.0:
964; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
965; AVX10_2-NEXT:    retq
966;
967; X86-LABEL: test_fminimum_nsz:
968; X86:       # %bb.0:
969; X86-NEXT:    pushl %eax
970; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
971; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
972; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
973; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
974; X86-NEXT:    vmovss %xmm0, (%esp)
975; X86-NEXT:    flds (%esp)
976; X86-NEXT:    popl %eax
977; X86-NEXT:    retl
978  %1 = tail call nsz float @llvm.minimum.f32(float %x, float %y)
979  ret float %1
980}
981
982define float @test_fminimum_combine_cmps(float %x, float %y) nounwind {
983; SSE2-LABEL: test_fminimum_combine_cmps:
984; SSE2:       # %bb.0:
985; SSE2-NEXT:    divss %xmm0, %xmm1
986; SSE2-NEXT:    movd %xmm0, %eax
987; SSE2-NEXT:    testl %eax, %eax
988; SSE2-NEXT:    movaps %xmm1, %xmm3
989; SSE2-NEXT:    js .LBB19_2
990; SSE2-NEXT:  # %bb.1:
991; SSE2-NEXT:    movaps %xmm0, %xmm3
992; SSE2-NEXT:  .LBB19_2:
993; SSE2-NEXT:    movaps %xmm3, %xmm2
994; SSE2-NEXT:    cmpunordss %xmm3, %xmm2
995; SSE2-NEXT:    movaps %xmm2, %xmm4
996; SSE2-NEXT:    andps %xmm3, %xmm4
997; SSE2-NEXT:    js .LBB19_4
998; SSE2-NEXT:  # %bb.3:
999; SSE2-NEXT:    movaps %xmm1, %xmm0
1000; SSE2-NEXT:  .LBB19_4:
1001; SSE2-NEXT:    minss %xmm0, %xmm3
1002; SSE2-NEXT:    andnps %xmm3, %xmm2
1003; SSE2-NEXT:    orps %xmm4, %xmm2
1004; SSE2-NEXT:    movaps %xmm2, %xmm0
1005; SSE2-NEXT:    retq
1006;
1007; AVX1-LABEL: test_fminimum_combine_cmps:
1008; AVX1:       # %bb.0:
1009; AVX1-NEXT:    vdivss %xmm0, %xmm1, %xmm2
1010; AVX1-NEXT:    vmovd %xmm0, %eax
1011; AVX1-NEXT:    testl %eax, %eax
1012; AVX1-NEXT:    js .LBB19_1
1013; AVX1-NEXT:  # %bb.2:
1014; AVX1-NEXT:    vmovaps %xmm2, %xmm1
1015; AVX1-NEXT:    jmp .LBB19_3
1016; AVX1-NEXT:  .LBB19_1:
1017; AVX1-NEXT:    vmovaps %xmm0, %xmm1
1018; AVX1-NEXT:    vmovaps %xmm2, %xmm0
1019; AVX1-NEXT:  .LBB19_3:
1020; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
1021; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
1022; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1023; AVX1-NEXT:    retq
1024;
1025; AVX512F-LABEL: test_fminimum_combine_cmps:
1026; AVX512F:       # %bb.0:
1027; AVX512F-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1028; AVX512F-NEXT:    vmovd %xmm0, %eax
1029; AVX512F-NEXT:    testl %eax, %eax
1030; AVX512F-NEXT:    sets %al
1031; AVX512F-NEXT:    kmovw %eax, %k1
1032; AVX512F-NEXT:    vmovaps %xmm1, %xmm2
1033; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
1034; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
1035; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
1036; AVX512F-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
1037; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
1038; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
1039; AVX512F-NEXT:    retq
1040;
1041; AVX512DQ-LABEL: test_fminimum_combine_cmps:
1042; AVX512DQ:       # %bb.0:
1043; AVX512DQ-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1044; AVX512DQ-NEXT:    vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0)
1045; AVX512DQ-NEXT:    kmovw %k0, %k1
1046; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm2
1047; AVX512DQ-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
1048; AVX512DQ-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
1049; AVX512DQ-NEXT:    vminss %xmm2, %xmm0, %xmm0
1050; AVX512DQ-NEXT:    retq
1051;
1052; AVX10_2-LABEL: test_fminimum_combine_cmps:
1053; AVX10_2:       # %bb.0:
1054; AVX10_2-NEXT:    vdivss %xmm0, %xmm1, %xmm1
1055; AVX10_2-NEXT:    vminmaxss $0, %xmm1, %xmm0
1056; AVX10_2-NEXT:    retq
1057;
1058; X86-LABEL: test_fminimum_combine_cmps:
1059; X86:       # %bb.0:
1060; X86-NEXT:    pushl %eax
1061; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1062; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1063; X86-NEXT:    vdivss %xmm0, %xmm1, %xmm2
1064; X86-NEXT:    vmovd %xmm0, %eax
1065; X86-NEXT:    testl %eax, %eax
1066; X86-NEXT:    js .LBB19_1
1067; X86-NEXT:  # %bb.2:
1068; X86-NEXT:    vmovaps %xmm2, %xmm1
1069; X86-NEXT:    jmp .LBB19_3
1070; X86-NEXT:  .LBB19_1:
1071; X86-NEXT:    vmovaps %xmm0, %xmm1
1072; X86-NEXT:    vmovaps %xmm2, %xmm0
1073; X86-NEXT:  .LBB19_3:
1074; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
1075; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
1076; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1077; X86-NEXT:    vmovss %xmm0, (%esp)
1078; X86-NEXT:    flds (%esp)
1079; X86-NEXT:    popl %eax
1080; X86-NEXT:    retl
1081  %1 = fdiv nnan float %y, %x
1082  %2 = tail call float @llvm.minimum.f32(float %x, float %1)
1083  ret float %2
1084}
1085
1086define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) {
1087; SSE2-LABEL: test_fminimum_vector:
1088; SSE2:       # %bb.0:
1089; SSE2-NEXT:    movaps %xmm0, %xmm2
1090; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3]
1091; SSE2-NEXT:    pxor %xmm3, %xmm3
1092; SSE2-NEXT:    pcmpgtd %xmm2, %xmm3
1093; SSE2-NEXT:    movdqa %xmm3, %xmm2
1094; SSE2-NEXT:    pandn %xmm1, %xmm2
1095; SSE2-NEXT:    movdqa %xmm3, %xmm4
1096; SSE2-NEXT:    pandn %xmm0, %xmm4
1097; SSE2-NEXT:    pand %xmm3, %xmm0
1098; SSE2-NEXT:    por %xmm2, %xmm0
1099; SSE2-NEXT:    pand %xmm1, %xmm3
1100; SSE2-NEXT:    por %xmm4, %xmm3
1101; SSE2-NEXT:    movdqa %xmm3, %xmm1
1102; SSE2-NEXT:    minpd %xmm0, %xmm1
1103; SSE2-NEXT:    movdqa %xmm3, %xmm0
1104; SSE2-NEXT:    cmpunordpd %xmm3, %xmm0
1105; SSE2-NEXT:    andpd %xmm0, %xmm3
1106; SSE2-NEXT:    andnpd %xmm1, %xmm0
1107; SSE2-NEXT:    orpd %xmm3, %xmm0
1108; SSE2-NEXT:    retq
1109;
1110; AVX-LABEL: test_fminimum_vector:
1111; AVX:       # %bb.0:
1112; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1113; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1114; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1115; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
1116; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1117; AVX-NEXT:    retq
1118;
1119; AVX10_2-LABEL: test_fminimum_vector:
1120; AVX10_2:       # %bb.0:
1121; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1122; AVX10_2-NEXT:    retq
1123;
1124; X86-LABEL: test_fminimum_vector:
1125; X86:       # %bb.0:
1126; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1127; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1128; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1129; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
1130; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1131; X86-NEXT:    retl
1132  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y)
1133  ret <2 x double> %r
1134}
1135
1136define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" {
1137; SSE2-LABEL: test_fmaximum_vector:
1138; SSE2:       # %bb.0:
1139; SSE2-NEXT:    maxps %xmm1, %xmm0
1140; SSE2-NEXT:    retq
1141;
1142; AVX-LABEL: test_fmaximum_vector:
1143; AVX:       # %bb.0:
1144; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
1145; AVX-NEXT:    retq
1146;
1147; AVX10_2-LABEL: test_fmaximum_vector:
1148; AVX10_2:       # %bb.0:
1149; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
1150; AVX10_2-NEXT:    retq
1151;
1152; X86-LABEL: test_fmaximum_vector:
1153; X86:       # %bb.0:
1154; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm0
1155; X86-NEXT:    retl
1156  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y)
1157  ret <4 x float> %r
1158}
1159
1160define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) {
1161; SSE2-LABEL: test_fminimum_vector_zero:
1162; SSE2:       # %bb.0:
1163; SSE2-NEXT:    xorpd %xmm1, %xmm1
1164; SSE2-NEXT:    minpd %xmm0, %xmm1
1165; SSE2-NEXT:    movapd %xmm1, %xmm0
1166; SSE2-NEXT:    retq
1167;
1168; AVX-LABEL: test_fminimum_vector_zero:
1169; AVX:       # %bb.0:
1170; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1171; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1172; AVX-NEXT:    retq
1173;
1174; AVX10_2-LABEL: test_fminimum_vector_zero:
1175; AVX10_2:       # %bb.0:
1176; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1177; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1178; AVX10_2-NEXT:    retq
1179;
1180; X86-LABEL: test_fminimum_vector_zero:
1181; X86:       # %bb.0:
1182; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1183; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1184; X86-NEXT:    retl
1185  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
1186  ret <2 x double> %r
1187}
1188
1189define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) {
1190; SSE2-LABEL: test_fmaximum_vector_signed_zero:
1191; SSE2:       # %bb.0:
1192; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1193; SSE2-NEXT:    maxps %xmm0, %xmm1
1194; SSE2-NEXT:    movaps %xmm1, %xmm0
1195; SSE2-NEXT:    retq
1196;
1197; AVX-LABEL: test_fmaximum_vector_signed_zero:
1198; AVX:       # %bb.0:
1199; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1200; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1201; AVX-NEXT:    retq
1202;
1203; AVX10_2-LABEL: test_fmaximum_vector_signed_zero:
1204; AVX10_2:       # %bb.0:
1205; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1206; AVX10_2-NEXT:    retq
1207;
1208; X86-LABEL: test_fmaximum_vector_signed_zero:
1209; X86:       # %bb.0:
1210; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1211; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1212; X86-NEXT:    retl
1213  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
1214  ret <4 x float> %r
1215}
1216
1217define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) {
1218; SSE2-LABEL: test_fminimum_vector_partially_zero:
1219; SSE2:       # %bb.0:
1220; SSE2-NEXT:    xorpd %xmm1, %xmm1
1221; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1222; SSE2-NEXT:    minpd %xmm0, %xmm1
1223; SSE2-NEXT:    movapd %xmm1, %xmm0
1224; SSE2-NEXT:    retq
1225;
1226; AVX-LABEL: test_fminimum_vector_partially_zero:
1227; AVX:       # %bb.0:
1228; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1229; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1230; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1231; AVX-NEXT:    retq
1232;
1233; AVX10_2-LABEL: test_fminimum_vector_partially_zero:
1234; AVX10_2:       # %bb.0:
1235; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1236; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1237; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1238; AVX10_2-NEXT:    retq
1239;
1240; X86-LABEL: test_fminimum_vector_partially_zero:
1241; X86:       # %bb.0:
1242; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1243; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1244; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1245; X86-NEXT:    retl
1246  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
1247  ret <2 x double> %r
1248}
1249
1250define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) {
1251; SSE2-LABEL: test_fminimum_vector_different_zeros:
1252; SSE2:       # %bb.0:
1253; SSE2-NEXT:    movaps %xmm0, %xmm1
1254; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3]
1255; SSE2-NEXT:    xorps %xmm2, %xmm2
1256; SSE2-NEXT:    pxor %xmm3, %xmm3
1257; SSE2-NEXT:    pcmpgtd %xmm1, %xmm3
1258; SSE2-NEXT:    movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
1259; SSE2-NEXT:    movdqa %xmm3, %xmm1
1260; SSE2-NEXT:    pandn %xmm2, %xmm1
1261; SSE2-NEXT:    movaps %xmm0, %xmm4
1262; SSE2-NEXT:    andps %xmm3, %xmm4
1263; SSE2-NEXT:    orps %xmm1, %xmm4
1264; SSE2-NEXT:    pand %xmm0, %xmm2
1265; SSE2-NEXT:    pandn %xmm0, %xmm3
1266; SSE2-NEXT:    por %xmm2, %xmm3
1267; SSE2-NEXT:    movdqa %xmm3, %xmm1
1268; SSE2-NEXT:    minpd %xmm4, %xmm1
1269; SSE2-NEXT:    movdqa %xmm3, %xmm0
1270; SSE2-NEXT:    cmpunordpd %xmm3, %xmm0
1271; SSE2-NEXT:    andpd %xmm0, %xmm3
1272; SSE2-NEXT:    andnpd %xmm1, %xmm0
1273; SSE2-NEXT:    orpd %xmm3, %xmm0
1274; SSE2-NEXT:    retq
1275;
1276; AVX-LABEL: test_fminimum_vector_different_zeros:
1277; AVX:       # %bb.0:
1278; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1279; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1280; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1281; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1282; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1283; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
1284; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1285; AVX-NEXT:    retq
1286;
1287; AVX10_2-LABEL: test_fminimum_vector_different_zeros:
1288; AVX10_2:       # %bb.0:
1289; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1290; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1291; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1292; AVX10_2-NEXT:    retq
1293;
1294; X86-LABEL: test_fminimum_vector_different_zeros:
1295; X86:       # %bb.0:
1296; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1297; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1298; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
1299; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
1300; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
1301; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
1302; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
1303; X86-NEXT:    retl
1304  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
1305  ret <2 x double> %r
1306}
1307
1308define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) {
1309; SSE2-LABEL: test_fmaximum_vector_non_zero:
1310; SSE2:       # %bb.0:
1311; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1312; SSE2-NEXT:    maxps %xmm0, %xmm1
1313; SSE2-NEXT:    movaps %xmm1, %xmm0
1314; SSE2-NEXT:    retq
1315;
1316; AVX-LABEL: test_fmaximum_vector_non_zero:
1317; AVX:       # %bb.0:
1318; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1319; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1320; AVX-NEXT:    retq
1321;
1322; AVX10_2-LABEL: test_fmaximum_vector_non_zero:
1323; AVX10_2:       # %bb.0:
1324; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1325; AVX10_2-NEXT:    retq
1326;
1327; X86-LABEL: test_fmaximum_vector_non_zero:
1328; X86:       # %bb.0:
1329; X86-NEXT:    vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0]
1330; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1331; X86-NEXT:    retl
1332  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>)
1333  ret <4 x float> %r
1334}
1335
1336define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
1337; SSE2-LABEL: test_fminimum_vector_nan:
1338; SSE2:       # %bb.0:
1339; SSE2-NEXT:    movsd {{.*#+}} xmm2 = [NaN,0.0E+0]
1340; SSE2-NEXT:    xorpd %xmm1, %xmm1
1341; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1342; SSE2-NEXT:    minpd %xmm0, %xmm1
1343; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1344; SSE2-NEXT:    movapd %xmm1, %xmm0
1345; SSE2-NEXT:    retq
1346;
1347; AVX-LABEL: test_fminimum_vector_nan:
1348; AVX:       # %bb.0:
1349; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0]
1350; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
1351; AVX-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
1352; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
1353; AVX-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1354; AVX-NEXT:    retq
1355;
1356; AVX10_2-LABEL: test_fminimum_vector_nan:
1357; AVX10_2:       # %bb.0:
1358; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1359; AVX10_2-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1360; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1361; AVX10_2-NEXT:    retq
1362;
1363; X86-LABEL: test_fminimum_vector_nan:
1364; X86:       # %bb.0:
1365; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1366; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
1367; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1368; X86-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
1369; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
1370; X86-NEXT:    retl
1371  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
1372  ret <2 x double> %r
1373}
1374
1375define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) {
1376; SSE2-LABEL: test_fminimum_vector_zero_first:
1377; SSE2:       # %bb.0:
1378; SSE2-NEXT:    xorpd %xmm1, %xmm1
1379; SSE2-NEXT:    minpd %xmm0, %xmm1
1380; SSE2-NEXT:    movapd %xmm1, %xmm0
1381; SSE2-NEXT:    retq
1382;
1383; AVX-LABEL: test_fminimum_vector_zero_first:
1384; AVX:       # %bb.0:
1385; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1386; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1387; AVX-NEXT:    retq
1388;
1389; AVX10_2-LABEL: test_fminimum_vector_zero_first:
1390; AVX10_2:       # %bb.0:
1391; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1392; AVX10_2-NEXT:    vminmaxpd $0, %xmm1, %xmm0, %xmm0
1393; AVX10_2-NEXT:    retq
1394;
1395; X86-LABEL: test_fminimum_vector_zero_first:
1396; X86:       # %bb.0:
1397; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1398; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
1399; X86-NEXT:    retl
1400  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
1401  ret <2 x double> %r
1402}
1403
1404define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) {
1405; SSE2-LABEL: test_fminimum_vector_signed_zero:
1406; SSE2:       # %bb.0:
1407; SSE2-NEXT:    movapd %xmm0, %xmm1
1408; SSE2-NEXT:    minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1409; SSE2-NEXT:    movapd %xmm0, %xmm2
1410; SSE2-NEXT:    cmpunordpd %xmm0, %xmm2
1411; SSE2-NEXT:    andpd %xmm2, %xmm0
1412; SSE2-NEXT:    andnpd %xmm1, %xmm2
1413; SSE2-NEXT:    orpd %xmm2, %xmm0
1414; SSE2-NEXT:    retq
1415;
1416; AVX-LABEL: test_fminimum_vector_signed_zero:
1417; AVX:       # %bb.0:
1418; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
1419; AVX-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
1420; AVX-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
1421; AVX-NEXT:    retq
1422;
1423; AVX10_2-LABEL: test_fminimum_vector_signed_zero:
1424; AVX10_2:       # %bb.0:
1425; AVX10_2-NEXT:    vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
1426; AVX10_2-NEXT:    retq
1427;
1428; X86-LABEL: test_fminimum_vector_signed_zero:
1429; X86:       # %bb.0:
1430; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
1431; X86-NEXT:    vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2
1432; X86-NEXT:    vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
1433; X86-NEXT:    retl
1434  %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>)
1435  ret <2 x double> %r
1436}
1437
1438define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) {
1439; SSE2-LABEL: test_fmaximum_vector_signed_zero_first:
1440; SSE2:       # %bb.0:
1441; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1442; SSE2-NEXT:    maxps %xmm0, %xmm1
1443; SSE2-NEXT:    movaps %xmm1, %xmm0
1444; SSE2-NEXT:    retq
1445;
1446; AVX-LABEL: test_fmaximum_vector_signed_zero_first:
1447; AVX:       # %bb.0:
1448; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1449; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1450; AVX-NEXT:    retq
1451;
1452; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first:
1453; AVX10_2:       # %bb.0:
1454; AVX10_2-NEXT:    vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1455; AVX10_2-NEXT:    retq
1456;
1457; X86-LABEL: test_fmaximum_vector_signed_zero_first:
1458; X86:       # %bb.0:
1459; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
1460; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
1461; X86-NEXT:    retl
1462  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
1463  ret <4 x float> %r
1464}
1465
1466define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) {
1467; SSE2-LABEL: test_fmaximum_vector_zero:
1468; SSE2:       # %bb.0:
1469; SSE2-NEXT:    xorps %xmm1, %xmm1
1470; SSE2-NEXT:    movaps %xmm0, %xmm2
1471; SSE2-NEXT:    maxps %xmm1, %xmm2
1472; SSE2-NEXT:    movaps %xmm0, %xmm1
1473; SSE2-NEXT:    cmpunordps %xmm0, %xmm1
1474; SSE2-NEXT:    andps %xmm1, %xmm0
1475; SSE2-NEXT:    andnps %xmm2, %xmm1
1476; SSE2-NEXT:    orps %xmm1, %xmm0
1477; SSE2-NEXT:    retq
1478;
1479; AVX-LABEL: test_fmaximum_vector_zero:
1480; AVX:       # %bb.0:
1481; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1482; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
1483; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
1484; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1485; AVX-NEXT:    retq
1486;
1487; AVX10_2-LABEL: test_fmaximum_vector_zero:
1488; AVX10_2:       # %bb.0:
1489; AVX10_2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1490; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
1491; AVX10_2-NEXT:    retq
1492;
1493; X86-LABEL: test_fmaximum_vector_zero:
1494; X86:       # %bb.0:
1495; X86-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1496; X86-NEXT:    vmaxps %xmm1, %xmm0, %xmm1
1497; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
1498; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1499; X86-NEXT:    retl
1500  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>)
1501  ret <4 x float> %r
1502}
1503
1504; PR77805: Check that signed zeroes are handled correctly in this case (FIXME)
1505define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) {
1506; SSE2-LABEL: test_fmaximum_v4f32_splat:
1507; SSE2:       # %bb.0:
1508; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1509; SSE2-NEXT:    pxor %xmm2, %xmm2
1510; SSE2-NEXT:    pcmpgtd %xmm0, %xmm2
1511; SSE2-NEXT:    movdqa %xmm2, %xmm3
1512; SSE2-NEXT:    pandn %xmm0, %xmm3
1513; SSE2-NEXT:    movaps %xmm1, %xmm4
1514; SSE2-NEXT:    andps %xmm2, %xmm4
1515; SSE2-NEXT:    orps %xmm3, %xmm4
1516; SSE2-NEXT:    pand %xmm2, %xmm0
1517; SSE2-NEXT:    andnps %xmm1, %xmm2
1518; SSE2-NEXT:    por %xmm2, %xmm0
1519; SSE2-NEXT:    movdqa %xmm0, %xmm1
1520; SSE2-NEXT:    maxps %xmm4, %xmm1
1521; SSE2-NEXT:    movdqa %xmm0, %xmm2
1522; SSE2-NEXT:    cmpunordps %xmm0, %xmm2
1523; SSE2-NEXT:    andps %xmm2, %xmm0
1524; SSE2-NEXT:    andnps %xmm1, %xmm2
1525; SSE2-NEXT:    orps %xmm2, %xmm0
1526; SSE2-NEXT:    retq
1527;
1528; AVX1-LABEL: test_fmaximum_v4f32_splat:
1529; AVX1:       # %bb.0:
1530; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1531; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1532; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1533; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1534; AVX1-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
1535; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1536; AVX1-NEXT:    retq
1537;
1538; AVX512-LABEL: test_fmaximum_v4f32_splat:
1539; AVX512:       # %bb.0:
1540; AVX512-NEXT:    vbroadcastss %xmm1, %xmm1
1541; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1542; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1543; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1544; AVX512-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
1545; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1546; AVX512-NEXT:    retq
1547;
1548; AVX10_2-LABEL: test_fmaximum_v4f32_splat:
1549; AVX10_2:       # %bb.0:
1550; AVX10_2-NEXT:    vbroadcastss %xmm1, %xmm1
1551; AVX10_2-NEXT:    vminmaxps $1, %xmm1, %xmm0, %xmm0
1552; AVX10_2-NEXT:    retq
1553;
1554; X86-LABEL: test_fmaximum_v4f32_splat:
1555; X86:       # %bb.0:
1556; X86-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm1
1557; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
1558; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
1559; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
1560; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
1561; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
1562; X86-NEXT:    retl
1563  %splatinsert = insertelement <4 x float> poison, float %y, i64 0
1564  %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer
1565  %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %vec) readnone
1566  ret <4 x float> %r
1567}
1568
1569define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
1570; SSE2-LABEL: test_fmaximum_v4f16:
1571; SSE2:       # %bb.0:
1572; SSE2-NEXT:    subq $104, %rsp
1573; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1574; SSE2-NEXT:    psrld $16, %xmm0
1575; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1576; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
1577; SSE2-NEXT:    movdqa %xmm1, %xmm0
1578; SSE2-NEXT:    psrld $16, %xmm0
1579; SSE2-NEXT:    callq __extendhfsf2@PLT
1580; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
1581; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1582; SSE2-NEXT:    callq __extendhfsf2@PLT
1583; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1584; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1585; SSE2-NEXT:    movdqa %xmm0, %xmm1
1586; SSE2-NEXT:    movd %xmm0, %eax
1587; SSE2-NEXT:    testl %eax, %eax
1588; SSE2-NEXT:    movdqa %xmm0, %xmm2
1589; SSE2-NEXT:    js .LBB33_2
1590; SSE2-NEXT:  # %bb.1:
1591; SSE2-NEXT:    movdqa %xmm4, %xmm2
1592; SSE2-NEXT:  .LBB33_2:
1593; SSE2-NEXT:    movdqa %xmm2, %xmm0
1594; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
1595; SSE2-NEXT:    movaps %xmm0, %xmm3
1596; SSE2-NEXT:    andps %xmm2, %xmm3
1597; SSE2-NEXT:    js .LBB33_4
1598; SSE2-NEXT:  # %bb.3:
1599; SSE2-NEXT:    movdqa %xmm1, %xmm4
1600; SSE2-NEXT:  .LBB33_4:
1601; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1602; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
1603; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1604; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
1605; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
1606; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1607; SSE2-NEXT:    maxss %xmm4, %xmm2
1608; SSE2-NEXT:    andnps %xmm2, %xmm0
1609; SSE2-NEXT:    orps %xmm3, %xmm0
1610; SSE2-NEXT:    callq __truncsfhf2@PLT
1611; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1612; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1613; SSE2-NEXT:    callq __extendhfsf2@PLT
1614; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1615; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1616; SSE2-NEXT:    callq __extendhfsf2@PLT
1617; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1618; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1619; SSE2-NEXT:    movdqa %xmm0, %xmm1
1620; SSE2-NEXT:    movd %xmm0, %eax
1621; SSE2-NEXT:    testl %eax, %eax
1622; SSE2-NEXT:    movdqa %xmm0, %xmm2
1623; SSE2-NEXT:    js .LBB33_6
1624; SSE2-NEXT:  # %bb.5:
1625; SSE2-NEXT:    movdqa %xmm4, %xmm2
1626; SSE2-NEXT:  .LBB33_6:
1627; SSE2-NEXT:    movdqa %xmm2, %xmm0
1628; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
1629; SSE2-NEXT:    movaps %xmm0, %xmm3
1630; SSE2-NEXT:    andps %xmm2, %xmm3
1631; SSE2-NEXT:    js .LBB33_8
1632; SSE2-NEXT:  # %bb.7:
1633; SSE2-NEXT:    movdqa %xmm1, %xmm4
1634; SSE2-NEXT:  .LBB33_8:
1635; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1636; SSE2-NEXT:    psrlq $48, %xmm1
1637; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1638; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
1639; SSE2-NEXT:    psrlq $48, %xmm1
1640; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
1641; SSE2-NEXT:    maxss %xmm4, %xmm2
1642; SSE2-NEXT:    andnps %xmm2, %xmm0
1643; SSE2-NEXT:    orps %xmm3, %xmm0
1644; SSE2-NEXT:    callq __truncsfhf2@PLT
1645; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1646; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1647; SSE2-NEXT:    callq __extendhfsf2@PLT
1648; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1649; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1650; SSE2-NEXT:    callq __extendhfsf2@PLT
1651; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
1652; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1653; SSE2-NEXT:    movd %xmm0, %eax
1654; SSE2-NEXT:    testl %eax, %eax
1655; SSE2-NEXT:    movdqa %xmm0, %xmm2
1656; SSE2-NEXT:    js .LBB33_10
1657; SSE2-NEXT:  # %bb.9:
1658; SSE2-NEXT:    movdqa %xmm4, %xmm2
1659; SSE2-NEXT:  .LBB33_10:
1660; SSE2-NEXT:    movdqa %xmm2, %xmm1
1661; SSE2-NEXT:    cmpunordss %xmm2, %xmm1
1662; SSE2-NEXT:    movaps %xmm1, %xmm3
1663; SSE2-NEXT:    andps %xmm2, %xmm3
1664; SSE2-NEXT:    js .LBB33_12
1665; SSE2-NEXT:  # %bb.11:
1666; SSE2-NEXT:    movdqa %xmm0, %xmm4
1667; SSE2-NEXT:  .LBB33_12:
1668; SSE2-NEXT:    maxss %xmm4, %xmm2
1669; SSE2-NEXT:    andnps %xmm2, %xmm1
1670; SSE2-NEXT:    orps %xmm3, %xmm1
1671; SSE2-NEXT:    movaps %xmm1, %xmm0
1672; SSE2-NEXT:    callq __truncsfhf2@PLT
1673; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1674; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
1675; SSE2-NEXT:    callq __extendhfsf2@PLT
1676; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
1677; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1678; SSE2-NEXT:    callq __extendhfsf2@PLT
1679; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
1680; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
1681; SSE2-NEXT:    movdqa %xmm0, %xmm1
1682; SSE2-NEXT:    movd %xmm0, %eax
1683; SSE2-NEXT:    testl %eax, %eax
1684; SSE2-NEXT:    movdqa %xmm0, %xmm2
1685; SSE2-NEXT:    js .LBB33_14
1686; SSE2-NEXT:  # %bb.13:
1687; SSE2-NEXT:    movdqa %xmm4, %xmm2
1688; SSE2-NEXT:  .LBB33_14:
1689; SSE2-NEXT:    movdqa %xmm2, %xmm0
1690; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
1691; SSE2-NEXT:    movaps %xmm0, %xmm3
1692; SSE2-NEXT:    andps %xmm2, %xmm3
1693; SSE2-NEXT:    js .LBB33_16
1694; SSE2-NEXT:  # %bb.15:
1695; SSE2-NEXT:    movdqa %xmm1, %xmm4
1696; SSE2-NEXT:  .LBB33_16:
1697; SSE2-NEXT:    maxss %xmm4, %xmm2
1698; SSE2-NEXT:    andnps %xmm2, %xmm0
1699; SSE2-NEXT:    orps %xmm3, %xmm0
1700; SSE2-NEXT:    callq __truncsfhf2@PLT
1701; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1702; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1703; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1704; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1705; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1706; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1707; SSE2-NEXT:    addq $104, %rsp
1708; SSE2-NEXT:    retq
1709;
1710; AVX1-LABEL: test_fmaximum_v4f16:
1711; AVX1:       # %bb.0:
1712; AVX1-NEXT:    subq $120, %rsp
1713; AVX1-NEXT:    vmovaps %xmm0, %xmm2
1714; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3]
1715; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1716; AVX1-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
1717; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1718; AVX1-NEXT:    vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1719; AVX1-NEXT:    vpsrld $16, %xmm2, %xmm0
1720; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1721; AVX1-NEXT:    vmovaps %xmm1, (%rsp) # 16-byte Spill
1722; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
1723; AVX1-NEXT:    callq __extendhfsf2@PLT
1724; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1725; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1726; AVX1-NEXT:    callq __extendhfsf2@PLT
1727; AVX1-NEXT:    vmovd %xmm0, %eax
1728; AVX1-NEXT:    testl %eax, %eax
1729; AVX1-NEXT:    js .LBB33_1
1730; AVX1-NEXT:  # %bb.2:
1731; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1732; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1733; AVX1-NEXT:    jmp .LBB33_3
1734; AVX1-NEXT:  .LBB33_1:
1735; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1736; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1737; AVX1-NEXT:  .LBB33_3:
1738; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1739; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1740; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1741; AVX1-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
1742; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
1743; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1744; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1745; AVX1-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm1
1746; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1747; AVX1-NEXT:    callq __truncsfhf2@PLT
1748; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1749; AVX1-NEXT:    vmovaps (%rsp), %xmm0 # 16-byte Reload
1750; AVX1-NEXT:    callq __extendhfsf2@PLT
1751; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1752; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1753; AVX1-NEXT:    callq __extendhfsf2@PLT
1754; AVX1-NEXT:    vmovd %xmm0, %eax
1755; AVX1-NEXT:    testl %eax, %eax
1756; AVX1-NEXT:    js .LBB33_4
1757; AVX1-NEXT:  # %bb.5:
1758; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1759; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
1760; AVX1-NEXT:    jmp .LBB33_6
1761; AVX1-NEXT:  .LBB33_4:
1762; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1763; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1764; AVX1-NEXT:  .LBB33_6:
1765; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1766; AVX1-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm1
1767; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1768; AVX1-NEXT:    callq __truncsfhf2@PLT
1769; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1770; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1771; AVX1-NEXT:    callq __extendhfsf2@PLT
1772; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1773; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1774; AVX1-NEXT:    callq __extendhfsf2@PLT
1775; AVX1-NEXT:    vmovd %xmm0, %eax
1776; AVX1-NEXT:    testl %eax, %eax
1777; AVX1-NEXT:    js .LBB33_7
1778; AVX1-NEXT:  # %bb.8:
1779; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1780; AVX1-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
1781; AVX1-NEXT:    jmp .LBB33_9
1782; AVX1-NEXT:  .LBB33_7:
1783; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1784; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1785; AVX1-NEXT:  .LBB33_9:
1786; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1787; AVX1-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm1
1788; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1789; AVX1-NEXT:    callq __truncsfhf2@PLT
1790; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
1791; AVX1-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1792; AVX1-NEXT:    callq __extendhfsf2@PLT
1793; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1794; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
1795; AVX1-NEXT:    callq __extendhfsf2@PLT
1796; AVX1-NEXT:    vmovd %xmm0, %eax
1797; AVX1-NEXT:    testl %eax, %eax
1798; AVX1-NEXT:    js .LBB33_10
1799; AVX1-NEXT:  # %bb.11:
1800; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
1801; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
1802; AVX1-NEXT:    jmp .LBB33_12
1803; AVX1-NEXT:  .LBB33_10:
1804; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1805; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
1806; AVX1-NEXT:  .LBB33_12:
1807; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
1808; AVX1-NEXT:    vcmpunordss %xmm2, %xmm2, %xmm1
1809; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
1810; AVX1-NEXT:    callq __truncsfhf2@PLT
1811; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
1812; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1813; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1814; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
1815; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
1816; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
1817; AVX1-NEXT:    addq $120, %rsp
1818; AVX1-NEXT:    retq
1819;
1820; AVX512-LABEL: test_fmaximum_v4f16:
1821; AVX512:       # %bb.0:
1822; AVX512-NEXT:    pushq %rbp
1823; AVX512-NEXT:    pushq %r15
1824; AVX512-NEXT:    pushq %r14
1825; AVX512-NEXT:    pushq %r13
1826; AVX512-NEXT:    pushq %r12
1827; AVX512-NEXT:    pushq %rbx
1828; AVX512-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
1829; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1830; AVX512-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3]
1831; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1832; AVX512-NEXT:    xorl %eax, %eax
1833; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1834; AVX512-NEXT:    movl $65535, %ecx # imm = 0xFFFF
1835; AVX512-NEXT:    movl $0, %edx
1836; AVX512-NEXT:    cmovpl %ecx, %edx
1837; AVX512-NEXT:    movl $0, %edi
1838; AVX512-NEXT:    cmoval %ecx, %edi
1839; AVX512-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1840; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1841; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1842; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1843; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1844; AVX512-NEXT:    movl $0, %esi
1845; AVX512-NEXT:    cmovpl %ecx, %esi
1846; AVX512-NEXT:    movl $0, %r9d
1847; AVX512-NEXT:    cmoval %ecx, %r9d
1848; AVX512-NEXT:    vshufpd {{.*#+}} xmm2 = xmm1[1,0]
1849; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1850; AVX512-NEXT:    vshufpd {{.*#+}} xmm3 = xmm0[1,0]
1851; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1852; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1853; AVX512-NEXT:    movl $0, %r8d
1854; AVX512-NEXT:    cmovpl %ecx, %r8d
1855; AVX512-NEXT:    movl $0, %r11d
1856; AVX512-NEXT:    cmoval %ecx, %r11d
1857; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7]
1858; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1859; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7]
1860; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1861; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1862; AVX512-NEXT:    movl $0, %r10d
1863; AVX512-NEXT:    cmovpl %ecx, %r10d
1864; AVX512-NEXT:    movl $0, %ebp
1865; AVX512-NEXT:    cmoval %ecx, %ebp
1866; AVX512-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3]
1867; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1868; AVX512-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3]
1869; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1870; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1871; AVX512-NEXT:    movl $0, %ebx
1872; AVX512-NEXT:    cmovpl %ecx, %ebx
1873; AVX512-NEXT:    movl $0, %r14d
1874; AVX512-NEXT:    cmoval %ecx, %r14d
1875; AVX512-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7]
1876; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm2
1877; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7]
1878; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1879; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1880; AVX512-NEXT:    movl $0, %r15d
1881; AVX512-NEXT:    cmovpl %ecx, %r15d
1882; AVX512-NEXT:    movl $0, %r12d
1883; AVX512-NEXT:    cmoval %ecx, %r12d
1884; AVX512-NEXT:    vcvtph2ps %xmm1, %xmm2
1885; AVX512-NEXT:    vcvtph2ps %xmm0, %xmm3
1886; AVX512-NEXT:    vucomiss %xmm2, %xmm3
1887; AVX512-NEXT:    movl $0, %r13d
1888; AVX512-NEXT:    cmoval %ecx, %r13d
1889; AVX512-NEXT:    vmovd %r13d, %xmm2
1890; AVX512-NEXT:    vpinsrw $1, %r12d, %xmm2, %xmm2
1891; AVX512-NEXT:    vpinsrw $2, %r14d, %xmm2, %xmm2
1892; AVX512-NEXT:    vpinsrw $3, %ebp, %xmm2, %xmm2
1893; AVX512-NEXT:    vpinsrw $4, %r11d, %xmm2, %xmm2
1894; AVX512-NEXT:    vpinsrw $5, %r9d, %xmm2, %xmm2
1895; AVX512-NEXT:    vpinsrw $6, %edi, %xmm2, %xmm2
1896; AVX512-NEXT:    movl $0, %edi
1897; AVX512-NEXT:    cmovpl %ecx, %edi
1898; AVX512-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1899; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1900; AVX512-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1901; AVX512-NEXT:    vcvtph2ps %xmm4, %xmm4
1902; AVX512-NEXT:    vucomiss %xmm3, %xmm4
1903; AVX512-NEXT:    movl $0, %r9d
1904; AVX512-NEXT:    cmoval %ecx, %r9d
1905; AVX512-NEXT:    vpinsrw $7, %r9d, %xmm2, %xmm2
1906; AVX512-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm2
1907; AVX512-NEXT:    vmovd %edi, %xmm3
1908; AVX512-NEXT:    vpinsrw $1, %r15d, %xmm3, %xmm3
1909; AVX512-NEXT:    vpinsrw $2, %ebx, %xmm3, %xmm3
1910; AVX512-NEXT:    vpinsrw $3, %r10d, %xmm3, %xmm3
1911; AVX512-NEXT:    vpinsrw $4, %r8d, %xmm3, %xmm3
1912; AVX512-NEXT:    vpinsrw $5, %esi, %xmm3, %xmm3
1913; AVX512-NEXT:    vpinsrw $6, %edx, %xmm3, %xmm3
1914; AVX512-NEXT:    movl $0, %edx
1915; AVX512-NEXT:    cmovpl %ecx, %edx
1916; AVX512-NEXT:    vpinsrw $7, %edx, %xmm3, %xmm3
1917; AVX512-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
1918; AVX512-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
1919; AVX512-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7]
1920; AVX512-NEXT:    vcvtph2ps %xmm3, %xmm3
1921; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1922; AVX512-NEXT:    vucomiss %xmm4, %xmm3
1923; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1924; AVX512-NEXT:    cmovnel %eax, %edx
1925; AVX512-NEXT:    cmovpl %eax, %edx
1926; AVX512-NEXT:    vcvtph2ps %xmm2, %xmm3
1927; AVX512-NEXT:    vucomiss %xmm4, %xmm3
1928; AVX512-NEXT:    movl $65535, %esi # imm = 0xFFFF
1929; AVX512-NEXT:    cmovnel %eax, %esi
1930; AVX512-NEXT:    cmovpl %eax, %esi
1931; AVX512-NEXT:    vmovd %esi, %xmm3
1932; AVX512-NEXT:    vpinsrw $1, %edx, %xmm3, %xmm3
1933; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1]
1934; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1935; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1936; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1937; AVX512-NEXT:    cmovnel %eax, %edx
1938; AVX512-NEXT:    cmovpl %eax, %edx
1939; AVX512-NEXT:    vpinsrw $2, %edx, %xmm3, %xmm3
1940; AVX512-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7]
1941; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1942; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1943; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1944; AVX512-NEXT:    cmovnel %eax, %edx
1945; AVX512-NEXT:    cmovpl %eax, %edx
1946; AVX512-NEXT:    vpinsrw $3, %edx, %xmm3, %xmm3
1947; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3]
1948; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1949; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1950; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1951; AVX512-NEXT:    cmovnel %eax, %edx
1952; AVX512-NEXT:    cmovpl %eax, %edx
1953; AVX512-NEXT:    vpinsrw $4, %edx, %xmm3, %xmm3
1954; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1955; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1956; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1957; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1958; AVX512-NEXT:    cmovnel %eax, %edx
1959; AVX512-NEXT:    cmovpl %eax, %edx
1960; AVX512-NEXT:    vpinsrw $5, %edx, %xmm3, %xmm3
1961; AVX512-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3]
1962; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1963; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1964; AVX512-NEXT:    movl $65535, %edx # imm = 0xFFFF
1965; AVX512-NEXT:    cmovnel %eax, %edx
1966; AVX512-NEXT:    cmovpl %eax, %edx
1967; AVX512-NEXT:    vpinsrw $6, %edx, %xmm3, %xmm3
1968; AVX512-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1969; AVX512-NEXT:    vcvtph2ps %xmm5, %xmm5
1970; AVX512-NEXT:    vucomiss %xmm4, %xmm5
1971; AVX512-NEXT:    cmovnel %eax, %ecx
1972; AVX512-NEXT:    cmovpl %eax, %ecx
1973; AVX512-NEXT:    vpinsrw $7, %ecx, %xmm3, %xmm3
1974; AVX512-NEXT:    vpxor %xmm4, %xmm4, %xmm4
1975; AVX512-NEXT:    vpcmpeqw %xmm4, %xmm0, %xmm5
1976; AVX512-NEXT:    vpblendvb %xmm5, %xmm0, %xmm2, %xmm0
1977; AVX512-NEXT:    vpcmpeqw %xmm4, %xmm1, %xmm4
1978; AVX512-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
1979; AVX512-NEXT:    vpblendvb %xmm3, %xmm0, %xmm2, %xmm0
1980; AVX512-NEXT:    popq %rbx
1981; AVX512-NEXT:    popq %r12
1982; AVX512-NEXT:    popq %r13
1983; AVX512-NEXT:    popq %r14
1984; AVX512-NEXT:    popq %r15
1985; AVX512-NEXT:    popq %rbp
1986; AVX512-NEXT:    retq
1987;
1988; AVX10_2-LABEL: test_fmaximum_v4f16:
1989; AVX10_2:       # %bb.0:
1990; AVX10_2-NEXT:    vminmaxph $1, %xmm1, %xmm0, %xmm0
1991; AVX10_2-NEXT:    retq
1992;
1993; X86-LABEL: test_fmaximum_v4f16:
1994; X86:       # %bb.0:
1995; X86-NEXT:    subl $164, %esp
1996; X86-NEXT:    vmovdqa %xmm0, %xmm2
1997; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
1998; X86-NEXT:    vpsrlq $48, %xmm0, %xmm0
1999; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2000; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3]
2001; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2002; X86-NEXT:    vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3]
2003; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2004; X86-NEXT:    vpsrlq $48, %xmm1, %xmm0
2005; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2006; X86-NEXT:    vpsrld $16, %xmm2, %xmm0
2007; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2008; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
2009; X86-NEXT:    vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2010; X86-NEXT:    vpextrw $0, %xmm1, (%esp)
2011; X86-NEXT:    calll __extendhfsf2
2012; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2013; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2014; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2015; X86-NEXT:    calll __extendhfsf2
2016; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2017; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2018; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2019; X86-NEXT:    calll __extendhfsf2
2020; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2021; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2022; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2023; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2024; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2025; X86-NEXT:    vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
2026; X86-NEXT:    vmovd %xmm2, %eax
2027; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2028; X86-NEXT:    testl %eax, %eax
2029; X86-NEXT:    js .LBB33_1
2030; X86-NEXT:  # %bb.2:
2031; X86-NEXT:    vmovdqa %xmm2, %xmm1
2032; X86-NEXT:    jmp .LBB33_3
2033; X86-NEXT:  .LBB33_1:
2034; X86-NEXT:    vmovdqa %xmm0, %xmm1
2035; X86-NEXT:    vmovdqa %xmm2, %xmm0
2036; X86-NEXT:  .LBB33_3:
2037; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2038; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2039; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2040; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2041; X86-NEXT:    calll __extendhfsf2
2042; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2043; X86-NEXT:    vmovss %xmm0, (%esp)
2044; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2045; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2046; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2047; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2048; X86-NEXT:    vmovd %xmm1, %eax
2049; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2050; X86-NEXT:    testl %eax, %eax
2051; X86-NEXT:    js .LBB33_4
2052; X86-NEXT:  # %bb.5:
2053; X86-NEXT:    vmovdqa %xmm1, %xmm2
2054; X86-NEXT:    jmp .LBB33_6
2055; X86-NEXT:  .LBB33_4:
2056; X86-NEXT:    vmovdqa %xmm0, %xmm2
2057; X86-NEXT:    vmovdqa %xmm1, %xmm0
2058; X86-NEXT:  .LBB33_6:
2059; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2060; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2061; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2062; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2063; X86-NEXT:    calll __truncsfhf2
2064; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2065; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2066; X86-NEXT:    vmovss %xmm0, (%esp)
2067; X86-NEXT:    calll __truncsfhf2
2068; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2069; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2070; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2071; X86-NEXT:    calll __extendhfsf2
2072; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2073; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2074; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2075; X86-NEXT:    calll __extendhfsf2
2076; X86-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill
2077; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2078; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2079; X86-NEXT:    calll __extendhfsf2
2080; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2081; X86-NEXT:    vpextrw $0, %xmm0, (%esp)
2082; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2083; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2084; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2085; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2086; X86-NEXT:    vmovd %xmm1, %eax
2087; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2088; X86-NEXT:    testl %eax, %eax
2089; X86-NEXT:    js .LBB33_7
2090; X86-NEXT:  # %bb.8:
2091; X86-NEXT:    vmovdqa %xmm1, %xmm2
2092; X86-NEXT:    jmp .LBB33_9
2093; X86-NEXT:  .LBB33_7:
2094; X86-NEXT:    vmovdqa %xmm0, %xmm2
2095; X86-NEXT:    vmovdqa %xmm1, %xmm0
2096; X86-NEXT:  .LBB33_9:
2097; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2098; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2099; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2100; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2101; X86-NEXT:    calll __extendhfsf2
2102; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2103; X86-NEXT:    vmovss %xmm0, (%esp)
2104; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2105; X86-NEXT:    fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload
2106; X86-NEXT:    fstps {{[0-9]+}}(%esp)
2107; X86-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2108; X86-NEXT:    vmovd %xmm1, %eax
2109; X86-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
2110; X86-NEXT:    testl %eax, %eax
2111; X86-NEXT:    js .LBB33_10
2112; X86-NEXT:  # %bb.11:
2113; X86-NEXT:    vmovdqa %xmm1, %xmm2
2114; X86-NEXT:    jmp .LBB33_12
2115; X86-NEXT:  .LBB33_10:
2116; X86-NEXT:    vmovdqa %xmm0, %xmm2
2117; X86-NEXT:    vmovdqa %xmm1, %xmm0
2118; X86-NEXT:  .LBB33_12:
2119; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
2120; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2121; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2122; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2123; X86-NEXT:    calll __truncsfhf2
2124; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2125; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2126; X86-NEXT:    vmovd %xmm0, (%esp)
2127; X86-NEXT:    calll __truncsfhf2
2128; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2129; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2130; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2131; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2132; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2133; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2134; X86-NEXT:    addl $164, %esp
2135; X86-NEXT:    retl
2136  %r = call <4 x half> @llvm.maximum.v4f16(<4 x half> %x, <4 x half> %y)
2137  ret <4 x half> %r
2138}
2139
2140define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) {
2141; SSE2-LABEL: test_fmaximum_v4bf16:
2142; SSE2:       # %bb.0:
2143; SSE2-NEXT:    pushq %rbp
2144; SSE2-NEXT:    .cfi_def_cfa_offset 16
2145; SSE2-NEXT:    pushq %r15
2146; SSE2-NEXT:    .cfi_def_cfa_offset 24
2147; SSE2-NEXT:    pushq %r14
2148; SSE2-NEXT:    .cfi_def_cfa_offset 32
2149; SSE2-NEXT:    pushq %rbx
2150; SSE2-NEXT:    .cfi_def_cfa_offset 40
2151; SSE2-NEXT:    subq $56, %rsp
2152; SSE2-NEXT:    .cfi_def_cfa_offset 96
2153; SSE2-NEXT:    .cfi_offset %rbx, -40
2154; SSE2-NEXT:    .cfi_offset %r14, -32
2155; SSE2-NEXT:    .cfi_offset %r15, -24
2156; SSE2-NEXT:    .cfi_offset %rbp, -16
2157; SSE2-NEXT:    movdqa %xmm1, %xmm4
2158; SSE2-NEXT:    movdqa %xmm0, %xmm5
2159; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
2160; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
2161; SSE2-NEXT:    movdqa %xmm1, %xmm0
2162; SSE2-NEXT:    psrld $16, %xmm0
2163; SSE2-NEXT:    pextrw $0, %xmm0, %eax
2164; SSE2-NEXT:    movdqa %xmm5, %xmm0
2165; SSE2-NEXT:    psrld $16, %xmm0
2166; SSE2-NEXT:    pextrw $0, %xmm0, %ecx
2167; SSE2-NEXT:    shll $16, %ecx
2168; SSE2-NEXT:    movd %ecx, %xmm3
2169; SSE2-NEXT:    shll $16, %eax
2170; SSE2-NEXT:    movd %eax, %xmm2
2171; SSE2-NEXT:    testl %ecx, %ecx
2172; SSE2-NEXT:    movdqa %xmm3, %xmm1
2173; SSE2-NEXT:    js .LBB34_2
2174; SSE2-NEXT:  # %bb.1:
2175; SSE2-NEXT:    movdqa %xmm2, %xmm1
2176; SSE2-NEXT:  .LBB34_2:
2177; SSE2-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2178; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1,1,1]
2179; SSE2-NEXT:    movdqa %xmm5, (%rsp) # 16-byte Spill
2180; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1,1,1]
2181; SSE2-NEXT:    movdqa %xmm1, %xmm0
2182; SSE2-NEXT:    cmpunordss %xmm1, %xmm0
2183; SSE2-NEXT:    movaps %xmm0, %xmm6
2184; SSE2-NEXT:    andps %xmm1, %xmm6
2185; SSE2-NEXT:    js .LBB34_4
2186; SSE2-NEXT:  # %bb.3:
2187; SSE2-NEXT:    movdqa %xmm3, %xmm2
2188; SSE2-NEXT:  .LBB34_4:
2189; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
2190; SSE2-NEXT:    pextrw $0, %xmm5, %ebx
2191; SSE2-NEXT:    maxss %xmm2, %xmm1
2192; SSE2-NEXT:    andnps %xmm1, %xmm0
2193; SSE2-NEXT:    orps %xmm6, %xmm0
2194; SSE2-NEXT:    callq __truncsfbf2@PLT
2195; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2196; SSE2-NEXT:    shll $16, %r15d
2197; SSE2-NEXT:    movd %r15d, %xmm3
2198; SSE2-NEXT:    shll $16, %r14d
2199; SSE2-NEXT:    movd %r14d, %xmm2
2200; SSE2-NEXT:    testl %r15d, %r15d
2201; SSE2-NEXT:    movdqa %xmm3, %xmm1
2202; SSE2-NEXT:    js .LBB34_6
2203; SSE2-NEXT:  # %bb.5:
2204; SSE2-NEXT:    movdqa %xmm2, %xmm1
2205; SSE2-NEXT:  .LBB34_6:
2206; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
2207; SSE2-NEXT:    psrlq $48, %xmm5
2208; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
2209; SSE2-NEXT:    psrlq $48, %xmm6
2210; SSE2-NEXT:    movdqa %xmm1, %xmm0
2211; SSE2-NEXT:    cmpunordss %xmm1, %xmm0
2212; SSE2-NEXT:    movaps %xmm0, %xmm4
2213; SSE2-NEXT:    andps %xmm1, %xmm4
2214; SSE2-NEXT:    js .LBB34_8
2215; SSE2-NEXT:  # %bb.7:
2216; SSE2-NEXT:    movdqa %xmm3, %xmm2
2217; SSE2-NEXT:  .LBB34_8:
2218; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
2219; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
2220; SSE2-NEXT:    maxss %xmm2, %xmm1
2221; SSE2-NEXT:    andnps %xmm1, %xmm0
2222; SSE2-NEXT:    orps %xmm4, %xmm0
2223; SSE2-NEXT:    callq __truncsfbf2@PLT
2224; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2225; SSE2-NEXT:    shll $16, %ebx
2226; SSE2-NEXT:    movd %ebx, %xmm1
2227; SSE2-NEXT:    shll $16, %ebp
2228; SSE2-NEXT:    movd %ebp, %xmm3
2229; SSE2-NEXT:    testl %ebx, %ebx
2230; SSE2-NEXT:    movdqa %xmm1, %xmm2
2231; SSE2-NEXT:    js .LBB34_10
2232; SSE2-NEXT:  # %bb.9:
2233; SSE2-NEXT:    movdqa %xmm3, %xmm2
2234; SSE2-NEXT:  .LBB34_10:
2235; SSE2-NEXT:    movdqa %xmm2, %xmm0
2236; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
2237; SSE2-NEXT:    movaps %xmm0, %xmm4
2238; SSE2-NEXT:    andps %xmm2, %xmm4
2239; SSE2-NEXT:    js .LBB34_12
2240; SSE2-NEXT:  # %bb.11:
2241; SSE2-NEXT:    movdqa %xmm1, %xmm3
2242; SSE2-NEXT:  .LBB34_12:
2243; SSE2-NEXT:    maxss %xmm3, %xmm2
2244; SSE2-NEXT:    andnps %xmm2, %xmm0
2245; SSE2-NEXT:    orps %xmm4, %xmm0
2246; SSE2-NEXT:    callq __truncsfbf2@PLT
2247; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
2248; SSE2-NEXT:    shll $16, %r14d
2249; SSE2-NEXT:    movd %r14d, %xmm1
2250; SSE2-NEXT:    shll $16, %r15d
2251; SSE2-NEXT:    movd %r15d, %xmm3
2252; SSE2-NEXT:    testl %r14d, %r14d
2253; SSE2-NEXT:    movdqa %xmm1, %xmm2
2254; SSE2-NEXT:    js .LBB34_14
2255; SSE2-NEXT:  # %bb.13:
2256; SSE2-NEXT:    movdqa %xmm3, %xmm2
2257; SSE2-NEXT:  .LBB34_14:
2258; SSE2-NEXT:    movdqa %xmm2, %xmm0
2259; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
2260; SSE2-NEXT:    movaps %xmm0, %xmm4
2261; SSE2-NEXT:    andps %xmm2, %xmm4
2262; SSE2-NEXT:    js .LBB34_16
2263; SSE2-NEXT:  # %bb.15:
2264; SSE2-NEXT:    movdqa %xmm1, %xmm3
2265; SSE2-NEXT:  .LBB34_16:
2266; SSE2-NEXT:    maxss %xmm3, %xmm2
2267; SSE2-NEXT:    andnps %xmm2, %xmm0
2268; SSE2-NEXT:    orps %xmm4, %xmm0
2269; SSE2-NEXT:    callq __truncsfbf2@PLT
2270; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
2271; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2272; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
2273; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
2274; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
2275; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
2276; SSE2-NEXT:    addq $56, %rsp
2277; SSE2-NEXT:    .cfi_def_cfa_offset 40
2278; SSE2-NEXT:    popq %rbx
2279; SSE2-NEXT:    .cfi_def_cfa_offset 32
2280; SSE2-NEXT:    popq %r14
2281; SSE2-NEXT:    .cfi_def_cfa_offset 24
2282; SSE2-NEXT:    popq %r15
2283; SSE2-NEXT:    .cfi_def_cfa_offset 16
2284; SSE2-NEXT:    popq %rbp
2285; SSE2-NEXT:    .cfi_def_cfa_offset 8
2286; SSE2-NEXT:    retq
2287;
2288; AVX1-LABEL: test_fmaximum_v4bf16:
2289; AVX1:       # %bb.0:
2290; AVX1-NEXT:    pushq %rbp
2291; AVX1-NEXT:    .cfi_def_cfa_offset 16
2292; AVX1-NEXT:    pushq %r15
2293; AVX1-NEXT:    .cfi_def_cfa_offset 24
2294; AVX1-NEXT:    pushq %r14
2295; AVX1-NEXT:    .cfi_def_cfa_offset 32
2296; AVX1-NEXT:    pushq %r13
2297; AVX1-NEXT:    .cfi_def_cfa_offset 40
2298; AVX1-NEXT:    pushq %r12
2299; AVX1-NEXT:    .cfi_def_cfa_offset 48
2300; AVX1-NEXT:    pushq %rbx
2301; AVX1-NEXT:    .cfi_def_cfa_offset 56
2302; AVX1-NEXT:    subq $56, %rsp
2303; AVX1-NEXT:    .cfi_def_cfa_offset 112
2304; AVX1-NEXT:    .cfi_offset %rbx, -56
2305; AVX1-NEXT:    .cfi_offset %r12, -48
2306; AVX1-NEXT:    .cfi_offset %r13, -40
2307; AVX1-NEXT:    .cfi_offset %r14, -32
2308; AVX1-NEXT:    .cfi_offset %r15, -24
2309; AVX1-NEXT:    .cfi_offset %rbp, -16
2310; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm2
2311; AVX1-NEXT:    vpsrlq $48, %xmm1, %xmm3
2312; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2313; AVX1-NEXT:    vpextrw $0, %xmm4, %ebx
2314; AVX1-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
2315; AVX1-NEXT:    vpextrw $0, %xmm4, %ebp
2316; AVX1-NEXT:    vpextrw $0, %xmm0, %r12d
2317; AVX1-NEXT:    vpextrw $0, %xmm1, %r13d
2318; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
2319; AVX1-NEXT:    vpextrw $0, %xmm0, %eax
2320; AVX1-NEXT:    vpsrld $16, %xmm1, %xmm0
2321; AVX1-NEXT:    vpextrw $0, %xmm0, %ecx
2322; AVX1-NEXT:    shll $16, %ecx
2323; AVX1-NEXT:    vmovd %ecx, %xmm0
2324; AVX1-NEXT:    shll $16, %eax
2325; AVX1-NEXT:    vmovd %eax, %xmm4
2326; AVX1-NEXT:    js .LBB34_1
2327; AVX1-NEXT:  # %bb.2:
2328; AVX1-NEXT:    vmovdqa %xmm4, %xmm1
2329; AVX1-NEXT:    jmp .LBB34_3
2330; AVX1-NEXT:  .LBB34_1:
2331; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2332; AVX1-NEXT:    vmovdqa %xmm4, %xmm0
2333; AVX1-NEXT:  .LBB34_3:
2334; AVX1-NEXT:    vpextrw $0, %xmm2, %r14d
2335; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
2336; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2337; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2338; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2339; AVX1-NEXT:    callq __truncsfbf2@PLT
2340; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2341; AVX1-NEXT:    shll $16, %r13d
2342; AVX1-NEXT:    vmovd %r13d, %xmm0
2343; AVX1-NEXT:    shll $16, %r12d
2344; AVX1-NEXT:    vmovd %r12d, %xmm2
2345; AVX1-NEXT:    js .LBB34_4
2346; AVX1-NEXT:  # %bb.5:
2347; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2348; AVX1-NEXT:    jmp .LBB34_6
2349; AVX1-NEXT:  .LBB34_4:
2350; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2351; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2352; AVX1-NEXT:  .LBB34_6:
2353; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2354; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2355; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2356; AVX1-NEXT:    callq __truncsfbf2@PLT
2357; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
2358; AVX1-NEXT:    shll $16, %ebp
2359; AVX1-NEXT:    vmovd %ebp, %xmm0
2360; AVX1-NEXT:    shll $16, %ebx
2361; AVX1-NEXT:    vmovd %ebx, %xmm2
2362; AVX1-NEXT:    js .LBB34_7
2363; AVX1-NEXT:  # %bb.8:
2364; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2365; AVX1-NEXT:    jmp .LBB34_9
2366; AVX1-NEXT:  .LBB34_7:
2367; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2368; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2369; AVX1-NEXT:  .LBB34_9:
2370; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2371; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2372; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2373; AVX1-NEXT:    callq __truncsfbf2@PLT
2374; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
2375; AVX1-NEXT:    shll $16, %r15d
2376; AVX1-NEXT:    vmovd %r15d, %xmm0
2377; AVX1-NEXT:    shll $16, %r14d
2378; AVX1-NEXT:    vmovd %r14d, %xmm2
2379; AVX1-NEXT:    js .LBB34_10
2380; AVX1-NEXT:  # %bb.11:
2381; AVX1-NEXT:    vmovdqa %xmm2, %xmm1
2382; AVX1-NEXT:    jmp .LBB34_12
2383; AVX1-NEXT:  .LBB34_10:
2384; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
2385; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
2386; AVX1-NEXT:  .LBB34_12:
2387; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2388; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2389; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2390; AVX1-NEXT:    callq __truncsfbf2@PLT
2391; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
2392; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2393; AVX1-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
2394; AVX1-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2395; AVX1-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2396; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2397; AVX1-NEXT:    addq $56, %rsp
2398; AVX1-NEXT:    .cfi_def_cfa_offset 56
2399; AVX1-NEXT:    popq %rbx
2400; AVX1-NEXT:    .cfi_def_cfa_offset 48
2401; AVX1-NEXT:    popq %r12
2402; AVX1-NEXT:    .cfi_def_cfa_offset 40
2403; AVX1-NEXT:    popq %r13
2404; AVX1-NEXT:    .cfi_def_cfa_offset 32
2405; AVX1-NEXT:    popq %r14
2406; AVX1-NEXT:    .cfi_def_cfa_offset 24
2407; AVX1-NEXT:    popq %r15
2408; AVX1-NEXT:    .cfi_def_cfa_offset 16
2409; AVX1-NEXT:    popq %rbp
2410; AVX1-NEXT:    .cfi_def_cfa_offset 8
2411; AVX1-NEXT:    retq
2412;
2413; AVX512-LABEL: test_fmaximum_v4bf16:
2414; AVX512:       # %bb.0:
2415; AVX512-NEXT:    pushq %rbp
2416; AVX512-NEXT:    .cfi_def_cfa_offset 16
2417; AVX512-NEXT:    pushq %r15
2418; AVX512-NEXT:    .cfi_def_cfa_offset 24
2419; AVX512-NEXT:    pushq %r14
2420; AVX512-NEXT:    .cfi_def_cfa_offset 32
2421; AVX512-NEXT:    pushq %r13
2422; AVX512-NEXT:    .cfi_def_cfa_offset 40
2423; AVX512-NEXT:    pushq %r12
2424; AVX512-NEXT:    .cfi_def_cfa_offset 48
2425; AVX512-NEXT:    pushq %rbx
2426; AVX512-NEXT:    .cfi_def_cfa_offset 56
2427; AVX512-NEXT:    pushq %rax
2428; AVX512-NEXT:    .cfi_def_cfa_offset 64
2429; AVX512-NEXT:    .cfi_offset %rbx, -56
2430; AVX512-NEXT:    .cfi_offset %r12, -48
2431; AVX512-NEXT:    .cfi_offset %r13, -40
2432; AVX512-NEXT:    .cfi_offset %r14, -32
2433; AVX512-NEXT:    .cfi_offset %r15, -24
2434; AVX512-NEXT:    .cfi_offset %rbp, -16
2435; AVX512-NEXT:    vmovq %xmm1, %r13
2436; AVX512-NEXT:    movq %r13, %rbx
2437; AVX512-NEXT:    shrq $32, %rbx
2438; AVX512-NEXT:    vmovq %xmm0, %rbp
2439; AVX512-NEXT:    movq %rbp, %r14
2440; AVX512-NEXT:    shrq $32, %r14
2441; AVX512-NEXT:    movq %r13, %r15
2442; AVX512-NEXT:    shrq $48, %r15
2443; AVX512-NEXT:    movq %rbp, %r12
2444; AVX512-NEXT:    shrq $48, %r12
2445; AVX512-NEXT:    movl %ebp, %eax
2446; AVX512-NEXT:    andl $-65536, %eax # imm = 0xFFFF0000
2447; AVX512-NEXT:    sets %cl
2448; AVX512-NEXT:    kmovw %ecx, %k1
2449; AVX512-NEXT:    movl %r13d, %ecx
2450; AVX512-NEXT:    andl $-65536, %ecx # imm = 0xFFFF0000
2451; AVX512-NEXT:    vmovd %ecx, %xmm1
2452; AVX512-NEXT:    vmovd %eax, %xmm0
2453; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2454; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2455; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2456; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2457; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
2458; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2459; AVX512-NEXT:    callq __truncsfbf2@PLT
2460; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2461; AVX512-NEXT:    shll $16, %ebp
2462; AVX512-NEXT:    sets %al
2463; AVX512-NEXT:    kmovw %eax, %k1
2464; AVX512-NEXT:    shll $16, %r13d
2465; AVX512-NEXT:    vmovd %r13d, %xmm1
2466; AVX512-NEXT:    vmovd %ebp, %xmm0
2467; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2468; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2469; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2470; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2471; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
2472; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2473; AVX512-NEXT:    callq __truncsfbf2@PLT
2474; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
2475; AVX512-NEXT:    shll $16, %r12d
2476; AVX512-NEXT:    sets %al
2477; AVX512-NEXT:    kmovw %eax, %k1
2478; AVX512-NEXT:    shll $16, %r15d
2479; AVX512-NEXT:    vmovd %r15d, %xmm1
2480; AVX512-NEXT:    vmovd %r12d, %xmm0
2481; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2482; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2483; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2484; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2485; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
2486; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2487; AVX512-NEXT:    callq __truncsfbf2@PLT
2488; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2489; AVX512-NEXT:    shll $16, %r14d
2490; AVX512-NEXT:    sets %al
2491; AVX512-NEXT:    kmovw %eax, %k1
2492; AVX512-NEXT:    shll $16, %ebx
2493; AVX512-NEXT:    vmovd %ebx, %xmm1
2494; AVX512-NEXT:    vmovd %r14d, %xmm0
2495; AVX512-NEXT:    vmovdqa %xmm0, %xmm2
2496; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
2497; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
2498; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
2499; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
2500; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
2501; AVX512-NEXT:    callq __truncsfbf2@PLT
2502; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
2503; AVX512-NEXT:    vmovaps (%rsp), %xmm0
2504; AVX512-NEXT:    addq $8, %rsp
2505; AVX512-NEXT:    .cfi_def_cfa_offset 56
2506; AVX512-NEXT:    popq %rbx
2507; AVX512-NEXT:    .cfi_def_cfa_offset 48
2508; AVX512-NEXT:    popq %r12
2509; AVX512-NEXT:    .cfi_def_cfa_offset 40
2510; AVX512-NEXT:    popq %r13
2511; AVX512-NEXT:    .cfi_def_cfa_offset 32
2512; AVX512-NEXT:    popq %r14
2513; AVX512-NEXT:    .cfi_def_cfa_offset 24
2514; AVX512-NEXT:    popq %r15
2515; AVX512-NEXT:    .cfi_def_cfa_offset 16
2516; AVX512-NEXT:    popq %rbp
2517; AVX512-NEXT:    .cfi_def_cfa_offset 8
2518; AVX512-NEXT:    retq
2519;
2520; AVX10_2-LABEL: test_fmaximum_v4bf16:
2521; AVX10_2:       # %bb.0:
2522; AVX10_2-NEXT:    vminmaxbf16 $1, %xmm1, %xmm0, %xmm0
2523; AVX10_2-NEXT:    retq
2524;
2525; X86-LABEL: test_fmaximum_v4bf16:
2526; X86:       # %bb.0:
2527; X86-NEXT:    pushl %ebp
2528; X86-NEXT:    .cfi_def_cfa_offset 8
2529; X86-NEXT:    pushl %ebx
2530; X86-NEXT:    .cfi_def_cfa_offset 12
2531; X86-NEXT:    pushl %edi
2532; X86-NEXT:    .cfi_def_cfa_offset 16
2533; X86-NEXT:    pushl %esi
2534; X86-NEXT:    .cfi_def_cfa_offset 20
2535; X86-NEXT:    subl $68, %esp
2536; X86-NEXT:    .cfi_def_cfa_offset 88
2537; X86-NEXT:    .cfi_offset %esi, -20
2538; X86-NEXT:    .cfi_offset %edi, -16
2539; X86-NEXT:    .cfi_offset %ebx, -12
2540; X86-NEXT:    .cfi_offset %ebp, -8
2541; X86-NEXT:    vpsrlq $48, %xmm0, %xmm2
2542; X86-NEXT:    vpsrlq $48, %xmm1, %xmm3
2543; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
2544; X86-NEXT:    vpextrw $0, %xmm4, %esi
2545; X86-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3]
2546; X86-NEXT:    vpextrw $0, %xmm4, %ebx
2547; X86-NEXT:    vpextrw $0, %xmm0, %eax
2548; X86-NEXT:    vpextrw $0, %xmm1, %ecx
2549; X86-NEXT:    vpsrld $16, %xmm0, %xmm0
2550; X86-NEXT:    vpextrw $0, %xmm0, %edx
2551; X86-NEXT:    vpsrld $16, %xmm1, %xmm0
2552; X86-NEXT:    vpextrw $0, %xmm0, %edi
2553; X86-NEXT:    shll $16, %edi
2554; X86-NEXT:    vmovd %edi, %xmm0
2555; X86-NEXT:    shll $16, %edx
2556; X86-NEXT:    vmovd %edx, %xmm4
2557; X86-NEXT:    js .LBB34_1
2558; X86-NEXT:  # %bb.2:
2559; X86-NEXT:    vmovdqa %xmm4, %xmm1
2560; X86-NEXT:    jmp .LBB34_3
2561; X86-NEXT:  .LBB34_1:
2562; X86-NEXT:    vmovdqa %xmm0, %xmm1
2563; X86-NEXT:    vmovdqa %xmm4, %xmm0
2564; X86-NEXT:  .LBB34_3:
2565; X86-NEXT:    vpextrw $0, %xmm2, %edi
2566; X86-NEXT:    vpextrw $0, %xmm3, %ebp
2567; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2568; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2569; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2570; X86-NEXT:    vmovss %xmm0, (%esp)
2571; X86-NEXT:    shll $16, %ecx
2572; X86-NEXT:    vmovd %ecx, %xmm0
2573; X86-NEXT:    shll $16, %eax
2574; X86-NEXT:    vmovd %eax, %xmm2
2575; X86-NEXT:    js .LBB34_4
2576; X86-NEXT:  # %bb.5:
2577; X86-NEXT:    vmovdqa %xmm2, %xmm1
2578; X86-NEXT:    jmp .LBB34_6
2579; X86-NEXT:  .LBB34_4:
2580; X86-NEXT:    vmovdqa %xmm0, %xmm1
2581; X86-NEXT:    vmovdqa %xmm2, %xmm0
2582; X86-NEXT:  .LBB34_6:
2583; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2584; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2585; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2586; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2587; X86-NEXT:    calll __truncsfbf2
2588; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2589; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2590; X86-NEXT:    vmovss %xmm0, (%esp)
2591; X86-NEXT:    shll $16, %ebx
2592; X86-NEXT:    vmovd %ebx, %xmm0
2593; X86-NEXT:    shll $16, %esi
2594; X86-NEXT:    vmovd %esi, %xmm2
2595; X86-NEXT:    js .LBB34_7
2596; X86-NEXT:  # %bb.8:
2597; X86-NEXT:    vmovdqa %xmm2, %xmm1
2598; X86-NEXT:    jmp .LBB34_9
2599; X86-NEXT:  .LBB34_7:
2600; X86-NEXT:    vmovdqa %xmm0, %xmm1
2601; X86-NEXT:    vmovdqa %xmm2, %xmm0
2602; X86-NEXT:  .LBB34_9:
2603; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2604; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2605; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2606; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2607; X86-NEXT:    calll __truncsfbf2
2608; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2609; X86-NEXT:    vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2610; X86-NEXT:    vmovss %xmm0, (%esp)
2611; X86-NEXT:    shll $16, %ebp
2612; X86-NEXT:    vmovd %ebp, %xmm0
2613; X86-NEXT:    shll $16, %edi
2614; X86-NEXT:    vmovd %edi, %xmm2
2615; X86-NEXT:    js .LBB34_10
2616; X86-NEXT:  # %bb.11:
2617; X86-NEXT:    vmovdqa %xmm2, %xmm1
2618; X86-NEXT:    jmp .LBB34_12
2619; X86-NEXT:  .LBB34_10:
2620; X86-NEXT:    vmovdqa %xmm0, %xmm1
2621; X86-NEXT:    vmovdqa %xmm2, %xmm0
2622; X86-NEXT:  .LBB34_12:
2623; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
2624; X86-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
2625; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
2626; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2627; X86-NEXT:    calll __truncsfbf2
2628; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
2629; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
2630; X86-NEXT:    vmovd %xmm0, (%esp)
2631; X86-NEXT:    calll __truncsfbf2
2632; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2633; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
2634; X86-NEXT:    vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
2635; X86-NEXT:    vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
2636; X86-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
2637; X86-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero
2638; X86-NEXT:    addl $68, %esp
2639; X86-NEXT:    .cfi_def_cfa_offset 20
2640; X86-NEXT:    popl %esi
2641; X86-NEXT:    .cfi_def_cfa_offset 16
2642; X86-NEXT:    popl %edi
2643; X86-NEXT:    .cfi_def_cfa_offset 12
2644; X86-NEXT:    popl %ebx
2645; X86-NEXT:    .cfi_def_cfa_offset 8
2646; X86-NEXT:    popl %ebp
2647; X86-NEXT:    .cfi_def_cfa_offset 4
2648; X86-NEXT:    retl
2649  %r = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y)
2650  ret <4 x bfloat> %r
2651}
2652