1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2 7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 8 9declare float @llvm.maximumnum.f32(float, float) 10declare double @llvm.maximumnum.f64(double, double) 11declare float @llvm.minimumnum.f32(float, float) 12declare double @llvm.minimumnum.f64(double, double) 13declare <2 x double> @llvm.minimumnum.v2f64(<2 x double>, <2 x double>) 14declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>) 15declare <4 x half> @llvm.maximumnum.v4f16(<4 x half>, <4 x half>) 16declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>) 17 18; 19; fmaximumnum 20; 21 22define float @test_fmaximumnum(float %x, float %y) nounwind { 23; SSE2-LABEL: test_fmaximumnum: 24; SSE2: # %bb.0: 25; SSE2-NEXT: movdqa %xmm0, %xmm2 26; SSE2-NEXT: movd %xmm0, %eax 27; SSE2-NEXT: testl %eax, %eax 28; SSE2-NEXT: movdqa %xmm0, %xmm3 29; SSE2-NEXT: js .LBB0_2 30; SSE2-NEXT: # %bb.1: 31; SSE2-NEXT: movdqa %xmm1, %xmm3 32; SSE2-NEXT: .LBB0_2: 33; SSE2-NEXT: movdqa %xmm3, %xmm0 34; SSE2-NEXT: cmpordss %xmm3, %xmm0 35; SSE2-NEXT: movaps %xmm0, %xmm4 36; SSE2-NEXT: andps %xmm3, %xmm4 37; SSE2-NEXT: js .LBB0_4 38; SSE2-NEXT: # %bb.3: 39; SSE2-NEXT: movdqa %xmm2, %xmm1 40; SSE2-NEXT: .LBB0_4: 41; SSE2-NEXT: maxss %xmm1, %xmm3 42; SSE2-NEXT: andnps %xmm3, %xmm0 43; SSE2-NEXT: orps %xmm4, %xmm0 44; SSE2-NEXT: retq 45; 46; AVX1-LABEL: test_fmaximumnum: 47; AVX1: # %bb.0: 48; AVX1-NEXT: vmovd %xmm0, %eax 49; AVX1-NEXT: testl %eax, %eax 50; AVX1-NEXT: js .LBB0_1 51; AVX1-NEXT: # %bb.2: 52; AVX1-NEXT: vmovdqa %xmm0, %xmm2 53; AVX1-NEXT: jmp .LBB0_3 54; AVX1-NEXT: .LBB0_1: 55; AVX1-NEXT: vmovdqa %xmm1, %xmm2 56; AVX1-NEXT: vmovdqa %xmm0, %xmm1 57; AVX1-NEXT: .LBB0_3: 58; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 59; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 60; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 61; AVX1-NEXT: retq 62; 63; AVX512-LABEL: test_fmaximumnum: 64; AVX512: # %bb.0: 65; AVX512-NEXT: vmovd %xmm0, %eax 66; AVX512-NEXT: testl %eax, %eax 67; AVX512-NEXT: sets %al 68; AVX512-NEXT: kmovw %eax, %k1 69; AVX512-NEXT: vmovdqa %xmm0, %xmm2 70; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 71; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 72; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 73; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 74; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 75; AVX512-NEXT: retq 76; 77; AVX10_2-LABEL: test_fmaximumnum: 78; AVX10_2: # %bb.0: 79; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 80; AVX10_2-NEXT: retq 81; 82; X86-LABEL: test_fmaximumnum: 83; X86: # %bb.0: 84; X86-NEXT: pushl %eax 85; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 86; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 87; X86-NEXT: vmovd %xmm2, %eax 88; X86-NEXT: testl %eax, %eax 89; X86-NEXT: js .LBB0_1 90; X86-NEXT: # %bb.2: 91; X86-NEXT: vmovdqa %xmm2, %xmm1 92; X86-NEXT: jmp .LBB0_3 93; X86-NEXT: .LBB0_1: 94; X86-NEXT: vmovdqa %xmm0, %xmm1 95; X86-NEXT: vmovdqa %xmm2, %xmm0 96; X86-NEXT: .LBB0_3: 97; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 98; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 99; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 100; X86-NEXT: vmovss %xmm0, (%esp) 101; X86-NEXT: flds (%esp) 102; X86-NEXT: popl %eax 103; X86-NEXT: retl 104 %1 = tail call float @llvm.maximumnum.f32(float %x, float %y) 105 ret float %1 106} 107 108define <4 x float> @test_fmaximumnum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 109; SSE2-LABEL: test_fmaximumnum_scalarize: 110; SSE2: # %bb.0: 111; SSE2-NEXT: maxps %xmm1, %xmm0 112; SSE2-NEXT: retq 113; 114; AVX-LABEL: test_fmaximumnum_scalarize: 115; AVX: # %bb.0: 116; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 117; AVX-NEXT: retq 118; 119; AVX10_2-LABEL: test_fmaximumnum_scalarize: 120; AVX10_2: # %bb.0: 121; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 122; AVX10_2-NEXT: retq 123; 124; X86-LABEL: test_fmaximumnum_scalarize: 125; X86: # %bb.0: 126; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 127; X86-NEXT: retl 128 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) 129 ret <4 x float> %r 130} 131 132define float @test_fmaximumnum_nan0(float %x, float %y) { 133; SSE2-LABEL: test_fmaximumnum_nan0: 134; SSE2: # %bb.0: 135; SSE2-NEXT: movaps %xmm1, %xmm0 136; SSE2-NEXT: retq 137; 138; AVX-LABEL: test_fmaximumnum_nan0: 139; AVX: # %bb.0: 140; AVX-NEXT: vmovaps %xmm1, %xmm0 141; AVX-NEXT: retq 142; 143; AVX10_2-LABEL: test_fmaximumnum_nan0: 144; AVX10_2: # %bb.0: 145; AVX10_2-NEXT: vmovaps %xmm1, %xmm0 146; AVX10_2-NEXT: retq 147; 148; X86-LABEL: test_fmaximumnum_nan0: 149; X86: # %bb.0: 150; X86-NEXT: flds {{[0-9]+}}(%esp) 151; X86-NEXT: retl 152 %1 = tail call float @llvm.maximumnum.f32(float 0x7fff000000000000, float %y) 153 ret float %1 154} 155 156define float @test_fmaximumnum_nan1(float %x, float %y) { 157; SSE2-LABEL: test_fmaximumnum_nan1: 158; SSE2: # %bb.0: 159; SSE2-NEXT: retq 160; 161; AVX-LABEL: test_fmaximumnum_nan1: 162; AVX: # %bb.0: 163; AVX-NEXT: retq 164; 165; AVX10_2-LABEL: test_fmaximumnum_nan1: 166; AVX10_2: # %bb.0: 167; AVX10_2-NEXT: retq 168; 169; X86-LABEL: test_fmaximumnum_nan1: 170; X86: # %bb.0: 171; X86-NEXT: flds {{[0-9]+}}(%esp) 172; X86-NEXT: retl 173 %1 = tail call float @llvm.maximumnum.f32(float %x, float 0x7fff000000000000) 174 ret float %1 175} 176 177define float @test_fmaximumnum_nnan(float %x, float %y) nounwind { 178; SSE2-LABEL: test_fmaximumnum_nnan: 179; SSE2: # %bb.0: 180; SSE2-NEXT: movaps %xmm0, %xmm2 181; SSE2-NEXT: addss %xmm1, %xmm2 182; SSE2-NEXT: subss %xmm1, %xmm0 183; SSE2-NEXT: movd %xmm2, %eax 184; SSE2-NEXT: testl %eax, %eax 185; SSE2-NEXT: js .LBB4_1 186; SSE2-NEXT: # %bb.2: 187; SSE2-NEXT: maxss %xmm2, %xmm0 188; SSE2-NEXT: retq 189; SSE2-NEXT: .LBB4_1: 190; SSE2-NEXT: movaps %xmm0, %xmm1 191; SSE2-NEXT: movaps %xmm2, %xmm0 192; SSE2-NEXT: maxss %xmm1, %xmm0 193; SSE2-NEXT: retq 194; 195; AVX1-LABEL: test_fmaximumnum_nnan: 196; AVX1: # %bb.0: 197; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 198; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 199; AVX1-NEXT: vmovd %xmm2, %eax 200; AVX1-NEXT: testl %eax, %eax 201; AVX1-NEXT: js .LBB4_1 202; AVX1-NEXT: # %bb.2: 203; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0 204; AVX1-NEXT: retq 205; AVX1-NEXT: .LBB4_1: 206; AVX1-NEXT: vmovaps %xmm0, %xmm1 207; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 208; AVX1-NEXT: retq 209; 210; AVX512F-LABEL: test_fmaximumnum_nnan: 211; AVX512F: # %bb.0: 212; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 213; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 214; AVX512F-NEXT: vmovd %xmm2, %eax 215; AVX512F-NEXT: testl %eax, %eax 216; AVX512F-NEXT: sets %al 217; AVX512F-NEXT: kmovw %eax, %k1 218; AVX512F-NEXT: vmovaps %xmm2, %xmm1 219; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 220; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 221; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0 222; AVX512F-NEXT: retq 223; 224; AVX512DQ-LABEL: test_fmaximumnum_nnan: 225; AVX512DQ: # %bb.0: 226; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2 227; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0 228; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) 229; AVX512DQ-NEXT: kmovw %k0, %k1 230; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1 231; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 232; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 233; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0 234; AVX512DQ-NEXT: retq 235; 236; AVX10_2-LABEL: test_fmaximumnum_nnan: 237; AVX10_2: # %bb.0: 238; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2 239; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0 240; AVX10_2-NEXT: vminmaxss $17, %xmm0, %xmm2 241; AVX10_2-NEXT: retq 242; 243; X86-LABEL: test_fmaximumnum_nnan: 244; X86: # %bb.0: 245; X86-NEXT: pushl %eax 246; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 247; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 248; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1 249; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0 250; X86-NEXT: vmovd %xmm1, %eax 251; X86-NEXT: testl %eax, %eax 252; X86-NEXT: js .LBB4_1 253; X86-NEXT: # %bb.2: 254; X86-NEXT: vmovaps %xmm1, %xmm2 255; X86-NEXT: jmp .LBB4_3 256; X86-NEXT: .LBB4_1: 257; X86-NEXT: vmovaps %xmm0, %xmm2 258; X86-NEXT: vmovaps %xmm1, %xmm0 259; X86-NEXT: .LBB4_3: 260; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0 261; X86-NEXT: vmovss %xmm0, (%esp) 262; X86-NEXT: flds (%esp) 263; X86-NEXT: popl %eax 264; X86-NEXT: retl 265 %1 = fadd nnan float %x, %y 266 %2 = fsub nnan float %x, %y 267 %3 = tail call float @llvm.maximumnum.f32(float %1, float %2) 268 ret float %3 269} 270 271define double @test_fmaximumnum_zero0(double %x, double %y) nounwind { 272; SSE2-LABEL: test_fmaximumnum_zero0: 273; SSE2: # %bb.0: 274; SSE2-NEXT: movapd %xmm1, %xmm0 275; SSE2-NEXT: cmpordsd %xmm1, %xmm0 276; SSE2-NEXT: movapd %xmm0, %xmm2 277; SSE2-NEXT: andpd %xmm1, %xmm2 278; SSE2-NEXT: xorpd %xmm3, %xmm3 279; SSE2-NEXT: maxsd %xmm3, %xmm1 280; SSE2-NEXT: andnpd %xmm1, %xmm0 281; SSE2-NEXT: orpd %xmm2, %xmm0 282; SSE2-NEXT: retq 283; 284; AVX1-LABEL: test_fmaximumnum_zero0: 285; AVX1: # %bb.0: 286; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 287; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 288; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm2 289; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 290; AVX1-NEXT: retq 291; 292; AVX512-LABEL: test_fmaximumnum_zero0: 293; AVX512: # %bb.0: 294; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 295; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 296; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1 297; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 298; AVX512-NEXT: retq 299; 300; AVX10_2-LABEL: test_fmaximumnum_zero0: 301; AVX10_2: # %bb.0: 302; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0 303; AVX10_2-NEXT: vminmaxsd $17, %xmm0, %xmm1 304; AVX10_2-NEXT: retq 305; 306; X86-LABEL: test_fmaximumnum_zero0: 307; X86: # %bb.0: 308; X86-NEXT: pushl %ebp 309; X86-NEXT: movl %esp, %ebp 310; X86-NEXT: andl $-8, %esp 311; X86-NEXT: subl $8, %esp 312; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 313; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 314; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 315; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 316; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 317; X86-NEXT: vmovlpd %xmm0, (%esp) 318; X86-NEXT: fldl (%esp) 319; X86-NEXT: movl %ebp, %esp 320; X86-NEXT: popl %ebp 321; X86-NEXT: retl 322 %1 = tail call double @llvm.maximumnum.f64(double 0.0, double %y) 323 ret double %1 324} 325 326define double @test_fmaximumnum_zero1(double %x, double %y) nounwind { 327; SSE2-LABEL: test_fmaximumnum_zero1: 328; SSE2: # %bb.0: 329; SSE2-NEXT: movapd %xmm0, %xmm1 330; SSE2-NEXT: cmpordsd %xmm0, %xmm1 331; SSE2-NEXT: movapd %xmm1, %xmm2 332; SSE2-NEXT: andpd %xmm0, %xmm2 333; SSE2-NEXT: xorpd %xmm3, %xmm3 334; SSE2-NEXT: maxsd %xmm3, %xmm0 335; SSE2-NEXT: andnpd %xmm0, %xmm1 336; SSE2-NEXT: orpd %xmm2, %xmm1 337; SSE2-NEXT: movapd %xmm1, %xmm0 338; SSE2-NEXT: retq 339; 340; AVX1-LABEL: test_fmaximumnum_zero1: 341; AVX1: # %bb.0: 342; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 343; AVX1-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 344; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 345; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 346; AVX1-NEXT: retq 347; 348; AVX512-LABEL: test_fmaximumnum_zero1: 349; AVX512: # %bb.0: 350; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 351; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 352; AVX512-NEXT: vcmpordsd %xmm0, %xmm0, %k1 353; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 354; AVX512-NEXT: vmovapd %xmm1, %xmm0 355; AVX512-NEXT: retq 356; 357; AVX10_2-LABEL: test_fmaximumnum_zero1: 358; AVX10_2: # %bb.0: 359; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 360; AVX10_2-NEXT: vminmaxsd $17, %xmm1, %xmm0 361; AVX10_2-NEXT: retq 362; 363; X86-LABEL: test_fmaximumnum_zero1: 364; X86: # %bb.0: 365; X86-NEXT: pushl %ebp 366; X86-NEXT: movl %esp, %ebp 367; X86-NEXT: andl $-8, %esp 368; X86-NEXT: subl $8, %esp 369; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 370; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 371; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 372; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm2 373; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 374; X86-NEXT: vmovlpd %xmm0, (%esp) 375; X86-NEXT: fldl (%esp) 376; X86-NEXT: movl %ebp, %esp 377; X86-NEXT: popl %ebp 378; X86-NEXT: retl 379 %1 = tail call double @llvm.maximumnum.f64(double %x, double 0.0) 380 ret double %1 381} 382 383define double @test_fmaximumnum_zero2(double %x, double %y) { 384; SSE2-LABEL: test_fmaximumnum_zero2: 385; SSE2: # %bb.0: 386; SSE2-NEXT: xorps %xmm0, %xmm0 387; SSE2-NEXT: retq 388; 389; AVX-LABEL: test_fmaximumnum_zero2: 390; AVX: # %bb.0: 391; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 392; AVX-NEXT: retq 393; 394; AVX10_2-LABEL: test_fmaximumnum_zero2: 395; AVX10_2: # %bb.0: 396; AVX10_2-NEXT: vxorps %xmm0, %xmm0, %xmm0 397; AVX10_2-NEXT: retq 398; 399; X86-LABEL: test_fmaximumnum_zero2: 400; X86: # %bb.0: 401; X86-NEXT: fldz 402; X86-NEXT: retl 403 %1 = tail call double @llvm.maximumnum.f64(double 0.0, double -0.0) 404 ret double %1 405} 406 407define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind { 408; SSE2-LABEL: test_fmaximumnum_nsz: 409; SSE2: # %bb.0: 410; SSE2-NEXT: movaps %xmm0, %xmm2 411; SSE2-NEXT: cmpordss %xmm0, %xmm2 412; SSE2-NEXT: movaps %xmm2, %xmm3 413; SSE2-NEXT: andps %xmm0, %xmm3 414; SSE2-NEXT: maxss %xmm1, %xmm0 415; SSE2-NEXT: andnps %xmm0, %xmm2 416; SSE2-NEXT: orps %xmm3, %xmm2 417; SSE2-NEXT: movaps %xmm2, %xmm0 418; SSE2-NEXT: retq 419; 420; AVX1-LABEL: test_fmaximumnum_nsz: 421; AVX1: # %bb.0: 422; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 423; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 424; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 425; AVX1-NEXT: retq 426; 427; AVX512-LABEL: test_fmaximumnum_nsz: 428; AVX512: # %bb.0: 429; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 430; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 431; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 432; AVX512-NEXT: vmovaps %xmm1, %xmm0 433; AVX512-NEXT: retq 434; 435; AVX10_2-LABEL: test_fmaximumnum_nsz: 436; AVX10_2: # %bb.0: 437; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 438; AVX10_2-NEXT: retq 439; 440; X86-LABEL: test_fmaximumnum_nsz: 441; X86: # %bb.0: 442; X86-NEXT: pushl %eax 443; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 444; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 445; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 446; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 447; X86-NEXT: vmovss %xmm0, (%esp) 448; X86-NEXT: flds (%esp) 449; X86-NEXT: popl %eax 450; X86-NEXT: retl 451 %1 = tail call float @llvm.maximumnum.f32(float %x, float %y) 452 ret float %1 453} 454 455define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind { 456; SSE2-LABEL: test_fmaximumnum_combine_cmps: 457; SSE2: # %bb.0: 458; SSE2-NEXT: divss %xmm0, %xmm1 459; SSE2-NEXT: movd %xmm0, %eax 460; SSE2-NEXT: testl %eax, %eax 461; SSE2-NEXT: movaps %xmm0, %xmm3 462; SSE2-NEXT: js .LBB9_2 463; SSE2-NEXT: # %bb.1: 464; SSE2-NEXT: movaps %xmm1, %xmm3 465; SSE2-NEXT: .LBB9_2: 466; SSE2-NEXT: movaps %xmm3, %xmm2 467; SSE2-NEXT: cmpordss %xmm3, %xmm2 468; SSE2-NEXT: movaps %xmm2, %xmm4 469; SSE2-NEXT: andps %xmm3, %xmm4 470; SSE2-NEXT: js .LBB9_4 471; SSE2-NEXT: # %bb.3: 472; SSE2-NEXT: movaps %xmm0, %xmm1 473; SSE2-NEXT: .LBB9_4: 474; SSE2-NEXT: maxss %xmm1, %xmm3 475; SSE2-NEXT: andnps %xmm3, %xmm2 476; SSE2-NEXT: orps %xmm4, %xmm2 477; SSE2-NEXT: movaps %xmm2, %xmm0 478; SSE2-NEXT: retq 479; 480; AVX1-LABEL: test_fmaximumnum_combine_cmps: 481; AVX1: # %bb.0: 482; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 483; AVX1-NEXT: vmovd %xmm0, %eax 484; AVX1-NEXT: testl %eax, %eax 485; AVX1-NEXT: js .LBB9_1 486; AVX1-NEXT: # %bb.2: 487; AVX1-NEXT: vmovaps %xmm0, %xmm2 488; AVX1-NEXT: jmp .LBB9_3 489; AVX1-NEXT: .LBB9_1: 490; AVX1-NEXT: vmovaps %xmm1, %xmm2 491; AVX1-NEXT: vmovaps %xmm0, %xmm1 492; AVX1-NEXT: .LBB9_3: 493; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 494; AVX1-NEXT: vcmpordss %xmm1, %xmm1, %xmm2 495; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 496; AVX1-NEXT: retq 497; 498; AVX512F-LABEL: test_fmaximumnum_combine_cmps: 499; AVX512F: # %bb.0: 500; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 501; AVX512F-NEXT: vmovd %xmm0, %eax 502; AVX512F-NEXT: testl %eax, %eax 503; AVX512F-NEXT: sets %al 504; AVX512F-NEXT: kmovw %eax, %k1 505; AVX512F-NEXT: vmovaps %xmm0, %xmm2 506; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 507; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 508; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 509; AVX512F-NEXT: vcmpordss %xmm1, %xmm1, %k1 510; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 511; AVX512F-NEXT: retq 512; 513; AVX512DQ-LABEL: test_fmaximumnum_combine_cmps: 514; AVX512DQ: # %bb.0: 515; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 516; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) 517; AVX512DQ-NEXT: kmovw %k0, %k1 518; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 519; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 520; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 521; AVX512DQ-NEXT: vmaxss %xmm2, %xmm0, %xmm0 522; AVX512DQ-NEXT: retq 523; 524; AVX10_2-LABEL: test_fmaximumnum_combine_cmps: 525; AVX10_2: # %bb.0: 526; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 527; AVX10_2-NEXT: vminmaxss $17, %xmm1, %xmm0 528; AVX10_2-NEXT: retq 529; 530; X86-LABEL: test_fmaximumnum_combine_cmps: 531; X86: # %bb.0: 532; X86-NEXT: pushl %eax 533; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 534; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 535; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 536; X86-NEXT: vmovd %xmm1, %eax 537; X86-NEXT: testl %eax, %eax 538; X86-NEXT: js .LBB9_1 539; X86-NEXT: # %bb.2: 540; X86-NEXT: vmovaps %xmm1, %xmm2 541; X86-NEXT: jmp .LBB9_3 542; X86-NEXT: .LBB9_1: 543; X86-NEXT: vmovaps %xmm0, %xmm2 544; X86-NEXT: vmovaps %xmm1, %xmm0 545; X86-NEXT: .LBB9_3: 546; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 547; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 548; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 549; X86-NEXT: vmovss %xmm0, (%esp) 550; X86-NEXT: flds (%esp) 551; X86-NEXT: popl %eax 552; X86-NEXT: retl 553 %1 = fdiv nnan float %y, %x 554 %2 = tail call float @llvm.maximumnum.f32(float %x, float %1) 555 ret float %2 556} 557 558; 559; fminimumnum 560; 561 562define float @test_fminimumnum(float %x, float %y) nounwind { 563; SSE2-LABEL: test_fminimumnum: 564; SSE2: # %bb.0: 565; SSE2-NEXT: movd %xmm0, %eax 566; SSE2-NEXT: testl %eax, %eax 567; SSE2-NEXT: movdqa %xmm1, %xmm3 568; SSE2-NEXT: js .LBB10_2 569; SSE2-NEXT: # %bb.1: 570; SSE2-NEXT: movdqa %xmm0, %xmm3 571; SSE2-NEXT: .LBB10_2: 572; SSE2-NEXT: movdqa %xmm3, %xmm2 573; SSE2-NEXT: cmpordss %xmm3, %xmm2 574; SSE2-NEXT: movaps %xmm2, %xmm4 575; SSE2-NEXT: andps %xmm3, %xmm4 576; SSE2-NEXT: js .LBB10_4 577; SSE2-NEXT: # %bb.3: 578; SSE2-NEXT: movdqa %xmm1, %xmm0 579; SSE2-NEXT: .LBB10_4: 580; SSE2-NEXT: minss %xmm0, %xmm3 581; SSE2-NEXT: andnps %xmm3, %xmm2 582; SSE2-NEXT: orps %xmm4, %xmm2 583; SSE2-NEXT: movaps %xmm2, %xmm0 584; SSE2-NEXT: retq 585; 586; AVX1-LABEL: test_fminimumnum: 587; AVX1: # %bb.0: 588; AVX1-NEXT: vmovd %xmm0, %eax 589; AVX1-NEXT: testl %eax, %eax 590; AVX1-NEXT: js .LBB10_1 591; AVX1-NEXT: # %bb.2: 592; AVX1-NEXT: vmovdqa %xmm1, %xmm2 593; AVX1-NEXT: jmp .LBB10_3 594; AVX1-NEXT: .LBB10_1: 595; AVX1-NEXT: vmovdqa %xmm0, %xmm2 596; AVX1-NEXT: vmovdqa %xmm1, %xmm0 597; AVX1-NEXT: .LBB10_3: 598; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 599; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 600; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 601; AVX1-NEXT: retq 602; 603; AVX512-LABEL: test_fminimumnum: 604; AVX512: # %bb.0: 605; AVX512-NEXT: vmovd %xmm0, %eax 606; AVX512-NEXT: testl %eax, %eax 607; AVX512-NEXT: sets %al 608; AVX512-NEXT: kmovw %eax, %k1 609; AVX512-NEXT: vmovaps %xmm1, %xmm2 610; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 611; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 612; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 613; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 614; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 615; AVX512-NEXT: vmovaps %xmm1, %xmm0 616; AVX512-NEXT: retq 617; 618; AVX10_2-LABEL: test_fminimumnum: 619; AVX10_2: # %bb.0: 620; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 621; AVX10_2-NEXT: retq 622; 623; X86-LABEL: test_fminimumnum: 624; X86: # %bb.0: 625; X86-NEXT: pushl %eax 626; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 627; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 628; X86-NEXT: vmovd %xmm0, %eax 629; X86-NEXT: testl %eax, %eax 630; X86-NEXT: js .LBB10_1 631; X86-NEXT: # %bb.2: 632; X86-NEXT: vmovdqa %xmm1, %xmm2 633; X86-NEXT: jmp .LBB10_3 634; X86-NEXT: .LBB10_1: 635; X86-NEXT: vmovdqa %xmm0, %xmm2 636; X86-NEXT: vmovdqa %xmm1, %xmm0 637; X86-NEXT: .LBB10_3: 638; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 639; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 640; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 641; X86-NEXT: vmovss %xmm0, (%esp) 642; X86-NEXT: flds (%esp) 643; X86-NEXT: popl %eax 644; X86-NEXT: retl 645 %1 = tail call float @llvm.minimumnum.f32(float %x, float %y) 646 ret float %1 647} 648 649define <2 x double> @test_fminimumnum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 650; SSE2-LABEL: test_fminimumnum_scalarize: 651; SSE2: # %bb.0: 652; SSE2-NEXT: minpd %xmm1, %xmm0 653; SSE2-NEXT: retq 654; 655; AVX-LABEL: test_fminimumnum_scalarize: 656; AVX: # %bb.0: 657; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 658; AVX-NEXT: retq 659; 660; AVX10_2-LABEL: test_fminimumnum_scalarize: 661; AVX10_2: # %bb.0: 662; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 663; AVX10_2-NEXT: retq 664; 665; X86-LABEL: test_fminimumnum_scalarize: 666; X86: # %bb.0: 667; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 668; X86-NEXT: retl 669 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) 670 ret <2 x double> %r 671} 672 673define float @test_fminimumnum_nan0(float %x, float %y) { 674; SSE2-LABEL: test_fminimumnum_nan0: 675; SSE2: # %bb.0: 676; SSE2-NEXT: movaps %xmm1, %xmm0 677; SSE2-NEXT: retq 678; 679; AVX-LABEL: test_fminimumnum_nan0: 680; AVX: # %bb.0: 681; AVX-NEXT: vmovaps %xmm1, %xmm0 682; AVX-NEXT: retq 683; 684; AVX10_2-LABEL: test_fminimumnum_nan0: 685; AVX10_2: # %bb.0: 686; AVX10_2-NEXT: vmovaps %xmm1, %xmm0 687; AVX10_2-NEXT: retq 688; 689; X86-LABEL: test_fminimumnum_nan0: 690; X86: # %bb.0: 691; X86-NEXT: flds {{[0-9]+}}(%esp) 692; X86-NEXT: retl 693 %1 = tail call float @llvm.minimumnum.f32(float 0x7fff000000000000, float %y) 694 ret float %1 695} 696 697define float @test_fminimumnum_nan1(float %x, float %y) { 698; SSE2-LABEL: test_fminimumnum_nan1: 699; SSE2: # %bb.0: 700; SSE2-NEXT: retq 701; 702; AVX-LABEL: test_fminimumnum_nan1: 703; AVX: # %bb.0: 704; AVX-NEXT: retq 705; 706; AVX10_2-LABEL: test_fminimumnum_nan1: 707; AVX10_2: # %bb.0: 708; AVX10_2-NEXT: retq 709; 710; X86-LABEL: test_fminimumnum_nan1: 711; X86: # %bb.0: 712; X86-NEXT: flds {{[0-9]+}}(%esp) 713; X86-NEXT: retl 714 %1 = tail call float @llvm.minimumnum.f32(float %x, float 0x7fff000000000000) 715 ret float %1 716} 717 718define double @test_fminimumnum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind { 719; SSE2-LABEL: test_fminimumnum_nnan: 720; SSE2: # %bb.0: 721; SSE2-NEXT: movq %xmm0, %rax 722; SSE2-NEXT: testq %rax, %rax 723; SSE2-NEXT: js .LBB14_1 724; SSE2-NEXT: # %bb.2: 725; SSE2-NEXT: minsd %xmm1, %xmm0 726; SSE2-NEXT: retq 727; SSE2-NEXT: .LBB14_1: 728; SSE2-NEXT: movdqa %xmm0, %xmm2 729; SSE2-NEXT: movapd %xmm1, %xmm0 730; SSE2-NEXT: minsd %xmm2, %xmm0 731; SSE2-NEXT: retq 732; 733; AVX1-LABEL: test_fminimumnum_nnan: 734; AVX1: # %bb.0: 735; AVX1-NEXT: vmovq %xmm0, %rax 736; AVX1-NEXT: testq %rax, %rax 737; AVX1-NEXT: js .LBB14_1 738; AVX1-NEXT: # %bb.2: 739; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0 740; AVX1-NEXT: retq 741; AVX1-NEXT: .LBB14_1: 742; AVX1-NEXT: vmovdqa %xmm0, %xmm2 743; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0 744; AVX1-NEXT: retq 745; 746; AVX512F-LABEL: test_fminimumnum_nnan: 747; AVX512F: # %bb.0: 748; AVX512F-NEXT: vmovq %xmm0, %rax 749; AVX512F-NEXT: testq %rax, %rax 750; AVX512F-NEXT: sets %al 751; AVX512F-NEXT: kmovw %eax, %k1 752; AVX512F-NEXT: vmovapd %xmm1, %xmm2 753; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} 754; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 755; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0 756; AVX512F-NEXT: retq 757; 758; AVX512DQ-LABEL: test_fminimumnum_nnan: 759; AVX512DQ: # %bb.0: 760; AVX512DQ-NEXT: vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1) 761; AVX512DQ-NEXT: kmovw %k0, %k1 762; AVX512DQ-NEXT: vmovapd %xmm0, %xmm2 763; AVX512DQ-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 764; AVX512DQ-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 765; AVX512DQ-NEXT: vminsd %xmm2, %xmm1, %xmm0 766; AVX512DQ-NEXT: retq 767; 768; AVX10_2-LABEL: test_fminimumnum_nnan: 769; AVX10_2: # %bb.0: 770; AVX10_2-NEXT: vminmaxsd $16, %xmm1, %xmm0 771; AVX10_2-NEXT: retq 772; 773; X86-LABEL: test_fminimumnum_nnan: 774; X86: # %bb.0: 775; X86-NEXT: pushl %ebp 776; X86-NEXT: movl %esp, %ebp 777; X86-NEXT: andl $-8, %esp 778; X86-NEXT: subl $8, %esp 779; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 780; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 781; X86-NEXT: vextractps $1, %xmm0, %eax 782; X86-NEXT: testl %eax, %eax 783; X86-NEXT: js .LBB14_1 784; X86-NEXT: # %bb.2: 785; X86-NEXT: vmovapd %xmm1, %xmm2 786; X86-NEXT: jmp .LBB14_3 787; X86-NEXT: .LBB14_1: 788; X86-NEXT: vmovapd %xmm0, %xmm2 789; X86-NEXT: vmovapd %xmm1, %xmm0 790; X86-NEXT: .LBB14_3: 791; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0 792; X86-NEXT: vmovsd %xmm0, (%esp) 793; X86-NEXT: fldl (%esp) 794; X86-NEXT: movl %ebp, %esp 795; X86-NEXT: popl %ebp 796; X86-NEXT: retl 797 %1 = tail call double @llvm.minimumnum.f64(double %x, double %y) 798 ret double %1 799} 800 801define double @test_fminimumnum_zero0(double %x, double %y) nounwind { 802; SSE2-LABEL: test_fminimumnum_zero0: 803; SSE2: # %bb.0: 804; SSE2-NEXT: movapd %xmm1, %xmm0 805; SSE2-NEXT: cmpordsd %xmm1, %xmm0 806; SSE2-NEXT: movapd %xmm0, %xmm2 807; SSE2-NEXT: andpd %xmm1, %xmm2 808; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 809; SSE2-NEXT: andnpd %xmm1, %xmm0 810; SSE2-NEXT: orpd %xmm2, %xmm0 811; SSE2-NEXT: retq 812; 813; AVX1-LABEL: test_fminimumnum_zero0: 814; AVX1: # %bb.0: 815; AVX1-NEXT: vcmpordsd %xmm1, %xmm1, %xmm0 816; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 817; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 818; AVX1-NEXT: retq 819; 820; AVX512-LABEL: test_fminimumnum_zero0: 821; AVX512: # %bb.0: 822; AVX512-NEXT: vcmpordsd %xmm1, %xmm1, %k1 823; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 824; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 825; AVX512-NEXT: retq 826; 827; AVX10_2-LABEL: test_fminimumnum_zero0: 828; AVX10_2: # %bb.0: 829; AVX10_2-NEXT: vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 830; AVX10_2-NEXT: retq 831; 832; X86-LABEL: test_fminimumnum_zero0: 833; X86: # %bb.0: 834; X86-NEXT: pushl %ebp 835; X86-NEXT: movl %esp, %ebp 836; X86-NEXT: andl $-8, %esp 837; X86-NEXT: subl $8, %esp 838; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 839; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 840; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 841; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 842; X86-NEXT: vmovlpd %xmm0, (%esp) 843; X86-NEXT: fldl (%esp) 844; X86-NEXT: movl %ebp, %esp 845; X86-NEXT: popl %ebp 846; X86-NEXT: retl 847 %1 = tail call double @llvm.minimumnum.f64(double -0.0, double %y) 848 ret double %1 849} 850 851define double @test_fminimumnum_zero1(double %x, double %y) nounwind { 852; SSE2-LABEL: test_fminimumnum_zero1: 853; SSE2: # %bb.0: 854; SSE2-NEXT: movapd %xmm0, %xmm1 855; SSE2-NEXT: cmpordsd %xmm0, %xmm1 856; SSE2-NEXT: movapd %xmm1, %xmm2 857; SSE2-NEXT: andpd %xmm0, %xmm2 858; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 859; SSE2-NEXT: andnpd %xmm0, %xmm1 860; SSE2-NEXT: orpd %xmm2, %xmm1 861; SSE2-NEXT: movapd %xmm1, %xmm0 862; SSE2-NEXT: retq 863; 864; AVX1-LABEL: test_fminimumnum_zero1: 865; AVX1: # %bb.0: 866; AVX1-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 867; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 868; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 869; AVX1-NEXT: retq 870; 871; AVX512-LABEL: test_fminimumnum_zero1: 872; AVX512: # %bb.0: 873; AVX512-NEXT: vcmpordsd %xmm0, %xmm0, %k1 874; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 875; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 876; AVX512-NEXT: vmovapd %xmm1, %xmm0 877; AVX512-NEXT: retq 878; 879; AVX10_2-LABEL: test_fminimumnum_zero1: 880; AVX10_2: # %bb.0: 881; AVX10_2-NEXT: vminmaxsd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 882; AVX10_2-NEXT: retq 883; 884; X86-LABEL: test_fminimumnum_zero1: 885; X86: # %bb.0: 886; X86-NEXT: pushl %ebp 887; X86-NEXT: movl %esp, %ebp 888; X86-NEXT: andl $-8, %esp 889; X86-NEXT: subl $8, %esp 890; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 891; X86-NEXT: vcmpordsd %xmm0, %xmm0, %xmm1 892; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 893; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 894; X86-NEXT: vmovlpd %xmm0, (%esp) 895; X86-NEXT: fldl (%esp) 896; X86-NEXT: movl %ebp, %esp 897; X86-NEXT: popl %ebp 898; X86-NEXT: retl 899 %1 = tail call double @llvm.minimumnum.f64(double %x, double -0.0) 900 ret double %1 901} 902 903define double @test_fminimumnum_zero2(double %x, double %y) { 904; SSE2-LABEL: test_fminimumnum_zero2: 905; SSE2: # %bb.0: 906; SSE2-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 907; SSE2-NEXT: retq 908; 909; AVX-LABEL: test_fminimumnum_zero2: 910; AVX: # %bb.0: 911; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 912; AVX-NEXT: retq 913; 914; AVX10_2-LABEL: test_fminimumnum_zero2: 915; AVX10_2: # %bb.0: 916; AVX10_2-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 917; AVX10_2-NEXT: retq 918; 919; X86-LABEL: test_fminimumnum_zero2: 920; X86: # %bb.0: 921; X86-NEXT: fldz 922; X86-NEXT: fchs 923; X86-NEXT: retl 924 %1 = tail call double @llvm.minimumnum.f64(double -0.0, double 0.0) 925 ret double %1 926} 927 928define float @test_fminimumnum_nsz(float %x, float %y) nounwind { 929; SSE2-LABEL: test_fminimumnum_nsz: 930; SSE2: # %bb.0: 931; SSE2-NEXT: movaps %xmm0, %xmm2 932; SSE2-NEXT: cmpordss %xmm0, %xmm2 933; SSE2-NEXT: movaps %xmm2, %xmm3 934; SSE2-NEXT: andps %xmm0, %xmm3 935; SSE2-NEXT: minss %xmm1, %xmm0 936; SSE2-NEXT: andnps %xmm0, %xmm2 937; SSE2-NEXT: orps %xmm3, %xmm2 938; SSE2-NEXT: movaps %xmm2, %xmm0 939; SSE2-NEXT: retq 940; 941; AVX1-LABEL: test_fminimumnum_nsz: 942; AVX1: # %bb.0: 943; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 944; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 945; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 946; AVX1-NEXT: retq 947; 948; AVX512-LABEL: test_fminimumnum_nsz: 949; AVX512: # %bb.0: 950; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 951; AVX512-NEXT: vcmpordss %xmm0, %xmm0, %k1 952; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 953; AVX512-NEXT: vmovaps %xmm1, %xmm0 954; AVX512-NEXT: retq 955; 956; AVX10_2-LABEL: test_fminimumnum_nsz: 957; AVX10_2: # %bb.0: 958; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 959; AVX10_2-NEXT: retq 960; 961; X86-LABEL: test_fminimumnum_nsz: 962; X86: # %bb.0: 963; X86-NEXT: pushl %eax 964; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 965; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm1 966; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 967; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 968; X86-NEXT: vmovss %xmm0, (%esp) 969; X86-NEXT: flds (%esp) 970; X86-NEXT: popl %eax 971; X86-NEXT: retl 972 %1 = tail call nsz float @llvm.minimumnum.f32(float %x, float %y) 973 ret float %1 974} 975 976define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind { 977; SSE2-LABEL: test_fminimumnum_combine_cmps: 978; SSE2: # %bb.0: 979; SSE2-NEXT: divss %xmm0, %xmm1 980; SSE2-NEXT: movd %xmm0, %eax 981; SSE2-NEXT: testl %eax, %eax 982; SSE2-NEXT: movaps %xmm1, %xmm3 983; SSE2-NEXT: js .LBB19_2 984; SSE2-NEXT: # %bb.1: 985; SSE2-NEXT: movaps %xmm0, %xmm3 986; SSE2-NEXT: .LBB19_2: 987; SSE2-NEXT: movaps %xmm3, %xmm2 988; SSE2-NEXT: cmpordss %xmm3, %xmm2 989; SSE2-NEXT: movaps %xmm2, %xmm4 990; SSE2-NEXT: andps %xmm3, %xmm4 991; SSE2-NEXT: js .LBB19_4 992; SSE2-NEXT: # %bb.3: 993; SSE2-NEXT: movaps %xmm1, %xmm0 994; SSE2-NEXT: .LBB19_4: 995; SSE2-NEXT: minss %xmm0, %xmm3 996; SSE2-NEXT: andnps %xmm3, %xmm2 997; SSE2-NEXT: orps %xmm4, %xmm2 998; SSE2-NEXT: movaps %xmm2, %xmm0 999; SSE2-NEXT: retq 1000; 1001; AVX1-LABEL: test_fminimumnum_combine_cmps: 1002; AVX1: # %bb.0: 1003; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 1004; AVX1-NEXT: vmovd %xmm0, %eax 1005; AVX1-NEXT: testl %eax, %eax 1006; AVX1-NEXT: js .LBB19_1 1007; AVX1-NEXT: # %bb.2: 1008; AVX1-NEXT: vmovaps %xmm2, %xmm1 1009; AVX1-NEXT: jmp .LBB19_3 1010; AVX1-NEXT: .LBB19_1: 1011; AVX1-NEXT: vmovaps %xmm0, %xmm1 1012; AVX1-NEXT: vmovaps %xmm2, %xmm0 1013; AVX1-NEXT: .LBB19_3: 1014; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 1015; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 1016; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1017; AVX1-NEXT: retq 1018; 1019; AVX512F-LABEL: test_fminimumnum_combine_cmps: 1020; AVX512F: # %bb.0: 1021; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 1022; AVX512F-NEXT: vmovd %xmm0, %eax 1023; AVX512F-NEXT: testl %eax, %eax 1024; AVX512F-NEXT: sets %al 1025; AVX512F-NEXT: kmovw %eax, %k1 1026; AVX512F-NEXT: vmovaps %xmm1, %xmm2 1027; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 1028; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 1029; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 1030; AVX512F-NEXT: vcmpordss %xmm0, %xmm0, %k1 1031; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 1032; AVX512F-NEXT: vmovaps %xmm1, %xmm0 1033; AVX512F-NEXT: retq 1034; 1035; AVX512DQ-LABEL: test_fminimumnum_combine_cmps: 1036; AVX512DQ: # %bb.0: 1037; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 1038; AVX512DQ-NEXT: vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0) 1039; AVX512DQ-NEXT: kmovw %k0, %k1 1040; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 1041; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 1042; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 1043; AVX512DQ-NEXT: vminss %xmm2, %xmm0, %xmm0 1044; AVX512DQ-NEXT: retq 1045; 1046; AVX10_2-LABEL: test_fminimumnum_combine_cmps: 1047; AVX10_2: # %bb.0: 1048; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 1049; AVX10_2-NEXT: vminmaxss $16, %xmm1, %xmm0 1050; AVX10_2-NEXT: retq 1051; 1052; X86-LABEL: test_fminimumnum_combine_cmps: 1053; X86: # %bb.0: 1054; X86-NEXT: pushl %eax 1055; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1056; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1057; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 1058; X86-NEXT: vmovd %xmm0, %eax 1059; X86-NEXT: testl %eax, %eax 1060; X86-NEXT: js .LBB19_1 1061; X86-NEXT: # %bb.2: 1062; X86-NEXT: vmovaps %xmm2, %xmm1 1063; X86-NEXT: jmp .LBB19_3 1064; X86-NEXT: .LBB19_1: 1065; X86-NEXT: vmovaps %xmm0, %xmm1 1066; X86-NEXT: vmovaps %xmm2, %xmm0 1067; X86-NEXT: .LBB19_3: 1068; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 1069; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 1070; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1071; X86-NEXT: vmovss %xmm0, (%esp) 1072; X86-NEXT: flds (%esp) 1073; X86-NEXT: popl %eax 1074; X86-NEXT: retl 1075 %1 = fdiv nnan float %y, %x 1076 %2 = tail call float @llvm.minimumnum.f32(float %x, float %1) 1077 ret float %2 1078} 1079 1080define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) { 1081; SSE2-LABEL: test_fminimumnum_vector: 1082; SSE2: # %bb.0: 1083; SSE2-NEXT: movaps %xmm0, %xmm2 1084; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] 1085; SSE2-NEXT: pxor %xmm3, %xmm3 1086; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1087; SSE2-NEXT: movdqa %xmm3, %xmm2 1088; SSE2-NEXT: pandn %xmm1, %xmm2 1089; SSE2-NEXT: movdqa %xmm3, %xmm4 1090; SSE2-NEXT: pandn %xmm0, %xmm4 1091; SSE2-NEXT: pand %xmm3, %xmm0 1092; SSE2-NEXT: por %xmm2, %xmm0 1093; SSE2-NEXT: pand %xmm1, %xmm3 1094; SSE2-NEXT: por %xmm4, %xmm3 1095; SSE2-NEXT: movdqa %xmm3, %xmm1 1096; SSE2-NEXT: minpd %xmm0, %xmm1 1097; SSE2-NEXT: movdqa %xmm3, %xmm0 1098; SSE2-NEXT: cmpordpd %xmm3, %xmm0 1099; SSE2-NEXT: andpd %xmm0, %xmm3 1100; SSE2-NEXT: andnpd %xmm1, %xmm0 1101; SSE2-NEXT: orpd %xmm3, %xmm0 1102; SSE2-NEXT: retq 1103; 1104; AVX-LABEL: test_fminimumnum_vector: 1105; AVX: # %bb.0: 1106; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1107; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1108; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 1109; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 1110; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1111; AVX-NEXT: retq 1112; 1113; AVX10_2-LABEL: test_fminimumnum_vector: 1114; AVX10_2: # %bb.0: 1115; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1116; AVX10_2-NEXT: retq 1117; 1118; X86-LABEL: test_fminimumnum_vector: 1119; X86: # %bb.0: 1120; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1121; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1122; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 1123; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 1124; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1125; X86-NEXT: retl 1126 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y) 1127 ret <2 x double> %r 1128} 1129 1130define <4 x float> @test_fmaximumnum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 1131; SSE2-LABEL: test_fmaximumnum_vector: 1132; SSE2: # %bb.0: 1133; SSE2-NEXT: maxps %xmm1, %xmm0 1134; SSE2-NEXT: retq 1135; 1136; AVX-LABEL: test_fmaximumnum_vector: 1137; AVX: # %bb.0: 1138; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 1139; AVX-NEXT: retq 1140; 1141; AVX10_2-LABEL: test_fmaximumnum_vector: 1142; AVX10_2: # %bb.0: 1143; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 1144; AVX10_2-NEXT: retq 1145; 1146; X86-LABEL: test_fmaximumnum_vector: 1147; X86: # %bb.0: 1148; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 1149; X86-NEXT: retl 1150 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %y) 1151 ret <4 x float> %r 1152} 1153 1154define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) { 1155; SSE2-LABEL: test_fminimumnum_vector_zero: 1156; SSE2: # %bb.0: 1157; SSE2-NEXT: xorpd %xmm1, %xmm1 1158; SSE2-NEXT: minpd %xmm0, %xmm1 1159; SSE2-NEXT: movapd %xmm1, %xmm0 1160; SSE2-NEXT: retq 1161; 1162; AVX-LABEL: test_fminimumnum_vector_zero: 1163; AVX: # %bb.0: 1164; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1165; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1166; AVX-NEXT: retq 1167; 1168; AVX10_2-LABEL: test_fminimumnum_vector_zero: 1169; AVX10_2: # %bb.0: 1170; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1171; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1172; AVX10_2-NEXT: retq 1173; 1174; X86-LABEL: test_fminimumnum_vector_zero: 1175; X86: # %bb.0: 1176; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1177; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1178; X86-NEXT: retl 1179 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>) 1180 ret <2 x double> %r 1181} 1182 1183define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) { 1184; SSE2-LABEL: test_fmaximumnum_vector_signed_zero: 1185; SSE2: # %bb.0: 1186; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1187; SSE2-NEXT: maxps %xmm0, %xmm1 1188; SSE2-NEXT: movaps %xmm1, %xmm0 1189; SSE2-NEXT: retq 1190; 1191; AVX-LABEL: test_fmaximumnum_vector_signed_zero: 1192; AVX: # %bb.0: 1193; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1194; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1195; AVX-NEXT: retq 1196; 1197; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero: 1198; AVX10_2: # %bb.0: 1199; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1200; AVX10_2-NEXT: retq 1201; 1202; X86-LABEL: test_fmaximumnum_vector_signed_zero: 1203; X86: # %bb.0: 1204; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1205; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1206; X86-NEXT: retl 1207 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>) 1208 ret <4 x float> %r 1209} 1210 1211define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) { 1212; SSE2-LABEL: test_fminimumnum_vector_partially_zero: 1213; SSE2: # %bb.0: 1214; SSE2-NEXT: xorpd %xmm1, %xmm1 1215; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1216; SSE2-NEXT: minpd %xmm0, %xmm1 1217; SSE2-NEXT: movapd %xmm1, %xmm0 1218; SSE2-NEXT: retq 1219; 1220; AVX-LABEL: test_fminimumnum_vector_partially_zero: 1221; AVX: # %bb.0: 1222; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1223; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1224; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1225; AVX-NEXT: retq 1226; 1227; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero: 1228; AVX10_2: # %bb.0: 1229; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1230; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1231; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1232; AVX10_2-NEXT: retq 1233; 1234; X86-LABEL: test_fminimumnum_vector_partially_zero: 1235; X86: # %bb.0: 1236; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1237; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1238; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1239; X86-NEXT: retl 1240 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>) 1241 ret <2 x double> %r 1242} 1243 1244define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) { 1245; SSE2-LABEL: test_fminimumnum_vector_different_zeros: 1246; SSE2: # %bb.0: 1247; SSE2-NEXT: movaps %xmm0, %xmm1 1248; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] 1249; SSE2-NEXT: xorps %xmm2, %xmm2 1250; SSE2-NEXT: pxor %xmm3, %xmm3 1251; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 1252; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1253; SSE2-NEXT: movdqa %xmm3, %xmm1 1254; SSE2-NEXT: pandn %xmm2, %xmm1 1255; SSE2-NEXT: movaps %xmm0, %xmm4 1256; SSE2-NEXT: andps %xmm3, %xmm4 1257; SSE2-NEXT: orps %xmm1, %xmm4 1258; SSE2-NEXT: pand %xmm0, %xmm2 1259; SSE2-NEXT: pandn %xmm0, %xmm3 1260; SSE2-NEXT: por %xmm2, %xmm3 1261; SSE2-NEXT: movdqa %xmm3, %xmm1 1262; SSE2-NEXT: minpd %xmm4, %xmm1 1263; SSE2-NEXT: movdqa %xmm3, %xmm0 1264; SSE2-NEXT: cmpordpd %xmm3, %xmm0 1265; SSE2-NEXT: andpd %xmm0, %xmm3 1266; SSE2-NEXT: andnpd %xmm1, %xmm0 1267; SSE2-NEXT: orpd %xmm3, %xmm0 1268; SSE2-NEXT: retq 1269; 1270; AVX-LABEL: test_fminimumnum_vector_different_zeros: 1271; AVX: # %bb.0: 1272; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1273; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1274; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1275; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1276; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 1277; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 1278; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1279; AVX-NEXT: retq 1280; 1281; AVX10_2-LABEL: test_fminimumnum_vector_different_zeros: 1282; AVX10_2: # %bb.0: 1283; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1284; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1285; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1286; AVX10_2-NEXT: retq 1287; 1288; X86-LABEL: test_fminimumnum_vector_different_zeros: 1289; X86: # %bb.0: 1290; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1291; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1292; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1293; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1294; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 1295; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm2 1296; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1297; X86-NEXT: retl 1298 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>) 1299 ret <2 x double> %r 1300} 1301 1302define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) { 1303; SSE2-LABEL: test_fmaximumnum_vector_non_zero: 1304; SSE2: # %bb.0: 1305; SSE2-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1306; SSE2-NEXT: maxps %xmm0, %xmm1 1307; SSE2-NEXT: movaps %xmm1, %xmm0 1308; SSE2-NEXT: retq 1309; 1310; AVX-LABEL: test_fmaximumnum_vector_non_zero: 1311; AVX: # %bb.0: 1312; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1313; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1314; AVX-NEXT: retq 1315; 1316; AVX10_2-LABEL: test_fmaximumnum_vector_non_zero: 1317; AVX10_2: # %bb.0: 1318; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1319; AVX10_2-NEXT: retq 1320; 1321; X86-LABEL: test_fmaximumnum_vector_non_zero: 1322; X86: # %bb.0: 1323; X86-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1324; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1325; X86-NEXT: retl 1326 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>) 1327 ret <4 x float> %r 1328} 1329 1330define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) { 1331; SSE2-LABEL: test_fminimumnum_vector_nan: 1332; SSE2: # %bb.0: 1333; SSE2-NEXT: xorpd %xmm2, %xmm2 1334; SSE2-NEXT: xorpd %xmm1, %xmm1 1335; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1336; SSE2-NEXT: minpd %xmm0, %xmm1 1337; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1] 1338; SSE2-NEXT: movapd %xmm1, %xmm0 1339; SSE2-NEXT: retq 1340; 1341; AVX-LABEL: test_fminimumnum_vector_nan: 1342; AVX: # %bb.0: 1343; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1344; AVX-NEXT: vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0] 1345; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0 1346; AVX-NEXT: vblendpd {{.*#+}} xmm0 = xmm1[0],xmm0[1] 1347; AVX-NEXT: retq 1348; 1349; AVX10_2-LABEL: test_fminimumnum_vector_nan: 1350; AVX10_2: # %bb.0: 1351; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1352; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1353; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1354; AVX10_2-NEXT: retq 1355; 1356; X86-LABEL: test_fminimumnum_vector_nan: 1357; X86: # %bb.0: 1358; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1359; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1360; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1361; X86-NEXT: vcmpordpd %xmm1, %xmm1, %xmm2 1362; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 1363; X86-NEXT: retl 1364 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>) 1365 ret <2 x double> %r 1366} 1367 1368define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) { 1369; SSE2-LABEL: test_fminimumnum_vector_zero_first: 1370; SSE2: # %bb.0: 1371; SSE2-NEXT: xorpd %xmm1, %xmm1 1372; SSE2-NEXT: minpd %xmm0, %xmm1 1373; SSE2-NEXT: movapd %xmm1, %xmm0 1374; SSE2-NEXT: retq 1375; 1376; AVX-LABEL: test_fminimumnum_vector_zero_first: 1377; AVX: # %bb.0: 1378; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1379; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1380; AVX-NEXT: retq 1381; 1382; AVX10_2-LABEL: test_fminimumnum_vector_zero_first: 1383; AVX10_2: # %bb.0: 1384; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1385; AVX10_2-NEXT: vminmaxpd $16, %xmm1, %xmm0, %xmm0 1386; AVX10_2-NEXT: retq 1387; 1388; X86-LABEL: test_fminimumnum_vector_zero_first: 1389; X86: # %bb.0: 1390; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1391; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1392; X86-NEXT: retl 1393 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x) 1394 ret <2 x double> %r 1395} 1396 1397define <2 x double> @test_fminimumnum_vector_signed_zero(<2 x double> %x) { 1398; SSE2-LABEL: test_fminimumnum_vector_signed_zero: 1399; SSE2: # %bb.0: 1400; SSE2-NEXT: movapd %xmm0, %xmm1 1401; SSE2-NEXT: minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1402; SSE2-NEXT: movapd %xmm0, %xmm2 1403; SSE2-NEXT: cmpordpd %xmm0, %xmm2 1404; SSE2-NEXT: andpd %xmm2, %xmm0 1405; SSE2-NEXT: andnpd %xmm1, %xmm2 1406; SSE2-NEXT: orpd %xmm2, %xmm0 1407; SSE2-NEXT: retq 1408; 1409; AVX-LABEL: test_fminimumnum_vector_signed_zero: 1410; AVX: # %bb.0: 1411; AVX-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 1412; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1413; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 1414; AVX-NEXT: retq 1415; 1416; AVX10_2-LABEL: test_fminimumnum_vector_signed_zero: 1417; AVX10_2: # %bb.0: 1418; AVX10_2-NEXT: vminmaxpd $16, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 1419; AVX10_2-NEXT: retq 1420; 1421; X86-LABEL: test_fminimumnum_vector_signed_zero: 1422; X86: # %bb.0: 1423; X86-NEXT: vcmpordpd %xmm0, %xmm0, %xmm1 1424; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 1425; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 1426; X86-NEXT: retl 1427 %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>) 1428 ret <2 x double> %r 1429} 1430 1431define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) { 1432; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first: 1433; SSE2: # %bb.0: 1434; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1435; SSE2-NEXT: maxps %xmm0, %xmm1 1436; SSE2-NEXT: movaps %xmm1, %xmm0 1437; SSE2-NEXT: retq 1438; 1439; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first: 1440; AVX: # %bb.0: 1441; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1442; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1443; AVX-NEXT: retq 1444; 1445; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first: 1446; AVX10_2: # %bb.0: 1447; AVX10_2-NEXT: vminmaxps $17, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1448; AVX10_2-NEXT: retq 1449; 1450; X86-LABEL: test_fmaximumnum_vector_signed_zero_first: 1451; X86: # %bb.0: 1452; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1453; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1454; X86-NEXT: retl 1455 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x) 1456 ret <4 x float> %r 1457} 1458 1459define <4 x float> @test_fmaximumnum_vector_zero(<4 x float> %x) { 1460; SSE2-LABEL: test_fmaximumnum_vector_zero: 1461; SSE2: # %bb.0: 1462; SSE2-NEXT: xorps %xmm1, %xmm1 1463; SSE2-NEXT: movaps %xmm0, %xmm2 1464; SSE2-NEXT: maxps %xmm1, %xmm2 1465; SSE2-NEXT: movaps %xmm0, %xmm1 1466; SSE2-NEXT: cmpordps %xmm0, %xmm1 1467; SSE2-NEXT: andps %xmm1, %xmm0 1468; SSE2-NEXT: andnps %xmm2, %xmm1 1469; SSE2-NEXT: orps %xmm1, %xmm0 1470; SSE2-NEXT: retq 1471; 1472; AVX-LABEL: test_fmaximumnum_vector_zero: 1473; AVX: # %bb.0: 1474; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1475; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm1 1476; AVX-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 1477; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1478; AVX-NEXT: retq 1479; 1480; AVX10_2-LABEL: test_fmaximumnum_vector_zero: 1481; AVX10_2: # %bb.0: 1482; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1483; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 1484; AVX10_2-NEXT: retq 1485; 1486; X86-LABEL: test_fmaximumnum_vector_zero: 1487; X86: # %bb.0: 1488; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1489; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm1 1490; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 1491; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1492; X86-NEXT: retl 1493 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>) 1494 ret <4 x float> %r 1495} 1496 1497; PR77805: Check that signed zeroes are handled correctly in this case (FIXME) 1498define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) { 1499; SSE2-LABEL: test_fmaximumnum_v4f32_splat: 1500; SSE2: # %bb.0: 1501; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 1502; SSE2-NEXT: pxor %xmm2, %xmm2 1503; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 1504; SSE2-NEXT: movdqa %xmm2, %xmm3 1505; SSE2-NEXT: pandn %xmm0, %xmm3 1506; SSE2-NEXT: movaps %xmm1, %xmm4 1507; SSE2-NEXT: andps %xmm2, %xmm4 1508; SSE2-NEXT: orps %xmm3, %xmm4 1509; SSE2-NEXT: pand %xmm2, %xmm0 1510; SSE2-NEXT: andnps %xmm1, %xmm2 1511; SSE2-NEXT: por %xmm2, %xmm0 1512; SSE2-NEXT: movdqa %xmm0, %xmm1 1513; SSE2-NEXT: maxps %xmm4, %xmm1 1514; SSE2-NEXT: movdqa %xmm0, %xmm2 1515; SSE2-NEXT: cmpordps %xmm0, %xmm2 1516; SSE2-NEXT: andps %xmm2, %xmm0 1517; SSE2-NEXT: andnps %xmm1, %xmm2 1518; SSE2-NEXT: orps %xmm2, %xmm0 1519; SSE2-NEXT: retq 1520; 1521; AVX1-LABEL: test_fmaximumnum_v4f32_splat: 1522; AVX1: # %bb.0: 1523; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 1524; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1525; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1526; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1527; AVX1-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 1528; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1529; AVX1-NEXT: retq 1530; 1531; AVX512-LABEL: test_fmaximumnum_v4f32_splat: 1532; AVX512: # %bb.0: 1533; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 1534; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1535; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1536; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1537; AVX512-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 1538; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1539; AVX512-NEXT: retq 1540; 1541; AVX10_2-LABEL: test_fmaximumnum_v4f32_splat: 1542; AVX10_2: # %bb.0: 1543; AVX10_2-NEXT: vbroadcastss %xmm1, %xmm1 1544; AVX10_2-NEXT: vminmaxps $17, %xmm1, %xmm0, %xmm0 1545; AVX10_2-NEXT: retq 1546; 1547; X86-LABEL: test_fmaximumnum_v4f32_splat: 1548; X86: # %bb.0: 1549; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 1550; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1551; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1552; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1553; X86-NEXT: vcmpordps %xmm0, %xmm0, %xmm2 1554; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1555; X86-NEXT: retl 1556 %splatinsert = insertelement <4 x float> poison, float %y, i64 0 1557 %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer 1558 %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> %vec) readnone 1559 ret <4 x float> %r 1560} 1561 1562define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { 1563; SSE2-LABEL: test_fmaximumnum_v4f16: 1564; SSE2: # %bb.0: 1565; SSE2-NEXT: subq $104, %rsp 1566; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1567; SSE2-NEXT: psrld $16, %xmm0 1568; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1569; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 1570; SSE2-NEXT: movdqa %xmm1, %xmm0 1571; SSE2-NEXT: psrld $16, %xmm0 1572; SSE2-NEXT: callq __extendhfsf2@PLT 1573; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 1574; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1575; SSE2-NEXT: callq __extendhfsf2@PLT 1576; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1577; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1578; SSE2-NEXT: movdqa %xmm0, %xmm1 1579; SSE2-NEXT: movd %xmm0, %eax 1580; SSE2-NEXT: testl %eax, %eax 1581; SSE2-NEXT: movdqa %xmm0, %xmm2 1582; SSE2-NEXT: js .LBB33_2 1583; SSE2-NEXT: # %bb.1: 1584; SSE2-NEXT: movdqa %xmm4, %xmm2 1585; SSE2-NEXT: .LBB33_2: 1586; SSE2-NEXT: movdqa %xmm2, %xmm0 1587; SSE2-NEXT: cmpordss %xmm2, %xmm0 1588; SSE2-NEXT: movaps %xmm0, %xmm3 1589; SSE2-NEXT: andps %xmm2, %xmm3 1590; SSE2-NEXT: js .LBB33_4 1591; SSE2-NEXT: # %bb.3: 1592; SSE2-NEXT: movdqa %xmm1, %xmm4 1593; SSE2-NEXT: .LBB33_4: 1594; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1595; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 1596; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1597; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 1598; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 1599; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1600; SSE2-NEXT: maxss %xmm4, %xmm2 1601; SSE2-NEXT: andnps %xmm2, %xmm0 1602; SSE2-NEXT: orps %xmm3, %xmm0 1603; SSE2-NEXT: callq __truncsfhf2@PLT 1604; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1605; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1606; SSE2-NEXT: callq __extendhfsf2@PLT 1607; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1608; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1609; SSE2-NEXT: callq __extendhfsf2@PLT 1610; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1611; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1612; SSE2-NEXT: movdqa %xmm0, %xmm1 1613; SSE2-NEXT: movd %xmm0, %eax 1614; SSE2-NEXT: testl %eax, %eax 1615; SSE2-NEXT: movdqa %xmm0, %xmm2 1616; SSE2-NEXT: js .LBB33_6 1617; SSE2-NEXT: # %bb.5: 1618; SSE2-NEXT: movdqa %xmm4, %xmm2 1619; SSE2-NEXT: .LBB33_6: 1620; SSE2-NEXT: movdqa %xmm2, %xmm0 1621; SSE2-NEXT: cmpordss %xmm2, %xmm0 1622; SSE2-NEXT: movaps %xmm0, %xmm3 1623; SSE2-NEXT: andps %xmm2, %xmm3 1624; SSE2-NEXT: js .LBB33_8 1625; SSE2-NEXT: # %bb.7: 1626; SSE2-NEXT: movdqa %xmm1, %xmm4 1627; SSE2-NEXT: .LBB33_8: 1628; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1629; SSE2-NEXT: psrlq $48, %xmm1 1630; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1631; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 1632; SSE2-NEXT: psrlq $48, %xmm1 1633; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 1634; SSE2-NEXT: maxss %xmm4, %xmm2 1635; SSE2-NEXT: andnps %xmm2, %xmm0 1636; SSE2-NEXT: orps %xmm3, %xmm0 1637; SSE2-NEXT: callq __truncsfhf2@PLT 1638; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1639; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1640; SSE2-NEXT: callq __extendhfsf2@PLT 1641; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1642; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1643; SSE2-NEXT: callq __extendhfsf2@PLT 1644; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1645; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1646; SSE2-NEXT: movd %xmm0, %eax 1647; SSE2-NEXT: testl %eax, %eax 1648; SSE2-NEXT: movdqa %xmm0, %xmm2 1649; SSE2-NEXT: js .LBB33_10 1650; SSE2-NEXT: # %bb.9: 1651; SSE2-NEXT: movdqa %xmm4, %xmm2 1652; SSE2-NEXT: .LBB33_10: 1653; SSE2-NEXT: movdqa %xmm2, %xmm1 1654; SSE2-NEXT: cmpordss %xmm2, %xmm1 1655; SSE2-NEXT: movaps %xmm1, %xmm3 1656; SSE2-NEXT: andps %xmm2, %xmm3 1657; SSE2-NEXT: js .LBB33_12 1658; SSE2-NEXT: # %bb.11: 1659; SSE2-NEXT: movdqa %xmm0, %xmm4 1660; SSE2-NEXT: .LBB33_12: 1661; SSE2-NEXT: maxss %xmm4, %xmm2 1662; SSE2-NEXT: andnps %xmm2, %xmm1 1663; SSE2-NEXT: orps %xmm3, %xmm1 1664; SSE2-NEXT: movaps %xmm1, %xmm0 1665; SSE2-NEXT: callq __truncsfhf2@PLT 1666; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1667; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1668; SSE2-NEXT: callq __extendhfsf2@PLT 1669; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 1670; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1671; SSE2-NEXT: callq __extendhfsf2@PLT 1672; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload 1673; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1674; SSE2-NEXT: movdqa %xmm0, %xmm1 1675; SSE2-NEXT: movd %xmm0, %eax 1676; SSE2-NEXT: testl %eax, %eax 1677; SSE2-NEXT: movdqa %xmm0, %xmm2 1678; SSE2-NEXT: js .LBB33_14 1679; SSE2-NEXT: # %bb.13: 1680; SSE2-NEXT: movdqa %xmm4, %xmm2 1681; SSE2-NEXT: .LBB33_14: 1682; SSE2-NEXT: movdqa %xmm2, %xmm0 1683; SSE2-NEXT: cmpordss %xmm2, %xmm0 1684; SSE2-NEXT: movaps %xmm0, %xmm3 1685; SSE2-NEXT: andps %xmm2, %xmm3 1686; SSE2-NEXT: js .LBB33_16 1687; SSE2-NEXT: # %bb.15: 1688; SSE2-NEXT: movdqa %xmm1, %xmm4 1689; SSE2-NEXT: .LBB33_16: 1690; SSE2-NEXT: maxss %xmm4, %xmm2 1691; SSE2-NEXT: andnps %xmm2, %xmm0 1692; SSE2-NEXT: orps %xmm3, %xmm0 1693; SSE2-NEXT: callq __truncsfhf2@PLT 1694; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1695; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1696; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1697; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1698; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 1699; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1700; SSE2-NEXT: addq $104, %rsp 1701; SSE2-NEXT: retq 1702; 1703; AVX1-LABEL: test_fmaximumnum_v4f16: 1704; AVX1: # %bb.0: 1705; AVX1-NEXT: subq $120, %rsp 1706; AVX1-NEXT: vmovaps %xmm0, %xmm2 1707; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1708; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1709; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1710; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1711; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1712; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0 1713; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1714; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 1715; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 1716; AVX1-NEXT: callq __extendhfsf2@PLT 1717; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1718; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1719; AVX1-NEXT: callq __extendhfsf2@PLT 1720; AVX1-NEXT: vmovd %xmm0, %eax 1721; AVX1-NEXT: testl %eax, %eax 1722; AVX1-NEXT: js .LBB33_1 1723; AVX1-NEXT: # %bb.2: 1724; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1725; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1726; AVX1-NEXT: jmp .LBB33_3 1727; AVX1-NEXT: .LBB33_1: 1728; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1729; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1730; AVX1-NEXT: .LBB33_3: 1731; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1732; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1733; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1734; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1735; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1736; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1737; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1738; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 1739; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1740; AVX1-NEXT: callq __truncsfhf2@PLT 1741; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1742; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1743; AVX1-NEXT: callq __extendhfsf2@PLT 1744; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1745; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1746; AVX1-NEXT: callq __extendhfsf2@PLT 1747; AVX1-NEXT: vmovd %xmm0, %eax 1748; AVX1-NEXT: testl %eax, %eax 1749; AVX1-NEXT: js .LBB33_4 1750; AVX1-NEXT: # %bb.5: 1751; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1752; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 1753; AVX1-NEXT: jmp .LBB33_6 1754; AVX1-NEXT: .LBB33_4: 1755; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1756; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1757; AVX1-NEXT: .LBB33_6: 1758; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1759; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 1760; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1761; AVX1-NEXT: callq __truncsfhf2@PLT 1762; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1763; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1764; AVX1-NEXT: callq __extendhfsf2@PLT 1765; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1766; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1767; AVX1-NEXT: callq __extendhfsf2@PLT 1768; AVX1-NEXT: vmovd %xmm0, %eax 1769; AVX1-NEXT: testl %eax, %eax 1770; AVX1-NEXT: js .LBB33_7 1771; AVX1-NEXT: # %bb.8: 1772; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1773; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 1774; AVX1-NEXT: jmp .LBB33_9 1775; AVX1-NEXT: .LBB33_7: 1776; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1777; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1778; AVX1-NEXT: .LBB33_9: 1779; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1780; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 1781; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1782; AVX1-NEXT: callq __truncsfhf2@PLT 1783; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1784; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1785; AVX1-NEXT: callq __extendhfsf2@PLT 1786; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1787; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1788; AVX1-NEXT: callq __extendhfsf2@PLT 1789; AVX1-NEXT: vmovd %xmm0, %eax 1790; AVX1-NEXT: testl %eax, %eax 1791; AVX1-NEXT: js .LBB33_10 1792; AVX1-NEXT: # %bb.11: 1793; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1794; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1795; AVX1-NEXT: jmp .LBB33_12 1796; AVX1-NEXT: .LBB33_10: 1797; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1798; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1799; AVX1-NEXT: .LBB33_12: 1800; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1801; AVX1-NEXT: vcmpordss %xmm2, %xmm2, %xmm1 1802; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1803; AVX1-NEXT: callq __truncsfhf2@PLT 1804; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1805; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1806; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1807; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1808; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1809; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 1810; AVX1-NEXT: addq $120, %rsp 1811; AVX1-NEXT: retq 1812; 1813; AVX512-LABEL: test_fmaximumnum_v4f16: 1814; AVX512: # %bb.0: 1815; AVX512-NEXT: subq $88, %rsp 1816; AVX512-NEXT: vmovdqa %xmm1, %xmm4 1817; AVX512-NEXT: vmovdqa %xmm0, %xmm6 1818; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1819; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1820; AVX512-NEXT: vucomiss %xmm0, %xmm0 1821; AVX512-NEXT: setp %al 1822; AVX512-NEXT: kmovw %eax, %k1 1823; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm6[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1824; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1825; AVX512-NEXT: vucomiss %xmm2, %xmm2 1826; AVX512-NEXT: setp %al 1827; AVX512-NEXT: kmovw %eax, %k2 1828; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} 1829; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1830; AVX512-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 1831; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 1832; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 1833; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1834; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1835; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1836; AVX512-NEXT: vucomiss %xmm0, %xmm2 1837; AVX512-NEXT: seta %al 1838; AVX512-NEXT: kmovw %eax, %k1 1839; AVX512-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 1840; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1841; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1842; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1843; AVX512-NEXT: movzwl {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %eax 1844; AVX512-NEXT: vmovd %eax, %xmm2 1845; AVX512-NEXT: vcvtph2ps %xmm2, %xmm9 1846; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 1847; AVX512-NEXT: vxorps %xmm10, %xmm10, %xmm10 1848; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] 1849; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1850; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1851; AVX512-NEXT: vmovd %xmm0, %eax 1852; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1853; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[3,3,3,3] 1854; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1855; AVX512-NEXT: vucomiss %xmm2, %xmm2 1856; AVX512-NEXT: setp %al 1857; AVX512-NEXT: kmovw %eax, %k1 1858; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm6[3,3,3,3] 1859; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1860; AVX512-NEXT: vucomiss %xmm3, %xmm3 1861; AVX512-NEXT: setp %al 1862; AVX512-NEXT: kmovw %eax, %k2 1863; AVX512-NEXT: vmovss %xmm2, %xmm3, %xmm3 {%k2} 1864; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 1865; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1866; AVX512-NEXT: vcvtph2ps %xmm1, %xmm3 1867; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} 1868; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1869; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1870; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 1871; AVX512-NEXT: vucomiss %xmm2, %xmm3 1872; AVX512-NEXT: seta %al 1873; AVX512-NEXT: kmovw %eax, %k1 1874; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} 1875; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1876; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1877; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1878; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 1879; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] 1880; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm1 1881; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1882; AVX512-NEXT: vmovd %xmm1, %eax 1883; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 1884; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 1885; AVX512-NEXT: vpsrldq {{.*#+}} xmm0 = xmm4[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1886; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1887; AVX512-NEXT: vucomiss %xmm0, %xmm0 1888; AVX512-NEXT: setp %al 1889; AVX512-NEXT: kmovw %eax, %k1 1890; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1891; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1892; AVX512-NEXT: vucomiss %xmm3, %xmm3 1893; AVX512-NEXT: setp %al 1894; AVX512-NEXT: kmovw %eax, %k2 1895; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} 1896; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 1897; AVX512-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1898; AVX512-NEXT: vcvtph2ps %xmm1, %xmm5 1899; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} 1900; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1901; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1902; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3 1903; AVX512-NEXT: vucomiss %xmm3, %xmm5 1904; AVX512-NEXT: seta %al 1905; AVX512-NEXT: kmovw %eax, %k1 1906; AVX512-NEXT: vmovss %xmm5, %xmm3, %xmm3 {%k1} 1907; AVX512-NEXT: vshufpd {{.*#+}} xmm0 = xmm4[1,0] 1908; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1909; AVX512-NEXT: vucomiss %xmm0, %xmm0 1910; AVX512-NEXT: setp %al 1911; AVX512-NEXT: kmovw %eax, %k1 1912; AVX512-NEXT: vshufpd {{.*#+}} xmm5 = xmm6[1,0] 1913; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1914; AVX512-NEXT: vucomiss %xmm5, %xmm5 1915; AVX512-NEXT: setp %al 1916; AVX512-NEXT: kmovw %eax, %k2 1917; AVX512-NEXT: vmovss %xmm0, %xmm5, %xmm5 {%k2} 1918; AVX512-NEXT: vcvtps2ph $4, %xmm5, %xmm15 1919; AVX512-NEXT: vcvtph2ps %xmm15, %xmm5 1920; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} 1921; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1922; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1923; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1924; AVX512-NEXT: vucomiss %xmm0, %xmm5 1925; AVX512-NEXT: seta %al 1926; AVX512-NEXT: kmovw %eax, %k1 1927; AVX512-NEXT: vmovss %xmm5, %xmm0, %xmm0 {%k1} 1928; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm3 1929; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero 1930; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1931; AVX512-NEXT: vmulss %xmm3, %xmm9, %xmm3 1932; AVX512-NEXT: vblendps {{.*#+}} xmm3 = xmm3[0],xmm10[1,2,3] 1933; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm1 1934; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1935; AVX512-NEXT: vmovd %xmm1, %eax 1936; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1937; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1938; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1939; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 1940; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] 1941; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1942; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1943; AVX512-NEXT: vmovd %xmm0, %ecx 1944; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 1945; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm3 1946; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] 1947; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] 1948; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1949; AVX512-NEXT: vpshuflw {{.*#+}} xmm0 = xmm4[3,3,3,3,4,5,6,7] 1950; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1951; AVX512-NEXT: vucomiss %xmm0, %xmm0 1952; AVX512-NEXT: setp %al 1953; AVX512-NEXT: kmovw %eax, %k1 1954; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm6[3,3,3,3,4,5,6,7] 1955; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1956; AVX512-NEXT: vucomiss %xmm2, %xmm2 1957; AVX512-NEXT: setp %al 1958; AVX512-NEXT: kmovw %eax, %k2 1959; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} 1960; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm11 1961; AVX512-NEXT: vcvtph2ps %xmm11, %xmm3 1962; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} 1963; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1964; AVX512-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1965; AVX512-NEXT: vcvtph2ps %xmm0, %xmm2 1966; AVX512-NEXT: vucomiss %xmm2, %xmm3 1967; AVX512-NEXT: seta %al 1968; AVX512-NEXT: kmovw %eax, %k1 1969; AVX512-NEXT: vmovss %xmm3, %xmm2, %xmm2 {%k1} 1970; AVX512-NEXT: vmovshdup {{.*#+}} xmm0 = xmm4[1,1,3,3] 1971; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 1972; AVX512-NEXT: vucomiss %xmm0, %xmm0 1973; AVX512-NEXT: setp %al 1974; AVX512-NEXT: kmovw %eax, %k1 1975; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm6[1,1,3,3] 1976; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1977; AVX512-NEXT: vucomiss %xmm3, %xmm3 1978; AVX512-NEXT: setp %al 1979; AVX512-NEXT: kmovw %eax, %k2 1980; AVX512-NEXT: vmovss %xmm0, %xmm3, %xmm3 {%k2} 1981; AVX512-NEXT: vcvtps2ph $4, %xmm3, %xmm7 1982; AVX512-NEXT: vcvtph2ps %xmm7, %xmm3 1983; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} 1984; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm12 1985; AVX512-NEXT: vcvtph2ps %xmm12, %xmm0 1986; AVX512-NEXT: vucomiss %xmm0, %xmm3 1987; AVX512-NEXT: seta %al 1988; AVX512-NEXT: kmovw %eax, %k1 1989; AVX512-NEXT: vmovss %xmm3, %xmm0, %xmm0 {%k1} 1990; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 1991; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 1992; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1993; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 1994; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] 1995; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm14 1996; AVX512-NEXT: vmovd %xmm14, %eax 1997; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 1998; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 1999; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 2000; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 2001; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] 2002; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm13 2003; AVX512-NEXT: vmovd %xmm13, %ecx 2004; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 2005; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 2006; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2007; AVX512-NEXT: vcvtph2ps %xmm4, %xmm0 2008; AVX512-NEXT: vucomiss %xmm0, %xmm0 2009; AVX512-NEXT: setp %al 2010; AVX512-NEXT: kmovw %eax, %k1 2011; AVX512-NEXT: vcvtph2ps %xmm6, %xmm2 2012; AVX512-NEXT: vucomiss %xmm2, %xmm2 2013; AVX512-NEXT: setp %al 2014; AVX512-NEXT: kmovw %eax, %k2 2015; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k2} 2016; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm3 2017; AVX512-NEXT: vcvtph2ps %xmm3, %xmm1 2018; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2019; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm8 2020; AVX512-NEXT: vcvtph2ps %xmm8, %xmm2 2021; AVX512-NEXT: vucomiss %xmm2, %xmm1 2022; AVX512-NEXT: seta %al 2023; AVX512-NEXT: kmovw %eax, %k1 2024; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2025; AVX512-NEXT: vpshuflw {{.*#+}} xmm1 = xmm4[1,1,1,1,4,5,6,7] 2026; AVX512-NEXT: vcvtph2ps %xmm1, %xmm1 2027; AVX512-NEXT: vucomiss %xmm1, %xmm1 2028; AVX512-NEXT: setp %al 2029; AVX512-NEXT: kmovw %eax, %k1 2030; AVX512-NEXT: vpshuflw {{.*#+}} xmm4 = xmm6[1,1,1,1,4,5,6,7] 2031; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 2032; AVX512-NEXT: vucomiss %xmm4, %xmm4 2033; AVX512-NEXT: setp %al 2034; AVX512-NEXT: kmovw %eax, %k2 2035; AVX512-NEXT: vmovss %xmm1, %xmm4, %xmm4 {%k2} 2036; AVX512-NEXT: vcvtps2ph $4, %xmm4, %xmm4 2037; AVX512-NEXT: vcvtph2ps %xmm4, %xmm6 2038; AVX512-NEXT: vmovss %xmm6, %xmm1, %xmm1 {%k1} 2039; AVX512-NEXT: vcvtps2ph $4, %xmm1, %xmm1 2040; AVX512-NEXT: vcvtph2ps %xmm1, %xmm0 2041; AVX512-NEXT: vucomiss %xmm0, %xmm6 2042; AVX512-NEXT: seta %al 2043; AVX512-NEXT: kmovw %eax, %k1 2044; AVX512-NEXT: vmovss %xmm6, %xmm0, %xmm0 {%k1} 2045; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm2 2046; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero 2047; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 2048; AVX512-NEXT: vmulss %xmm2, %xmm9, %xmm2 2049; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 2050; AVX512-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero 2051; AVX512-NEXT: vcvtph2ps %xmm0, %xmm0 2052; AVX512-NEXT: vmulss %xmm0, %xmm9, %xmm0 2053; AVX512-NEXT: vblendps {{.*#+}} xmm2 = xmm2[0],xmm10[1,2,3] 2054; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm10[1,2,3] 2055; AVX512-NEXT: vcvtps2ph $4, %xmm2, %xmm9 2056; AVX512-NEXT: vmovd %xmm9, %eax 2057; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm10 2058; AVX512-NEXT: vmovd %xmm10, %ecx 2059; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 2060; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 2061; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] 2062; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1] 2063; AVX512-NEXT: vpunpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload 2064; AVX512-NEXT: # xmm6 = xmm0[0],mem[0] 2065; AVX512-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 2066; AVX512-NEXT: vmovd %xmm0, %eax 2067; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2068; AVX512-NEXT: vmovd %xmm0, %ecx 2069; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 2070; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm2 2071; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] 2072; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 2073; AVX512-NEXT: vmovd %xmm2, %eax 2074; AVX512-NEXT: vmovd %xmm15, %ecx 2075; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 2076; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 2077; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 2078; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] 2079; AVX512-NEXT: vmovd %xmm11, %eax 2080; AVX512-NEXT: vmovd %xmm7, %ecx 2081; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm2 2082; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 2083; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3] 2084; AVX512-NEXT: vmovd %xmm3, %eax 2085; AVX512-NEXT: vmovd %xmm4, %ecx 2086; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 2087; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4 2088; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] 2089; AVX512-NEXT: vpunpckldq {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] 2090; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0] 2091; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0 2092; AVX512-NEXT: vpcmpeqw %xmm0, %xmm2, %xmm3 2093; AVX512-NEXT: vpblendvb %xmm3, %xmm2, %xmm6, %xmm2 2094; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2095; AVX512-NEXT: vmovd %xmm3, %eax 2096; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload 2097; AVX512-NEXT: vmovd %xmm3, %ecx 2098; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm3 2099; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm4 2100; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] 2101; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2102; AVX512-NEXT: vmovd %xmm4, %eax 2103; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2104; AVX512-NEXT: vmovd %xmm4, %ecx 2105; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 2106; AVX512-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm5 2107; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2108; AVX512-NEXT: vpunpckldq {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] 2109; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload 2110; AVX512-NEXT: vmovd %xmm4, %eax 2111; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm4 2112; AVX512-NEXT: vmovd %xmm12, %eax 2113; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 2114; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] 2115; AVX512-NEXT: vmovd %xmm8, %eax 2116; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm5 2117; AVX512-NEXT: vmovd %xmm1, %eax 2118; AVX512-NEXT: vpinsrw $0, %eax, %xmm0, %xmm1 2119; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3] 2120; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] 2121; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] 2122; AVX512-NEXT: vpcmpeqw %xmm0, %xmm1, %xmm0 2123; AVX512-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 2124; AVX512-NEXT: vcvtph2ps %xmm10, %xmm1 2125; AVX512-NEXT: xorl %eax, %eax 2126; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 2127; AVX512-NEXT: vucomiss %xmm2, %xmm1 2128; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF 2129; AVX512-NEXT: movl $0, %edx 2130; AVX512-NEXT: cmovel %ecx, %edx 2131; AVX512-NEXT: vcvtph2ps %xmm9, %xmm1 2132; AVX512-NEXT: vucomiss %xmm2, %xmm1 2133; AVX512-NEXT: movl $0, %esi 2134; AVX512-NEXT: cmovel %ecx, %esi 2135; AVX512-NEXT: vcvtph2ps %xmm13, %xmm1 2136; AVX512-NEXT: vucomiss %xmm2, %xmm1 2137; AVX512-NEXT: movl $0, %edi 2138; AVX512-NEXT: cmovel %ecx, %edi 2139; AVX512-NEXT: vcvtph2ps %xmm14, %xmm1 2140; AVX512-NEXT: vucomiss %xmm2, %xmm1 2141; AVX512-NEXT: movl $0, %r8d 2142; AVX512-NEXT: cmovel %ecx, %r8d 2143; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2144; AVX512-NEXT: vucomiss %xmm2, %xmm1 2145; AVX512-NEXT: movl $0, %r9d 2146; AVX512-NEXT: cmovel %ecx, %r9d 2147; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2148; AVX512-NEXT: vucomiss %xmm2, %xmm1 2149; AVX512-NEXT: movl $0, %r10d 2150; AVX512-NEXT: cmovel %ecx, %r10d 2151; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2152; AVX512-NEXT: vucomiss %xmm2, %xmm1 2153; AVX512-NEXT: movl $0, %r11d 2154; AVX512-NEXT: cmovel %ecx, %r11d 2155; AVX512-NEXT: vcvtph2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 2156; AVX512-NEXT: vucomiss %xmm2, %xmm1 2157; AVX512-NEXT: vmovd %esi, %xmm1 2158; AVX512-NEXT: vpinsrw $1, %edx, %xmm1, %xmm1 2159; AVX512-NEXT: vpinsrw $2, %edi, %xmm1, %xmm1 2160; AVX512-NEXT: vpinsrw $3, %r8d, %xmm1, %xmm1 2161; AVX512-NEXT: vpinsrw $4, %r9d, %xmm1, %xmm1 2162; AVX512-NEXT: vpinsrw $5, %r10d, %xmm1, %xmm1 2163; AVX512-NEXT: vpinsrw $6, %r11d, %xmm1, %xmm1 2164; AVX512-NEXT: cmovel %ecx, %eax 2165; AVX512-NEXT: vpinsrw $7, %eax, %xmm1, %xmm1 2166; AVX512-NEXT: vpblendvb %xmm1, %xmm0, %xmm6, %xmm0 2167; AVX512-NEXT: addq $88, %rsp 2168; AVX512-NEXT: retq 2169; 2170; AVX10_2-LABEL: test_fmaximumnum_v4f16: 2171; AVX10_2: # %bb.0: 2172; AVX10_2-NEXT: vminmaxph $17, %xmm1, %xmm0, %xmm0 2173; AVX10_2-NEXT: retq 2174; 2175; X86-LABEL: test_fmaximumnum_v4f16: 2176; X86: # %bb.0: 2177; X86-NEXT: subl $164, %esp 2178; X86-NEXT: vmovdqa %xmm0, %xmm2 2179; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2180; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 2181; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2182; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 2183; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2184; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2185; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2186; X86-NEXT: vpsrlq $48, %xmm1, %xmm0 2187; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2188; X86-NEXT: vpsrld $16, %xmm2, %xmm0 2189; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2190; X86-NEXT: vpsrld $16, %xmm1, %xmm0 2191; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2192; X86-NEXT: vpextrw $0, %xmm1, (%esp) 2193; X86-NEXT: calll __extendhfsf2 2194; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2195; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2196; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2197; X86-NEXT: calll __extendhfsf2 2198; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2199; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2200; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2201; X86-NEXT: calll __extendhfsf2 2202; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2203; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2204; X86-NEXT: fstps {{[0-9]+}}(%esp) 2205; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2206; X86-NEXT: fstps {{[0-9]+}}(%esp) 2207; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 2208; X86-NEXT: vmovd %xmm2, %eax 2209; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2210; X86-NEXT: testl %eax, %eax 2211; X86-NEXT: js .LBB33_1 2212; X86-NEXT: # %bb.2: 2213; X86-NEXT: vmovdqa %xmm2, %xmm1 2214; X86-NEXT: jmp .LBB33_3 2215; X86-NEXT: .LBB33_1: 2216; X86-NEXT: vmovdqa %xmm0, %xmm1 2217; X86-NEXT: vmovdqa %xmm2, %xmm0 2218; X86-NEXT: .LBB33_3: 2219; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2220; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2221; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2222; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2223; X86-NEXT: calll __extendhfsf2 2224; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2225; X86-NEXT: vmovss %xmm0, (%esp) 2226; X86-NEXT: fstps {{[0-9]+}}(%esp) 2227; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2228; X86-NEXT: fstps {{[0-9]+}}(%esp) 2229; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2230; X86-NEXT: vmovd %xmm1, %eax 2231; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2232; X86-NEXT: testl %eax, %eax 2233; X86-NEXT: js .LBB33_4 2234; X86-NEXT: # %bb.5: 2235; X86-NEXT: vmovdqa %xmm1, %xmm2 2236; X86-NEXT: jmp .LBB33_6 2237; X86-NEXT: .LBB33_4: 2238; X86-NEXT: vmovdqa %xmm0, %xmm2 2239; X86-NEXT: vmovdqa %xmm1, %xmm0 2240; X86-NEXT: .LBB33_6: 2241; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2242; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2243; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2244; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2245; X86-NEXT: calll __truncsfhf2 2246; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2247; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2248; X86-NEXT: vmovss %xmm0, (%esp) 2249; X86-NEXT: calll __truncsfhf2 2250; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2251; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2252; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2253; X86-NEXT: calll __extendhfsf2 2254; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2255; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2256; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2257; X86-NEXT: calll __extendhfsf2 2258; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2259; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2260; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2261; X86-NEXT: calll __extendhfsf2 2262; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2263; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2264; X86-NEXT: fstps {{[0-9]+}}(%esp) 2265; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2266; X86-NEXT: fstps {{[0-9]+}}(%esp) 2267; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2268; X86-NEXT: vmovd %xmm1, %eax 2269; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2270; X86-NEXT: testl %eax, %eax 2271; X86-NEXT: js .LBB33_7 2272; X86-NEXT: # %bb.8: 2273; X86-NEXT: vmovdqa %xmm1, %xmm2 2274; X86-NEXT: jmp .LBB33_9 2275; X86-NEXT: .LBB33_7: 2276; X86-NEXT: vmovdqa %xmm0, %xmm2 2277; X86-NEXT: vmovdqa %xmm1, %xmm0 2278; X86-NEXT: .LBB33_9: 2279; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2280; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2281; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2282; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2283; X86-NEXT: calll __extendhfsf2 2284; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2285; X86-NEXT: vmovss %xmm0, (%esp) 2286; X86-NEXT: fstps {{[0-9]+}}(%esp) 2287; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2288; X86-NEXT: fstps {{[0-9]+}}(%esp) 2289; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2290; X86-NEXT: vmovd %xmm1, %eax 2291; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2292; X86-NEXT: testl %eax, %eax 2293; X86-NEXT: js .LBB33_10 2294; X86-NEXT: # %bb.11: 2295; X86-NEXT: vmovdqa %xmm1, %xmm2 2296; X86-NEXT: jmp .LBB33_12 2297; X86-NEXT: .LBB33_10: 2298; X86-NEXT: vmovdqa %xmm0, %xmm2 2299; X86-NEXT: vmovdqa %xmm1, %xmm0 2300; X86-NEXT: .LBB33_12: 2301; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2302; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2303; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2304; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2305; X86-NEXT: calll __truncsfhf2 2306; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2307; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2308; X86-NEXT: vmovd %xmm0, (%esp) 2309; X86-NEXT: calll __truncsfhf2 2310; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2311; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2312; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2313; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2314; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2315; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2316; X86-NEXT: addl $164, %esp 2317; X86-NEXT: retl 2318 %r = call <4 x half> @llvm.maximumnum.v4f16(<4 x half> %x, <4 x half> %y) 2319 ret <4 x half> %r 2320} 2321 2322define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) nounwind { 2323; SSE2-LABEL: test_fmaximumnum_v4bf16: 2324; SSE2: # %bb.0: 2325; SSE2-NEXT: pushq %rbp 2326; SSE2-NEXT: pushq %r15 2327; SSE2-NEXT: pushq %r14 2328; SSE2-NEXT: pushq %rbx 2329; SSE2-NEXT: subq $56, %rsp 2330; SSE2-NEXT: movdqa %xmm1, %xmm4 2331; SSE2-NEXT: movdqa %xmm0, %xmm5 2332; SSE2-NEXT: pextrw $0, %xmm1, %r14d 2333; SSE2-NEXT: pextrw $0, %xmm0, %r15d 2334; SSE2-NEXT: movdqa %xmm1, %xmm0 2335; SSE2-NEXT: psrld $16, %xmm0 2336; SSE2-NEXT: pextrw $0, %xmm0, %eax 2337; SSE2-NEXT: movdqa %xmm5, %xmm0 2338; SSE2-NEXT: psrld $16, %xmm0 2339; SSE2-NEXT: pextrw $0, %xmm0, %ecx 2340; SSE2-NEXT: shll $16, %ecx 2341; SSE2-NEXT: movd %ecx, %xmm3 2342; SSE2-NEXT: shll $16, %eax 2343; SSE2-NEXT: movd %eax, %xmm2 2344; SSE2-NEXT: testl %ecx, %ecx 2345; SSE2-NEXT: movdqa %xmm3, %xmm1 2346; SSE2-NEXT: js .LBB34_2 2347; SSE2-NEXT: # %bb.1: 2348; SSE2-NEXT: movdqa %xmm2, %xmm1 2349; SSE2-NEXT: .LBB34_2: 2350; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2351; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] 2352; SSE2-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill 2353; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,1,1] 2354; SSE2-NEXT: movdqa %xmm1, %xmm0 2355; SSE2-NEXT: cmpordss %xmm1, %xmm0 2356; SSE2-NEXT: movaps %xmm0, %xmm6 2357; SSE2-NEXT: andps %xmm1, %xmm6 2358; SSE2-NEXT: js .LBB34_4 2359; SSE2-NEXT: # %bb.3: 2360; SSE2-NEXT: movdqa %xmm3, %xmm2 2361; SSE2-NEXT: .LBB34_4: 2362; SSE2-NEXT: pextrw $0, %xmm4, %ebp 2363; SSE2-NEXT: pextrw $0, %xmm5, %ebx 2364; SSE2-NEXT: maxss %xmm2, %xmm1 2365; SSE2-NEXT: andnps %xmm1, %xmm0 2366; SSE2-NEXT: orps %xmm6, %xmm0 2367; SSE2-NEXT: callq __truncsfbf2@PLT 2368; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2369; SSE2-NEXT: shll $16, %r15d 2370; SSE2-NEXT: movd %r15d, %xmm3 2371; SSE2-NEXT: shll $16, %r14d 2372; SSE2-NEXT: movd %r14d, %xmm2 2373; SSE2-NEXT: testl %r15d, %r15d 2374; SSE2-NEXT: movdqa %xmm3, %xmm1 2375; SSE2-NEXT: js .LBB34_6 2376; SSE2-NEXT: # %bb.5: 2377; SSE2-NEXT: movdqa %xmm2, %xmm1 2378; SSE2-NEXT: .LBB34_6: 2379; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2380; SSE2-NEXT: psrlq $48, %xmm5 2381; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload 2382; SSE2-NEXT: psrlq $48, %xmm6 2383; SSE2-NEXT: movdqa %xmm1, %xmm0 2384; SSE2-NEXT: cmpordss %xmm1, %xmm0 2385; SSE2-NEXT: movaps %xmm0, %xmm4 2386; SSE2-NEXT: andps %xmm1, %xmm4 2387; SSE2-NEXT: js .LBB34_8 2388; SSE2-NEXT: # %bb.7: 2389; SSE2-NEXT: movdqa %xmm3, %xmm2 2390; SSE2-NEXT: .LBB34_8: 2391; SSE2-NEXT: pextrw $0, %xmm5, %r15d 2392; SSE2-NEXT: pextrw $0, %xmm6, %r14d 2393; SSE2-NEXT: maxss %xmm2, %xmm1 2394; SSE2-NEXT: andnps %xmm1, %xmm0 2395; SSE2-NEXT: orps %xmm4, %xmm0 2396; SSE2-NEXT: callq __truncsfbf2@PLT 2397; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2398; SSE2-NEXT: shll $16, %ebx 2399; SSE2-NEXT: movd %ebx, %xmm1 2400; SSE2-NEXT: shll $16, %ebp 2401; SSE2-NEXT: movd %ebp, %xmm3 2402; SSE2-NEXT: testl %ebx, %ebx 2403; SSE2-NEXT: movdqa %xmm1, %xmm2 2404; SSE2-NEXT: js .LBB34_10 2405; SSE2-NEXT: # %bb.9: 2406; SSE2-NEXT: movdqa %xmm3, %xmm2 2407; SSE2-NEXT: .LBB34_10: 2408; SSE2-NEXT: movdqa %xmm2, %xmm0 2409; SSE2-NEXT: cmpordss %xmm2, %xmm0 2410; SSE2-NEXT: movaps %xmm0, %xmm4 2411; SSE2-NEXT: andps %xmm2, %xmm4 2412; SSE2-NEXT: js .LBB34_12 2413; SSE2-NEXT: # %bb.11: 2414; SSE2-NEXT: movdqa %xmm1, %xmm3 2415; SSE2-NEXT: .LBB34_12: 2416; SSE2-NEXT: maxss %xmm3, %xmm2 2417; SSE2-NEXT: andnps %xmm2, %xmm0 2418; SSE2-NEXT: orps %xmm4, %xmm0 2419; SSE2-NEXT: callq __truncsfbf2@PLT 2420; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 2421; SSE2-NEXT: shll $16, %r14d 2422; SSE2-NEXT: movd %r14d, %xmm1 2423; SSE2-NEXT: shll $16, %r15d 2424; SSE2-NEXT: movd %r15d, %xmm3 2425; SSE2-NEXT: testl %r14d, %r14d 2426; SSE2-NEXT: movdqa %xmm1, %xmm2 2427; SSE2-NEXT: js .LBB34_14 2428; SSE2-NEXT: # %bb.13: 2429; SSE2-NEXT: movdqa %xmm3, %xmm2 2430; SSE2-NEXT: .LBB34_14: 2431; SSE2-NEXT: movdqa %xmm2, %xmm0 2432; SSE2-NEXT: cmpordss %xmm2, %xmm0 2433; SSE2-NEXT: movaps %xmm0, %xmm4 2434; SSE2-NEXT: andps %xmm2, %xmm4 2435; SSE2-NEXT: js .LBB34_16 2436; SSE2-NEXT: # %bb.15: 2437; SSE2-NEXT: movdqa %xmm1, %xmm3 2438; SSE2-NEXT: .LBB34_16: 2439; SSE2-NEXT: maxss %xmm3, %xmm2 2440; SSE2-NEXT: andnps %xmm2, %xmm0 2441; SSE2-NEXT: orps %xmm4, %xmm0 2442; SSE2-NEXT: callq __truncsfbf2@PLT 2443; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 2444; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2445; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2446; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2447; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2448; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2449; SSE2-NEXT: addq $56, %rsp 2450; SSE2-NEXT: popq %rbx 2451; SSE2-NEXT: popq %r14 2452; SSE2-NEXT: popq %r15 2453; SSE2-NEXT: popq %rbp 2454; SSE2-NEXT: retq 2455; 2456; AVX1-LABEL: test_fmaximumnum_v4bf16: 2457; AVX1: # %bb.0: 2458; AVX1-NEXT: pushq %rbp 2459; AVX1-NEXT: pushq %r15 2460; AVX1-NEXT: pushq %r14 2461; AVX1-NEXT: pushq %r13 2462; AVX1-NEXT: pushq %r12 2463; AVX1-NEXT: pushq %rbx 2464; AVX1-NEXT: subq $56, %rsp 2465; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 2466; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3 2467; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2468; AVX1-NEXT: vpextrw $0, %xmm4, %ebx 2469; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 2470; AVX1-NEXT: vpextrw $0, %xmm4, %ebp 2471; AVX1-NEXT: vpextrw $0, %xmm0, %r12d 2472; AVX1-NEXT: vpextrw $0, %xmm1, %r13d 2473; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2474; AVX1-NEXT: vpextrw $0, %xmm0, %eax 2475; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 2476; AVX1-NEXT: vpextrw $0, %xmm0, %ecx 2477; AVX1-NEXT: shll $16, %ecx 2478; AVX1-NEXT: vmovd %ecx, %xmm0 2479; AVX1-NEXT: shll $16, %eax 2480; AVX1-NEXT: vmovd %eax, %xmm4 2481; AVX1-NEXT: js .LBB34_1 2482; AVX1-NEXT: # %bb.2: 2483; AVX1-NEXT: vmovdqa %xmm4, %xmm1 2484; AVX1-NEXT: jmp .LBB34_3 2485; AVX1-NEXT: .LBB34_1: 2486; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2487; AVX1-NEXT: vmovdqa %xmm4, %xmm0 2488; AVX1-NEXT: .LBB34_3: 2489; AVX1-NEXT: vpextrw $0, %xmm2, %r14d 2490; AVX1-NEXT: vpextrw $0, %xmm3, %r15d 2491; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2492; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2493; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2494; AVX1-NEXT: callq __truncsfbf2@PLT 2495; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2496; AVX1-NEXT: shll $16, %r13d 2497; AVX1-NEXT: vmovd %r13d, %xmm0 2498; AVX1-NEXT: shll $16, %r12d 2499; AVX1-NEXT: vmovd %r12d, %xmm2 2500; AVX1-NEXT: js .LBB34_4 2501; AVX1-NEXT: # %bb.5: 2502; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2503; AVX1-NEXT: jmp .LBB34_6 2504; AVX1-NEXT: .LBB34_4: 2505; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2506; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2507; AVX1-NEXT: .LBB34_6: 2508; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2509; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2510; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2511; AVX1-NEXT: callq __truncsfbf2@PLT 2512; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2513; AVX1-NEXT: shll $16, %ebp 2514; AVX1-NEXT: vmovd %ebp, %xmm0 2515; AVX1-NEXT: shll $16, %ebx 2516; AVX1-NEXT: vmovd %ebx, %xmm2 2517; AVX1-NEXT: js .LBB34_7 2518; AVX1-NEXT: # %bb.8: 2519; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2520; AVX1-NEXT: jmp .LBB34_9 2521; AVX1-NEXT: .LBB34_7: 2522; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2523; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2524; AVX1-NEXT: .LBB34_9: 2525; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2526; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2527; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2528; AVX1-NEXT: callq __truncsfbf2@PLT 2529; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2530; AVX1-NEXT: shll $16, %r15d 2531; AVX1-NEXT: vmovd %r15d, %xmm0 2532; AVX1-NEXT: shll $16, %r14d 2533; AVX1-NEXT: vmovd %r14d, %xmm2 2534; AVX1-NEXT: js .LBB34_10 2535; AVX1-NEXT: # %bb.11: 2536; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2537; AVX1-NEXT: jmp .LBB34_12 2538; AVX1-NEXT: .LBB34_10: 2539; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2540; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2541; AVX1-NEXT: .LBB34_12: 2542; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2543; AVX1-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2544; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2545; AVX1-NEXT: callq __truncsfbf2@PLT 2546; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2547; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2548; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2549; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2550; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2551; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2552; AVX1-NEXT: addq $56, %rsp 2553; AVX1-NEXT: popq %rbx 2554; AVX1-NEXT: popq %r12 2555; AVX1-NEXT: popq %r13 2556; AVX1-NEXT: popq %r14 2557; AVX1-NEXT: popq %r15 2558; AVX1-NEXT: popq %rbp 2559; AVX1-NEXT: retq 2560; 2561; AVX512-LABEL: test_fmaximumnum_v4bf16: 2562; AVX512: # %bb.0: 2563; AVX512-NEXT: pushq %rbp 2564; AVX512-NEXT: pushq %r15 2565; AVX512-NEXT: pushq %r14 2566; AVX512-NEXT: pushq %r13 2567; AVX512-NEXT: pushq %r12 2568; AVX512-NEXT: pushq %rbx 2569; AVX512-NEXT: pushq %rax 2570; AVX512-NEXT: vmovq %xmm1, %r13 2571; AVX512-NEXT: movq %r13, %rbx 2572; AVX512-NEXT: shrq $32, %rbx 2573; AVX512-NEXT: vmovq %xmm0, %rbp 2574; AVX512-NEXT: movq %rbp, %r14 2575; AVX512-NEXT: shrq $32, %r14 2576; AVX512-NEXT: movq %r13, %r15 2577; AVX512-NEXT: shrq $48, %r15 2578; AVX512-NEXT: movq %rbp, %r12 2579; AVX512-NEXT: shrq $48, %r12 2580; AVX512-NEXT: movl %ebp, %eax 2581; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 2582; AVX512-NEXT: sets %cl 2583; AVX512-NEXT: kmovw %ecx, %k1 2584; AVX512-NEXT: movl %r13d, %ecx 2585; AVX512-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 2586; AVX512-NEXT: vmovd %ecx, %xmm1 2587; AVX512-NEXT: vmovd %eax, %xmm0 2588; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2589; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2590; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2591; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2592; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 2593; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2594; AVX512-NEXT: callq __truncsfbf2@PLT 2595; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2596; AVX512-NEXT: shll $16, %ebp 2597; AVX512-NEXT: sets %al 2598; AVX512-NEXT: kmovw %eax, %k1 2599; AVX512-NEXT: shll $16, %r13d 2600; AVX512-NEXT: vmovd %r13d, %xmm1 2601; AVX512-NEXT: vmovd %ebp, %xmm0 2602; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2603; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2604; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2605; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2606; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 2607; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2608; AVX512-NEXT: callq __truncsfbf2@PLT 2609; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) 2610; AVX512-NEXT: shll $16, %r12d 2611; AVX512-NEXT: sets %al 2612; AVX512-NEXT: kmovw %eax, %k1 2613; AVX512-NEXT: shll $16, %r15d 2614; AVX512-NEXT: vmovd %r15d, %xmm1 2615; AVX512-NEXT: vmovd %r12d, %xmm0 2616; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2617; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2618; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2619; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2620; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 2621; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2622; AVX512-NEXT: callq __truncsfbf2@PLT 2623; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2624; AVX512-NEXT: shll $16, %r14d 2625; AVX512-NEXT: sets %al 2626; AVX512-NEXT: kmovw %eax, %k1 2627; AVX512-NEXT: shll $16, %ebx 2628; AVX512-NEXT: vmovd %ebx, %xmm1 2629; AVX512-NEXT: vmovd %r14d, %xmm0 2630; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2631; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2632; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2633; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2634; AVX512-NEXT: vcmpordss %xmm1, %xmm1, %k1 2635; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2636; AVX512-NEXT: callq __truncsfbf2@PLT 2637; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2638; AVX512-NEXT: vmovaps (%rsp), %xmm0 2639; AVX512-NEXT: addq $8, %rsp 2640; AVX512-NEXT: popq %rbx 2641; AVX512-NEXT: popq %r12 2642; AVX512-NEXT: popq %r13 2643; AVX512-NEXT: popq %r14 2644; AVX512-NEXT: popq %r15 2645; AVX512-NEXT: popq %rbp 2646; AVX512-NEXT: retq 2647; 2648; AVX10_2-LABEL: test_fmaximumnum_v4bf16: 2649; AVX10_2: # %bb.0: 2650; AVX10_2-NEXT: vminmaxbf16 $17, %xmm1, %xmm0, %xmm0 2651; AVX10_2-NEXT: retq 2652; 2653; X86-LABEL: test_fmaximumnum_v4bf16: 2654; X86: # %bb.0: 2655; X86-NEXT: pushl %ebp 2656; X86-NEXT: pushl %ebx 2657; X86-NEXT: pushl %edi 2658; X86-NEXT: pushl %esi 2659; X86-NEXT: subl $68, %esp 2660; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 2661; X86-NEXT: vpsrlq $48, %xmm1, %xmm3 2662; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2663; X86-NEXT: vpextrw $0, %xmm4, %esi 2664; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 2665; X86-NEXT: vpextrw $0, %xmm4, %ebx 2666; X86-NEXT: vpextrw $0, %xmm0, %eax 2667; X86-NEXT: vpextrw $0, %xmm1, %ecx 2668; X86-NEXT: vpsrld $16, %xmm0, %xmm0 2669; X86-NEXT: vpextrw $0, %xmm0, %edx 2670; X86-NEXT: vpsrld $16, %xmm1, %xmm0 2671; X86-NEXT: vpextrw $0, %xmm0, %edi 2672; X86-NEXT: shll $16, %edi 2673; X86-NEXT: vmovd %edi, %xmm0 2674; X86-NEXT: shll $16, %edx 2675; X86-NEXT: vmovd %edx, %xmm4 2676; X86-NEXT: js .LBB34_1 2677; X86-NEXT: # %bb.2: 2678; X86-NEXT: vmovdqa %xmm4, %xmm1 2679; X86-NEXT: jmp .LBB34_3 2680; X86-NEXT: .LBB34_1: 2681; X86-NEXT: vmovdqa %xmm0, %xmm1 2682; X86-NEXT: vmovdqa %xmm4, %xmm0 2683; X86-NEXT: .LBB34_3: 2684; X86-NEXT: vpextrw $0, %xmm2, %edi 2685; X86-NEXT: vpextrw $0, %xmm3, %ebp 2686; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2687; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2688; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2689; X86-NEXT: vmovss %xmm0, (%esp) 2690; X86-NEXT: shll $16, %ecx 2691; X86-NEXT: vmovd %ecx, %xmm0 2692; X86-NEXT: shll $16, %eax 2693; X86-NEXT: vmovd %eax, %xmm2 2694; X86-NEXT: js .LBB34_4 2695; X86-NEXT: # %bb.5: 2696; X86-NEXT: vmovdqa %xmm2, %xmm1 2697; X86-NEXT: jmp .LBB34_6 2698; X86-NEXT: .LBB34_4: 2699; X86-NEXT: vmovdqa %xmm0, %xmm1 2700; X86-NEXT: vmovdqa %xmm2, %xmm0 2701; X86-NEXT: .LBB34_6: 2702; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2703; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2704; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2705; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2706; X86-NEXT: calll __truncsfbf2 2707; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2708; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2709; X86-NEXT: vmovss %xmm0, (%esp) 2710; X86-NEXT: shll $16, %ebx 2711; X86-NEXT: vmovd %ebx, %xmm0 2712; X86-NEXT: shll $16, %esi 2713; X86-NEXT: vmovd %esi, %xmm2 2714; X86-NEXT: js .LBB34_7 2715; X86-NEXT: # %bb.8: 2716; X86-NEXT: vmovdqa %xmm2, %xmm1 2717; X86-NEXT: jmp .LBB34_9 2718; X86-NEXT: .LBB34_7: 2719; X86-NEXT: vmovdqa %xmm0, %xmm1 2720; X86-NEXT: vmovdqa %xmm2, %xmm0 2721; X86-NEXT: .LBB34_9: 2722; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2723; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2724; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2725; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2726; X86-NEXT: calll __truncsfbf2 2727; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2728; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2729; X86-NEXT: vmovss %xmm0, (%esp) 2730; X86-NEXT: shll $16, %ebp 2731; X86-NEXT: vmovd %ebp, %xmm0 2732; X86-NEXT: shll $16, %edi 2733; X86-NEXT: vmovd %edi, %xmm2 2734; X86-NEXT: js .LBB34_10 2735; X86-NEXT: # %bb.11: 2736; X86-NEXT: vmovdqa %xmm2, %xmm1 2737; X86-NEXT: jmp .LBB34_12 2738; X86-NEXT: .LBB34_10: 2739; X86-NEXT: vmovdqa %xmm0, %xmm1 2740; X86-NEXT: vmovdqa %xmm2, %xmm0 2741; X86-NEXT: .LBB34_12: 2742; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2743; X86-NEXT: vcmpordss %xmm0, %xmm0, %xmm2 2744; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2745; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2746; X86-NEXT: calll __truncsfbf2 2747; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2748; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2749; X86-NEXT: vmovd %xmm0, (%esp) 2750; X86-NEXT: calll __truncsfbf2 2751; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2752; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2753; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2754; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2755; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2756; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2757; X86-NEXT: addl $68, %esp 2758; X86-NEXT: popl %esi 2759; X86-NEXT: popl %edi 2760; X86-NEXT: popl %ebx 2761; X86-NEXT: popl %ebp 2762; X86-NEXT: retl 2763 %r = call <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) 2764 ret <4 x bfloat> %r 2765} 2766