1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX,AVX512,AVX512DQ 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.2-256 | FileCheck %s --check-prefixes=AVX10_2 7; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 8 9declare float @llvm.maximum.f32(float, float) 10declare double @llvm.maximum.f64(double, double) 11declare float @llvm.minimum.f32(float, float) 12declare double @llvm.minimum.f64(double, double) 13declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>) 14declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) 15declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>) 16declare <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat>, <4 x bfloat>) 17 18; 19; fmaximum 20; 21 22define float @test_fmaximum(float %x, float %y) nounwind { 23; SSE2-LABEL: test_fmaximum: 24; SSE2: # %bb.0: 25; SSE2-NEXT: movdqa %xmm0, %xmm2 26; SSE2-NEXT: movd %xmm0, %eax 27; SSE2-NEXT: testl %eax, %eax 28; SSE2-NEXT: movdqa %xmm0, %xmm3 29; SSE2-NEXT: js .LBB0_2 30; SSE2-NEXT: # %bb.1: 31; SSE2-NEXT: movdqa %xmm1, %xmm3 32; SSE2-NEXT: .LBB0_2: 33; SSE2-NEXT: movdqa %xmm3, %xmm0 34; SSE2-NEXT: cmpunordss %xmm3, %xmm0 35; SSE2-NEXT: movaps %xmm0, %xmm4 36; SSE2-NEXT: andps %xmm3, %xmm4 37; SSE2-NEXT: js .LBB0_4 38; SSE2-NEXT: # %bb.3: 39; SSE2-NEXT: movdqa %xmm2, %xmm1 40; SSE2-NEXT: .LBB0_4: 41; SSE2-NEXT: maxss %xmm1, %xmm3 42; SSE2-NEXT: andnps %xmm3, %xmm0 43; SSE2-NEXT: orps %xmm4, %xmm0 44; SSE2-NEXT: retq 45; 46; AVX1-LABEL: test_fmaximum: 47; AVX1: # %bb.0: 48; AVX1-NEXT: vmovd %xmm0, %eax 49; AVX1-NEXT: testl %eax, %eax 50; AVX1-NEXT: js .LBB0_1 51; AVX1-NEXT: # %bb.2: 52; AVX1-NEXT: vmovdqa %xmm0, %xmm2 53; AVX1-NEXT: jmp .LBB0_3 54; AVX1-NEXT: .LBB0_1: 55; AVX1-NEXT: vmovdqa %xmm1, %xmm2 56; AVX1-NEXT: vmovdqa %xmm0, %xmm1 57; AVX1-NEXT: .LBB0_3: 58; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 59; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 60; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 61; AVX1-NEXT: retq 62; 63; AVX512-LABEL: test_fmaximum: 64; AVX512: # %bb.0: 65; AVX512-NEXT: vmovd %xmm0, %eax 66; AVX512-NEXT: testl %eax, %eax 67; AVX512-NEXT: sets %al 68; AVX512-NEXT: kmovw %eax, %k1 69; AVX512-NEXT: vmovdqa %xmm0, %xmm2 70; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 71; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 72; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 73; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 74; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 75; AVX512-NEXT: retq 76; 77; AVX10_2-LABEL: test_fmaximum: 78; AVX10_2: # %bb.0: 79; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 80; AVX10_2-NEXT: retq 81; 82; X86-LABEL: test_fmaximum: 83; X86: # %bb.0: 84; X86-NEXT: pushl %eax 85; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 86; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 87; X86-NEXT: vmovd %xmm2, %eax 88; X86-NEXT: testl %eax, %eax 89; X86-NEXT: js .LBB0_1 90; X86-NEXT: # %bb.2: 91; X86-NEXT: vmovdqa %xmm2, %xmm1 92; X86-NEXT: jmp .LBB0_3 93; X86-NEXT: .LBB0_1: 94; X86-NEXT: vmovdqa %xmm0, %xmm1 95; X86-NEXT: vmovdqa %xmm2, %xmm0 96; X86-NEXT: .LBB0_3: 97; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 98; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 99; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 100; X86-NEXT: vmovss %xmm0, (%esp) 101; X86-NEXT: flds (%esp) 102; X86-NEXT: popl %eax 103; X86-NEXT: retl 104 %1 = tail call float @llvm.maximum.f32(float %x, float %y) 105 ret float %1 106} 107 108define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 109; SSE2-LABEL: test_fmaximum_scalarize: 110; SSE2: # %bb.0: 111; SSE2-NEXT: maxps %xmm1, %xmm0 112; SSE2-NEXT: retq 113; 114; AVX-LABEL: test_fmaximum_scalarize: 115; AVX: # %bb.0: 116; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 117; AVX-NEXT: retq 118; 119; AVX10_2-LABEL: test_fmaximum_scalarize: 120; AVX10_2: # %bb.0: 121; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 122; AVX10_2-NEXT: retq 123; 124; X86-LABEL: test_fmaximum_scalarize: 125; X86: # %bb.0: 126; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 127; X86-NEXT: retl 128 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) 129 ret <4 x float> %r 130} 131 132define float @test_fmaximum_nan0(float %x, float %y) { 133; SSE2-LABEL: test_fmaximum_nan0: 134; SSE2: # %bb.0: 135; SSE2-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 136; SSE2-NEXT: retq 137; 138; AVX-LABEL: test_fmaximum_nan0: 139; AVX: # %bb.0: 140; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 141; AVX-NEXT: retq 142; 143; AVX10_2-LABEL: test_fmaximum_nan0: 144; AVX10_2: # %bb.0: 145; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 146; AVX10_2-NEXT: retq 147; 148; X86-LABEL: test_fmaximum_nan0: 149; X86: # %bb.0: 150; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} 151; X86-NEXT: retl 152 %1 = tail call float @llvm.maximum.f32(float 0x7fff000000000000, float %y) 153 ret float %1 154} 155 156define float @test_fmaximum_nan1(float %x, float %y) { 157; SSE2-LABEL: test_fmaximum_nan1: 158; SSE2: # %bb.0: 159; SSE2-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 160; SSE2-NEXT: retq 161; 162; AVX-LABEL: test_fmaximum_nan1: 163; AVX: # %bb.0: 164; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 165; AVX-NEXT: retq 166; 167; AVX10_2-LABEL: test_fmaximum_nan1: 168; AVX10_2: # %bb.0: 169; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 170; AVX10_2-NEXT: retq 171; 172; X86-LABEL: test_fmaximum_nan1: 173; X86: # %bb.0: 174; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} 175; X86-NEXT: retl 176 %1 = tail call float @llvm.maximum.f32(float %x, float 0x7fff000000000000) 177 ret float %1 178} 179 180define float @test_fmaximum_nnan(float %x, float %y) nounwind { 181; SSE2-LABEL: test_fmaximum_nnan: 182; SSE2: # %bb.0: 183; SSE2-NEXT: movaps %xmm0, %xmm2 184; SSE2-NEXT: addss %xmm1, %xmm2 185; SSE2-NEXT: subss %xmm1, %xmm0 186; SSE2-NEXT: movd %xmm2, %eax 187; SSE2-NEXT: testl %eax, %eax 188; SSE2-NEXT: js .LBB4_1 189; SSE2-NEXT: # %bb.2: 190; SSE2-NEXT: maxss %xmm2, %xmm0 191; SSE2-NEXT: retq 192; SSE2-NEXT: .LBB4_1: 193; SSE2-NEXT: movaps %xmm0, %xmm1 194; SSE2-NEXT: movaps %xmm2, %xmm0 195; SSE2-NEXT: maxss %xmm1, %xmm0 196; SSE2-NEXT: retq 197; 198; AVX1-LABEL: test_fmaximum_nnan: 199; AVX1: # %bb.0: 200; AVX1-NEXT: vaddss %xmm1, %xmm0, %xmm2 201; AVX1-NEXT: vsubss %xmm1, %xmm0, %xmm0 202; AVX1-NEXT: vmovd %xmm2, %eax 203; AVX1-NEXT: testl %eax, %eax 204; AVX1-NEXT: js .LBB4_1 205; AVX1-NEXT: # %bb.2: 206; AVX1-NEXT: vmaxss %xmm2, %xmm0, %xmm0 207; AVX1-NEXT: retq 208; AVX1-NEXT: .LBB4_1: 209; AVX1-NEXT: vmovaps %xmm0, %xmm1 210; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 211; AVX1-NEXT: retq 212; 213; AVX512F-LABEL: test_fmaximum_nnan: 214; AVX512F: # %bb.0: 215; AVX512F-NEXT: vaddss %xmm1, %xmm0, %xmm2 216; AVX512F-NEXT: vsubss %xmm1, %xmm0, %xmm0 217; AVX512F-NEXT: vmovd %xmm2, %eax 218; AVX512F-NEXT: testl %eax, %eax 219; AVX512F-NEXT: sets %al 220; AVX512F-NEXT: kmovw %eax, %k1 221; AVX512F-NEXT: vmovaps %xmm2, %xmm1 222; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 223; AVX512F-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 224; AVX512F-NEXT: vmaxss %xmm1, %xmm0, %xmm0 225; AVX512F-NEXT: retq 226; 227; AVX512DQ-LABEL: test_fmaximum_nnan: 228; AVX512DQ: # %bb.0: 229; AVX512DQ-NEXT: vaddss %xmm1, %xmm0, %xmm2 230; AVX512DQ-NEXT: vsubss %xmm1, %xmm0, %xmm0 231; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) 232; AVX512DQ-NEXT: kmovw %k0, %k1 233; AVX512DQ-NEXT: vmovaps %xmm2, %xmm1 234; AVX512DQ-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 235; AVX512DQ-NEXT: vmovss %xmm2, %xmm0, %xmm0 {%k1} 236; AVX512DQ-NEXT: vmaxss %xmm1, %xmm0, %xmm0 237; AVX512DQ-NEXT: retq 238; 239; AVX10_2-LABEL: test_fmaximum_nnan: 240; AVX10_2: # %bb.0: 241; AVX10_2-NEXT: vaddss %xmm1, %xmm0, %xmm2 242; AVX10_2-NEXT: vsubss %xmm1, %xmm0, %xmm0 243; AVX10_2-NEXT: vminmaxss $1, %xmm0, %xmm2 244; AVX10_2-NEXT: retq 245; 246; X86-LABEL: test_fmaximum_nnan: 247; X86: # %bb.0: 248; X86-NEXT: pushl %eax 249; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 250; X86-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero 251; X86-NEXT: vaddss %xmm0, %xmm2, %xmm1 252; X86-NEXT: vsubss %xmm0, %xmm2, %xmm0 253; X86-NEXT: vmovd %xmm1, %eax 254; X86-NEXT: testl %eax, %eax 255; X86-NEXT: js .LBB4_1 256; X86-NEXT: # %bb.2: 257; X86-NEXT: vmovaps %xmm1, %xmm2 258; X86-NEXT: jmp .LBB4_3 259; X86-NEXT: .LBB4_1: 260; X86-NEXT: vmovaps %xmm0, %xmm2 261; X86-NEXT: vmovaps %xmm1, %xmm0 262; X86-NEXT: .LBB4_3: 263; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm0 264; X86-NEXT: vmovss %xmm0, (%esp) 265; X86-NEXT: flds (%esp) 266; X86-NEXT: popl %eax 267; X86-NEXT: retl 268 %1 = fadd nnan float %x, %y 269 %2 = fsub nnan float %x, %y 270 %3 = tail call float @llvm.maximum.f32(float %1, float %2) 271 ret float %3 272} 273 274define double @test_fmaximum_zero0(double %x, double %y) nounwind { 275; SSE2-LABEL: test_fmaximum_zero0: 276; SSE2: # %bb.0: 277; SSE2-NEXT: movapd %xmm1, %xmm0 278; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 279; SSE2-NEXT: movapd %xmm0, %xmm2 280; SSE2-NEXT: andpd %xmm1, %xmm2 281; SSE2-NEXT: xorpd %xmm3, %xmm3 282; SSE2-NEXT: maxsd %xmm3, %xmm1 283; SSE2-NEXT: andnpd %xmm1, %xmm0 284; SSE2-NEXT: orpd %xmm2, %xmm0 285; SSE2-NEXT: retq 286; 287; AVX1-LABEL: test_fmaximum_zero0: 288; AVX1: # %bb.0: 289; AVX1-NEXT: vxorpd %xmm0, %xmm0, %xmm0 290; AVX1-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 291; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 292; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 293; AVX1-NEXT: retq 294; 295; AVX512-LABEL: test_fmaximum_zero0: 296; AVX512: # %bb.0: 297; AVX512-NEXT: vxorpd %xmm0, %xmm0, %xmm0 298; AVX512-NEXT: vmaxsd %xmm0, %xmm1, %xmm0 299; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 300; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 301; AVX512-NEXT: retq 302; 303; AVX10_2-LABEL: test_fmaximum_zero0: 304; AVX10_2: # %bb.0: 305; AVX10_2-NEXT: vxorpd %xmm0, %xmm0, %xmm0 306; AVX10_2-NEXT: vminmaxsd $1, %xmm0, %xmm1 307; AVX10_2-NEXT: retq 308; 309; X86-LABEL: test_fmaximum_zero0: 310; X86: # %bb.0: 311; X86-NEXT: pushl %ebp 312; X86-NEXT: movl %esp, %ebp 313; X86-NEXT: andl $-8, %esp 314; X86-NEXT: subl $8, %esp 315; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 316; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 317; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 318; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 319; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 320; X86-NEXT: vmovlpd %xmm0, (%esp) 321; X86-NEXT: fldl (%esp) 322; X86-NEXT: movl %ebp, %esp 323; X86-NEXT: popl %ebp 324; X86-NEXT: retl 325 %1 = tail call double @llvm.maximum.f64(double 0.0, double %y) 326 ret double %1 327} 328 329define double @test_fmaximum_zero1(double %x, double %y) nounwind { 330; SSE2-LABEL: test_fmaximum_zero1: 331; SSE2: # %bb.0: 332; SSE2-NEXT: movapd %xmm0, %xmm1 333; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 334; SSE2-NEXT: movapd %xmm1, %xmm2 335; SSE2-NEXT: andpd %xmm0, %xmm2 336; SSE2-NEXT: xorpd %xmm3, %xmm3 337; SSE2-NEXT: maxsd %xmm3, %xmm0 338; SSE2-NEXT: andnpd %xmm0, %xmm1 339; SSE2-NEXT: orpd %xmm2, %xmm1 340; SSE2-NEXT: movapd %xmm1, %xmm0 341; SSE2-NEXT: retq 342; 343; AVX1-LABEL: test_fmaximum_zero1: 344; AVX1: # %bb.0: 345; AVX1-NEXT: vxorpd %xmm1, %xmm1, %xmm1 346; AVX1-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 347; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 348; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 349; AVX1-NEXT: retq 350; 351; AVX512-LABEL: test_fmaximum_zero1: 352; AVX512: # %bb.0: 353; AVX512-NEXT: vxorpd %xmm1, %xmm1, %xmm1 354; AVX512-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 355; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 356; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 357; AVX512-NEXT: vmovapd %xmm1, %xmm0 358; AVX512-NEXT: retq 359; 360; AVX10_2-LABEL: test_fmaximum_zero1: 361; AVX10_2: # %bb.0: 362; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 363; AVX10_2-NEXT: vminmaxsd $1, %xmm1, %xmm0 364; AVX10_2-NEXT: retq 365; 366; X86-LABEL: test_fmaximum_zero1: 367; X86: # %bb.0: 368; X86-NEXT: pushl %ebp 369; X86-NEXT: movl %esp, %ebp 370; X86-NEXT: andl $-8, %esp 371; X86-NEXT: subl $8, %esp 372; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 373; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 374; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm1 375; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 376; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 377; X86-NEXT: vmovlpd %xmm0, (%esp) 378; X86-NEXT: fldl (%esp) 379; X86-NEXT: movl %ebp, %esp 380; X86-NEXT: popl %ebp 381; X86-NEXT: retl 382 %1 = tail call double @llvm.maximum.f64(double %x, double 0.0) 383 ret double %1 384} 385 386define double @test_fmaximum_zero2(double %x, double %y) { 387; SSE2-LABEL: test_fmaximum_zero2: 388; SSE2: # %bb.0: 389; SSE2-NEXT: xorps %xmm0, %xmm0 390; SSE2-NEXT: retq 391; 392; AVX-LABEL: test_fmaximum_zero2: 393; AVX: # %bb.0: 394; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 395; AVX-NEXT: retq 396; 397; AVX10_2-LABEL: test_fmaximum_zero2: 398; AVX10_2: # %bb.0: 399; AVX10_2-NEXT: vxorps %xmm0, %xmm0, %xmm0 400; AVX10_2-NEXT: retq 401; 402; X86-LABEL: test_fmaximum_zero2: 403; X86: # %bb.0: 404; X86-NEXT: fldz 405; X86-NEXT: retl 406 %1 = tail call double @llvm.maximum.f64(double 0.0, double -0.0) 407 ret double %1 408} 409 410define float @test_fmaximum_nsz(float %x, float %y) "no-signed-zeros-fp-math"="true" nounwind { 411; SSE2-LABEL: test_fmaximum_nsz: 412; SSE2: # %bb.0: 413; SSE2-NEXT: movaps %xmm0, %xmm2 414; SSE2-NEXT: cmpunordss %xmm0, %xmm2 415; SSE2-NEXT: movaps %xmm2, %xmm3 416; SSE2-NEXT: andps %xmm0, %xmm3 417; SSE2-NEXT: maxss %xmm1, %xmm0 418; SSE2-NEXT: andnps %xmm0, %xmm2 419; SSE2-NEXT: orps %xmm3, %xmm2 420; SSE2-NEXT: movaps %xmm2, %xmm0 421; SSE2-NEXT: retq 422; 423; AVX1-LABEL: test_fmaximum_nsz: 424; AVX1: # %bb.0: 425; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 426; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 427; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 428; AVX1-NEXT: retq 429; 430; AVX512-LABEL: test_fmaximum_nsz: 431; AVX512: # %bb.0: 432; AVX512-NEXT: vmaxss %xmm1, %xmm0, %xmm1 433; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 434; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 435; AVX512-NEXT: vmovaps %xmm1, %xmm0 436; AVX512-NEXT: retq 437; 438; AVX10_2-LABEL: test_fmaximum_nsz: 439; AVX10_2: # %bb.0: 440; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 441; AVX10_2-NEXT: retq 442; 443; X86-LABEL: test_fmaximum_nsz: 444; X86: # %bb.0: 445; X86-NEXT: pushl %eax 446; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 447; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 448; X86-NEXT: vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2 449; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 450; X86-NEXT: vmovss %xmm0, (%esp) 451; X86-NEXT: flds (%esp) 452; X86-NEXT: popl %eax 453; X86-NEXT: retl 454 %1 = tail call float @llvm.maximum.f32(float %x, float %y) 455 ret float %1 456} 457 458define float @test_fmaximum_combine_cmps(float %x, float %y) nounwind { 459; SSE2-LABEL: test_fmaximum_combine_cmps: 460; SSE2: # %bb.0: 461; SSE2-NEXT: divss %xmm0, %xmm1 462; SSE2-NEXT: movd %xmm0, %eax 463; SSE2-NEXT: testl %eax, %eax 464; SSE2-NEXT: movaps %xmm0, %xmm3 465; SSE2-NEXT: js .LBB9_2 466; SSE2-NEXT: # %bb.1: 467; SSE2-NEXT: movaps %xmm1, %xmm3 468; SSE2-NEXT: .LBB9_2: 469; SSE2-NEXT: movaps %xmm3, %xmm2 470; SSE2-NEXT: cmpunordss %xmm3, %xmm2 471; SSE2-NEXT: movaps %xmm2, %xmm4 472; SSE2-NEXT: andps %xmm3, %xmm4 473; SSE2-NEXT: js .LBB9_4 474; SSE2-NEXT: # %bb.3: 475; SSE2-NEXT: movaps %xmm0, %xmm1 476; SSE2-NEXT: .LBB9_4: 477; SSE2-NEXT: maxss %xmm1, %xmm3 478; SSE2-NEXT: andnps %xmm3, %xmm2 479; SSE2-NEXT: orps %xmm4, %xmm2 480; SSE2-NEXT: movaps %xmm2, %xmm0 481; SSE2-NEXT: retq 482; 483; AVX1-LABEL: test_fmaximum_combine_cmps: 484; AVX1: # %bb.0: 485; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm1 486; AVX1-NEXT: vmovd %xmm0, %eax 487; AVX1-NEXT: testl %eax, %eax 488; AVX1-NEXT: js .LBB9_1 489; AVX1-NEXT: # %bb.2: 490; AVX1-NEXT: vmovaps %xmm0, %xmm2 491; AVX1-NEXT: jmp .LBB9_3 492; AVX1-NEXT: .LBB9_1: 493; AVX1-NEXT: vmovaps %xmm1, %xmm2 494; AVX1-NEXT: vmovaps %xmm0, %xmm1 495; AVX1-NEXT: .LBB9_3: 496; AVX1-NEXT: vmaxss %xmm2, %xmm1, %xmm0 497; AVX1-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 498; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 499; AVX1-NEXT: retq 500; 501; AVX512F-LABEL: test_fmaximum_combine_cmps: 502; AVX512F: # %bb.0: 503; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 504; AVX512F-NEXT: vmovd %xmm0, %eax 505; AVX512F-NEXT: testl %eax, %eax 506; AVX512F-NEXT: sets %al 507; AVX512F-NEXT: kmovw %eax, %k1 508; AVX512F-NEXT: vmovaps %xmm0, %xmm2 509; AVX512F-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 510; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 511; AVX512F-NEXT: vmaxss %xmm2, %xmm1, %xmm0 512; AVX512F-NEXT: vcmpunordss %xmm1, %xmm1, %k1 513; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 514; AVX512F-NEXT: retq 515; 516; AVX512DQ-LABEL: test_fmaximum_combine_cmps: 517; AVX512DQ: # %bb.0: 518; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 519; AVX512DQ-NEXT: vfpclassss $3, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isPositiveZero(xmm0) 520; AVX512DQ-NEXT: kmovw %k0, %k1 521; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 522; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 523; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 524; AVX512DQ-NEXT: vmaxss %xmm2, %xmm0, %xmm0 525; AVX512DQ-NEXT: retq 526; 527; AVX10_2-LABEL: test_fmaximum_combine_cmps: 528; AVX10_2: # %bb.0: 529; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 530; AVX10_2-NEXT: vminmaxss $1, %xmm1, %xmm0 531; AVX10_2-NEXT: retq 532; 533; X86-LABEL: test_fmaximum_combine_cmps: 534; X86: # %bb.0: 535; X86-NEXT: pushl %eax 536; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 537; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 538; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 539; X86-NEXT: vmovd %xmm1, %eax 540; X86-NEXT: testl %eax, %eax 541; X86-NEXT: js .LBB9_1 542; X86-NEXT: # %bb.2: 543; X86-NEXT: vmovaps %xmm1, %xmm2 544; X86-NEXT: jmp .LBB9_3 545; X86-NEXT: .LBB9_1: 546; X86-NEXT: vmovaps %xmm0, %xmm2 547; X86-NEXT: vmovaps %xmm1, %xmm0 548; X86-NEXT: .LBB9_3: 549; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 550; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 551; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 552; X86-NEXT: vmovss %xmm0, (%esp) 553; X86-NEXT: flds (%esp) 554; X86-NEXT: popl %eax 555; X86-NEXT: retl 556 %1 = fdiv nnan float %y, %x 557 %2 = tail call float @llvm.maximum.f32(float %x, float %1) 558 ret float %2 559} 560 561; 562; fminimum 563; 564 565define float @test_fminimum(float %x, float %y) nounwind { 566; SSE2-LABEL: test_fminimum: 567; SSE2: # %bb.0: 568; SSE2-NEXT: movd %xmm0, %eax 569; SSE2-NEXT: testl %eax, %eax 570; SSE2-NEXT: movdqa %xmm1, %xmm3 571; SSE2-NEXT: js .LBB10_2 572; SSE2-NEXT: # %bb.1: 573; SSE2-NEXT: movdqa %xmm0, %xmm3 574; SSE2-NEXT: .LBB10_2: 575; SSE2-NEXT: movdqa %xmm3, %xmm2 576; SSE2-NEXT: cmpunordss %xmm3, %xmm2 577; SSE2-NEXT: movaps %xmm2, %xmm4 578; SSE2-NEXT: andps %xmm3, %xmm4 579; SSE2-NEXT: js .LBB10_4 580; SSE2-NEXT: # %bb.3: 581; SSE2-NEXT: movdqa %xmm1, %xmm0 582; SSE2-NEXT: .LBB10_4: 583; SSE2-NEXT: minss %xmm0, %xmm3 584; SSE2-NEXT: andnps %xmm3, %xmm2 585; SSE2-NEXT: orps %xmm4, %xmm2 586; SSE2-NEXT: movaps %xmm2, %xmm0 587; SSE2-NEXT: retq 588; 589; AVX1-LABEL: test_fminimum: 590; AVX1: # %bb.0: 591; AVX1-NEXT: vmovd %xmm0, %eax 592; AVX1-NEXT: testl %eax, %eax 593; AVX1-NEXT: js .LBB10_1 594; AVX1-NEXT: # %bb.2: 595; AVX1-NEXT: vmovdqa %xmm1, %xmm2 596; AVX1-NEXT: jmp .LBB10_3 597; AVX1-NEXT: .LBB10_1: 598; AVX1-NEXT: vmovdqa %xmm0, %xmm2 599; AVX1-NEXT: vmovdqa %xmm1, %xmm0 600; AVX1-NEXT: .LBB10_3: 601; AVX1-NEXT: vminss %xmm2, %xmm0, %xmm1 602; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 603; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 604; AVX1-NEXT: retq 605; 606; AVX512-LABEL: test_fminimum: 607; AVX512: # %bb.0: 608; AVX512-NEXT: vmovd %xmm0, %eax 609; AVX512-NEXT: testl %eax, %eax 610; AVX512-NEXT: sets %al 611; AVX512-NEXT: kmovw %eax, %k1 612; AVX512-NEXT: vmovaps %xmm1, %xmm2 613; AVX512-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 614; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 615; AVX512-NEXT: vminss %xmm2, %xmm0, %xmm1 616; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 617; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 618; AVX512-NEXT: vmovaps %xmm1, %xmm0 619; AVX512-NEXT: retq 620; 621; AVX10_2-LABEL: test_fminimum: 622; AVX10_2: # %bb.0: 623; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 624; AVX10_2-NEXT: retq 625; 626; X86-LABEL: test_fminimum: 627; X86: # %bb.0: 628; X86-NEXT: pushl %eax 629; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 630; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 631; X86-NEXT: vmovd %xmm0, %eax 632; X86-NEXT: testl %eax, %eax 633; X86-NEXT: js .LBB10_1 634; X86-NEXT: # %bb.2: 635; X86-NEXT: vmovdqa %xmm1, %xmm2 636; X86-NEXT: jmp .LBB10_3 637; X86-NEXT: .LBB10_1: 638; X86-NEXT: vmovdqa %xmm0, %xmm2 639; X86-NEXT: vmovdqa %xmm1, %xmm0 640; X86-NEXT: .LBB10_3: 641; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 642; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 643; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 644; X86-NEXT: vmovss %xmm0, (%esp) 645; X86-NEXT: flds (%esp) 646; X86-NEXT: popl %eax 647; X86-NEXT: retl 648 %1 = tail call float @llvm.minimum.f32(float %x, float %y) 649 ret float %1 650} 651 652define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 653; SSE2-LABEL: test_fminimum_scalarize: 654; SSE2: # %bb.0: 655; SSE2-NEXT: minpd %xmm1, %xmm0 656; SSE2-NEXT: retq 657; 658; AVX-LABEL: test_fminimum_scalarize: 659; AVX: # %bb.0: 660; AVX-NEXT: vminpd %xmm1, %xmm0, %xmm0 661; AVX-NEXT: retq 662; 663; AVX10_2-LABEL: test_fminimum_scalarize: 664; AVX10_2: # %bb.0: 665; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 666; AVX10_2-NEXT: retq 667; 668; X86-LABEL: test_fminimum_scalarize: 669; X86: # %bb.0: 670; X86-NEXT: vminpd %xmm1, %xmm0, %xmm0 671; X86-NEXT: retl 672 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) 673 ret <2 x double> %r 674} 675 676define float @test_fminimum_nan0(float %x, float %y) { 677; SSE2-LABEL: test_fminimum_nan0: 678; SSE2: # %bb.0: 679; SSE2-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 680; SSE2-NEXT: retq 681; 682; AVX-LABEL: test_fminimum_nan0: 683; AVX: # %bb.0: 684; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 685; AVX-NEXT: retq 686; 687; AVX10_2-LABEL: test_fminimum_nan0: 688; AVX10_2: # %bb.0: 689; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 690; AVX10_2-NEXT: retq 691; 692; X86-LABEL: test_fminimum_nan0: 693; X86: # %bb.0: 694; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} 695; X86-NEXT: retl 696 %1 = tail call float @llvm.minimum.f32(float 0x7fff000000000000, float %y) 697 ret float %1 698} 699 700define float @test_fminimum_nan1(float %x, float %y) { 701; SSE2-LABEL: test_fminimum_nan1: 702; SSE2: # %bb.0: 703; SSE2-NEXT: movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 704; SSE2-NEXT: retq 705; 706; AVX-LABEL: test_fminimum_nan1: 707; AVX: # %bb.0: 708; AVX-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 709; AVX-NEXT: retq 710; 711; AVX10_2-LABEL: test_fminimum_nan1: 712; AVX10_2: # %bb.0: 713; AVX10_2-NEXT: vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0] 714; AVX10_2-NEXT: retq 715; 716; X86-LABEL: test_fminimum_nan1: 717; X86: # %bb.0: 718; X86-NEXT: flds {{\.?LCPI[0-9]+_[0-9]+}} 719; X86-NEXT: retl 720 %1 = tail call float @llvm.minimum.f32(float %x, float 0x7fff000000000000) 721 ret float %1 722} 723 724define double @test_fminimum_nnan(double %x, double %y) "no-nans-fp-math"="true" nounwind { 725; SSE2-LABEL: test_fminimum_nnan: 726; SSE2: # %bb.0: 727; SSE2-NEXT: movq %xmm0, %rax 728; SSE2-NEXT: testq %rax, %rax 729; SSE2-NEXT: js .LBB14_1 730; SSE2-NEXT: # %bb.2: 731; SSE2-NEXT: minsd %xmm1, %xmm0 732; SSE2-NEXT: retq 733; SSE2-NEXT: .LBB14_1: 734; SSE2-NEXT: movdqa %xmm0, %xmm2 735; SSE2-NEXT: movapd %xmm1, %xmm0 736; SSE2-NEXT: minsd %xmm2, %xmm0 737; SSE2-NEXT: retq 738; 739; AVX1-LABEL: test_fminimum_nnan: 740; AVX1: # %bb.0: 741; AVX1-NEXT: vmovq %xmm0, %rax 742; AVX1-NEXT: testq %rax, %rax 743; AVX1-NEXT: js .LBB14_1 744; AVX1-NEXT: # %bb.2: 745; AVX1-NEXT: vminsd %xmm1, %xmm0, %xmm0 746; AVX1-NEXT: retq 747; AVX1-NEXT: .LBB14_1: 748; AVX1-NEXT: vmovdqa %xmm0, %xmm2 749; AVX1-NEXT: vminsd %xmm2, %xmm1, %xmm0 750; AVX1-NEXT: retq 751; 752; AVX512F-LABEL: test_fminimum_nnan: 753; AVX512F: # %bb.0: 754; AVX512F-NEXT: vmovq %xmm0, %rax 755; AVX512F-NEXT: testq %rax, %rax 756; AVX512F-NEXT: sets %al 757; AVX512F-NEXT: kmovw %eax, %k1 758; AVX512F-NEXT: vmovapd %xmm1, %xmm2 759; AVX512F-NEXT: vmovsd %xmm0, %xmm2, %xmm2 {%k1} 760; AVX512F-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 761; AVX512F-NEXT: vminsd %xmm2, %xmm0, %xmm0 762; AVX512F-NEXT: retq 763; 764; AVX512DQ-LABEL: test_fminimum_nnan: 765; AVX512DQ: # %bb.0: 766; AVX512DQ-NEXT: vfpclasssd $5, %xmm1, %k0 # k0 = isQuietNaN(xmm1) | isNegativeZero(xmm1) 767; AVX512DQ-NEXT: kmovw %k0, %k1 768; AVX512DQ-NEXT: vmovapd %xmm0, %xmm2 769; AVX512DQ-NEXT: vmovsd %xmm1, %xmm2, %xmm2 {%k1} 770; AVX512DQ-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 771; AVX512DQ-NEXT: vminsd %xmm2, %xmm1, %xmm0 772; AVX512DQ-NEXT: retq 773; 774; AVX10_2-LABEL: test_fminimum_nnan: 775; AVX10_2: # %bb.0: 776; AVX10_2-NEXT: vminmaxsd $0, %xmm1, %xmm0 777; AVX10_2-NEXT: retq 778; 779; X86-LABEL: test_fminimum_nnan: 780; X86: # %bb.0: 781; X86-NEXT: pushl %ebp 782; X86-NEXT: movl %esp, %ebp 783; X86-NEXT: andl $-8, %esp 784; X86-NEXT: subl $8, %esp 785; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 786; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 787; X86-NEXT: vextractps $1, %xmm0, %eax 788; X86-NEXT: testl %eax, %eax 789; X86-NEXT: js .LBB14_1 790; X86-NEXT: # %bb.2: 791; X86-NEXT: vmovapd %xmm1, %xmm2 792; X86-NEXT: jmp .LBB14_3 793; X86-NEXT: .LBB14_1: 794; X86-NEXT: vmovapd %xmm0, %xmm2 795; X86-NEXT: vmovapd %xmm1, %xmm0 796; X86-NEXT: .LBB14_3: 797; X86-NEXT: vminsd %xmm2, %xmm0, %xmm0 798; X86-NEXT: vmovsd %xmm0, (%esp) 799; X86-NEXT: fldl (%esp) 800; X86-NEXT: movl %ebp, %esp 801; X86-NEXT: popl %ebp 802; X86-NEXT: retl 803 %1 = tail call double @llvm.minimum.f64(double %x, double %y) 804 ret double %1 805} 806 807define double @test_fminimum_zero0(double %x, double %y) nounwind { 808; SSE2-LABEL: test_fminimum_zero0: 809; SSE2: # %bb.0: 810; SSE2-NEXT: movapd %xmm1, %xmm0 811; SSE2-NEXT: cmpunordsd %xmm1, %xmm0 812; SSE2-NEXT: movapd %xmm0, %xmm2 813; SSE2-NEXT: andpd %xmm1, %xmm2 814; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 815; SSE2-NEXT: andnpd %xmm1, %xmm0 816; SSE2-NEXT: orpd %xmm2, %xmm0 817; SSE2-NEXT: retq 818; 819; AVX1-LABEL: test_fminimum_zero0: 820; AVX1: # %bb.0: 821; AVX1-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm0 822; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 823; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 824; AVX1-NEXT: retq 825; 826; AVX512-LABEL: test_fminimum_zero0: 827; AVX512: # %bb.0: 828; AVX512-NEXT: vcmpunordsd %xmm1, %xmm1, %k1 829; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0 830; AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} 831; AVX512-NEXT: retq 832; 833; AVX10_2-LABEL: test_fminimum_zero0: 834; AVX10_2: # %bb.0: 835; AVX10_2-NEXT: vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 836; AVX10_2-NEXT: retq 837; 838; X86-LABEL: test_fminimum_zero0: 839; X86: # %bb.0: 840; X86-NEXT: pushl %ebp 841; X86-NEXT: movl %esp, %ebp 842; X86-NEXT: andl $-8, %esp 843; X86-NEXT: subl $8, %esp 844; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 845; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 846; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 847; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 848; X86-NEXT: vmovlpd %xmm0, (%esp) 849; X86-NEXT: fldl (%esp) 850; X86-NEXT: movl %ebp, %esp 851; X86-NEXT: popl %ebp 852; X86-NEXT: retl 853 %1 = tail call double @llvm.minimum.f64(double -0.0, double %y) 854 ret double %1 855} 856 857define double @test_fminimum_zero1(double %x, double %y) nounwind { 858; SSE2-LABEL: test_fminimum_zero1: 859; SSE2: # %bb.0: 860; SSE2-NEXT: movapd %xmm0, %xmm1 861; SSE2-NEXT: cmpunordsd %xmm0, %xmm1 862; SSE2-NEXT: movapd %xmm1, %xmm2 863; SSE2-NEXT: andpd %xmm0, %xmm2 864; SSE2-NEXT: minsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 865; SSE2-NEXT: andnpd %xmm0, %xmm1 866; SSE2-NEXT: orpd %xmm2, %xmm1 867; SSE2-NEXT: movapd %xmm1, %xmm0 868; SSE2-NEXT: retq 869; 870; AVX1-LABEL: test_fminimum_zero1: 871; AVX1: # %bb.0: 872; AVX1-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 873; AVX1-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 874; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 875; AVX1-NEXT: retq 876; 877; AVX512-LABEL: test_fminimum_zero1: 878; AVX512: # %bb.0: 879; AVX512-NEXT: vcmpunordsd %xmm0, %xmm0, %k1 880; AVX512-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 881; AVX512-NEXT: vmovsd %xmm0, %xmm1, %xmm1 {%k1} 882; AVX512-NEXT: vmovapd %xmm1, %xmm0 883; AVX512-NEXT: retq 884; 885; AVX10_2-LABEL: test_fminimum_zero1: 886; AVX10_2: # %bb.0: 887; AVX10_2-NEXT: vminmaxsd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 888; AVX10_2-NEXT: retq 889; 890; X86-LABEL: test_fminimum_zero1: 891; X86: # %bb.0: 892; X86-NEXT: pushl %ebp 893; X86-NEXT: movl %esp, %ebp 894; X86-NEXT: andl $-8, %esp 895; X86-NEXT: subl $8, %esp 896; X86-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero 897; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm1 898; X86-NEXT: vminsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 899; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 900; X86-NEXT: vmovlpd %xmm0, (%esp) 901; X86-NEXT: fldl (%esp) 902; X86-NEXT: movl %ebp, %esp 903; X86-NEXT: popl %ebp 904; X86-NEXT: retl 905 %1 = tail call double @llvm.minimum.f64(double %x, double -0.0) 906 ret double %1 907} 908 909define double @test_fminimum_zero2(double %x, double %y) { 910; SSE2-LABEL: test_fminimum_zero2: 911; SSE2: # %bb.0: 912; SSE2-NEXT: movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 913; SSE2-NEXT: retq 914; 915; AVX-LABEL: test_fminimum_zero2: 916; AVX: # %bb.0: 917; AVX-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 918; AVX-NEXT: retq 919; 920; AVX10_2-LABEL: test_fminimum_zero2: 921; AVX10_2: # %bb.0: 922; AVX10_2-NEXT: vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0] 923; AVX10_2-NEXT: retq 924; 925; X86-LABEL: test_fminimum_zero2: 926; X86: # %bb.0: 927; X86-NEXT: fldz 928; X86-NEXT: fchs 929; X86-NEXT: retl 930 %1 = tail call double @llvm.minimum.f64(double -0.0, double 0.0) 931 ret double %1 932} 933 934define float @test_fminimum_nsz(float %x, float %y) nounwind { 935; SSE2-LABEL: test_fminimum_nsz: 936; SSE2: # %bb.0: 937; SSE2-NEXT: movaps %xmm0, %xmm2 938; SSE2-NEXT: cmpunordss %xmm0, %xmm2 939; SSE2-NEXT: movaps %xmm2, %xmm3 940; SSE2-NEXT: andps %xmm0, %xmm3 941; SSE2-NEXT: minss %xmm1, %xmm0 942; SSE2-NEXT: andnps %xmm0, %xmm2 943; SSE2-NEXT: orps %xmm3, %xmm2 944; SSE2-NEXT: movaps %xmm2, %xmm0 945; SSE2-NEXT: retq 946; 947; AVX1-LABEL: test_fminimum_nsz: 948; AVX1: # %bb.0: 949; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 950; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 951; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 952; AVX1-NEXT: retq 953; 954; AVX512-LABEL: test_fminimum_nsz: 955; AVX512: # %bb.0: 956; AVX512-NEXT: vminss %xmm1, %xmm0, %xmm1 957; AVX512-NEXT: vcmpunordss %xmm0, %xmm0, %k1 958; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 959; AVX512-NEXT: vmovaps %xmm1, %xmm0 960; AVX512-NEXT: retq 961; 962; AVX10_2-LABEL: test_fminimum_nsz: 963; AVX10_2: # %bb.0: 964; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 965; AVX10_2-NEXT: retq 966; 967; X86-LABEL: test_fminimum_nsz: 968; X86: # %bb.0: 969; X86-NEXT: pushl %eax 970; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 971; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm1 972; X86-NEXT: vminss {{[0-9]+}}(%esp), %xmm0, %xmm2 973; X86-NEXT: vblendvps %xmm1, %xmm0, %xmm2, %xmm0 974; X86-NEXT: vmovss %xmm0, (%esp) 975; X86-NEXT: flds (%esp) 976; X86-NEXT: popl %eax 977; X86-NEXT: retl 978 %1 = tail call nsz float @llvm.minimum.f32(float %x, float %y) 979 ret float %1 980} 981 982define float @test_fminimum_combine_cmps(float %x, float %y) nounwind { 983; SSE2-LABEL: test_fminimum_combine_cmps: 984; SSE2: # %bb.0: 985; SSE2-NEXT: divss %xmm0, %xmm1 986; SSE2-NEXT: movd %xmm0, %eax 987; SSE2-NEXT: testl %eax, %eax 988; SSE2-NEXT: movaps %xmm1, %xmm3 989; SSE2-NEXT: js .LBB19_2 990; SSE2-NEXT: # %bb.1: 991; SSE2-NEXT: movaps %xmm0, %xmm3 992; SSE2-NEXT: .LBB19_2: 993; SSE2-NEXT: movaps %xmm3, %xmm2 994; SSE2-NEXT: cmpunordss %xmm3, %xmm2 995; SSE2-NEXT: movaps %xmm2, %xmm4 996; SSE2-NEXT: andps %xmm3, %xmm4 997; SSE2-NEXT: js .LBB19_4 998; SSE2-NEXT: # %bb.3: 999; SSE2-NEXT: movaps %xmm1, %xmm0 1000; SSE2-NEXT: .LBB19_4: 1001; SSE2-NEXT: minss %xmm0, %xmm3 1002; SSE2-NEXT: andnps %xmm3, %xmm2 1003; SSE2-NEXT: orps %xmm4, %xmm2 1004; SSE2-NEXT: movaps %xmm2, %xmm0 1005; SSE2-NEXT: retq 1006; 1007; AVX1-LABEL: test_fminimum_combine_cmps: 1008; AVX1: # %bb.0: 1009; AVX1-NEXT: vdivss %xmm0, %xmm1, %xmm2 1010; AVX1-NEXT: vmovd %xmm0, %eax 1011; AVX1-NEXT: testl %eax, %eax 1012; AVX1-NEXT: js .LBB19_1 1013; AVX1-NEXT: # %bb.2: 1014; AVX1-NEXT: vmovaps %xmm2, %xmm1 1015; AVX1-NEXT: jmp .LBB19_3 1016; AVX1-NEXT: .LBB19_1: 1017; AVX1-NEXT: vmovaps %xmm0, %xmm1 1018; AVX1-NEXT: vmovaps %xmm2, %xmm0 1019; AVX1-NEXT: .LBB19_3: 1020; AVX1-NEXT: vminss %xmm1, %xmm0, %xmm1 1021; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 1022; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1023; AVX1-NEXT: retq 1024; 1025; AVX512F-LABEL: test_fminimum_combine_cmps: 1026; AVX512F: # %bb.0: 1027; AVX512F-NEXT: vdivss %xmm0, %xmm1, %xmm1 1028; AVX512F-NEXT: vmovd %xmm0, %eax 1029; AVX512F-NEXT: testl %eax, %eax 1030; AVX512F-NEXT: sets %al 1031; AVX512F-NEXT: kmovw %eax, %k1 1032; AVX512F-NEXT: vmovaps %xmm1, %xmm2 1033; AVX512F-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 1034; AVX512F-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 1035; AVX512F-NEXT: vminss %xmm2, %xmm0, %xmm1 1036; AVX512F-NEXT: vcmpunordss %xmm0, %xmm0, %k1 1037; AVX512F-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 1038; AVX512F-NEXT: vmovaps %xmm1, %xmm0 1039; AVX512F-NEXT: retq 1040; 1041; AVX512DQ-LABEL: test_fminimum_combine_cmps: 1042; AVX512DQ: # %bb.0: 1043; AVX512DQ-NEXT: vdivss %xmm0, %xmm1, %xmm1 1044; AVX512DQ-NEXT: vfpclassss $5, %xmm0, %k0 # k0 = isQuietNaN(xmm0) | isNegativeZero(xmm0) 1045; AVX512DQ-NEXT: kmovw %k0, %k1 1046; AVX512DQ-NEXT: vmovaps %xmm1, %xmm2 1047; AVX512DQ-NEXT: vmovss %xmm0, %xmm2, %xmm2 {%k1} 1048; AVX512DQ-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 1049; AVX512DQ-NEXT: vminss %xmm2, %xmm0, %xmm0 1050; AVX512DQ-NEXT: retq 1051; 1052; AVX10_2-LABEL: test_fminimum_combine_cmps: 1053; AVX10_2: # %bb.0: 1054; AVX10_2-NEXT: vdivss %xmm0, %xmm1, %xmm1 1055; AVX10_2-NEXT: vminmaxss $0, %xmm1, %xmm0 1056; AVX10_2-NEXT: retq 1057; 1058; X86-LABEL: test_fminimum_combine_cmps: 1059; X86: # %bb.0: 1060; X86-NEXT: pushl %eax 1061; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 1062; X86-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero 1063; X86-NEXT: vdivss %xmm0, %xmm1, %xmm2 1064; X86-NEXT: vmovd %xmm0, %eax 1065; X86-NEXT: testl %eax, %eax 1066; X86-NEXT: js .LBB19_1 1067; X86-NEXT: # %bb.2: 1068; X86-NEXT: vmovaps %xmm2, %xmm1 1069; X86-NEXT: jmp .LBB19_3 1070; X86-NEXT: .LBB19_1: 1071; X86-NEXT: vmovaps %xmm0, %xmm1 1072; X86-NEXT: vmovaps %xmm2, %xmm0 1073; X86-NEXT: .LBB19_3: 1074; X86-NEXT: vminss %xmm1, %xmm0, %xmm1 1075; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 1076; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1077; X86-NEXT: vmovss %xmm0, (%esp) 1078; X86-NEXT: flds (%esp) 1079; X86-NEXT: popl %eax 1080; X86-NEXT: retl 1081 %1 = fdiv nnan float %y, %x 1082 %2 = tail call float @llvm.minimum.f32(float %x, float %1) 1083 ret float %2 1084} 1085 1086define <2 x double> @test_fminimum_vector(<2 x double> %x, <2 x double> %y) { 1087; SSE2-LABEL: test_fminimum_vector: 1088; SSE2: # %bb.0: 1089; SSE2-NEXT: movaps %xmm0, %xmm2 1090; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[3,3] 1091; SSE2-NEXT: pxor %xmm3, %xmm3 1092; SSE2-NEXT: pcmpgtd %xmm2, %xmm3 1093; SSE2-NEXT: movdqa %xmm3, %xmm2 1094; SSE2-NEXT: pandn %xmm1, %xmm2 1095; SSE2-NEXT: movdqa %xmm3, %xmm4 1096; SSE2-NEXT: pandn %xmm0, %xmm4 1097; SSE2-NEXT: pand %xmm3, %xmm0 1098; SSE2-NEXT: por %xmm2, %xmm0 1099; SSE2-NEXT: pand %xmm1, %xmm3 1100; SSE2-NEXT: por %xmm4, %xmm3 1101; SSE2-NEXT: movdqa %xmm3, %xmm1 1102; SSE2-NEXT: minpd %xmm0, %xmm1 1103; SSE2-NEXT: movdqa %xmm3, %xmm0 1104; SSE2-NEXT: cmpunordpd %xmm3, %xmm0 1105; SSE2-NEXT: andpd %xmm0, %xmm3 1106; SSE2-NEXT: andnpd %xmm1, %xmm0 1107; SSE2-NEXT: orpd %xmm3, %xmm0 1108; SSE2-NEXT: retq 1109; 1110; AVX-LABEL: test_fminimum_vector: 1111; AVX: # %bb.0: 1112; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1113; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1114; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 1115; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 1116; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1117; AVX-NEXT: retq 1118; 1119; AVX10_2-LABEL: test_fminimum_vector: 1120; AVX10_2: # %bb.0: 1121; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1122; AVX10_2-NEXT: retq 1123; 1124; X86-LABEL: test_fminimum_vector: 1125; X86: # %bb.0: 1126; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1127; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1128; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 1129; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 1130; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1131; X86-NEXT: retl 1132 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> %y) 1133 ret <2 x double> %r 1134} 1135 1136define <4 x float> @test_fmaximum_vector(<4 x float> %x, <4 x float> %y) "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" { 1137; SSE2-LABEL: test_fmaximum_vector: 1138; SSE2: # %bb.0: 1139; SSE2-NEXT: maxps %xmm1, %xmm0 1140; SSE2-NEXT: retq 1141; 1142; AVX-LABEL: test_fmaximum_vector: 1143; AVX: # %bb.0: 1144; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm0 1145; AVX-NEXT: retq 1146; 1147; AVX10_2-LABEL: test_fmaximum_vector: 1148; AVX10_2: # %bb.0: 1149; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 1150; AVX10_2-NEXT: retq 1151; 1152; X86-LABEL: test_fmaximum_vector: 1153; X86: # %bb.0: 1154; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm0 1155; X86-NEXT: retl 1156 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) 1157 ret <4 x float> %r 1158} 1159 1160define <2 x double> @test_fminimum_vector_zero(<2 x double> %x) { 1161; SSE2-LABEL: test_fminimum_vector_zero: 1162; SSE2: # %bb.0: 1163; SSE2-NEXT: xorpd %xmm1, %xmm1 1164; SSE2-NEXT: minpd %xmm0, %xmm1 1165; SSE2-NEXT: movapd %xmm1, %xmm0 1166; SSE2-NEXT: retq 1167; 1168; AVX-LABEL: test_fminimum_vector_zero: 1169; AVX: # %bb.0: 1170; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1171; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1172; AVX-NEXT: retq 1173; 1174; AVX10_2-LABEL: test_fminimum_vector_zero: 1175; AVX10_2: # %bb.0: 1176; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1177; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1178; AVX10_2-NEXT: retq 1179; 1180; X86-LABEL: test_fminimum_vector_zero: 1181; X86: # %bb.0: 1182; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1183; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1184; X86-NEXT: retl 1185 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>) 1186 ret <2 x double> %r 1187} 1188 1189define <4 x float> @test_fmaximum_vector_signed_zero(<4 x float> %x) { 1190; SSE2-LABEL: test_fmaximum_vector_signed_zero: 1191; SSE2: # %bb.0: 1192; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1193; SSE2-NEXT: maxps %xmm0, %xmm1 1194; SSE2-NEXT: movaps %xmm1, %xmm0 1195; SSE2-NEXT: retq 1196; 1197; AVX-LABEL: test_fmaximum_vector_signed_zero: 1198; AVX: # %bb.0: 1199; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1200; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1201; AVX-NEXT: retq 1202; 1203; AVX10_2-LABEL: test_fmaximum_vector_signed_zero: 1204; AVX10_2: # %bb.0: 1205; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1206; AVX10_2-NEXT: retq 1207; 1208; X86-LABEL: test_fmaximum_vector_signed_zero: 1209; X86: # %bb.0: 1210; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1211; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1212; X86-NEXT: retl 1213 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>) 1214 ret <4 x float> %r 1215} 1216 1217define <2 x double> @test_fminimum_vector_partially_zero(<2 x double> %x) { 1218; SSE2-LABEL: test_fminimum_vector_partially_zero: 1219; SSE2: # %bb.0: 1220; SSE2-NEXT: xorpd %xmm1, %xmm1 1221; SSE2-NEXT: movhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1222; SSE2-NEXT: minpd %xmm0, %xmm1 1223; SSE2-NEXT: movapd %xmm1, %xmm0 1224; SSE2-NEXT: retq 1225; 1226; AVX-LABEL: test_fminimum_vector_partially_zero: 1227; AVX: # %bb.0: 1228; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1229; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1230; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1231; AVX-NEXT: retq 1232; 1233; AVX10_2-LABEL: test_fminimum_vector_partially_zero: 1234; AVX10_2: # %bb.0: 1235; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1236; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1237; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1238; AVX10_2-NEXT: retq 1239; 1240; X86-LABEL: test_fminimum_vector_partially_zero: 1241; X86: # %bb.0: 1242; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1243; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1244; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1245; X86-NEXT: retl 1246 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>) 1247 ret <2 x double> %r 1248} 1249 1250define <2 x double> @test_fminimum_vector_different_zeros(<2 x double> %x) { 1251; SSE2-LABEL: test_fminimum_vector_different_zeros: 1252; SSE2: # %bb.0: 1253; SSE2-NEXT: movaps %xmm0, %xmm1 1254; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[3,3] 1255; SSE2-NEXT: xorps %xmm2, %xmm2 1256; SSE2-NEXT: pxor %xmm3, %xmm3 1257; SSE2-NEXT: pcmpgtd %xmm1, %xmm3 1258; SSE2-NEXT: movhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1] 1259; SSE2-NEXT: movdqa %xmm3, %xmm1 1260; SSE2-NEXT: pandn %xmm2, %xmm1 1261; SSE2-NEXT: movaps %xmm0, %xmm4 1262; SSE2-NEXT: andps %xmm3, %xmm4 1263; SSE2-NEXT: orps %xmm1, %xmm4 1264; SSE2-NEXT: pand %xmm0, %xmm2 1265; SSE2-NEXT: pandn %xmm0, %xmm3 1266; SSE2-NEXT: por %xmm2, %xmm3 1267; SSE2-NEXT: movdqa %xmm3, %xmm1 1268; SSE2-NEXT: minpd %xmm4, %xmm1 1269; SSE2-NEXT: movdqa %xmm3, %xmm0 1270; SSE2-NEXT: cmpunordpd %xmm3, %xmm0 1271; SSE2-NEXT: andpd %xmm0, %xmm3 1272; SSE2-NEXT: andnpd %xmm1, %xmm0 1273; SSE2-NEXT: orpd %xmm3, %xmm0 1274; SSE2-NEXT: retq 1275; 1276; AVX-LABEL: test_fminimum_vector_different_zeros: 1277; AVX: # %bb.0: 1278; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1279; AVX-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1280; AVX-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1281; AVX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1282; AVX-NEXT: vminpd %xmm2, %xmm0, %xmm1 1283; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 1284; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1285; AVX-NEXT: retq 1286; 1287; AVX10_2-LABEL: test_fminimum_vector_different_zeros: 1288; AVX10_2: # %bb.0: 1289; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1290; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1291; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1292; AVX10_2-NEXT: retq 1293; 1294; X86-LABEL: test_fminimum_vector_different_zeros: 1295; X86: # %bb.0: 1296; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1297; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1298; X86-NEXT: vblendvpd %xmm0, %xmm0, %xmm1, %xmm2 1299; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0 1300; X86-NEXT: vminpd %xmm2, %xmm0, %xmm1 1301; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm2 1302; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 1303; X86-NEXT: retl 1304 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>) 1305 ret <2 x double> %r 1306} 1307 1308define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) { 1309; SSE2-LABEL: test_fmaximum_vector_non_zero: 1310; SSE2: # %bb.0: 1311; SSE2-NEXT: movaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1312; SSE2-NEXT: maxps %xmm0, %xmm1 1313; SSE2-NEXT: movaps %xmm1, %xmm0 1314; SSE2-NEXT: retq 1315; 1316; AVX-LABEL: test_fmaximum_vector_non_zero: 1317; AVX: # %bb.0: 1318; AVX-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1319; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1320; AVX-NEXT: retq 1321; 1322; AVX10_2-LABEL: test_fmaximum_vector_non_zero: 1323; AVX10_2: # %bb.0: 1324; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 1325; AVX10_2-NEXT: retq 1326; 1327; X86-LABEL: test_fmaximum_vector_non_zero: 1328; X86: # %bb.0: 1329; X86-NEXT: vmovaps {{.*#+}} xmm1 = [5.0E+0,4.0E+0,3.0E+0,2.0E+0] 1330; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1331; X86-NEXT: retl 1332 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float 5., float 4., float 3., float 2.>) 1333 ret <4 x float> %r 1334} 1335 1336define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) { 1337; SSE2-LABEL: test_fminimum_vector_nan: 1338; SSE2: # %bb.0: 1339; SSE2-NEXT: movsd {{.*#+}} xmm2 = [NaN,0.0E+0] 1340; SSE2-NEXT: xorpd %xmm1, %xmm1 1341; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1342; SSE2-NEXT: minpd %xmm0, %xmm1 1343; SSE2-NEXT: unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] 1344; SSE2-NEXT: movapd %xmm1, %xmm0 1345; SSE2-NEXT: retq 1346; 1347; AVX-LABEL: test_fminimum_vector_nan: 1348; AVX: # %bb.0: 1349; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0] 1350; AVX-NEXT: vxorpd %xmm2, %xmm2, %xmm2 1351; AVX-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0] 1352; AVX-NEXT: vminpd %xmm0, %xmm2, %xmm0 1353; AVX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] 1354; AVX-NEXT: retq 1355; 1356; AVX10_2-LABEL: test_fminimum_vector_nan: 1357; AVX10_2: # %bb.0: 1358; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1359; AVX10_2-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1360; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1361; AVX10_2-NEXT: retq 1362; 1363; X86-LABEL: test_fminimum_vector_nan: 1364; X86: # %bb.0: 1365; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1366; X86-NEXT: vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0] 1367; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1368; X86-NEXT: vcmpunordpd %xmm1, %xmm1, %xmm2 1369; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 1370; X86-NEXT: retl 1371 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>) 1372 ret <2 x double> %r 1373} 1374 1375define <2 x double> @test_fminimum_vector_zero_first(<2 x double> %x) { 1376; SSE2-LABEL: test_fminimum_vector_zero_first: 1377; SSE2: # %bb.0: 1378; SSE2-NEXT: xorpd %xmm1, %xmm1 1379; SSE2-NEXT: minpd %xmm0, %xmm1 1380; SSE2-NEXT: movapd %xmm1, %xmm0 1381; SSE2-NEXT: retq 1382; 1383; AVX-LABEL: test_fminimum_vector_zero_first: 1384; AVX: # %bb.0: 1385; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1386; AVX-NEXT: vminpd %xmm0, %xmm1, %xmm0 1387; AVX-NEXT: retq 1388; 1389; AVX10_2-LABEL: test_fminimum_vector_zero_first: 1390; AVX10_2: # %bb.0: 1391; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1392; AVX10_2-NEXT: vminmaxpd $0, %xmm1, %xmm0, %xmm0 1393; AVX10_2-NEXT: retq 1394; 1395; X86-LABEL: test_fminimum_vector_zero_first: 1396; X86: # %bb.0: 1397; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1398; X86-NEXT: vminpd %xmm0, %xmm1, %xmm0 1399; X86-NEXT: retl 1400 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x) 1401 ret <2 x double> %r 1402} 1403 1404define <2 x double> @test_fminimum_vector_signed_zero(<2 x double> %x) { 1405; SSE2-LABEL: test_fminimum_vector_signed_zero: 1406; SSE2: # %bb.0: 1407; SSE2-NEXT: movapd %xmm0, %xmm1 1408; SSE2-NEXT: minpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 1409; SSE2-NEXT: movapd %xmm0, %xmm2 1410; SSE2-NEXT: cmpunordpd %xmm0, %xmm2 1411; SSE2-NEXT: andpd %xmm2, %xmm0 1412; SSE2-NEXT: andnpd %xmm1, %xmm2 1413; SSE2-NEXT: orpd %xmm2, %xmm0 1414; SSE2-NEXT: retq 1415; 1416; AVX-LABEL: test_fminimum_vector_signed_zero: 1417; AVX: # %bb.0: 1418; AVX-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 1419; AVX-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 1420; AVX-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 1421; AVX-NEXT: retq 1422; 1423; AVX10_2-LABEL: test_fminimum_vector_signed_zero: 1424; AVX10_2: # %bb.0: 1425; AVX10_2-NEXT: vminmaxpd $0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0 1426; AVX10_2-NEXT: retq 1427; 1428; X86-LABEL: test_fminimum_vector_signed_zero: 1429; X86: # %bb.0: 1430; X86-NEXT: vcmpunordpd %xmm0, %xmm0, %xmm1 1431; X86-NEXT: vminpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm2 1432; X86-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0 1433; X86-NEXT: retl 1434 %r = call <2 x double> @llvm.minimum.v2f64(<2 x double> %x, <2 x double> <double -0., double -0.>) 1435 ret <2 x double> %r 1436} 1437 1438define <4 x float> @test_fmaximum_vector_signed_zero_first(<4 x float> %x) { 1439; SSE2-LABEL: test_fmaximum_vector_signed_zero_first: 1440; SSE2: # %bb.0: 1441; SSE2-NEXT: movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1442; SSE2-NEXT: maxps %xmm0, %xmm1 1443; SSE2-NEXT: movaps %xmm1, %xmm0 1444; SSE2-NEXT: retq 1445; 1446; AVX-LABEL: test_fmaximum_vector_signed_zero_first: 1447; AVX: # %bb.0: 1448; AVX-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1449; AVX-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1450; AVX-NEXT: retq 1451; 1452; AVX10_2-LABEL: test_fmaximum_vector_signed_zero_first: 1453; AVX10_2: # %bb.0: 1454; AVX10_2-NEXT: vminmaxps $1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 1455; AVX10_2-NEXT: retq 1456; 1457; X86-LABEL: test_fmaximum_vector_signed_zero_first: 1458; X86: # %bb.0: 1459; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1460; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm0 1461; X86-NEXT: retl 1462 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x) 1463 ret <4 x float> %r 1464} 1465 1466define <4 x float> @test_fmaximum_vector_zero(<4 x float> %x) { 1467; SSE2-LABEL: test_fmaximum_vector_zero: 1468; SSE2: # %bb.0: 1469; SSE2-NEXT: xorps %xmm1, %xmm1 1470; SSE2-NEXT: movaps %xmm0, %xmm2 1471; SSE2-NEXT: maxps %xmm1, %xmm2 1472; SSE2-NEXT: movaps %xmm0, %xmm1 1473; SSE2-NEXT: cmpunordps %xmm0, %xmm1 1474; SSE2-NEXT: andps %xmm1, %xmm0 1475; SSE2-NEXT: andnps %xmm2, %xmm1 1476; SSE2-NEXT: orps %xmm1, %xmm0 1477; SSE2-NEXT: retq 1478; 1479; AVX-LABEL: test_fmaximum_vector_zero: 1480; AVX: # %bb.0: 1481; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 1482; AVX-NEXT: vmaxps %xmm1, %xmm0, %xmm1 1483; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 1484; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1485; AVX-NEXT: retq 1486; 1487; AVX10_2-LABEL: test_fmaximum_vector_zero: 1488; AVX10_2: # %bb.0: 1489; AVX10_2-NEXT: vxorpd %xmm1, %xmm1, %xmm1 1490; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 1491; AVX10_2-NEXT: retq 1492; 1493; X86-LABEL: test_fmaximum_vector_zero: 1494; X86: # %bb.0: 1495; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 1496; X86-NEXT: vmaxps %xmm1, %xmm0, %xmm1 1497; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 1498; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1499; X86-NEXT: retl 1500 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> <float 0., float 0., float 0., float 0.>) 1501 ret <4 x float> %r 1502} 1503 1504; PR77805: Check that signed zeroes are handled correctly in this case (FIXME) 1505define <4 x float> @test_fmaximum_v4f32_splat(<4 x float> %x, float %y) { 1506; SSE2-LABEL: test_fmaximum_v4f32_splat: 1507; SSE2: # %bb.0: 1508; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 1509; SSE2-NEXT: pxor %xmm2, %xmm2 1510; SSE2-NEXT: pcmpgtd %xmm0, %xmm2 1511; SSE2-NEXT: movdqa %xmm2, %xmm3 1512; SSE2-NEXT: pandn %xmm0, %xmm3 1513; SSE2-NEXT: movaps %xmm1, %xmm4 1514; SSE2-NEXT: andps %xmm2, %xmm4 1515; SSE2-NEXT: orps %xmm3, %xmm4 1516; SSE2-NEXT: pand %xmm2, %xmm0 1517; SSE2-NEXT: andnps %xmm1, %xmm2 1518; SSE2-NEXT: por %xmm2, %xmm0 1519; SSE2-NEXT: movdqa %xmm0, %xmm1 1520; SSE2-NEXT: maxps %xmm4, %xmm1 1521; SSE2-NEXT: movdqa %xmm0, %xmm2 1522; SSE2-NEXT: cmpunordps %xmm0, %xmm2 1523; SSE2-NEXT: andps %xmm2, %xmm0 1524; SSE2-NEXT: andnps %xmm1, %xmm2 1525; SSE2-NEXT: orps %xmm2, %xmm0 1526; SSE2-NEXT: retq 1527; 1528; AVX1-LABEL: test_fmaximum_v4f32_splat: 1529; AVX1: # %bb.0: 1530; AVX1-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0] 1531; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1532; AVX1-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1533; AVX1-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1534; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 1535; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1536; AVX1-NEXT: retq 1537; 1538; AVX512-LABEL: test_fmaximum_v4f32_splat: 1539; AVX512: # %bb.0: 1540; AVX512-NEXT: vbroadcastss %xmm1, %xmm1 1541; AVX512-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1542; AVX512-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1543; AVX512-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1544; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 1545; AVX512-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1546; AVX512-NEXT: retq 1547; 1548; AVX10_2-LABEL: test_fmaximum_v4f32_splat: 1549; AVX10_2: # %bb.0: 1550; AVX10_2-NEXT: vbroadcastss %xmm1, %xmm1 1551; AVX10_2-NEXT: vminmaxps $1, %xmm1, %xmm0, %xmm0 1552; AVX10_2-NEXT: retq 1553; 1554; X86-LABEL: test_fmaximum_v4f32_splat: 1555; X86: # %bb.0: 1556; X86-NEXT: vbroadcastss {{[0-9]+}}(%esp), %xmm1 1557; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2 1558; X86-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0 1559; X86-NEXT: vmaxps %xmm2, %xmm0, %xmm1 1560; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2 1561; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 1562; X86-NEXT: retl 1563 %splatinsert = insertelement <4 x float> poison, float %y, i64 0 1564 %vec = shufflevector <4 x float> %splatinsert, <4 x float> poison, <4 x i32> zeroinitializer 1565 %r = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %vec) readnone 1566 ret <4 x float> %r 1567} 1568 1569define <4 x half> @test_fmaximum_v4f16(<4 x half> %x, <4 x half> %y) nounwind { 1570; SSE2-LABEL: test_fmaximum_v4f16: 1571; SSE2: # %bb.0: 1572; SSE2-NEXT: subq $104, %rsp 1573; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1574; SSE2-NEXT: psrld $16, %xmm0 1575; SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1576; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 1577; SSE2-NEXT: movdqa %xmm1, %xmm0 1578; SSE2-NEXT: psrld $16, %xmm0 1579; SSE2-NEXT: callq __extendhfsf2@PLT 1580; SSE2-NEXT: movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill 1581; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1582; SSE2-NEXT: callq __extendhfsf2@PLT 1583; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1584; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1585; SSE2-NEXT: movdqa %xmm0, %xmm1 1586; SSE2-NEXT: movd %xmm0, %eax 1587; SSE2-NEXT: testl %eax, %eax 1588; SSE2-NEXT: movdqa %xmm0, %xmm2 1589; SSE2-NEXT: js .LBB33_2 1590; SSE2-NEXT: # %bb.1: 1591; SSE2-NEXT: movdqa %xmm4, %xmm2 1592; SSE2-NEXT: .LBB33_2: 1593; SSE2-NEXT: movdqa %xmm2, %xmm0 1594; SSE2-NEXT: cmpunordss %xmm2, %xmm0 1595; SSE2-NEXT: movaps %xmm0, %xmm3 1596; SSE2-NEXT: andps %xmm2, %xmm3 1597; SSE2-NEXT: js .LBB33_4 1598; SSE2-NEXT: # %bb.3: 1599; SSE2-NEXT: movdqa %xmm1, %xmm4 1600; SSE2-NEXT: .LBB33_4: 1601; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1602; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 1603; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1604; SSE2-NEXT: movaps (%rsp), %xmm1 # 16-byte Reload 1605; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 1606; SSE2-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1607; SSE2-NEXT: maxss %xmm4, %xmm2 1608; SSE2-NEXT: andnps %xmm2, %xmm0 1609; SSE2-NEXT: orps %xmm3, %xmm0 1610; SSE2-NEXT: callq __truncsfhf2@PLT 1611; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1612; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1613; SSE2-NEXT: callq __extendhfsf2@PLT 1614; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1615; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1616; SSE2-NEXT: callq __extendhfsf2@PLT 1617; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1618; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1619; SSE2-NEXT: movdqa %xmm0, %xmm1 1620; SSE2-NEXT: movd %xmm0, %eax 1621; SSE2-NEXT: testl %eax, %eax 1622; SSE2-NEXT: movdqa %xmm0, %xmm2 1623; SSE2-NEXT: js .LBB33_6 1624; SSE2-NEXT: # %bb.5: 1625; SSE2-NEXT: movdqa %xmm4, %xmm2 1626; SSE2-NEXT: .LBB33_6: 1627; SSE2-NEXT: movdqa %xmm2, %xmm0 1628; SSE2-NEXT: cmpunordss %xmm2, %xmm0 1629; SSE2-NEXT: movaps %xmm0, %xmm3 1630; SSE2-NEXT: andps %xmm2, %xmm3 1631; SSE2-NEXT: js .LBB33_8 1632; SSE2-NEXT: # %bb.7: 1633; SSE2-NEXT: movdqa %xmm1, %xmm4 1634; SSE2-NEXT: .LBB33_8: 1635; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1636; SSE2-NEXT: psrlq $48, %xmm1 1637; SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1638; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 1639; SSE2-NEXT: psrlq $48, %xmm1 1640; SSE2-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill 1641; SSE2-NEXT: maxss %xmm4, %xmm2 1642; SSE2-NEXT: andnps %xmm2, %xmm0 1643; SSE2-NEXT: orps %xmm3, %xmm0 1644; SSE2-NEXT: callq __truncsfhf2@PLT 1645; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1646; SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1647; SSE2-NEXT: callq __extendhfsf2@PLT 1648; SSE2-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill 1649; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1650; SSE2-NEXT: callq __extendhfsf2@PLT 1651; SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload 1652; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1653; SSE2-NEXT: movd %xmm0, %eax 1654; SSE2-NEXT: testl %eax, %eax 1655; SSE2-NEXT: movdqa %xmm0, %xmm2 1656; SSE2-NEXT: js .LBB33_10 1657; SSE2-NEXT: # %bb.9: 1658; SSE2-NEXT: movdqa %xmm4, %xmm2 1659; SSE2-NEXT: .LBB33_10: 1660; SSE2-NEXT: movdqa %xmm2, %xmm1 1661; SSE2-NEXT: cmpunordss %xmm2, %xmm1 1662; SSE2-NEXT: movaps %xmm1, %xmm3 1663; SSE2-NEXT: andps %xmm2, %xmm3 1664; SSE2-NEXT: js .LBB33_12 1665; SSE2-NEXT: # %bb.11: 1666; SSE2-NEXT: movdqa %xmm0, %xmm4 1667; SSE2-NEXT: .LBB33_12: 1668; SSE2-NEXT: maxss %xmm4, %xmm2 1669; SSE2-NEXT: andnps %xmm2, %xmm1 1670; SSE2-NEXT: orps %xmm3, %xmm1 1671; SSE2-NEXT: movaps %xmm1, %xmm0 1672; SSE2-NEXT: callq __truncsfhf2@PLT 1673; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1674; SSE2-NEXT: movaps (%rsp), %xmm0 # 16-byte Reload 1675; SSE2-NEXT: callq __extendhfsf2@PLT 1676; SSE2-NEXT: movss %xmm0, (%rsp) # 4-byte Spill 1677; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1678; SSE2-NEXT: callq __extendhfsf2@PLT 1679; SSE2-NEXT: movd (%rsp), %xmm4 # 4-byte Folded Reload 1680; SSE2-NEXT: # xmm4 = mem[0],zero,zero,zero 1681; SSE2-NEXT: movdqa %xmm0, %xmm1 1682; SSE2-NEXT: movd %xmm0, %eax 1683; SSE2-NEXT: testl %eax, %eax 1684; SSE2-NEXT: movdqa %xmm0, %xmm2 1685; SSE2-NEXT: js .LBB33_14 1686; SSE2-NEXT: # %bb.13: 1687; SSE2-NEXT: movdqa %xmm4, %xmm2 1688; SSE2-NEXT: .LBB33_14: 1689; SSE2-NEXT: movdqa %xmm2, %xmm0 1690; SSE2-NEXT: cmpunordss %xmm2, %xmm0 1691; SSE2-NEXT: movaps %xmm0, %xmm3 1692; SSE2-NEXT: andps %xmm2, %xmm3 1693; SSE2-NEXT: js .LBB33_16 1694; SSE2-NEXT: # %bb.15: 1695; SSE2-NEXT: movdqa %xmm1, %xmm4 1696; SSE2-NEXT: .LBB33_16: 1697; SSE2-NEXT: maxss %xmm4, %xmm2 1698; SSE2-NEXT: andnps %xmm2, %xmm0 1699; SSE2-NEXT: orps %xmm3, %xmm0 1700; SSE2-NEXT: callq __truncsfhf2@PLT 1701; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1702; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1703; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1704; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 1705; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 1706; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 1707; SSE2-NEXT: addq $104, %rsp 1708; SSE2-NEXT: retq 1709; 1710; AVX1-LABEL: test_fmaximum_v4f16: 1711; AVX1: # %bb.0: 1712; AVX1-NEXT: subq $120, %rsp 1713; AVX1-NEXT: vmovaps %xmm0, %xmm2 1714; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm0[1,1,3,3] 1715; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1716; AVX1-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 1717; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1718; AVX1-NEXT: vmovaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1719; AVX1-NEXT: vpsrld $16, %xmm2, %xmm0 1720; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1721; AVX1-NEXT: vmovaps %xmm1, (%rsp) # 16-byte Spill 1722; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 1723; AVX1-NEXT: callq __extendhfsf2@PLT 1724; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1725; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1726; AVX1-NEXT: callq __extendhfsf2@PLT 1727; AVX1-NEXT: vmovd %xmm0, %eax 1728; AVX1-NEXT: testl %eax, %eax 1729; AVX1-NEXT: js .LBB33_1 1730; AVX1-NEXT: # %bb.2: 1731; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1732; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1733; AVX1-NEXT: jmp .LBB33_3 1734; AVX1-NEXT: .LBB33_1: 1735; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1736; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1737; AVX1-NEXT: .LBB33_3: 1738; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1739; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1740; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1741; AVX1-NEXT: vmovdqa (%rsp), %xmm0 # 16-byte Reload 1742; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm0 1743; AVX1-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1744; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1745; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1 1746; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1747; AVX1-NEXT: callq __truncsfhf2@PLT 1748; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1749; AVX1-NEXT: vmovaps (%rsp), %xmm0 # 16-byte Reload 1750; AVX1-NEXT: callq __extendhfsf2@PLT 1751; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1752; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1753; AVX1-NEXT: callq __extendhfsf2@PLT 1754; AVX1-NEXT: vmovd %xmm0, %eax 1755; AVX1-NEXT: testl %eax, %eax 1756; AVX1-NEXT: js .LBB33_4 1757; AVX1-NEXT: # %bb.5: 1758; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1759; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 1760; AVX1-NEXT: jmp .LBB33_6 1761; AVX1-NEXT: .LBB33_4: 1762; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1763; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1764; AVX1-NEXT: .LBB33_6: 1765; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1766; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1 1767; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1768; AVX1-NEXT: callq __truncsfhf2@PLT 1769; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1770; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1771; AVX1-NEXT: callq __extendhfsf2@PLT 1772; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1773; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1774; AVX1-NEXT: callq __extendhfsf2@PLT 1775; AVX1-NEXT: vmovd %xmm0, %eax 1776; AVX1-NEXT: testl %eax, %eax 1777; AVX1-NEXT: js .LBB33_7 1778; AVX1-NEXT: # %bb.8: 1779; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1780; AVX1-NEXT: vmovdqa (%rsp), %xmm2 # 16-byte Reload 1781; AVX1-NEXT: jmp .LBB33_9 1782; AVX1-NEXT: .LBB33_7: 1783; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1784; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1785; AVX1-NEXT: .LBB33_9: 1786; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1787; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1 1788; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1789; AVX1-NEXT: callq __truncsfhf2@PLT 1790; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 1791; AVX1-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1792; AVX1-NEXT: callq __extendhfsf2@PLT 1793; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 1794; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 1795; AVX1-NEXT: callq __extendhfsf2@PLT 1796; AVX1-NEXT: vmovd %xmm0, %eax 1797; AVX1-NEXT: testl %eax, %eax 1798; AVX1-NEXT: js .LBB33_10 1799; AVX1-NEXT: # %bb.11: 1800; AVX1-NEXT: vmovdqa %xmm0, %xmm1 1801; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload 1802; AVX1-NEXT: jmp .LBB33_12 1803; AVX1-NEXT: .LBB33_10: 1804; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1805; AVX1-NEXT: vmovdqa %xmm0, %xmm2 1806; AVX1-NEXT: .LBB33_12: 1807; AVX1-NEXT: vmaxss %xmm1, %xmm2, %xmm0 1808; AVX1-NEXT: vcmpunordss %xmm2, %xmm2, %xmm1 1809; AVX1-NEXT: vblendvps %xmm1, %xmm2, %xmm0, %xmm0 1810; AVX1-NEXT: callq __truncsfhf2@PLT 1811; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 1812; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 1813; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 1814; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 1815; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 1816; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 1817; AVX1-NEXT: addq $120, %rsp 1818; AVX1-NEXT: retq 1819; 1820; AVX512-LABEL: test_fmaximum_v4f16: 1821; AVX512: # %bb.0: 1822; AVX512-NEXT: pushq %rbp 1823; AVX512-NEXT: pushq %r15 1824; AVX512-NEXT: pushq %r14 1825; AVX512-NEXT: pushq %r13 1826; AVX512-NEXT: pushq %r12 1827; AVX512-NEXT: pushq %rbx 1828; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[3,3,3,3] 1829; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1830; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[3,3,3,3] 1831; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1832; AVX512-NEXT: xorl %eax, %eax 1833; AVX512-NEXT: vucomiss %xmm2, %xmm3 1834; AVX512-NEXT: movl $65535, %ecx # imm = 0xFFFF 1835; AVX512-NEXT: movl $0, %edx 1836; AVX512-NEXT: cmovpl %ecx, %edx 1837; AVX512-NEXT: movl $0, %edi 1838; AVX512-NEXT: cmoval %ecx, %edi 1839; AVX512-NEXT: vpsrldq {{.*#+}} xmm2 = xmm1[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1840; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1841; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm0[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1842; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1843; AVX512-NEXT: vucomiss %xmm2, %xmm3 1844; AVX512-NEXT: movl $0, %esi 1845; AVX512-NEXT: cmovpl %ecx, %esi 1846; AVX512-NEXT: movl $0, %r9d 1847; AVX512-NEXT: cmoval %ecx, %r9d 1848; AVX512-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] 1849; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1850; AVX512-NEXT: vshufpd {{.*#+}} xmm3 = xmm0[1,0] 1851; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1852; AVX512-NEXT: vucomiss %xmm2, %xmm3 1853; AVX512-NEXT: movl $0, %r8d 1854; AVX512-NEXT: cmovpl %ecx, %r8d 1855; AVX512-NEXT: movl $0, %r11d 1856; AVX512-NEXT: cmoval %ecx, %r11d 1857; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[3,3,3,3,4,5,6,7] 1858; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1859; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[3,3,3,3,4,5,6,7] 1860; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1861; AVX512-NEXT: vucomiss %xmm2, %xmm3 1862; AVX512-NEXT: movl $0, %r10d 1863; AVX512-NEXT: cmovpl %ecx, %r10d 1864; AVX512-NEXT: movl $0, %ebp 1865; AVX512-NEXT: cmoval %ecx, %ebp 1866; AVX512-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] 1867; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1868; AVX512-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] 1869; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1870; AVX512-NEXT: vucomiss %xmm2, %xmm3 1871; AVX512-NEXT: movl $0, %ebx 1872; AVX512-NEXT: cmovpl %ecx, %ebx 1873; AVX512-NEXT: movl $0, %r14d 1874; AVX512-NEXT: cmoval %ecx, %r14d 1875; AVX512-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[1,1,1,1,4,5,6,7] 1876; AVX512-NEXT: vcvtph2ps %xmm2, %xmm2 1877; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm0[1,1,1,1,4,5,6,7] 1878; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1879; AVX512-NEXT: vucomiss %xmm2, %xmm3 1880; AVX512-NEXT: movl $0, %r15d 1881; AVX512-NEXT: cmovpl %ecx, %r15d 1882; AVX512-NEXT: movl $0, %r12d 1883; AVX512-NEXT: cmoval %ecx, %r12d 1884; AVX512-NEXT: vcvtph2ps %xmm1, %xmm2 1885; AVX512-NEXT: vcvtph2ps %xmm0, %xmm3 1886; AVX512-NEXT: vucomiss %xmm2, %xmm3 1887; AVX512-NEXT: movl $0, %r13d 1888; AVX512-NEXT: cmoval %ecx, %r13d 1889; AVX512-NEXT: vmovd %r13d, %xmm2 1890; AVX512-NEXT: vpinsrw $1, %r12d, %xmm2, %xmm2 1891; AVX512-NEXT: vpinsrw $2, %r14d, %xmm2, %xmm2 1892; AVX512-NEXT: vpinsrw $3, %ebp, %xmm2, %xmm2 1893; AVX512-NEXT: vpinsrw $4, %r11d, %xmm2, %xmm2 1894; AVX512-NEXT: vpinsrw $5, %r9d, %xmm2, %xmm2 1895; AVX512-NEXT: vpinsrw $6, %edi, %xmm2, %xmm2 1896; AVX512-NEXT: movl $0, %edi 1897; AVX512-NEXT: cmovpl %ecx, %edi 1898; AVX512-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1899; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1900; AVX512-NEXT: vpsrldq {{.*#+}} xmm4 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1901; AVX512-NEXT: vcvtph2ps %xmm4, %xmm4 1902; AVX512-NEXT: vucomiss %xmm3, %xmm4 1903; AVX512-NEXT: movl $0, %r9d 1904; AVX512-NEXT: cmoval %ecx, %r9d 1905; AVX512-NEXT: vpinsrw $7, %r9d, %xmm2, %xmm2 1906; AVX512-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm2 1907; AVX512-NEXT: vmovd %edi, %xmm3 1908; AVX512-NEXT: vpinsrw $1, %r15d, %xmm3, %xmm3 1909; AVX512-NEXT: vpinsrw $2, %ebx, %xmm3, %xmm3 1910; AVX512-NEXT: vpinsrw $3, %r10d, %xmm3, %xmm3 1911; AVX512-NEXT: vpinsrw $4, %r8d, %xmm3, %xmm3 1912; AVX512-NEXT: vpinsrw $5, %esi, %xmm3, %xmm3 1913; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 1914; AVX512-NEXT: movl $0, %edx 1915; AVX512-NEXT: cmovpl %ecx, %edx 1916; AVX512-NEXT: vpinsrw $7, %edx, %xmm3, %xmm3 1917; AVX512-NEXT: vpbroadcastw {{.*#+}} xmm4 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN] 1918; AVX512-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 1919; AVX512-NEXT: vpshuflw {{.*#+}} xmm3 = xmm2[1,1,1,1,4,5,6,7] 1920; AVX512-NEXT: vcvtph2ps %xmm3, %xmm3 1921; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 1922; AVX512-NEXT: vucomiss %xmm4, %xmm3 1923; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1924; AVX512-NEXT: cmovnel %eax, %edx 1925; AVX512-NEXT: cmovpl %eax, %edx 1926; AVX512-NEXT: vcvtph2ps %xmm2, %xmm3 1927; AVX512-NEXT: vucomiss %xmm4, %xmm3 1928; AVX512-NEXT: movl $65535, %esi # imm = 0xFFFF 1929; AVX512-NEXT: cmovnel %eax, %esi 1930; AVX512-NEXT: cmovpl %eax, %esi 1931; AVX512-NEXT: vmovd %esi, %xmm3 1932; AVX512-NEXT: vpinsrw $1, %edx, %xmm3, %xmm3 1933; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[1,1,1,1] 1934; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1935; AVX512-NEXT: vucomiss %xmm4, %xmm5 1936; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1937; AVX512-NEXT: cmovnel %eax, %edx 1938; AVX512-NEXT: cmovpl %eax, %edx 1939; AVX512-NEXT: vpinsrw $2, %edx, %xmm3, %xmm3 1940; AVX512-NEXT: vpshuflw {{.*#+}} xmm5 = xmm2[3,3,3,3,4,5,6,7] 1941; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1942; AVX512-NEXT: vucomiss %xmm4, %xmm5 1943; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1944; AVX512-NEXT: cmovnel %eax, %edx 1945; AVX512-NEXT: cmovpl %eax, %edx 1946; AVX512-NEXT: vpinsrw $3, %edx, %xmm3, %xmm3 1947; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,2,3] 1948; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1949; AVX512-NEXT: vucomiss %xmm4, %xmm5 1950; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1951; AVX512-NEXT: cmovnel %eax, %edx 1952; AVX512-NEXT: cmovpl %eax, %edx 1953; AVX512-NEXT: vpinsrw $4, %edx, %xmm3, %xmm3 1954; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1955; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1956; AVX512-NEXT: vucomiss %xmm4, %xmm5 1957; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1958; AVX512-NEXT: cmovnel %eax, %edx 1959; AVX512-NEXT: cmovpl %eax, %edx 1960; AVX512-NEXT: vpinsrw $5, %edx, %xmm3, %xmm3 1961; AVX512-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[3,3,3,3] 1962; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1963; AVX512-NEXT: vucomiss %xmm4, %xmm5 1964; AVX512-NEXT: movl $65535, %edx # imm = 0xFFFF 1965; AVX512-NEXT: cmovnel %eax, %edx 1966; AVX512-NEXT: cmovpl %eax, %edx 1967; AVX512-NEXT: vpinsrw $6, %edx, %xmm3, %xmm3 1968; AVX512-NEXT: vpsrldq {{.*#+}} xmm5 = xmm2[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 1969; AVX512-NEXT: vcvtph2ps %xmm5, %xmm5 1970; AVX512-NEXT: vucomiss %xmm4, %xmm5 1971; AVX512-NEXT: cmovnel %eax, %ecx 1972; AVX512-NEXT: cmovpl %eax, %ecx 1973; AVX512-NEXT: vpinsrw $7, %ecx, %xmm3, %xmm3 1974; AVX512-NEXT: vpxor %xmm4, %xmm4, %xmm4 1975; AVX512-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm5 1976; AVX512-NEXT: vpblendvb %xmm5, %xmm0, %xmm2, %xmm0 1977; AVX512-NEXT: vpcmpeqw %xmm4, %xmm1, %xmm4 1978; AVX512-NEXT: vpblendvb %xmm4, %xmm1, %xmm0, %xmm0 1979; AVX512-NEXT: vpblendvb %xmm3, %xmm0, %xmm2, %xmm0 1980; AVX512-NEXT: popq %rbx 1981; AVX512-NEXT: popq %r12 1982; AVX512-NEXT: popq %r13 1983; AVX512-NEXT: popq %r14 1984; AVX512-NEXT: popq %r15 1985; AVX512-NEXT: popq %rbp 1986; AVX512-NEXT: retq 1987; 1988; AVX10_2-LABEL: test_fmaximum_v4f16: 1989; AVX10_2: # %bb.0: 1990; AVX10_2-NEXT: vminmaxph $1, %xmm1, %xmm0, %xmm0 1991; AVX10_2-NEXT: retq 1992; 1993; X86-LABEL: test_fmaximum_v4f16: 1994; X86: # %bb.0: 1995; X86-NEXT: subl $164, %esp 1996; X86-NEXT: vmovdqa %xmm0, %xmm2 1997; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 1998; X86-NEXT: vpsrlq $48, %xmm0, %xmm0 1999; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2000; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm2[1,1,3,3] 2001; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2002; X86-NEXT: vmovshdup {{.*#+}} xmm0 = xmm1[1,1,3,3] 2003; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2004; X86-NEXT: vpsrlq $48, %xmm1, %xmm0 2005; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2006; X86-NEXT: vpsrld $16, %xmm2, %xmm0 2007; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2008; X86-NEXT: vpsrld $16, %xmm1, %xmm0 2009; X86-NEXT: vmovdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2010; X86-NEXT: vpextrw $0, %xmm1, (%esp) 2011; X86-NEXT: calll __extendhfsf2 2012; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2013; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2014; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2015; X86-NEXT: calll __extendhfsf2 2016; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2017; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2018; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2019; X86-NEXT: calll __extendhfsf2 2020; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2021; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2022; X86-NEXT: fstps {{[0-9]+}}(%esp) 2023; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2024; X86-NEXT: fstps {{[0-9]+}}(%esp) 2025; X86-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero 2026; X86-NEXT: vmovd %xmm2, %eax 2027; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2028; X86-NEXT: testl %eax, %eax 2029; X86-NEXT: js .LBB33_1 2030; X86-NEXT: # %bb.2: 2031; X86-NEXT: vmovdqa %xmm2, %xmm1 2032; X86-NEXT: jmp .LBB33_3 2033; X86-NEXT: .LBB33_1: 2034; X86-NEXT: vmovdqa %xmm0, %xmm1 2035; X86-NEXT: vmovdqa %xmm2, %xmm0 2036; X86-NEXT: .LBB33_3: 2037; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2038; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2039; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2040; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2041; X86-NEXT: calll __extendhfsf2 2042; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2043; X86-NEXT: vmovss %xmm0, (%esp) 2044; X86-NEXT: fstps {{[0-9]+}}(%esp) 2045; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2046; X86-NEXT: fstps {{[0-9]+}}(%esp) 2047; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2048; X86-NEXT: vmovd %xmm1, %eax 2049; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2050; X86-NEXT: testl %eax, %eax 2051; X86-NEXT: js .LBB33_4 2052; X86-NEXT: # %bb.5: 2053; X86-NEXT: vmovdqa %xmm1, %xmm2 2054; X86-NEXT: jmp .LBB33_6 2055; X86-NEXT: .LBB33_4: 2056; X86-NEXT: vmovdqa %xmm0, %xmm2 2057; X86-NEXT: vmovdqa %xmm1, %xmm0 2058; X86-NEXT: .LBB33_6: 2059; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2060; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2061; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2062; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2063; X86-NEXT: calll __truncsfhf2 2064; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2065; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2066; X86-NEXT: vmovss %xmm0, (%esp) 2067; X86-NEXT: calll __truncsfhf2 2068; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2069; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2070; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2071; X86-NEXT: calll __extendhfsf2 2072; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2073; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2074; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2075; X86-NEXT: calll __extendhfsf2 2076; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 2077; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2078; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2079; X86-NEXT: calll __extendhfsf2 2080; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2081; X86-NEXT: vpextrw $0, %xmm0, (%esp) 2082; X86-NEXT: fstps {{[0-9]+}}(%esp) 2083; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2084; X86-NEXT: fstps {{[0-9]+}}(%esp) 2085; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2086; X86-NEXT: vmovd %xmm1, %eax 2087; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2088; X86-NEXT: testl %eax, %eax 2089; X86-NEXT: js .LBB33_7 2090; X86-NEXT: # %bb.8: 2091; X86-NEXT: vmovdqa %xmm1, %xmm2 2092; X86-NEXT: jmp .LBB33_9 2093; X86-NEXT: .LBB33_7: 2094; X86-NEXT: vmovdqa %xmm0, %xmm2 2095; X86-NEXT: vmovdqa %xmm1, %xmm0 2096; X86-NEXT: .LBB33_9: 2097; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2098; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2099; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2100; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2101; X86-NEXT: calll __extendhfsf2 2102; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2103; X86-NEXT: vmovss %xmm0, (%esp) 2104; X86-NEXT: fstps {{[0-9]+}}(%esp) 2105; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 2106; X86-NEXT: fstps {{[0-9]+}}(%esp) 2107; X86-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero 2108; X86-NEXT: vmovd %xmm1, %eax 2109; X86-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero 2110; X86-NEXT: testl %eax, %eax 2111; X86-NEXT: js .LBB33_10 2112; X86-NEXT: # %bb.11: 2113; X86-NEXT: vmovdqa %xmm1, %xmm2 2114; X86-NEXT: jmp .LBB33_12 2115; X86-NEXT: .LBB33_10: 2116; X86-NEXT: vmovdqa %xmm0, %xmm2 2117; X86-NEXT: vmovdqa %xmm1, %xmm0 2118; X86-NEXT: .LBB33_12: 2119; X86-NEXT: vmaxss %xmm2, %xmm0, %xmm1 2120; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2121; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2122; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2123; X86-NEXT: calll __truncsfhf2 2124; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2125; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2126; X86-NEXT: vmovd %xmm0, (%esp) 2127; X86-NEXT: calll __truncsfhf2 2128; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2129; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2130; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2131; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2132; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2133; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2134; X86-NEXT: addl $164, %esp 2135; X86-NEXT: retl 2136 %r = call <4 x half> @llvm.maximum.v4f16(<4 x half> %x, <4 x half> %y) 2137 ret <4 x half> %r 2138} 2139 2140define <4 x bfloat> @test_fmaximum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) { 2141; SSE2-LABEL: test_fmaximum_v4bf16: 2142; SSE2: # %bb.0: 2143; SSE2-NEXT: pushq %rbp 2144; SSE2-NEXT: .cfi_def_cfa_offset 16 2145; SSE2-NEXT: pushq %r15 2146; SSE2-NEXT: .cfi_def_cfa_offset 24 2147; SSE2-NEXT: pushq %r14 2148; SSE2-NEXT: .cfi_def_cfa_offset 32 2149; SSE2-NEXT: pushq %rbx 2150; SSE2-NEXT: .cfi_def_cfa_offset 40 2151; SSE2-NEXT: subq $56, %rsp 2152; SSE2-NEXT: .cfi_def_cfa_offset 96 2153; SSE2-NEXT: .cfi_offset %rbx, -40 2154; SSE2-NEXT: .cfi_offset %r14, -32 2155; SSE2-NEXT: .cfi_offset %r15, -24 2156; SSE2-NEXT: .cfi_offset %rbp, -16 2157; SSE2-NEXT: movdqa %xmm1, %xmm4 2158; SSE2-NEXT: movdqa %xmm0, %xmm5 2159; SSE2-NEXT: pextrw $0, %xmm1, %r14d 2160; SSE2-NEXT: pextrw $0, %xmm0, %r15d 2161; SSE2-NEXT: movdqa %xmm1, %xmm0 2162; SSE2-NEXT: psrld $16, %xmm0 2163; SSE2-NEXT: pextrw $0, %xmm0, %eax 2164; SSE2-NEXT: movdqa %xmm5, %xmm0 2165; SSE2-NEXT: psrld $16, %xmm0 2166; SSE2-NEXT: pextrw $0, %xmm0, %ecx 2167; SSE2-NEXT: shll $16, %ecx 2168; SSE2-NEXT: movd %ecx, %xmm3 2169; SSE2-NEXT: shll $16, %eax 2170; SSE2-NEXT: movd %eax, %xmm2 2171; SSE2-NEXT: testl %ecx, %ecx 2172; SSE2-NEXT: movdqa %xmm3, %xmm1 2173; SSE2-NEXT: js .LBB34_2 2174; SSE2-NEXT: # %bb.1: 2175; SSE2-NEXT: movdqa %xmm2, %xmm1 2176; SSE2-NEXT: .LBB34_2: 2177; SSE2-NEXT: movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2178; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,1,1,1] 2179; SSE2-NEXT: movdqa %xmm5, (%rsp) # 16-byte Spill 2180; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,1,1,1] 2181; SSE2-NEXT: movdqa %xmm1, %xmm0 2182; SSE2-NEXT: cmpunordss %xmm1, %xmm0 2183; SSE2-NEXT: movaps %xmm0, %xmm6 2184; SSE2-NEXT: andps %xmm1, %xmm6 2185; SSE2-NEXT: js .LBB34_4 2186; SSE2-NEXT: # %bb.3: 2187; SSE2-NEXT: movdqa %xmm3, %xmm2 2188; SSE2-NEXT: .LBB34_4: 2189; SSE2-NEXT: pextrw $0, %xmm4, %ebp 2190; SSE2-NEXT: pextrw $0, %xmm5, %ebx 2191; SSE2-NEXT: maxss %xmm2, %xmm1 2192; SSE2-NEXT: andnps %xmm1, %xmm0 2193; SSE2-NEXT: orps %xmm6, %xmm0 2194; SSE2-NEXT: callq __truncsfbf2@PLT 2195; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2196; SSE2-NEXT: shll $16, %r15d 2197; SSE2-NEXT: movd %r15d, %xmm3 2198; SSE2-NEXT: shll $16, %r14d 2199; SSE2-NEXT: movd %r14d, %xmm2 2200; SSE2-NEXT: testl %r15d, %r15d 2201; SSE2-NEXT: movdqa %xmm3, %xmm1 2202; SSE2-NEXT: js .LBB34_6 2203; SSE2-NEXT: # %bb.5: 2204; SSE2-NEXT: movdqa %xmm2, %xmm1 2205; SSE2-NEXT: .LBB34_6: 2206; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload 2207; SSE2-NEXT: psrlq $48, %xmm5 2208; SSE2-NEXT: movdqa (%rsp), %xmm6 # 16-byte Reload 2209; SSE2-NEXT: psrlq $48, %xmm6 2210; SSE2-NEXT: movdqa %xmm1, %xmm0 2211; SSE2-NEXT: cmpunordss %xmm1, %xmm0 2212; SSE2-NEXT: movaps %xmm0, %xmm4 2213; SSE2-NEXT: andps %xmm1, %xmm4 2214; SSE2-NEXT: js .LBB34_8 2215; SSE2-NEXT: # %bb.7: 2216; SSE2-NEXT: movdqa %xmm3, %xmm2 2217; SSE2-NEXT: .LBB34_8: 2218; SSE2-NEXT: pextrw $0, %xmm5, %r15d 2219; SSE2-NEXT: pextrw $0, %xmm6, %r14d 2220; SSE2-NEXT: maxss %xmm2, %xmm1 2221; SSE2-NEXT: andnps %xmm1, %xmm0 2222; SSE2-NEXT: orps %xmm4, %xmm0 2223; SSE2-NEXT: callq __truncsfbf2@PLT 2224; SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2225; SSE2-NEXT: shll $16, %ebx 2226; SSE2-NEXT: movd %ebx, %xmm1 2227; SSE2-NEXT: shll $16, %ebp 2228; SSE2-NEXT: movd %ebp, %xmm3 2229; SSE2-NEXT: testl %ebx, %ebx 2230; SSE2-NEXT: movdqa %xmm1, %xmm2 2231; SSE2-NEXT: js .LBB34_10 2232; SSE2-NEXT: # %bb.9: 2233; SSE2-NEXT: movdqa %xmm3, %xmm2 2234; SSE2-NEXT: .LBB34_10: 2235; SSE2-NEXT: movdqa %xmm2, %xmm0 2236; SSE2-NEXT: cmpunordss %xmm2, %xmm0 2237; SSE2-NEXT: movaps %xmm0, %xmm4 2238; SSE2-NEXT: andps %xmm2, %xmm4 2239; SSE2-NEXT: js .LBB34_12 2240; SSE2-NEXT: # %bb.11: 2241; SSE2-NEXT: movdqa %xmm1, %xmm3 2242; SSE2-NEXT: .LBB34_12: 2243; SSE2-NEXT: maxss %xmm3, %xmm2 2244; SSE2-NEXT: andnps %xmm2, %xmm0 2245; SSE2-NEXT: orps %xmm4, %xmm0 2246; SSE2-NEXT: callq __truncsfbf2@PLT 2247; SSE2-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 2248; SSE2-NEXT: shll $16, %r14d 2249; SSE2-NEXT: movd %r14d, %xmm1 2250; SSE2-NEXT: shll $16, %r15d 2251; SSE2-NEXT: movd %r15d, %xmm3 2252; SSE2-NEXT: testl %r14d, %r14d 2253; SSE2-NEXT: movdqa %xmm1, %xmm2 2254; SSE2-NEXT: js .LBB34_14 2255; SSE2-NEXT: # %bb.13: 2256; SSE2-NEXT: movdqa %xmm3, %xmm2 2257; SSE2-NEXT: .LBB34_14: 2258; SSE2-NEXT: movdqa %xmm2, %xmm0 2259; SSE2-NEXT: cmpunordss %xmm2, %xmm0 2260; SSE2-NEXT: movaps %xmm0, %xmm4 2261; SSE2-NEXT: andps %xmm2, %xmm4 2262; SSE2-NEXT: js .LBB34_16 2263; SSE2-NEXT: # %bb.15: 2264; SSE2-NEXT: movdqa %xmm1, %xmm3 2265; SSE2-NEXT: .LBB34_16: 2266; SSE2-NEXT: maxss %xmm3, %xmm2 2267; SSE2-NEXT: andnps %xmm2, %xmm0 2268; SSE2-NEXT: orps %xmm4, %xmm0 2269; SSE2-NEXT: callq __truncsfbf2@PLT 2270; SSE2-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload 2271; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2272; SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 2273; SSE2-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 2274; SSE2-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] 2275; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] 2276; SSE2-NEXT: addq $56, %rsp 2277; SSE2-NEXT: .cfi_def_cfa_offset 40 2278; SSE2-NEXT: popq %rbx 2279; SSE2-NEXT: .cfi_def_cfa_offset 32 2280; SSE2-NEXT: popq %r14 2281; SSE2-NEXT: .cfi_def_cfa_offset 24 2282; SSE2-NEXT: popq %r15 2283; SSE2-NEXT: .cfi_def_cfa_offset 16 2284; SSE2-NEXT: popq %rbp 2285; SSE2-NEXT: .cfi_def_cfa_offset 8 2286; SSE2-NEXT: retq 2287; 2288; AVX1-LABEL: test_fmaximum_v4bf16: 2289; AVX1: # %bb.0: 2290; AVX1-NEXT: pushq %rbp 2291; AVX1-NEXT: .cfi_def_cfa_offset 16 2292; AVX1-NEXT: pushq %r15 2293; AVX1-NEXT: .cfi_def_cfa_offset 24 2294; AVX1-NEXT: pushq %r14 2295; AVX1-NEXT: .cfi_def_cfa_offset 32 2296; AVX1-NEXT: pushq %r13 2297; AVX1-NEXT: .cfi_def_cfa_offset 40 2298; AVX1-NEXT: pushq %r12 2299; AVX1-NEXT: .cfi_def_cfa_offset 48 2300; AVX1-NEXT: pushq %rbx 2301; AVX1-NEXT: .cfi_def_cfa_offset 56 2302; AVX1-NEXT: subq $56, %rsp 2303; AVX1-NEXT: .cfi_def_cfa_offset 112 2304; AVX1-NEXT: .cfi_offset %rbx, -56 2305; AVX1-NEXT: .cfi_offset %r12, -48 2306; AVX1-NEXT: .cfi_offset %r13, -40 2307; AVX1-NEXT: .cfi_offset %r14, -32 2308; AVX1-NEXT: .cfi_offset %r15, -24 2309; AVX1-NEXT: .cfi_offset %rbp, -16 2310; AVX1-NEXT: vpsrlq $48, %xmm0, %xmm2 2311; AVX1-NEXT: vpsrlq $48, %xmm1, %xmm3 2312; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2313; AVX1-NEXT: vpextrw $0, %xmm4, %ebx 2314; AVX1-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 2315; AVX1-NEXT: vpextrw $0, %xmm4, %ebp 2316; AVX1-NEXT: vpextrw $0, %xmm0, %r12d 2317; AVX1-NEXT: vpextrw $0, %xmm1, %r13d 2318; AVX1-NEXT: vpsrld $16, %xmm0, %xmm0 2319; AVX1-NEXT: vpextrw $0, %xmm0, %eax 2320; AVX1-NEXT: vpsrld $16, %xmm1, %xmm0 2321; AVX1-NEXT: vpextrw $0, %xmm0, %ecx 2322; AVX1-NEXT: shll $16, %ecx 2323; AVX1-NEXT: vmovd %ecx, %xmm0 2324; AVX1-NEXT: shll $16, %eax 2325; AVX1-NEXT: vmovd %eax, %xmm4 2326; AVX1-NEXT: js .LBB34_1 2327; AVX1-NEXT: # %bb.2: 2328; AVX1-NEXT: vmovdqa %xmm4, %xmm1 2329; AVX1-NEXT: jmp .LBB34_3 2330; AVX1-NEXT: .LBB34_1: 2331; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2332; AVX1-NEXT: vmovdqa %xmm4, %xmm0 2333; AVX1-NEXT: .LBB34_3: 2334; AVX1-NEXT: vpextrw $0, %xmm2, %r14d 2335; AVX1-NEXT: vpextrw $0, %xmm3, %r15d 2336; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2337; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2338; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2339; AVX1-NEXT: callq __truncsfbf2@PLT 2340; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2341; AVX1-NEXT: shll $16, %r13d 2342; AVX1-NEXT: vmovd %r13d, %xmm0 2343; AVX1-NEXT: shll $16, %r12d 2344; AVX1-NEXT: vmovd %r12d, %xmm2 2345; AVX1-NEXT: js .LBB34_4 2346; AVX1-NEXT: # %bb.5: 2347; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2348; AVX1-NEXT: jmp .LBB34_6 2349; AVX1-NEXT: .LBB34_4: 2350; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2351; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2352; AVX1-NEXT: .LBB34_6: 2353; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2354; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2355; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2356; AVX1-NEXT: callq __truncsfbf2@PLT 2357; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 2358; AVX1-NEXT: shll $16, %ebp 2359; AVX1-NEXT: vmovd %ebp, %xmm0 2360; AVX1-NEXT: shll $16, %ebx 2361; AVX1-NEXT: vmovd %ebx, %xmm2 2362; AVX1-NEXT: js .LBB34_7 2363; AVX1-NEXT: # %bb.8: 2364; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2365; AVX1-NEXT: jmp .LBB34_9 2366; AVX1-NEXT: .LBB34_7: 2367; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2368; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2369; AVX1-NEXT: .LBB34_9: 2370; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2371; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2372; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2373; AVX1-NEXT: callq __truncsfbf2@PLT 2374; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 2375; AVX1-NEXT: shll $16, %r15d 2376; AVX1-NEXT: vmovd %r15d, %xmm0 2377; AVX1-NEXT: shll $16, %r14d 2378; AVX1-NEXT: vmovd %r14d, %xmm2 2379; AVX1-NEXT: js .LBB34_10 2380; AVX1-NEXT: # %bb.11: 2381; AVX1-NEXT: vmovdqa %xmm2, %xmm1 2382; AVX1-NEXT: jmp .LBB34_12 2383; AVX1-NEXT: .LBB34_10: 2384; AVX1-NEXT: vmovdqa %xmm0, %xmm1 2385; AVX1-NEXT: vmovdqa %xmm2, %xmm0 2386; AVX1-NEXT: .LBB34_12: 2387; AVX1-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2388; AVX1-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2389; AVX1-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2390; AVX1-NEXT: callq __truncsfbf2@PLT 2391; AVX1-NEXT: vmovdqa (%rsp), %xmm1 # 16-byte Reload 2392; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2393; AVX1-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 2394; AVX1-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2395; AVX1-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2396; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2397; AVX1-NEXT: addq $56, %rsp 2398; AVX1-NEXT: .cfi_def_cfa_offset 56 2399; AVX1-NEXT: popq %rbx 2400; AVX1-NEXT: .cfi_def_cfa_offset 48 2401; AVX1-NEXT: popq %r12 2402; AVX1-NEXT: .cfi_def_cfa_offset 40 2403; AVX1-NEXT: popq %r13 2404; AVX1-NEXT: .cfi_def_cfa_offset 32 2405; AVX1-NEXT: popq %r14 2406; AVX1-NEXT: .cfi_def_cfa_offset 24 2407; AVX1-NEXT: popq %r15 2408; AVX1-NEXT: .cfi_def_cfa_offset 16 2409; AVX1-NEXT: popq %rbp 2410; AVX1-NEXT: .cfi_def_cfa_offset 8 2411; AVX1-NEXT: retq 2412; 2413; AVX512-LABEL: test_fmaximum_v4bf16: 2414; AVX512: # %bb.0: 2415; AVX512-NEXT: pushq %rbp 2416; AVX512-NEXT: .cfi_def_cfa_offset 16 2417; AVX512-NEXT: pushq %r15 2418; AVX512-NEXT: .cfi_def_cfa_offset 24 2419; AVX512-NEXT: pushq %r14 2420; AVX512-NEXT: .cfi_def_cfa_offset 32 2421; AVX512-NEXT: pushq %r13 2422; AVX512-NEXT: .cfi_def_cfa_offset 40 2423; AVX512-NEXT: pushq %r12 2424; AVX512-NEXT: .cfi_def_cfa_offset 48 2425; AVX512-NEXT: pushq %rbx 2426; AVX512-NEXT: .cfi_def_cfa_offset 56 2427; AVX512-NEXT: pushq %rax 2428; AVX512-NEXT: .cfi_def_cfa_offset 64 2429; AVX512-NEXT: .cfi_offset %rbx, -56 2430; AVX512-NEXT: .cfi_offset %r12, -48 2431; AVX512-NEXT: .cfi_offset %r13, -40 2432; AVX512-NEXT: .cfi_offset %r14, -32 2433; AVX512-NEXT: .cfi_offset %r15, -24 2434; AVX512-NEXT: .cfi_offset %rbp, -16 2435; AVX512-NEXT: vmovq %xmm1, %r13 2436; AVX512-NEXT: movq %r13, %rbx 2437; AVX512-NEXT: shrq $32, %rbx 2438; AVX512-NEXT: vmovq %xmm0, %rbp 2439; AVX512-NEXT: movq %rbp, %r14 2440; AVX512-NEXT: shrq $32, %r14 2441; AVX512-NEXT: movq %r13, %r15 2442; AVX512-NEXT: shrq $48, %r15 2443; AVX512-NEXT: movq %rbp, %r12 2444; AVX512-NEXT: shrq $48, %r12 2445; AVX512-NEXT: movl %ebp, %eax 2446; AVX512-NEXT: andl $-65536, %eax # imm = 0xFFFF0000 2447; AVX512-NEXT: sets %cl 2448; AVX512-NEXT: kmovw %ecx, %k1 2449; AVX512-NEXT: movl %r13d, %ecx 2450; AVX512-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 2451; AVX512-NEXT: vmovd %ecx, %xmm1 2452; AVX512-NEXT: vmovd %eax, %xmm0 2453; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2454; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2455; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2456; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2457; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 2458; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2459; AVX512-NEXT: callq __truncsfbf2@PLT 2460; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2461; AVX512-NEXT: shll $16, %ebp 2462; AVX512-NEXT: sets %al 2463; AVX512-NEXT: kmovw %eax, %k1 2464; AVX512-NEXT: shll $16, %r13d 2465; AVX512-NEXT: vmovd %r13d, %xmm1 2466; AVX512-NEXT: vmovd %ebp, %xmm0 2467; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2468; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2469; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2470; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2471; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 2472; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2473; AVX512-NEXT: callq __truncsfbf2@PLT 2474; AVX512-NEXT: vpextrw $0, %xmm0, (%rsp) 2475; AVX512-NEXT: shll $16, %r12d 2476; AVX512-NEXT: sets %al 2477; AVX512-NEXT: kmovw %eax, %k1 2478; AVX512-NEXT: shll $16, %r15d 2479; AVX512-NEXT: vmovd %r15d, %xmm1 2480; AVX512-NEXT: vmovd %r12d, %xmm0 2481; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2482; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2483; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2484; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2485; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 2486; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2487; AVX512-NEXT: callq __truncsfbf2@PLT 2488; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2489; AVX512-NEXT: shll $16, %r14d 2490; AVX512-NEXT: sets %al 2491; AVX512-NEXT: kmovw %eax, %k1 2492; AVX512-NEXT: shll $16, %ebx 2493; AVX512-NEXT: vmovd %ebx, %xmm1 2494; AVX512-NEXT: vmovd %r14d, %xmm0 2495; AVX512-NEXT: vmovdqa %xmm0, %xmm2 2496; AVX512-NEXT: vmovss %xmm1, %xmm2, %xmm2 {%k1} 2497; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} 2498; AVX512-NEXT: vmaxss %xmm2, %xmm1, %xmm0 2499; AVX512-NEXT: vcmpunordss %xmm1, %xmm1, %k1 2500; AVX512-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} 2501; AVX512-NEXT: callq __truncsfbf2@PLT 2502; AVX512-NEXT: vpextrw $0, %xmm0, {{[0-9]+}}(%rsp) 2503; AVX512-NEXT: vmovaps (%rsp), %xmm0 2504; AVX512-NEXT: addq $8, %rsp 2505; AVX512-NEXT: .cfi_def_cfa_offset 56 2506; AVX512-NEXT: popq %rbx 2507; AVX512-NEXT: .cfi_def_cfa_offset 48 2508; AVX512-NEXT: popq %r12 2509; AVX512-NEXT: .cfi_def_cfa_offset 40 2510; AVX512-NEXT: popq %r13 2511; AVX512-NEXT: .cfi_def_cfa_offset 32 2512; AVX512-NEXT: popq %r14 2513; AVX512-NEXT: .cfi_def_cfa_offset 24 2514; AVX512-NEXT: popq %r15 2515; AVX512-NEXT: .cfi_def_cfa_offset 16 2516; AVX512-NEXT: popq %rbp 2517; AVX512-NEXT: .cfi_def_cfa_offset 8 2518; AVX512-NEXT: retq 2519; 2520; AVX10_2-LABEL: test_fmaximum_v4bf16: 2521; AVX10_2: # %bb.0: 2522; AVX10_2-NEXT: vminmaxbf16 $1, %xmm1, %xmm0, %xmm0 2523; AVX10_2-NEXT: retq 2524; 2525; X86-LABEL: test_fmaximum_v4bf16: 2526; X86: # %bb.0: 2527; X86-NEXT: pushl %ebp 2528; X86-NEXT: .cfi_def_cfa_offset 8 2529; X86-NEXT: pushl %ebx 2530; X86-NEXT: .cfi_def_cfa_offset 12 2531; X86-NEXT: pushl %edi 2532; X86-NEXT: .cfi_def_cfa_offset 16 2533; X86-NEXT: pushl %esi 2534; X86-NEXT: .cfi_def_cfa_offset 20 2535; X86-NEXT: subl $68, %esp 2536; X86-NEXT: .cfi_def_cfa_offset 88 2537; X86-NEXT: .cfi_offset %esi, -20 2538; X86-NEXT: .cfi_offset %edi, -16 2539; X86-NEXT: .cfi_offset %ebx, -12 2540; X86-NEXT: .cfi_offset %ebp, -8 2541; X86-NEXT: vpsrlq $48, %xmm0, %xmm2 2542; X86-NEXT: vpsrlq $48, %xmm1, %xmm3 2543; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] 2544; X86-NEXT: vpextrw $0, %xmm4, %esi 2545; X86-NEXT: vmovshdup {{.*#+}} xmm4 = xmm1[1,1,3,3] 2546; X86-NEXT: vpextrw $0, %xmm4, %ebx 2547; X86-NEXT: vpextrw $0, %xmm0, %eax 2548; X86-NEXT: vpextrw $0, %xmm1, %ecx 2549; X86-NEXT: vpsrld $16, %xmm0, %xmm0 2550; X86-NEXT: vpextrw $0, %xmm0, %edx 2551; X86-NEXT: vpsrld $16, %xmm1, %xmm0 2552; X86-NEXT: vpextrw $0, %xmm0, %edi 2553; X86-NEXT: shll $16, %edi 2554; X86-NEXT: vmovd %edi, %xmm0 2555; X86-NEXT: shll $16, %edx 2556; X86-NEXT: vmovd %edx, %xmm4 2557; X86-NEXT: js .LBB34_1 2558; X86-NEXT: # %bb.2: 2559; X86-NEXT: vmovdqa %xmm4, %xmm1 2560; X86-NEXT: jmp .LBB34_3 2561; X86-NEXT: .LBB34_1: 2562; X86-NEXT: vmovdqa %xmm0, %xmm1 2563; X86-NEXT: vmovdqa %xmm4, %xmm0 2564; X86-NEXT: .LBB34_3: 2565; X86-NEXT: vpextrw $0, %xmm2, %edi 2566; X86-NEXT: vpextrw $0, %xmm3, %ebp 2567; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2568; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2569; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2570; X86-NEXT: vmovss %xmm0, (%esp) 2571; X86-NEXT: shll $16, %ecx 2572; X86-NEXT: vmovd %ecx, %xmm0 2573; X86-NEXT: shll $16, %eax 2574; X86-NEXT: vmovd %eax, %xmm2 2575; X86-NEXT: js .LBB34_4 2576; X86-NEXT: # %bb.5: 2577; X86-NEXT: vmovdqa %xmm2, %xmm1 2578; X86-NEXT: jmp .LBB34_6 2579; X86-NEXT: .LBB34_4: 2580; X86-NEXT: vmovdqa %xmm0, %xmm1 2581; X86-NEXT: vmovdqa %xmm2, %xmm0 2582; X86-NEXT: .LBB34_6: 2583; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2584; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2585; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2586; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2587; X86-NEXT: calll __truncsfbf2 2588; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2589; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2590; X86-NEXT: vmovss %xmm0, (%esp) 2591; X86-NEXT: shll $16, %ebx 2592; X86-NEXT: vmovd %ebx, %xmm0 2593; X86-NEXT: shll $16, %esi 2594; X86-NEXT: vmovd %esi, %xmm2 2595; X86-NEXT: js .LBB34_7 2596; X86-NEXT: # %bb.8: 2597; X86-NEXT: vmovdqa %xmm2, %xmm1 2598; X86-NEXT: jmp .LBB34_9 2599; X86-NEXT: .LBB34_7: 2600; X86-NEXT: vmovdqa %xmm0, %xmm1 2601; X86-NEXT: vmovdqa %xmm2, %xmm0 2602; X86-NEXT: .LBB34_9: 2603; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2604; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2605; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2606; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2607; X86-NEXT: calll __truncsfbf2 2608; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2609; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2610; X86-NEXT: vmovss %xmm0, (%esp) 2611; X86-NEXT: shll $16, %ebp 2612; X86-NEXT: vmovd %ebp, %xmm0 2613; X86-NEXT: shll $16, %edi 2614; X86-NEXT: vmovd %edi, %xmm2 2615; X86-NEXT: js .LBB34_10 2616; X86-NEXT: # %bb.11: 2617; X86-NEXT: vmovdqa %xmm2, %xmm1 2618; X86-NEXT: jmp .LBB34_12 2619; X86-NEXT: .LBB34_10: 2620; X86-NEXT: vmovdqa %xmm0, %xmm1 2621; X86-NEXT: vmovdqa %xmm2, %xmm0 2622; X86-NEXT: .LBB34_12: 2623; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm1 2624; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 2625; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 2626; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2627; X86-NEXT: calll __truncsfbf2 2628; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 2629; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 2630; X86-NEXT: vmovd %xmm0, (%esp) 2631; X86-NEXT: calll __truncsfbf2 2632; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2633; X86-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] 2634; X86-NEXT: vmovdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload 2635; X86-NEXT: vpunpcklwd {{[-0-9]+}}(%e{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload 2636; X86-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] 2637; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero 2638; X86-NEXT: addl $68, %esp 2639; X86-NEXT: .cfi_def_cfa_offset 20 2640; X86-NEXT: popl %esi 2641; X86-NEXT: .cfi_def_cfa_offset 16 2642; X86-NEXT: popl %edi 2643; X86-NEXT: .cfi_def_cfa_offset 12 2644; X86-NEXT: popl %ebx 2645; X86-NEXT: .cfi_def_cfa_offset 8 2646; X86-NEXT: popl %ebp 2647; X86-NEXT: .cfi_def_cfa_offset 4 2648; X86-NEXT: retl 2649 %r = call <4 x bfloat> @llvm.maximum.v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) 2650 ret <4 x bfloat> %r 2651} 2652