1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X64 3; RUN: llc < %s -mtriple=i686-- -mattr=avx2,fma | FileCheck %s --check-prefixes=CHECK,X86 4 5define float @fneg_v4f32(<4 x float> %x) nounwind { 6; X64-LABEL: fneg_v4f32: 7; X64: # %bb.0: 8; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 9; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 10; X64-NEXT: retq 11; 12; X86-LABEL: fneg_v4f32: 13; X86: # %bb.0: 14; X86-NEXT: pushl %eax 15; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 16; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 17; X86-NEXT: vmovss %xmm0, (%esp) 18; X86-NEXT: flds (%esp) 19; X86-NEXT: popl %eax 20; X86-NEXT: retl 21 %v = fneg <4 x float> %x 22 %r = extractelement <4 x float> %v, i32 0 23 ret float %r 24} 25 26define double @fneg_v4f64(<4 x double> %x) nounwind { 27; X64-LABEL: fneg_v4f64: 28; X64: # %bb.0: 29; X64-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 30; X64-NEXT: # xmm1 = mem[0,0] 31; X64-NEXT: vxorps %xmm1, %xmm0, %xmm0 32; X64-NEXT: vzeroupper 33; X64-NEXT: retq 34; 35; X86-LABEL: fneg_v4f64: 36; X86: # %bb.0: 37; X86-NEXT: pushl %ebp 38; X86-NEXT: movl %esp, %ebp 39; X86-NEXT: andl $-8, %esp 40; X86-NEXT: subl $8, %esp 41; X86-NEXT: vmovddup {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0] 42; X86-NEXT: # xmm1 = mem[0,0] 43; X86-NEXT: vxorps %xmm1, %xmm0, %xmm0 44; X86-NEXT: vmovlps %xmm0, (%esp) 45; X86-NEXT: fldl (%esp) 46; X86-NEXT: movl %ebp, %esp 47; X86-NEXT: popl %ebp 48; X86-NEXT: vzeroupper 49; X86-NEXT: retl 50 %v = fneg <4 x double> %x 51 %r = extractelement <4 x double> %v, i32 0 52 ret double %r 53} 54 55define float @fadd_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 56; X64-LABEL: fadd_v4f32: 57; X64: # %bb.0: 58; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 59; X64-NEXT: retq 60; 61; X86-LABEL: fadd_v4f32: 62; X86: # %bb.0: 63; X86-NEXT: pushl %eax 64; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 65; X86-NEXT: vmovss %xmm0, (%esp) 66; X86-NEXT: flds (%esp) 67; X86-NEXT: popl %eax 68; X86-NEXT: retl 69 %v = fadd <4 x float> %x, %y 70 %r = extractelement <4 x float> %v, i32 0 71 ret float %r 72} 73 74define double @fadd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 75; X64-LABEL: fadd_v4f64: 76; X64: # %bb.0: 77; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 78; X64-NEXT: vzeroupper 79; X64-NEXT: retq 80; 81; X86-LABEL: fadd_v4f64: 82; X86: # %bb.0: 83; X86-NEXT: pushl %ebp 84; X86-NEXT: movl %esp, %ebp 85; X86-NEXT: andl $-8, %esp 86; X86-NEXT: subl $8, %esp 87; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 88; X86-NEXT: vmovsd %xmm0, (%esp) 89; X86-NEXT: fldl (%esp) 90; X86-NEXT: movl %ebp, %esp 91; X86-NEXT: popl %ebp 92; X86-NEXT: vzeroupper 93; X86-NEXT: retl 94 %v = fadd <4 x double> %x, %y 95 %r = extractelement <4 x double> %v, i32 0 96 ret double %r 97} 98 99define float @fsub_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 100; X64-LABEL: fsub_v4f32: 101; X64: # %bb.0: 102; X64-NEXT: vsubss %xmm1, %xmm0, %xmm0 103; X64-NEXT: retq 104; 105; X86-LABEL: fsub_v4f32: 106; X86: # %bb.0: 107; X86-NEXT: pushl %eax 108; X86-NEXT: vsubss %xmm1, %xmm0, %xmm0 109; X86-NEXT: vmovss %xmm0, (%esp) 110; X86-NEXT: flds (%esp) 111; X86-NEXT: popl %eax 112; X86-NEXT: retl 113 %v = fsub <4 x float> %x, %y 114 %r = extractelement <4 x float> %v, i32 0 115 ret float %r 116} 117 118define double @fsub_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 119; X64-LABEL: fsub_v4f64: 120; X64: # %bb.0: 121; X64-NEXT: vsubsd %xmm1, %xmm0, %xmm0 122; X64-NEXT: vzeroupper 123; X64-NEXT: retq 124; 125; X86-LABEL: fsub_v4f64: 126; X86: # %bb.0: 127; X86-NEXT: pushl %ebp 128; X86-NEXT: movl %esp, %ebp 129; X86-NEXT: andl $-8, %esp 130; X86-NEXT: subl $8, %esp 131; X86-NEXT: vsubsd %xmm1, %xmm0, %xmm0 132; X86-NEXT: vmovsd %xmm0, (%esp) 133; X86-NEXT: fldl (%esp) 134; X86-NEXT: movl %ebp, %esp 135; X86-NEXT: popl %ebp 136; X86-NEXT: vzeroupper 137; X86-NEXT: retl 138 %v = fsub <4 x double> %x, %y 139 %r = extractelement <4 x double> %v, i32 0 140 ret double %r 141} 142 143define float @fmul_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 144; X64-LABEL: fmul_v4f32: 145; X64: # %bb.0: 146; X64-NEXT: vmulss %xmm1, %xmm0, %xmm0 147; X64-NEXT: retq 148; 149; X86-LABEL: fmul_v4f32: 150; X86: # %bb.0: 151; X86-NEXT: pushl %eax 152; X86-NEXT: vmulss %xmm1, %xmm0, %xmm0 153; X86-NEXT: vmovss %xmm0, (%esp) 154; X86-NEXT: flds (%esp) 155; X86-NEXT: popl %eax 156; X86-NEXT: retl 157 %v = fmul <4 x float> %x, %y 158 %r = extractelement <4 x float> %v, i32 0 159 ret float %r 160} 161 162define double @fmul_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 163; X64-LABEL: fmul_v4f64: 164; X64: # %bb.0: 165; X64-NEXT: vmulsd %xmm1, %xmm0, %xmm0 166; X64-NEXT: vzeroupper 167; X64-NEXT: retq 168; 169; X86-LABEL: fmul_v4f64: 170; X86: # %bb.0: 171; X86-NEXT: pushl %ebp 172; X86-NEXT: movl %esp, %ebp 173; X86-NEXT: andl $-8, %esp 174; X86-NEXT: subl $8, %esp 175; X86-NEXT: vmulsd %xmm1, %xmm0, %xmm0 176; X86-NEXT: vmovsd %xmm0, (%esp) 177; X86-NEXT: fldl (%esp) 178; X86-NEXT: movl %ebp, %esp 179; X86-NEXT: popl %ebp 180; X86-NEXT: vzeroupper 181; X86-NEXT: retl 182 %v = fmul <4 x double> %x, %y 183 %r = extractelement <4 x double> %v, i32 0 184 ret double %r 185} 186 187define float @fdiv_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 188; X64-LABEL: fdiv_v4f32: 189; X64: # %bb.0: 190; X64-NEXT: vdivss %xmm1, %xmm0, %xmm0 191; X64-NEXT: retq 192; 193; X86-LABEL: fdiv_v4f32: 194; X86: # %bb.0: 195; X86-NEXT: pushl %eax 196; X86-NEXT: vdivss %xmm1, %xmm0, %xmm0 197; X86-NEXT: vmovss %xmm0, (%esp) 198; X86-NEXT: flds (%esp) 199; X86-NEXT: popl %eax 200; X86-NEXT: retl 201 %v = fdiv <4 x float> %x, %y 202 %r = extractelement <4 x float> %v, i32 0 203 ret float %r 204} 205 206define double @fdiv_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 207; X64-LABEL: fdiv_v4f64: 208; X64: # %bb.0: 209; X64-NEXT: vdivsd %xmm1, %xmm0, %xmm0 210; X64-NEXT: vzeroupper 211; X64-NEXT: retq 212; 213; X86-LABEL: fdiv_v4f64: 214; X86: # %bb.0: 215; X86-NEXT: pushl %ebp 216; X86-NEXT: movl %esp, %ebp 217; X86-NEXT: andl $-8, %esp 218; X86-NEXT: subl $8, %esp 219; X86-NEXT: vdivsd %xmm1, %xmm0, %xmm0 220; X86-NEXT: vmovsd %xmm0, (%esp) 221; X86-NEXT: fldl (%esp) 222; X86-NEXT: movl %ebp, %esp 223; X86-NEXT: popl %ebp 224; X86-NEXT: vzeroupper 225; X86-NEXT: retl 226 %v = fdiv <4 x double> %x, %y 227 %r = extractelement <4 x double> %v, i32 0 228 ret double %r 229} 230 231define float @frem_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 232; X64-LABEL: frem_v4f32: 233; X64: # %bb.0: 234; X64-NEXT: jmp fmodf@PLT # TAILCALL 235; 236; X86-LABEL: frem_v4f32: 237; X86: # %bb.0: 238; X86-NEXT: subl $8, %esp 239; X86-NEXT: vmovss %xmm1, {{[0-9]+}}(%esp) 240; X86-NEXT: vmovss %xmm0, (%esp) 241; X86-NEXT: calll fmodf 242; X86-NEXT: addl $8, %esp 243; X86-NEXT: retl 244 %v = frem <4 x float> %x, %y 245 %r = extractelement <4 x float> %v, i32 0 246 ret float %r 247} 248 249define double @frem_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 250; X64-LABEL: frem_v4f64: 251; X64: # %bb.0: 252; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 253; X64-NEXT: # kill: def $xmm1 killed $xmm1 killed $ymm1 254; X64-NEXT: vzeroupper 255; X64-NEXT: jmp fmod@PLT # TAILCALL 256; 257; X86-LABEL: frem_v4f64: 258; X86: # %bb.0: 259; X86-NEXT: subl $16, %esp 260; X86-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0] 261; X86-NEXT: vmovups %xmm0, (%esp) 262; X86-NEXT: vzeroupper 263; X86-NEXT: calll fmod 264; X86-NEXT: addl $16, %esp 265; X86-NEXT: retl 266 %v = frem <4 x double> %x, %y 267 %r = extractelement <4 x double> %v, i32 0 268 ret double %r 269} 270 271define i1 @fcmp_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 272; CHECK-LABEL: fcmp_v4f32: 273; CHECK: # %bb.0: 274; CHECK-NEXT: vucomiss %xmm1, %xmm0 275; CHECK-NEXT: seta %al 276; CHECK-NEXT: ret{{[l|q]}} 277 %v = fcmp ogt <4 x float> %x, %y 278 %r = extractelement <4 x i1> %v, i32 0 279 ret i1 %r 280} 281 282define i1 @fcmp_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 283; CHECK-LABEL: fcmp_v4f64: 284; CHECK: # %bb.0: 285; CHECK-NEXT: vucomisd %xmm0, %xmm1 286; CHECK-NEXT: setb %al 287; CHECK-NEXT: vzeroupper 288; CHECK-NEXT: ret{{[l|q]}} 289 %v = fcmp ugt <4 x double> %x, %y 290 %r = extractelement <4 x i1> %v, i32 0 291 ret i1 %r 292} 293 294; If we do the fcmp transform late, make sure we have the right types. 295; https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=13700 296 297define void @extsetcc(<4 x float> %x) { 298; X64-LABEL: extsetcc: 299; X64: # %bb.0: 300; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 301; X64-NEXT: vucomiss %xmm1, %xmm0 302; X64-NEXT: setb (%rax) 303; X64-NEXT: retq 304; 305; X86-LABEL: extsetcc: 306; X86: # %bb.0: 307; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 308; X86-NEXT: vucomiss %xmm1, %xmm0 309; X86-NEXT: setb (%eax) 310; X86-NEXT: retl 311 %cmp = fcmp ult <4 x float> %x, zeroinitializer 312 %sext = sext <4 x i1> %cmp to <4 x i32> 313 %e = extractelement <4 x i1> %cmp, i1 0 314 store i1 %e, ptr undef 315 ret void 316} 317 318; This used to crash by creating a setcc with an i64 condition on a 32-bit target. 319define <3 x double> @extvselectsetcc_crash(<2 x double> %x) { 320; X64-LABEL: extvselectsetcc_crash: 321; X64: # %bb.0: 322; X64-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 323; X64-NEXT: vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0] 324; X64-NEXT: vandpd %xmm2, %xmm1, %xmm1 325; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 326; X64-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] 327; X64-NEXT: retq 328; 329; X86-LABEL: extvselectsetcc_crash: 330; X86: # %bb.0: 331; X86-NEXT: vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 332; X86-NEXT: vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0] 333; X86-NEXT: vandpd %xmm2, %xmm1, %xmm1 334; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 335; X86-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3] 336; X86-NEXT: retl 337 %cmp = fcmp oeq <2 x double> %x, <double 5.0, double 5.0> 338 %s = select <2 x i1> %cmp, <2 x double> <double 1.0, double undef>, <2 x double> <double 0.0, double undef> 339 %r = shufflevector <2 x double> %s, <2 x double> %x, <3 x i32> <i32 0, i32 2, i32 3> 340 ret <3 x double> %r 341} 342 343define float @select_fcmp_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z, <4 x float> %w) nounwind { 344; X64-LABEL: select_fcmp_v4f32: 345; X64: # %bb.0: 346; X64-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 347; X64-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 348; X64-NEXT: retq 349; 350; X86-LABEL: select_fcmp_v4f32: 351; X86: # %bb.0: 352; X86-NEXT: pushl %ebp 353; X86-NEXT: movl %esp, %ebp 354; X86-NEXT: andl $-16, %esp 355; X86-NEXT: subl $16, %esp 356; X86-NEXT: vmovaps 8(%ebp), %xmm3 357; X86-NEXT: vcmpneq_oqss %xmm1, %xmm0, %xmm0 358; X86-NEXT: vblendvps %xmm0, %xmm2, %xmm3, %xmm0 359; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 360; X86-NEXT: flds {{[0-9]+}}(%esp) 361; X86-NEXT: movl %ebp, %esp 362; X86-NEXT: popl %ebp 363; X86-NEXT: retl 364 %c = fcmp one <4 x float> %x, %y 365 %s = select <4 x i1> %c, <4 x float> %z, <4 x float> %w 366 %r = extractelement <4 x float> %s, i32 0 367 ret float %r 368} 369 370define double @select_fcmp_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z, <4 x double> %w) nounwind { 371; X64-LABEL: select_fcmp_v4f64: 372; X64: # %bb.0: 373; X64-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0 374; X64-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0 375; X64-NEXT: vzeroupper 376; X64-NEXT: retq 377; 378; X86-LABEL: select_fcmp_v4f64: 379; X86: # %bb.0: 380; X86-NEXT: pushl %ebp 381; X86-NEXT: movl %esp, %ebp 382; X86-NEXT: andl $-32, %esp 383; X86-NEXT: subl $32, %esp 384; X86-NEXT: vcmpnltsd %xmm0, %xmm1, %xmm0 385; X86-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero 386; X86-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 387; X86-NEXT: vmovlpd %xmm0, {{[0-9]+}}(%esp) 388; X86-NEXT: fldl {{[0-9]+}}(%esp) 389; X86-NEXT: movl %ebp, %esp 390; X86-NEXT: popl %ebp 391; X86-NEXT: vzeroupper 392; X86-NEXT: retl 393 %c = fcmp ule <4 x double> %x, %y 394 %s = select <4 x i1> %c, <4 x double> %z, <4 x double> %w 395 %r = extractelement <4 x double> %s, i32 0 396 ret double %r 397} 398 399define float @fsqrt_v4f32(<4 x float> %x) nounwind { 400; X64-LABEL: fsqrt_v4f32: 401; X64: # %bb.0: 402; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 403; X64-NEXT: retq 404; 405; X86-LABEL: fsqrt_v4f32: 406; X86: # %bb.0: 407; X86-NEXT: pushl %eax 408; X86-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 409; X86-NEXT: vmovss %xmm0, (%esp) 410; X86-NEXT: flds (%esp) 411; X86-NEXT: popl %eax 412; X86-NEXT: retl 413 %v = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) 414 %r = extractelement <4 x float> %v, i32 0 415 ret float %r 416} 417 418define double @fsqrt_v4f64(<4 x double> %x) nounwind { 419; X64-LABEL: fsqrt_v4f64: 420; X64: # %bb.0: 421; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 422; X64-NEXT: vzeroupper 423; X64-NEXT: retq 424; 425; X86-LABEL: fsqrt_v4f64: 426; X86: # %bb.0: 427; X86-NEXT: pushl %ebp 428; X86-NEXT: movl %esp, %ebp 429; X86-NEXT: andl $-8, %esp 430; X86-NEXT: subl $8, %esp 431; X86-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 432; X86-NEXT: vmovsd %xmm0, (%esp) 433; X86-NEXT: fldl (%esp) 434; X86-NEXT: movl %ebp, %esp 435; X86-NEXT: popl %ebp 436; X86-NEXT: vzeroupper 437; X86-NEXT: retl 438 %v = call <4 x double> @llvm.sqrt.v4f64(<4 x double> %x) 439 %r = extractelement <4 x double> %v, i32 0 440 ret double %r 441} 442 443define float @fsin_v4f32(<4 x float> %x) nounwind { 444; X64-LABEL: fsin_v4f32: 445; X64: # %bb.0: 446; X64-NEXT: jmp sinf@PLT # TAILCALL 447; 448; X86-LABEL: fsin_v4f32: 449; X86: # %bb.0: 450; X86-NEXT: pushl %eax 451; X86-NEXT: vmovss %xmm0, (%esp) 452; X86-NEXT: calll sinf 453; X86-NEXT: popl %eax 454; X86-NEXT: retl 455 %v = call <4 x float> @llvm.sin.v4f32(<4 x float> %x) 456 %r = extractelement <4 x float> %v, i32 0 457 ret float %r 458} 459 460define double @fsin_v4f64(<4 x double> %x) nounwind { 461; X64-LABEL: fsin_v4f64: 462; X64: # %bb.0: 463; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 464; X64-NEXT: vzeroupper 465; X64-NEXT: jmp sin@PLT # TAILCALL 466; 467; X86-LABEL: fsin_v4f64: 468; X86: # %bb.0: 469; X86-NEXT: subl $8, %esp 470; X86-NEXT: vmovlps %xmm0, (%esp) 471; X86-NEXT: vzeroupper 472; X86-NEXT: calll sin 473; X86-NEXT: addl $8, %esp 474; X86-NEXT: retl 475 %v = call <4 x double> @llvm.sin.v4f64(<4 x double> %x) 476 %r = extractelement <4 x double> %v, i32 0 477 ret double %r 478} 479 480define float @fma_v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) nounwind { 481; X64-LABEL: fma_v4f32: 482; X64: # %bb.0: 483; X64-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 484; X64-NEXT: retq 485; 486; X86-LABEL: fma_v4f32: 487; X86: # %bb.0: 488; X86-NEXT: pushl %eax 489; X86-NEXT: vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 490; X86-NEXT: vmovss %xmm0, (%esp) 491; X86-NEXT: flds (%esp) 492; X86-NEXT: popl %eax 493; X86-NEXT: retl 494 %v = call <4 x float> @llvm.fma.v4f32(<4 x float> %x, <4 x float> %y, <4 x float> %z) 495 %r = extractelement <4 x float> %v, i32 0 496 ret float %r 497} 498 499define double @fma_v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) nounwind { 500; X64-LABEL: fma_v4f64: 501; X64: # %bb.0: 502; X64-NEXT: vfmadd213sd {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm2 503; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 504; X64-NEXT: vzeroupper 505; X64-NEXT: retq 506; 507; X86-LABEL: fma_v4f64: 508; X86: # %bb.0: 509; X86-NEXT: pushl %ebp 510; X86-NEXT: movl %esp, %ebp 511; X86-NEXT: andl $-8, %esp 512; X86-NEXT: subl $8, %esp 513; X86-NEXT: vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + xmm2 514; X86-NEXT: vmovsd %xmm1, (%esp) 515; X86-NEXT: fldl (%esp) 516; X86-NEXT: movl %ebp, %esp 517; X86-NEXT: popl %ebp 518; X86-NEXT: vzeroupper 519; X86-NEXT: retl 520 %v = call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %z) 521 %r = extractelement <4 x double> %v, i32 0 522 ret double %r 523} 524 525define float @fabs_v4f32(<4 x float> %x) nounwind { 526; X64-LABEL: fabs_v4f32: 527; X64: # %bb.0: 528; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] 529; X64-NEXT: vandps %xmm1, %xmm0, %xmm0 530; X64-NEXT: retq 531; 532; X86-LABEL: fabs_v4f32: 533; X86: # %bb.0: 534; X86-NEXT: pushl %eax 535; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] 536; X86-NEXT: vandps %xmm1, %xmm0, %xmm0 537; X86-NEXT: vmovss %xmm0, (%esp) 538; X86-NEXT: flds (%esp) 539; X86-NEXT: popl %eax 540; X86-NEXT: retl 541 %v = call <4 x float> @llvm.fabs.v4f32(<4 x float> %x) 542 %r = extractelement <4 x float> %v, i32 0 543 ret float %r 544} 545 546define double @fabs_v4f64(<4 x double> %x) nounwind { 547; X64-LABEL: fabs_v4f64: 548; X64: # %bb.0: 549; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 550; X64-NEXT: vzeroupper 551; X64-NEXT: retq 552; 553; X86-LABEL: fabs_v4f64: 554; X86: # %bb.0: 555; X86-NEXT: pushl %ebp 556; X86-NEXT: movl %esp, %ebp 557; X86-NEXT: andl $-8, %esp 558; X86-NEXT: subl $8, %esp 559; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 560; X86-NEXT: vmovlps %xmm0, (%esp) 561; X86-NEXT: fldl (%esp) 562; X86-NEXT: movl %ebp, %esp 563; X86-NEXT: popl %ebp 564; X86-NEXT: vzeroupper 565; X86-NEXT: retl 566 %v = call <4 x double> @llvm.fabs.v4f64(<4 x double> %x) 567 %r = extractelement <4 x double> %v, i32 0 568 ret double %r 569} 570 571define float @fmaxnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 572; X64-LABEL: fmaxnum_v4f32: 573; X64: # %bb.0: 574; X64-NEXT: vmaxss %xmm0, %xmm1, %xmm2 575; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 576; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 577; X64-NEXT: retq 578; 579; X86-LABEL: fmaxnum_v4f32: 580; X86: # %bb.0: 581; X86-NEXT: pushl %eax 582; X86-NEXT: vmaxss %xmm0, %xmm1, %xmm2 583; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 584; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 585; X86-NEXT: vmovss %xmm0, (%esp) 586; X86-NEXT: flds (%esp) 587; X86-NEXT: popl %eax 588; X86-NEXT: retl 589 %v = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %x, <4 x float> %y) 590 %r = extractelement <4 x float> %v, i32 0 591 ret float %r 592} 593 594define double @fmaxnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 595; X64-LABEL: fmaxnum_v4f64: 596; X64: # %bb.0: 597; X64-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 598; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 599; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 600; X64-NEXT: vzeroupper 601; X64-NEXT: retq 602; 603; X86-LABEL: fmaxnum_v4f64: 604; X86: # %bb.0: 605; X86-NEXT: pushl %ebp 606; X86-NEXT: movl %esp, %ebp 607; X86-NEXT: andl $-8, %esp 608; X86-NEXT: subl $8, %esp 609; X86-NEXT: vmaxsd %xmm0, %xmm1, %xmm2 610; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 611; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 612; X86-NEXT: vmovlpd %xmm0, (%esp) 613; X86-NEXT: fldl (%esp) 614; X86-NEXT: movl %ebp, %esp 615; X86-NEXT: popl %ebp 616; X86-NEXT: vzeroupper 617; X86-NEXT: retl 618 %v = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %x, <4 x double> %y) 619 %r = extractelement <4 x double> %v, i32 0 620 ret double %r 621} 622 623define float @fminnum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 624; X64-LABEL: fminnum_v4f32: 625; X64: # %bb.0: 626; X64-NEXT: vminss %xmm0, %xmm1, %xmm2 627; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 628; X64-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 629; X64-NEXT: retq 630; 631; X86-LABEL: fminnum_v4f32: 632; X86: # %bb.0: 633; X86-NEXT: pushl %eax 634; X86-NEXT: vminss %xmm0, %xmm1, %xmm2 635; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm0 636; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 637; X86-NEXT: vmovss %xmm0, (%esp) 638; X86-NEXT: flds (%esp) 639; X86-NEXT: popl %eax 640; X86-NEXT: retl 641 %v = call <4 x float> @llvm.minnum.v4f32(<4 x float> %x, <4 x float> %y) 642 %r = extractelement <4 x float> %v, i32 0 643 ret float %r 644} 645 646define double @fminnum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 647; X64-LABEL: fminnum_v4f64: 648; X64: # %bb.0: 649; X64-NEXT: vminsd %xmm0, %xmm1, %xmm2 650; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 651; X64-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 652; X64-NEXT: vzeroupper 653; X64-NEXT: retq 654; 655; X86-LABEL: fminnum_v4f64: 656; X86: # %bb.0: 657; X86-NEXT: pushl %ebp 658; X86-NEXT: movl %esp, %ebp 659; X86-NEXT: andl $-8, %esp 660; X86-NEXT: subl $8, %esp 661; X86-NEXT: vminsd %xmm0, %xmm1, %xmm2 662; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm0 663; X86-NEXT: vblendvpd %xmm0, %xmm1, %xmm2, %xmm0 664; X86-NEXT: vmovlpd %xmm0, (%esp) 665; X86-NEXT: fldl (%esp) 666; X86-NEXT: movl %ebp, %esp 667; X86-NEXT: popl %ebp 668; X86-NEXT: vzeroupper 669; X86-NEXT: retl 670 %v = call <4 x double> @llvm.minnum.v4f64(<4 x double> %x, <4 x double> %y) 671 %r = extractelement <4 x double> %v, i32 0 672 ret double %r 673} 674 675define float @fmaximum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 676; X64-LABEL: fmaximum_v4f32: 677; X64: # %bb.0: 678; X64-NEXT: vmovd %xmm0, %eax 679; X64-NEXT: testl %eax, %eax 680; X64-NEXT: js .LBB30_1 681; X64-NEXT: # %bb.2: 682; X64-NEXT: vmovdqa %xmm0, %xmm2 683; X64-NEXT: jmp .LBB30_3 684; X64-NEXT: .LBB30_1: 685; X64-NEXT: vmovdqa %xmm1, %xmm2 686; X64-NEXT: vmovdqa %xmm0, %xmm1 687; X64-NEXT: .LBB30_3: 688; X64-NEXT: vmaxss %xmm2, %xmm1, %xmm0 689; X64-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 690; X64-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 691; X64-NEXT: retq 692; 693; X86-LABEL: fmaximum_v4f32: 694; X86: # %bb.0: 695; X86-NEXT: vmovd %xmm0, %eax 696; X86-NEXT: testl %eax, %eax 697; X86-NEXT: js .LBB30_1 698; X86-NEXT: # %bb.2: 699; X86-NEXT: vmovdqa %xmm0, %xmm2 700; X86-NEXT: jmp .LBB30_3 701; X86-NEXT: .LBB30_1: 702; X86-NEXT: vmovdqa %xmm1, %xmm2 703; X86-NEXT: vmovdqa %xmm0, %xmm1 704; X86-NEXT: .LBB30_3: 705; X86-NEXT: pushl %eax 706; X86-NEXT: vmaxss %xmm2, %xmm1, %xmm0 707; X86-NEXT: vcmpunordss %xmm1, %xmm1, %xmm2 708; X86-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 709; X86-NEXT: vmovss %xmm0, (%esp) 710; X86-NEXT: flds (%esp) 711; X86-NEXT: popl %eax 712; X86-NEXT: retl 713 %v = call <4 x float> @llvm.maximum.v4f32(<4 x float> %x, <4 x float> %y) 714 %r = extractelement <4 x float> %v, i32 0 715 ret float %r 716} 717 718define double @fmaximum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 719; X64-LABEL: fmaximum_v4f64: 720; X64: # %bb.0: 721; X64-NEXT: vmovq %xmm0, %rax 722; X64-NEXT: testq %rax, %rax 723; X64-NEXT: js .LBB31_1 724; X64-NEXT: # %bb.2: 725; X64-NEXT: vmovdqa %xmm0, %xmm2 726; X64-NEXT: jmp .LBB31_3 727; X64-NEXT: .LBB31_1: 728; X64-NEXT: vmovdqa %xmm1, %xmm2 729; X64-NEXT: vmovdqa %xmm0, %xmm1 730; X64-NEXT: .LBB31_3: 731; X64-NEXT: vmaxsd %xmm2, %xmm1, %xmm0 732; X64-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 733; X64-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 734; X64-NEXT: vzeroupper 735; X64-NEXT: retq 736; 737; X86-LABEL: fmaximum_v4f64: 738; X86: # %bb.0: 739; X86-NEXT: vextractps $1, %xmm0, %eax 740; X86-NEXT: testl %eax, %eax 741; X86-NEXT: js .LBB31_1 742; X86-NEXT: # %bb.2: 743; X86-NEXT: vmovapd %xmm0, %xmm2 744; X86-NEXT: jmp .LBB31_3 745; X86-NEXT: .LBB31_1: 746; X86-NEXT: vmovapd %xmm1, %xmm2 747; X86-NEXT: vmovapd %xmm0, %xmm1 748; X86-NEXT: .LBB31_3: 749; X86-NEXT: pushl %ebp 750; X86-NEXT: movl %esp, %ebp 751; X86-NEXT: andl $-8, %esp 752; X86-NEXT: subl $8, %esp 753; X86-NEXT: vmaxsd %xmm2, %xmm1, %xmm0 754; X86-NEXT: vcmpunordsd %xmm1, %xmm1, %xmm2 755; X86-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 756; X86-NEXT: vmovlpd %xmm0, (%esp) 757; X86-NEXT: fldl (%esp) 758; X86-NEXT: movl %ebp, %esp 759; X86-NEXT: popl %ebp 760; X86-NEXT: vzeroupper 761; X86-NEXT: retl 762 %v = call <4 x double> @llvm.maximum.v4f64(<4 x double> %x, <4 x double> %y) 763 %r = extractelement <4 x double> %v, i32 0 764 ret double %r 765} 766 767define float @fminimum_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 768; X64-LABEL: fminimum_v4f32: 769; X64: # %bb.0: 770; X64-NEXT: vmovd %xmm0, %eax 771; X64-NEXT: testl %eax, %eax 772; X64-NEXT: js .LBB32_1 773; X64-NEXT: # %bb.2: 774; X64-NEXT: vmovdqa %xmm1, %xmm2 775; X64-NEXT: jmp .LBB32_3 776; X64-NEXT: .LBB32_1: 777; X64-NEXT: vmovdqa %xmm0, %xmm2 778; X64-NEXT: vmovdqa %xmm1, %xmm0 779; X64-NEXT: .LBB32_3: 780; X64-NEXT: vminss %xmm2, %xmm0, %xmm1 781; X64-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 782; X64-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 783; X64-NEXT: retq 784; 785; X86-LABEL: fminimum_v4f32: 786; X86: # %bb.0: 787; X86-NEXT: vmovd %xmm0, %eax 788; X86-NEXT: testl %eax, %eax 789; X86-NEXT: js .LBB32_1 790; X86-NEXT: # %bb.2: 791; X86-NEXT: vmovdqa %xmm1, %xmm2 792; X86-NEXT: jmp .LBB32_3 793; X86-NEXT: .LBB32_1: 794; X86-NEXT: vmovdqa %xmm0, %xmm2 795; X86-NEXT: vmovdqa %xmm1, %xmm0 796; X86-NEXT: .LBB32_3: 797; X86-NEXT: pushl %eax 798; X86-NEXT: vminss %xmm2, %xmm0, %xmm1 799; X86-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2 800; X86-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 801; X86-NEXT: vmovss %xmm0, (%esp) 802; X86-NEXT: flds (%esp) 803; X86-NEXT: popl %eax 804; X86-NEXT: retl 805 %v = call <4 x float> @llvm.minimum.v4f32(<4 x float> %x, <4 x float> %y) 806 %r = extractelement <4 x float> %v, i32 0 807 ret float %r 808} 809 810define double @fminimum_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 811; X64-LABEL: fminimum_v4f64: 812; X64: # %bb.0: 813; X64-NEXT: vmovq %xmm0, %rax 814; X64-NEXT: testq %rax, %rax 815; X64-NEXT: js .LBB33_1 816; X64-NEXT: # %bb.2: 817; X64-NEXT: vmovdqa %xmm1, %xmm2 818; X64-NEXT: jmp .LBB33_3 819; X64-NEXT: .LBB33_1: 820; X64-NEXT: vmovdqa %xmm0, %xmm2 821; X64-NEXT: vmovdqa %xmm1, %xmm0 822; X64-NEXT: .LBB33_3: 823; X64-NEXT: vminsd %xmm2, %xmm0, %xmm1 824; X64-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 825; X64-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 826; X64-NEXT: vzeroupper 827; X64-NEXT: retq 828; 829; X86-LABEL: fminimum_v4f64: 830; X86: # %bb.0: 831; X86-NEXT: vextractps $1, %xmm0, %eax 832; X86-NEXT: testl %eax, %eax 833; X86-NEXT: js .LBB33_1 834; X86-NEXT: # %bb.2: 835; X86-NEXT: vmovapd %xmm1, %xmm2 836; X86-NEXT: jmp .LBB33_3 837; X86-NEXT: .LBB33_1: 838; X86-NEXT: vmovapd %xmm0, %xmm2 839; X86-NEXT: vmovapd %xmm1, %xmm0 840; X86-NEXT: .LBB33_3: 841; X86-NEXT: pushl %ebp 842; X86-NEXT: movl %esp, %ebp 843; X86-NEXT: andl $-8, %esp 844; X86-NEXT: subl $8, %esp 845; X86-NEXT: vminsd %xmm2, %xmm0, %xmm1 846; X86-NEXT: vcmpunordsd %xmm0, %xmm0, %xmm2 847; X86-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 848; X86-NEXT: vmovlpd %xmm0, (%esp) 849; X86-NEXT: fldl (%esp) 850; X86-NEXT: movl %ebp, %esp 851; X86-NEXT: popl %ebp 852; X86-NEXT: vzeroupper 853; X86-NEXT: retl 854 %v = call <4 x double> @llvm.minimum.v4f64(<4 x double> %x, <4 x double> %y) 855 %r = extractelement <4 x double> %v, i32 0 856 ret double %r 857} 858 859define float @maxps_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 860; X64-LABEL: maxps_v4f32: 861; X64: # %bb.0: 862; X64-NEXT: vmaxss %xmm1, %xmm0, %xmm0 863; X64-NEXT: retq 864; 865; X86-LABEL: maxps_v4f32: 866; X86: # %bb.0: 867; X86-NEXT: pushl %eax 868; X86-NEXT: vmaxss %xmm1, %xmm0, %xmm0 869; X86-NEXT: vmovss %xmm0, (%esp) 870; X86-NEXT: flds (%esp) 871; X86-NEXT: popl %eax 872; X86-NEXT: retl 873 %cmp = fcmp ogt <4 x float> %x, %y 874 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y 875 %r = extractelement <4 x float> %v, i32 0 876 ret float %r 877} 878 879define double @maxpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 880; X64-LABEL: maxpd_v4f64: 881; X64: # %bb.0: 882; X64-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 883; X64-NEXT: vzeroupper 884; X64-NEXT: retq 885; 886; X86-LABEL: maxpd_v4f64: 887; X86: # %bb.0: 888; X86-NEXT: pushl %ebp 889; X86-NEXT: movl %esp, %ebp 890; X86-NEXT: andl $-8, %esp 891; X86-NEXT: subl $8, %esp 892; X86-NEXT: vmaxsd %xmm1, %xmm0, %xmm0 893; X86-NEXT: vmovsd %xmm0, (%esp) 894; X86-NEXT: fldl (%esp) 895; X86-NEXT: movl %ebp, %esp 896; X86-NEXT: popl %ebp 897; X86-NEXT: vzeroupper 898; X86-NEXT: retl 899 %cmp = fcmp ogt <4 x double> %x, %y 900 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y 901 %r = extractelement <4 x double> %v, i32 0 902 ret double %r 903} 904 905define float @minps_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 906; X64-LABEL: minps_v4f32: 907; X64: # %bb.0: 908; X64-NEXT: vminss %xmm1, %xmm0, %xmm0 909; X64-NEXT: retq 910; 911; X86-LABEL: minps_v4f32: 912; X86: # %bb.0: 913; X86-NEXT: pushl %eax 914; X86-NEXT: vminss %xmm1, %xmm0, %xmm0 915; X86-NEXT: vmovss %xmm0, (%esp) 916; X86-NEXT: flds (%esp) 917; X86-NEXT: popl %eax 918; X86-NEXT: retl 919 %cmp = fcmp olt <4 x float> %x, %y 920 %v = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y 921 %r = extractelement <4 x float> %v, i32 0 922 ret float %r 923} 924 925define double @minpd_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 926; X64-LABEL: minpd_v4f64: 927; X64: # %bb.0: 928; X64-NEXT: vminsd %xmm1, %xmm0, %xmm0 929; X64-NEXT: vzeroupper 930; X64-NEXT: retq 931; 932; X86-LABEL: minpd_v4f64: 933; X86: # %bb.0: 934; X86-NEXT: pushl %ebp 935; X86-NEXT: movl %esp, %ebp 936; X86-NEXT: andl $-8, %esp 937; X86-NEXT: subl $8, %esp 938; X86-NEXT: vminsd %xmm1, %xmm0, %xmm0 939; X86-NEXT: vmovsd %xmm0, (%esp) 940; X86-NEXT: fldl (%esp) 941; X86-NEXT: movl %ebp, %esp 942; X86-NEXT: popl %ebp 943; X86-NEXT: vzeroupper 944; X86-NEXT: retl 945 %cmp = fcmp olt <4 x double> %x, %y 946 %v = select <4 x i1> %cmp, <4 x double> %x, <4 x double> %y 947 %r = extractelement <4 x double> %v, i32 0 948 ret double %r 949} 950 951define float @copysign_v4f32(<4 x float> %x, <4 x float> %y) nounwind { 952; X64-LABEL: copysign_v4f32: 953; X64: # %bb.0: 954; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 955; X64-NEXT: vandps %xmm2, %xmm1, %xmm1 956; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 957; X64-NEXT: vandps %xmm2, %xmm0, %xmm0 958; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 959; X64-NEXT: retq 960; 961; X86-LABEL: copysign_v4f32: 962; X86: # %bb.0: 963; X86-NEXT: pushl %eax 964; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 965; X86-NEXT: vandps %xmm2, %xmm1, %xmm1 966; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [NaN,NaN,NaN,NaN] 967; X86-NEXT: vandps %xmm2, %xmm0, %xmm0 968; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 969; X86-NEXT: vmovss %xmm0, (%esp) 970; X86-NEXT: flds (%esp) 971; X86-NEXT: popl %eax 972; X86-NEXT: retl 973 %v = call <4 x float> @llvm.copysign.v4f32(<4 x float> %x, <4 x float> %y) 974 %r = extractelement <4 x float> %v, i32 0 975 ret float %r 976} 977 978define double @copysign_v4f64(<4 x double> %x, <4 x double> %y) nounwind { 979; X64-LABEL: copysign_v4f64: 980; X64: # %bb.0: 981; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 982; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 983; X64-NEXT: vorps %xmm1, %xmm0, %xmm0 984; X64-NEXT: vzeroupper 985; X64-NEXT: retq 986; 987; X86-LABEL: copysign_v4f64: 988; X86: # %bb.0: 989; X86-NEXT: pushl %ebp 990; X86-NEXT: movl %esp, %ebp 991; X86-NEXT: andl $-8, %esp 992; X86-NEXT: subl $8, %esp 993; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 994; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 995; X86-NEXT: vorps %xmm1, %xmm0, %xmm0 996; X86-NEXT: vmovlps %xmm0, (%esp) 997; X86-NEXT: fldl (%esp) 998; X86-NEXT: movl %ebp, %esp 999; X86-NEXT: popl %ebp 1000; X86-NEXT: vzeroupper 1001; X86-NEXT: retl 1002 %v = call <4 x double> @llvm.copysign.v4f64(<4 x double> %x, <4 x double> %y) 1003 %r = extractelement <4 x double> %v, i32 0 1004 ret double %r 1005} 1006 1007define float @floor_v4f32(<4 x float> %x) nounwind { 1008; X64-LABEL: floor_v4f32: 1009; X64: # %bb.0: 1010; X64-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1011; X64-NEXT: retq 1012; 1013; X86-LABEL: floor_v4f32: 1014; X86: # %bb.0: 1015; X86-NEXT: pushl %eax 1016; X86-NEXT: vroundss $9, %xmm0, %xmm0, %xmm0 1017; X86-NEXT: vmovss %xmm0, (%esp) 1018; X86-NEXT: flds (%esp) 1019; X86-NEXT: popl %eax 1020; X86-NEXT: retl 1021 %v = call <4 x float> @llvm.floor.v4f32(<4 x float> %x) 1022 %r = extractelement <4 x float> %v, i32 0 1023 ret float %r 1024} 1025 1026define double @floor_v4f64(<4 x double> %x) nounwind { 1027; X64-LABEL: floor_v4f64: 1028; X64: # %bb.0: 1029; X64-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1030; X64-NEXT: vzeroupper 1031; X64-NEXT: retq 1032; 1033; X86-LABEL: floor_v4f64: 1034; X86: # %bb.0: 1035; X86-NEXT: pushl %ebp 1036; X86-NEXT: movl %esp, %ebp 1037; X86-NEXT: andl $-8, %esp 1038; X86-NEXT: subl $8, %esp 1039; X86-NEXT: vroundsd $9, %xmm0, %xmm0, %xmm0 1040; X86-NEXT: vmovsd %xmm0, (%esp) 1041; X86-NEXT: fldl (%esp) 1042; X86-NEXT: movl %ebp, %esp 1043; X86-NEXT: popl %ebp 1044; X86-NEXT: vzeroupper 1045; X86-NEXT: retl 1046 %v = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) 1047 %r = extractelement <4 x double> %v, i32 0 1048 ret double %r 1049} 1050 1051define float @ceil_v4f32(<4 x float> %x) nounwind { 1052; X64-LABEL: ceil_v4f32: 1053; X64: # %bb.0: 1054; X64-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 1055; X64-NEXT: retq 1056; 1057; X86-LABEL: ceil_v4f32: 1058; X86: # %bb.0: 1059; X86-NEXT: pushl %eax 1060; X86-NEXT: vroundss $10, %xmm0, %xmm0, %xmm0 1061; X86-NEXT: vmovss %xmm0, (%esp) 1062; X86-NEXT: flds (%esp) 1063; X86-NEXT: popl %eax 1064; X86-NEXT: retl 1065 %v = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) 1066 %r = extractelement <4 x float> %v, i32 0 1067 ret float %r 1068} 1069 1070define double @ceil_v4f64(<4 x double> %x) nounwind { 1071; X64-LABEL: ceil_v4f64: 1072; X64: # %bb.0: 1073; X64-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 1074; X64-NEXT: vzeroupper 1075; X64-NEXT: retq 1076; 1077; X86-LABEL: ceil_v4f64: 1078; X86: # %bb.0: 1079; X86-NEXT: pushl %ebp 1080; X86-NEXT: movl %esp, %ebp 1081; X86-NEXT: andl $-8, %esp 1082; X86-NEXT: subl $8, %esp 1083; X86-NEXT: vroundsd $10, %xmm0, %xmm0, %xmm0 1084; X86-NEXT: vmovsd %xmm0, (%esp) 1085; X86-NEXT: fldl (%esp) 1086; X86-NEXT: movl %ebp, %esp 1087; X86-NEXT: popl %ebp 1088; X86-NEXT: vzeroupper 1089; X86-NEXT: retl 1090 %v = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) 1091 %r = extractelement <4 x double> %v, i32 0 1092 ret double %r 1093} 1094 1095define float @trunc_v4f32(<4 x float> %x) nounwind { 1096; X64-LABEL: trunc_v4f32: 1097; X64: # %bb.0: 1098; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1099; X64-NEXT: retq 1100; 1101; X86-LABEL: trunc_v4f32: 1102; X86: # %bb.0: 1103; X86-NEXT: pushl %eax 1104; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1105; X86-NEXT: vmovss %xmm0, (%esp) 1106; X86-NEXT: flds (%esp) 1107; X86-NEXT: popl %eax 1108; X86-NEXT: retl 1109 %v = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) 1110 %r = extractelement <4 x float> %v, i32 0 1111 ret float %r 1112} 1113 1114define double @trunc_v4f64(<4 x double> %x) nounwind { 1115; X64-LABEL: trunc_v4f64: 1116; X64: # %bb.0: 1117; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1118; X64-NEXT: vzeroupper 1119; X64-NEXT: retq 1120; 1121; X86-LABEL: trunc_v4f64: 1122; X86: # %bb.0: 1123; X86-NEXT: pushl %ebp 1124; X86-NEXT: movl %esp, %ebp 1125; X86-NEXT: andl $-8, %esp 1126; X86-NEXT: subl $8, %esp 1127; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1128; X86-NEXT: vmovsd %xmm0, (%esp) 1129; X86-NEXT: fldl (%esp) 1130; X86-NEXT: movl %ebp, %esp 1131; X86-NEXT: popl %ebp 1132; X86-NEXT: vzeroupper 1133; X86-NEXT: retl 1134 %v = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) 1135 %r = extractelement <4 x double> %v, i32 0 1136 ret double %r 1137} 1138 1139define float @rint_v4f32(<4 x float> %x) nounwind { 1140; X64-LABEL: rint_v4f32: 1141; X64: # %bb.0: 1142; X64-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 1143; X64-NEXT: retq 1144; 1145; X86-LABEL: rint_v4f32: 1146; X86: # %bb.0: 1147; X86-NEXT: pushl %eax 1148; X86-NEXT: vroundss $4, %xmm0, %xmm0, %xmm0 1149; X86-NEXT: vmovss %xmm0, (%esp) 1150; X86-NEXT: flds (%esp) 1151; X86-NEXT: popl %eax 1152; X86-NEXT: retl 1153 %v = call <4 x float> @llvm.rint.v4f32(<4 x float> %x) 1154 %r = extractelement <4 x float> %v, i32 0 1155 ret float %r 1156} 1157 1158define double @rint_v4f64(<4 x double> %x) nounwind { 1159; X64-LABEL: rint_v4f64: 1160; X64: # %bb.0: 1161; X64-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0 1162; X64-NEXT: vzeroupper 1163; X64-NEXT: retq 1164; 1165; X86-LABEL: rint_v4f64: 1166; X86: # %bb.0: 1167; X86-NEXT: pushl %ebp 1168; X86-NEXT: movl %esp, %ebp 1169; X86-NEXT: andl $-8, %esp 1170; X86-NEXT: subl $8, %esp 1171; X86-NEXT: vroundsd $4, %xmm0, %xmm0, %xmm0 1172; X86-NEXT: vmovsd %xmm0, (%esp) 1173; X86-NEXT: fldl (%esp) 1174; X86-NEXT: movl %ebp, %esp 1175; X86-NEXT: popl %ebp 1176; X86-NEXT: vzeroupper 1177; X86-NEXT: retl 1178 %v = call <4 x double> @llvm.rint.v4f64(<4 x double> %x) 1179 %r = extractelement <4 x double> %v, i32 0 1180 ret double %r 1181} 1182 1183define float @nearbyint_v4f32(<4 x float> %x) nounwind { 1184; X64-LABEL: nearbyint_v4f32: 1185; X64: # %bb.0: 1186; X64-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 1187; X64-NEXT: retq 1188; 1189; X86-LABEL: nearbyint_v4f32: 1190; X86: # %bb.0: 1191; X86-NEXT: pushl %eax 1192; X86-NEXT: vroundss $12, %xmm0, %xmm0, %xmm0 1193; X86-NEXT: vmovss %xmm0, (%esp) 1194; X86-NEXT: flds (%esp) 1195; X86-NEXT: popl %eax 1196; X86-NEXT: retl 1197 %v = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %x) 1198 %r = extractelement <4 x float> %v, i32 0 1199 ret float %r 1200} 1201 1202define double @nearbyint_v4f64(<4 x double> %x) nounwind { 1203; X64-LABEL: nearbyint_v4f64: 1204; X64: # %bb.0: 1205; X64-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 1206; X64-NEXT: vzeroupper 1207; X64-NEXT: retq 1208; 1209; X86-LABEL: nearbyint_v4f64: 1210; X86: # %bb.0: 1211; X86-NEXT: pushl %ebp 1212; X86-NEXT: movl %esp, %ebp 1213; X86-NEXT: andl $-8, %esp 1214; X86-NEXT: subl $8, %esp 1215; X86-NEXT: vroundsd $12, %xmm0, %xmm0, %xmm0 1216; X86-NEXT: vmovsd %xmm0, (%esp) 1217; X86-NEXT: fldl (%esp) 1218; X86-NEXT: movl %ebp, %esp 1219; X86-NEXT: popl %ebp 1220; X86-NEXT: vzeroupper 1221; X86-NEXT: retl 1222 %v = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %x) 1223 %r = extractelement <4 x double> %v, i32 0 1224 ret double %r 1225} 1226 1227define float @round_v4f32(<4 x float> %x) nounwind { 1228; X64-LABEL: round_v4f32: 1229; X64: # %bb.0: 1230; X64-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1231; X64-NEXT: vandps %xmm1, %xmm0, %xmm1 1232; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 1233; X64-NEXT: vorps %xmm2, %xmm1, %xmm1 1234; X64-NEXT: vaddss %xmm1, %xmm0, %xmm0 1235; X64-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1236; X64-NEXT: retq 1237; 1238; X86-LABEL: round_v4f32: 1239; X86: # %bb.0: 1240; X86-NEXT: pushl %eax 1241; X86-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 1242; X86-NEXT: vandps %xmm1, %xmm0, %xmm1 1243; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4.9999997E-1,4.9999997E-1,4.9999997E-1,4.9999997E-1] 1244; X86-NEXT: vorps %xmm2, %xmm1, %xmm1 1245; X86-NEXT: vaddss %xmm1, %xmm0, %xmm0 1246; X86-NEXT: vroundss $11, %xmm0, %xmm0, %xmm0 1247; X86-NEXT: vmovss %xmm0, (%esp) 1248; X86-NEXT: flds (%esp) 1249; X86-NEXT: popl %eax 1250; X86-NEXT: retl 1251 %v = call <4 x float> @llvm.round.v4f32(<4 x float> %x) 1252 %r = extractelement <4 x float> %v, i32 0 1253 ret float %r 1254} 1255 1256define double @round_v4f64(<4 x double> %x) nounwind { 1257; X64-LABEL: round_v4f64: 1258; X64: # %bb.0: 1259; X64-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 1260; X64-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] 1261; X64-NEXT: # xmm2 = mem[0,0] 1262; X64-NEXT: vorpd %xmm2, %xmm1, %xmm1 1263; X64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1264; X64-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1265; X64-NEXT: vzeroupper 1266; X64-NEXT: retq 1267; 1268; X86-LABEL: round_v4f64: 1269; X86: # %bb.0: 1270; X86-NEXT: pushl %ebp 1271; X86-NEXT: movl %esp, %ebp 1272; X86-NEXT: andl $-8, %esp 1273; X86-NEXT: subl $8, %esp 1274; X86-NEXT: vandpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 1275; X86-NEXT: vmovddup {{.*#+}} xmm2 = [4.9999999999999994E-1,4.9999999999999994E-1] 1276; X86-NEXT: # xmm2 = mem[0,0] 1277; X86-NEXT: vorpd %xmm2, %xmm1, %xmm1 1278; X86-NEXT: vaddsd %xmm1, %xmm0, %xmm0 1279; X86-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm0 1280; X86-NEXT: vmovsd %xmm0, (%esp) 1281; X86-NEXT: fldl (%esp) 1282; X86-NEXT: movl %ebp, %esp 1283; X86-NEXT: popl %ebp 1284; X86-NEXT: vzeroupper 1285; X86-NEXT: retl 1286 %v = call <4 x double> @llvm.round.v4f64(<4 x double> %x) 1287 %r = extractelement <4 x double> %v, i32 0 1288 ret double %r 1289} 1290 1291define float @rcp_v4f32(<4 x float> %x) nounwind { 1292; X64-LABEL: rcp_v4f32: 1293; X64: # %bb.0: 1294; X64-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1295; X64-NEXT: retq 1296; 1297; X86-LABEL: rcp_v4f32: 1298; X86: # %bb.0: 1299; X86-NEXT: pushl %eax 1300; X86-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1301; X86-NEXT: vmovss %xmm0, (%esp) 1302; X86-NEXT: flds (%esp) 1303; X86-NEXT: popl %eax 1304; X86-NEXT: retl 1305 %v = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %x) 1306 %r = extractelement <4 x float> %v, i32 0 1307 ret float %r 1308} 1309 1310define float @rcp_v8f32(<8 x float> %x) nounwind { 1311; X64-LABEL: rcp_v8f32: 1312; X64: # %bb.0: 1313; X64-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1314; X64-NEXT: vzeroupper 1315; X64-NEXT: retq 1316; 1317; X86-LABEL: rcp_v8f32: 1318; X86: # %bb.0: 1319; X86-NEXT: pushl %eax 1320; X86-NEXT: vrcpss %xmm0, %xmm0, %xmm0 1321; X86-NEXT: vmovss %xmm0, (%esp) 1322; X86-NEXT: flds (%esp) 1323; X86-NEXT: popl %eax 1324; X86-NEXT: vzeroupper 1325; X86-NEXT: retl 1326 %v = call <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float> %x) 1327 %r = extractelement <8 x float> %v, i32 0 1328 ret float %r 1329} 1330 1331define float @rsqrt_v4f32(<4 x float> %x) nounwind { 1332; X64-LABEL: rsqrt_v4f32: 1333; X64: # %bb.0: 1334; X64-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1335; X64-NEXT: retq 1336; 1337; X86-LABEL: rsqrt_v4f32: 1338; X86: # %bb.0: 1339; X86-NEXT: pushl %eax 1340; X86-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1341; X86-NEXT: vmovss %xmm0, (%esp) 1342; X86-NEXT: flds (%esp) 1343; X86-NEXT: popl %eax 1344; X86-NEXT: retl 1345 %v = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %x) 1346 %r = extractelement <4 x float> %v, i32 0 1347 ret float %r 1348} 1349 1350define float @rsqrt_v8f32(<8 x float> %x) nounwind { 1351; X64-LABEL: rsqrt_v8f32: 1352; X64: # %bb.0: 1353; X64-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1354; X64-NEXT: vzeroupper 1355; X64-NEXT: retq 1356; 1357; X86-LABEL: rsqrt_v8f32: 1358; X86: # %bb.0: 1359; X86-NEXT: pushl %eax 1360; X86-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 1361; X86-NEXT: vmovss %xmm0, (%esp) 1362; X86-NEXT: flds (%esp) 1363; X86-NEXT: popl %eax 1364; X86-NEXT: vzeroupper 1365; X86-NEXT: retl 1366 %v = call <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float> %x) 1367 %r = extractelement <8 x float> %v, i32 0 1368 ret float %r 1369} 1370 1371declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) 1372declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) 1373declare <4 x float> @llvm.sin.v4f32(<4 x float>) 1374declare <4 x double> @llvm.sin.v4f64(<4 x double>) 1375declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>) 1376declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) 1377declare <4 x float> @llvm.fabs.v4f32(<4 x float>) 1378declare <4 x double> @llvm.fabs.v4f64(<4 x double>) 1379declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) 1380declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) 1381declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) 1382declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) 1383declare <4 x float> @llvm.maximum.v4f32(<4 x float>, <4 x float>) 1384declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>) 1385declare <4 x float> @llvm.minimum.v4f32(<4 x float>, <4 x float>) 1386declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>) 1387declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) 1388declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) 1389declare <4 x float> @llvm.floor.v4f32(<4 x float>) 1390declare <4 x double> @llvm.floor.v4f64(<4 x double>) 1391declare <4 x float> @llvm.ceil.v4f32(<4 x float>) 1392declare <4 x double> @llvm.ceil.v4f64(<4 x double>) 1393declare <4 x float> @llvm.trunc.v4f32(<4 x float>) 1394declare <4 x double> @llvm.trunc.v4f64(<4 x double>) 1395declare <4 x float> @llvm.rint.v4f32(<4 x float>) 1396declare <4 x double> @llvm.rint.v4f64(<4 x double>) 1397declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>) 1398declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) 1399declare <4 x float> @llvm.round.v4f32(<4 x float>) 1400declare <4 x double> @llvm.round.v4f64(<4 x double>) 1401 1402declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) 1403declare <8 x float> @llvm.x86.avx.rcp.ps.256(<8 x float>) 1404declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) 1405declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) 1406