1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=X86 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512 7 8define void @vp_fadd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 9; X86-LABEL: vp_fadd_v4f32: 10; X86: # %bb.0: 11; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 12; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 13; X86-NEXT: vmovaps %xmm0, (%eax) 14; X86-NEXT: retl 15; 16; SSE-LABEL: vp_fadd_v4f32: 17; SSE: # %bb.0: 18; SSE-NEXT: addps %xmm1, %xmm0 19; SSE-NEXT: movaps %xmm0, (%rdi) 20; SSE-NEXT: retq 21; 22; AVX-LABEL: vp_fadd_v4f32: 23; AVX: # %bb.0: 24; AVX-NEXT: vaddps %xmm1, %xmm0, %xmm0 25; AVX-NEXT: vmovaps %xmm0, (%rdi) 26; AVX-NEXT: retq 27 %res = call <4 x float> @llvm.vp.fadd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 28 store <4 x float> %res, ptr %out 29 ret void 30} 31declare <4 x float> @llvm.vp.fadd.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 32 33define void @vp_fsub_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 34; X86-LABEL: vp_fsub_v4f32: 35; X86: # %bb.0: 36; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 37; X86-NEXT: vsubps %xmm1, %xmm0, %xmm0 38; X86-NEXT: vmovaps %xmm0, (%eax) 39; X86-NEXT: retl 40; 41; SSE-LABEL: vp_fsub_v4f32: 42; SSE: # %bb.0: 43; SSE-NEXT: subps %xmm1, %xmm0 44; SSE-NEXT: movaps %xmm0, (%rdi) 45; SSE-NEXT: retq 46; 47; AVX-LABEL: vp_fsub_v4f32: 48; AVX: # %bb.0: 49; AVX-NEXT: vsubps %xmm1, %xmm0, %xmm0 50; AVX-NEXT: vmovaps %xmm0, (%rdi) 51; AVX-NEXT: retq 52 %res = call <4 x float> @llvm.vp.fsub.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 53 store <4 x float> %res, ptr %out 54 ret void 55} 56declare <4 x float> @llvm.vp.fsub.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 57 58define void @vp_fmul_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 59; X86-LABEL: vp_fmul_v4f32: 60; X86: # %bb.0: 61; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 62; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 63; X86-NEXT: vmovaps %xmm0, (%eax) 64; X86-NEXT: retl 65; 66; SSE-LABEL: vp_fmul_v4f32: 67; SSE: # %bb.0: 68; SSE-NEXT: mulps %xmm1, %xmm0 69; SSE-NEXT: movaps %xmm0, (%rdi) 70; SSE-NEXT: retq 71; 72; AVX-LABEL: vp_fmul_v4f32: 73; AVX: # %bb.0: 74; AVX-NEXT: vmulps %xmm1, %xmm0, %xmm0 75; AVX-NEXT: vmovaps %xmm0, (%rdi) 76; AVX-NEXT: retq 77 %res = call <4 x float> @llvm.vp.fmul.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 78 store <4 x float> %res, ptr %out 79 ret void 80} 81declare <4 x float> @llvm.vp.fmul.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 82 83define void @vp_fdiv_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 84; X86-LABEL: vp_fdiv_v4f32: 85; X86: # %bb.0: 86; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 87; X86-NEXT: vdivps %xmm1, %xmm0, %xmm0 88; X86-NEXT: vmovaps %xmm0, (%eax) 89; X86-NEXT: retl 90; 91; SSE-LABEL: vp_fdiv_v4f32: 92; SSE: # %bb.0: 93; SSE-NEXT: divps %xmm1, %xmm0 94; SSE-NEXT: movaps %xmm0, (%rdi) 95; SSE-NEXT: retq 96; 97; AVX-LABEL: vp_fdiv_v4f32: 98; AVX: # %bb.0: 99; AVX-NEXT: vdivps %xmm1, %xmm0, %xmm0 100; AVX-NEXT: vmovaps %xmm0, (%rdi) 101; AVX-NEXT: retq 102 %res = call <4 x float> @llvm.vp.fdiv.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 103 store <4 x float> %res, ptr %out 104 ret void 105} 106declare <4 x float> @llvm.vp.fdiv.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 107 108define void @vp_frem_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 109; X86-LABEL: vp_frem_v4f32: 110; X86: # %bb.0: 111; X86-NEXT: pushl %esi 112; X86-NEXT: subl $80, %esp 113; X86-NEXT: vmovups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 114; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 115; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 116; X86-NEXT: vextractps $2, %xmm1, {{[0-9]+}}(%esp) 117; X86-NEXT: vextractps $2, %xmm0, (%esp) 118; X86-NEXT: calll fmodf 119; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 120; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 121; X86-NEXT: vextractps $1, %xmm0, {{[0-9]+}}(%esp) 122; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 123; X86-NEXT: vextractps $1, %xmm0, (%esp) 124; X86-NEXT: calll fmodf 125; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 126; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 127; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 128; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 129; X86-NEXT: vmovss %xmm0, (%esp) 130; X86-NEXT: calll fmodf 131; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 132; X86-NEXT: vextractps $3, %xmm0, {{[0-9]+}}(%esp) 133; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 134; X86-NEXT: vextractps $3, %xmm0, (%esp) 135; X86-NEXT: fstps {{[0-9]+}}(%esp) 136; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 137; X86-NEXT: fstps {{[0-9]+}}(%esp) 138; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 139; X86-NEXT: fstps {{[0-9]+}}(%esp) 140; X86-NEXT: calll fmodf 141; X86-NEXT: fstps {{[0-9]+}}(%esp) 142; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 143; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 144; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 145; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 146; X86-NEXT: vmovaps %xmm0, (%esi) 147; X86-NEXT: addl $80, %esp 148; X86-NEXT: popl %esi 149; X86-NEXT: retl 150; 151; SSE-LABEL: vp_frem_v4f32: 152; SSE: # %bb.0: 153; SSE-NEXT: pushq %rbx 154; SSE-NEXT: subq $64, %rsp 155; SSE-NEXT: movq %rdi, %rbx 156; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 157; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 158; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 159; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 160; SSE-NEXT: callq fmodf@PLT 161; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 162; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 163; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 164; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 165; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 166; SSE-NEXT: callq fmodf@PLT 167; SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload 168; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 169; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 170; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 171; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 172; SSE-NEXT: callq fmodf@PLT 173; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 174; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 175; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 176; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 177; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 178; SSE-NEXT: callq fmodf@PLT 179; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 180; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 181; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload 182; SSE-NEXT: # xmm1 = xmm1[0],mem[0] 183; SSE-NEXT: movaps %xmm1, (%rbx) 184; SSE-NEXT: addq $64, %rsp 185; SSE-NEXT: popq %rbx 186; SSE-NEXT: retq 187; 188; AVX-LABEL: vp_frem_v4f32: 189; AVX: # %bb.0: 190; AVX-NEXT: pushq %rbx 191; AVX-NEXT: subq $48, %rsp 192; AVX-NEXT: movq %rdi, %rbx 193; AVX-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 194; AVX-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 195; AVX-NEXT: callq fmodf@PLT 196; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 197; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 198; AVX-NEXT: # xmm0 = mem[1,1,3,3] 199; AVX-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 200; AVX-NEXT: # xmm1 = mem[1,1,3,3] 201; AVX-NEXT: callq fmodf@PLT 202; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 203; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 204; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 205; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 206; AVX-NEXT: # xmm0 = mem[1,0] 207; AVX-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 208; AVX-NEXT: # xmm1 = mem[1,0] 209; AVX-NEXT: callq fmodf@PLT 210; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 211; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 212; AVX-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 213; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 214; AVX-NEXT: # xmm0 = mem[3,3,3,3] 215; AVX-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 216; AVX-NEXT: # xmm1 = mem[3,3,3,3] 217; AVX-NEXT: callq fmodf@PLT 218; AVX-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 219; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 220; AVX-NEXT: vmovaps %xmm0, (%rbx) 221; AVX-NEXT: addq $48, %rsp 222; AVX-NEXT: popq %rbx 223; AVX-NEXT: retq 224 %res = call <4 x float> @llvm.vp.frem.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 225 store <4 x float> %res, ptr %out 226 ret void 227} 228declare <4 x float> @llvm.vp.frem.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 229 230define void @vp_fabs_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 231; X86-LABEL: vp_fabs_v4f32: 232; X86: # %bb.0: 233; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 234; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 235; X86-NEXT: vmovaps %xmm0, (%eax) 236; X86-NEXT: retl 237; 238; SSE-LABEL: vp_fabs_v4f32: 239; SSE: # %bb.0: 240; SSE-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 241; SSE-NEXT: movaps %xmm0, (%rdi) 242; SSE-NEXT: retq 243; 244; AVX1-LABEL: vp_fabs_v4f32: 245; AVX1: # %bb.0: 246; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 247; AVX1-NEXT: vmovaps %xmm0, (%rdi) 248; AVX1-NEXT: retq 249; 250; AVX2-LABEL: vp_fabs_v4f32: 251; AVX2: # %bb.0: 252; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN] 253; AVX2-NEXT: vandps %xmm1, %xmm0, %xmm0 254; AVX2-NEXT: vmovaps %xmm0, (%rdi) 255; AVX2-NEXT: retq 256; 257; AVX512-LABEL: vp_fabs_v4f32: 258; AVX512: # %bb.0: 259; AVX512-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 260; AVX512-NEXT: vmovdqa %xmm0, (%rdi) 261; AVX512-NEXT: retq 262 %res = call <4 x float> @llvm.vp.fabs.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 263 store <4 x float> %res, ptr %out 264 ret void 265} 266declare <4 x float> @llvm.vp.fabs.v4f32(<4 x float>, <4 x i1>, i32) 267 268define void @vp_sqrt_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 269; X86-LABEL: vp_sqrt_v4f32: 270; X86: # %bb.0: 271; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 272; X86-NEXT: vsqrtps %xmm0, %xmm0 273; X86-NEXT: vmovaps %xmm0, (%eax) 274; X86-NEXT: retl 275; 276; SSE-LABEL: vp_sqrt_v4f32: 277; SSE: # %bb.0: 278; SSE-NEXT: sqrtps %xmm0, %xmm0 279; SSE-NEXT: movaps %xmm0, (%rdi) 280; SSE-NEXT: retq 281; 282; AVX-LABEL: vp_sqrt_v4f32: 283; AVX: # %bb.0: 284; AVX-NEXT: vsqrtps %xmm0, %xmm0 285; AVX-NEXT: vmovaps %xmm0, (%rdi) 286; AVX-NEXT: retq 287 %res = call <4 x float> @llvm.vp.sqrt.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 288 store <4 x float> %res, ptr %out 289 ret void 290} 291declare <4 x float> @llvm.vp.sqrt.v4f32(<4 x float>, <4 x i1>, i32) 292 293define void @vp_fneg_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i32 %vp) nounwind { 294; X86-LABEL: vp_fneg_v4f32: 295; X86: # %bb.0: 296; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 297; X86-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 298; X86-NEXT: vmovaps %xmm0, (%eax) 299; X86-NEXT: retl 300; 301; SSE-LABEL: vp_fneg_v4f32: 302; SSE: # %bb.0: 303; SSE-NEXT: xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 304; SSE-NEXT: movaps %xmm0, (%rdi) 305; SSE-NEXT: retq 306; 307; AVX1-LABEL: vp_fneg_v4f32: 308; AVX1: # %bb.0: 309; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 310; AVX1-NEXT: vmovaps %xmm0, (%rdi) 311; AVX1-NEXT: retq 312; 313; AVX2-LABEL: vp_fneg_v4f32: 314; AVX2: # %bb.0: 315; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0] 316; AVX2-NEXT: vxorps %xmm1, %xmm0, %xmm0 317; AVX2-NEXT: vmovaps %xmm0, (%rdi) 318; AVX2-NEXT: retq 319; 320; AVX512-LABEL: vp_fneg_v4f32: 321; AVX512: # %bb.0: 322; AVX512-NEXT: vpxord {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0 323; AVX512-NEXT: vmovdqa %xmm0, (%rdi) 324; AVX512-NEXT: retq 325 %res = call <4 x float> @llvm.vp.fneg.v4f32(<4 x float> %a0, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 %vp) 326 store <4 x float> %res, ptr %out 327 ret void 328} 329declare <4 x float> @llvm.vp.fneg.v4f32(<4 x float>, <4 x i1>, i32) 330 331define void @vp_fma_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { 332; X86-LABEL: vp_fma_v4f32: 333; X86: # %bb.0: 334; X86-NEXT: pushl %esi 335; X86-NEXT: subl $84, %esp 336; X86-NEXT: vmovupd %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 337; X86-NEXT: vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill 338; X86-NEXT: movl {{[0-9]+}}(%esp), %esi 339; X86-NEXT: vextractps $2, %xmm0, (%esp) 340; X86-NEXT: vshufpd {{.*#+}} xmm0 = xmm1[1,0] 341; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 342; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 343; X86-NEXT: calll fmaf 344; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 345; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 346; X86-NEXT: vextractps $1, %xmm0, (%esp) 347; X86-NEXT: vmovshdup {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload 348; X86-NEXT: # xmm0 = mem[1,1,3,3] 349; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 350; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 351; X86-NEXT: calll fmaf 352; X86-NEXT: fstpt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Spill 353; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 354; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 355; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 356; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 357; X86-NEXT: vmovss %xmm0, (%esp) 358; X86-NEXT: calll fmaf 359; X86-NEXT: vmovups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload 360; X86-NEXT: vextractps $3, %xmm0, (%esp) 361; X86-NEXT: vpermilps $255, {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Folded Reload 362; X86-NEXT: # xmm0 = mem[3,3,3,3] 363; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 364; X86-NEXT: vmovss %xmm0, {{[0-9]+}}(%esp) 365; X86-NEXT: fstps {{[0-9]+}}(%esp) 366; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 367; X86-NEXT: fstps {{[0-9]+}}(%esp) 368; X86-NEXT: fldt {{[-0-9]+}}(%e{{[sb]}}p) # 10-byte Folded Reload 369; X86-NEXT: fstps {{[0-9]+}}(%esp) 370; X86-NEXT: calll fmaf 371; X86-NEXT: fstps {{[0-9]+}}(%esp) 372; X86-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero 373; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] 374; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] 375; X86-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] 376; X86-NEXT: vmovaps %xmm0, (%esi) 377; X86-NEXT: addl $84, %esp 378; X86-NEXT: popl %esi 379; X86-NEXT: retl 380; 381; SSE-LABEL: vp_fma_v4f32: 382; SSE: # %bb.0: 383; SSE-NEXT: pushq %rbx 384; SSE-NEXT: subq $64, %rsp 385; SSE-NEXT: movq %rdi, %rbx 386; SSE-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 387; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 388; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3] 389; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3] 390; SSE-NEXT: movaps %xmm1, %xmm2 391; SSE-NEXT: callq fmaf@PLT 392; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 393; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 394; SSE-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1] 395; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 396; SSE-NEXT: movhlps {{.*#+}} xmm1 = xmm1[1,1] 397; SSE-NEXT: movaps %xmm1, %xmm2 398; SSE-NEXT: callq fmaf@PLT 399; SSE-NEXT: unpcklps (%rsp), %xmm0 # 16-byte Folded Reload 400; SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1] 401; SSE-NEXT: movaps %xmm0, (%rsp) # 16-byte Spill 402; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 403; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 404; SSE-NEXT: movaps %xmm1, %xmm2 405; SSE-NEXT: callq fmaf@PLT 406; SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 407; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload 408; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1] 409; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 410; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,1,1,1] 411; SSE-NEXT: movaps %xmm1, %xmm2 412; SSE-NEXT: callq fmaf@PLT 413; SSE-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload 414; SSE-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] 415; SSE-NEXT: unpcklpd (%rsp), %xmm1 # 16-byte Folded Reload 416; SSE-NEXT: # xmm1 = xmm1[0],mem[0] 417; SSE-NEXT: movaps %xmm1, (%rbx) 418; SSE-NEXT: addq $64, %rsp 419; SSE-NEXT: popq %rbx 420; SSE-NEXT: retq 421; 422; AVX1-LABEL: vp_fma_v4f32: 423; AVX1: # %bb.0: 424; AVX1-NEXT: pushq %rbx 425; AVX1-NEXT: subq $48, %rsp 426; AVX1-NEXT: movq %rdi, %rbx 427; AVX1-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 428; AVX1-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 429; AVX1-NEXT: vmovaps %xmm1, %xmm2 430; AVX1-NEXT: callq fmaf@PLT 431; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 432; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 433; AVX1-NEXT: # xmm0 = mem[1,1,3,3] 434; AVX1-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 435; AVX1-NEXT: # xmm1 = mem[1,1,3,3] 436; AVX1-NEXT: vmovaps %xmm1, %xmm2 437; AVX1-NEXT: callq fmaf@PLT 438; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 439; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 440; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 441; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 442; AVX1-NEXT: # xmm0 = mem[1,0] 443; AVX1-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 444; AVX1-NEXT: # xmm1 = mem[1,0] 445; AVX1-NEXT: vmovapd %xmm1, %xmm2 446; AVX1-NEXT: callq fmaf@PLT 447; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 448; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 449; AVX1-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 450; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 451; AVX1-NEXT: # xmm0 = mem[3,3,3,3] 452; AVX1-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 453; AVX1-NEXT: # xmm1 = mem[3,3,3,3] 454; AVX1-NEXT: vmovaps %xmm1, %xmm2 455; AVX1-NEXT: callq fmaf@PLT 456; AVX1-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 457; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 458; AVX1-NEXT: vmovaps %xmm0, (%rbx) 459; AVX1-NEXT: addq $48, %rsp 460; AVX1-NEXT: popq %rbx 461; AVX1-NEXT: retq 462; 463; AVX2-LABEL: vp_fma_v4f32: 464; AVX2: # %bb.0: 465; AVX2-NEXT: pushq %rbx 466; AVX2-NEXT: subq $48, %rsp 467; AVX2-NEXT: movq %rdi, %rbx 468; AVX2-NEXT: vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 469; AVX2-NEXT: vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill 470; AVX2-NEXT: vmovaps %xmm1, %xmm2 471; AVX2-NEXT: callq fmaf@PLT 472; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 473; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 474; AVX2-NEXT: # xmm0 = mem[1,1,3,3] 475; AVX2-NEXT: vmovshdup {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 476; AVX2-NEXT: # xmm1 = mem[1,1,3,3] 477; AVX2-NEXT: vmovaps %xmm1, %xmm2 478; AVX2-NEXT: callq fmaf@PLT 479; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 480; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] 481; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 482; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 483; AVX2-NEXT: # xmm0 = mem[1,0] 484; AVX2-NEXT: vpermilpd $1, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 485; AVX2-NEXT: # xmm1 = mem[1,0] 486; AVX2-NEXT: vmovapd %xmm1, %xmm2 487; AVX2-NEXT: callq fmaf@PLT 488; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 489; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] 490; AVX2-NEXT: vmovaps %xmm0, (%rsp) # 16-byte Spill 491; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload 492; AVX2-NEXT: # xmm0 = mem[3,3,3,3] 493; AVX2-NEXT: vpermilps $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload 494; AVX2-NEXT: # xmm1 = mem[3,3,3,3] 495; AVX2-NEXT: vmovaps %xmm1, %xmm2 496; AVX2-NEXT: callq fmaf@PLT 497; AVX2-NEXT: vmovaps (%rsp), %xmm1 # 16-byte Reload 498; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] 499; AVX2-NEXT: vmovaps %xmm0, (%rbx) 500; AVX2-NEXT: addq $48, %rsp 501; AVX2-NEXT: popq %rbx 502; AVX2-NEXT: retq 503; 504; AVX512-LABEL: vp_fma_v4f32: 505; AVX512: # %bb.0: 506; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 507; AVX512-NEXT: vmovaps %xmm0, (%rdi) 508; AVX512-NEXT: retq 509 %res = call <4 x float> @llvm.vp.fma.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4) 510 store <4 x float> %res, ptr %out 511 ret void 512} 513declare <4 x float> @llvm.vp.fma.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) 514 515define void @vp_fmuladd_v4f32(<4 x float> %a0, <4 x float> %a1, ptr %out, i4 %a5) nounwind { 516; X86-LABEL: vp_fmuladd_v4f32: 517; X86: # %bb.0: 518; X86-NEXT: movl {{[0-9]+}}(%esp), %eax 519; X86-NEXT: vmulps %xmm1, %xmm0, %xmm0 520; X86-NEXT: vaddps %xmm1, %xmm0, %xmm0 521; X86-NEXT: vmovaps %xmm0, (%eax) 522; X86-NEXT: retl 523; 524; SSE-LABEL: vp_fmuladd_v4f32: 525; SSE: # %bb.0: 526; SSE-NEXT: mulps %xmm1, %xmm0 527; SSE-NEXT: addps %xmm1, %xmm0 528; SSE-NEXT: movaps %xmm0, (%rdi) 529; SSE-NEXT: retq 530; 531; AVX1-LABEL: vp_fmuladd_v4f32: 532; AVX1: # %bb.0: 533; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm0 534; AVX1-NEXT: vaddps %xmm1, %xmm0, %xmm0 535; AVX1-NEXT: vmovaps %xmm0, (%rdi) 536; AVX1-NEXT: retq 537; 538; AVX2-LABEL: vp_fmuladd_v4f32: 539; AVX2: # %bb.0: 540; AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 541; AVX2-NEXT: vaddps %xmm1, %xmm0, %xmm0 542; AVX2-NEXT: vmovaps %xmm0, (%rdi) 543; AVX2-NEXT: retq 544; 545; AVX512-LABEL: vp_fmuladd_v4f32: 546; AVX512: # %bb.0: 547; AVX512-NEXT: vfmadd213ps {{.*#+}} xmm0 = (xmm1 * xmm0) + xmm1 548; AVX512-NEXT: vmovaps %xmm0, (%rdi) 549; AVX512-NEXT: retq 550 %res = call <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float> %a0, <4 x float> %a1, <4 x float> %a1, <4 x i1> <i1 -1, i1 -1, i1 -1, i1 -1>, i32 4) 551 store <4 x float> %res, ptr %out 552 ret void 553} 554declare <4 x float> @llvm.vp.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>, <4 x i1>, i32) 555 556declare <4 x float> @llvm.vp.maxnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 557define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { 558; X86-LABEL: vfmax_vv_v4f32: 559; X86: # %bb.0: 560; X86-NEXT: vmaxps %xmm0, %xmm1, %xmm2 561; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 562; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 563; X86-NEXT: retl 564; 565; SSE-LABEL: vfmax_vv_v4f32: 566; SSE: # %bb.0: 567; SSE-NEXT: movaps %xmm1, %xmm2 568; SSE-NEXT: maxps %xmm0, %xmm2 569; SSE-NEXT: cmpunordps %xmm0, %xmm0 570; SSE-NEXT: andps %xmm0, %xmm1 571; SSE-NEXT: andnps %xmm2, %xmm0 572; SSE-NEXT: orps %xmm1, %xmm0 573; SSE-NEXT: retq 574; 575; AVX1-LABEL: vfmax_vv_v4f32: 576; AVX1: # %bb.0: 577; AVX1-NEXT: vmaxps %xmm0, %xmm1, %xmm2 578; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 579; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 580; AVX1-NEXT: retq 581; 582; AVX2-LABEL: vfmax_vv_v4f32: 583; AVX2: # %bb.0: 584; AVX2-NEXT: vmaxps %xmm0, %xmm1, %xmm2 585; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 586; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 587; AVX2-NEXT: retq 588; 589; AVX512-LABEL: vfmax_vv_v4f32: 590; AVX512: # %bb.0: 591; AVX512-NEXT: vmaxps %xmm0, %xmm1, %xmm2 592; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1 593; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1} 594; AVX512-NEXT: vmovaps %xmm2, %xmm0 595; AVX512-NEXT: retq 596 %v = call <4 x float> @llvm.vp.maxnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) 597 ret <4 x float> %v 598} 599 600declare <8 x float> @llvm.vp.maxnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) 601define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { 602; X86-LABEL: vfmax_vv_v8f32: 603; X86: # %bb.0: 604; X86-NEXT: vmaxps %ymm0, %ymm1, %ymm2 605; X86-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 606; X86-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 607; X86-NEXT: retl 608; 609; SSE-LABEL: vfmax_vv_v8f32: 610; SSE: # %bb.0: 611; SSE-NEXT: movaps %xmm2, %xmm4 612; SSE-NEXT: maxps %xmm0, %xmm4 613; SSE-NEXT: cmpunordps %xmm0, %xmm0 614; SSE-NEXT: andps %xmm0, %xmm2 615; SSE-NEXT: andnps %xmm4, %xmm0 616; SSE-NEXT: orps %xmm2, %xmm0 617; SSE-NEXT: movaps %xmm3, %xmm2 618; SSE-NEXT: maxps %xmm1, %xmm2 619; SSE-NEXT: cmpunordps %xmm1, %xmm1 620; SSE-NEXT: andps %xmm1, %xmm3 621; SSE-NEXT: andnps %xmm2, %xmm1 622; SSE-NEXT: orps %xmm3, %xmm1 623; SSE-NEXT: retq 624; 625; AVX1-LABEL: vfmax_vv_v8f32: 626; AVX1: # %bb.0: 627; AVX1-NEXT: vmaxps %ymm0, %ymm1, %ymm2 628; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 629; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 630; AVX1-NEXT: retq 631; 632; AVX2-LABEL: vfmax_vv_v8f32: 633; AVX2: # %bb.0: 634; AVX2-NEXT: vmaxps %ymm0, %ymm1, %ymm2 635; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 636; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 637; AVX2-NEXT: retq 638; 639; AVX512-LABEL: vfmax_vv_v8f32: 640; AVX512: # %bb.0: 641; AVX512-NEXT: vmaxps %ymm0, %ymm1, %ymm2 642; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1 643; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1} 644; AVX512-NEXT: vmovaps %ymm2, %ymm0 645; AVX512-NEXT: retq 646 %v = call <8 x float> @llvm.vp.maxnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) 647 ret <8 x float> %v 648} 649 650declare <4 x float> @llvm.vp.minnum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32) 651define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) { 652; X86-LABEL: vfmin_vv_v4f32: 653; X86: # %bb.0: 654; X86-NEXT: vminps %xmm0, %xmm1, %xmm2 655; X86-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 656; X86-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 657; X86-NEXT: retl 658; 659; SSE-LABEL: vfmin_vv_v4f32: 660; SSE: # %bb.0: 661; SSE-NEXT: movaps %xmm1, %xmm2 662; SSE-NEXT: minps %xmm0, %xmm2 663; SSE-NEXT: cmpunordps %xmm0, %xmm0 664; SSE-NEXT: andps %xmm0, %xmm1 665; SSE-NEXT: andnps %xmm2, %xmm0 666; SSE-NEXT: orps %xmm1, %xmm0 667; SSE-NEXT: retq 668; 669; AVX1-LABEL: vfmin_vv_v4f32: 670; AVX1: # %bb.0: 671; AVX1-NEXT: vminps %xmm0, %xmm1, %xmm2 672; AVX1-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 673; AVX1-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 674; AVX1-NEXT: retq 675; 676; AVX2-LABEL: vfmin_vv_v4f32: 677; AVX2: # %bb.0: 678; AVX2-NEXT: vminps %xmm0, %xmm1, %xmm2 679; AVX2-NEXT: vcmpunordps %xmm0, %xmm0, %xmm0 680; AVX2-NEXT: vblendvps %xmm0, %xmm1, %xmm2, %xmm0 681; AVX2-NEXT: retq 682; 683; AVX512-LABEL: vfmin_vv_v4f32: 684; AVX512: # %bb.0: 685; AVX512-NEXT: vminps %xmm0, %xmm1, %xmm2 686; AVX512-NEXT: vcmpunordps %xmm0, %xmm0, %k1 687; AVX512-NEXT: vmovaps %xmm1, %xmm2 {%k1} 688; AVX512-NEXT: vmovaps %xmm2, %xmm0 689; AVX512-NEXT: retq 690 %v = call <4 x float> @llvm.vp.minnum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl) 691 ret <4 x float> %v 692} 693 694declare <8 x float> @llvm.vp.minnum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) 695define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) { 696; X86-LABEL: vfmin_vv_v8f32: 697; X86: # %bb.0: 698; X86-NEXT: vminps %ymm0, %ymm1, %ymm2 699; X86-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 700; X86-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 701; X86-NEXT: retl 702; 703; SSE-LABEL: vfmin_vv_v8f32: 704; SSE: # %bb.0: 705; SSE-NEXT: movaps %xmm2, %xmm4 706; SSE-NEXT: minps %xmm0, %xmm4 707; SSE-NEXT: cmpunordps %xmm0, %xmm0 708; SSE-NEXT: andps %xmm0, %xmm2 709; SSE-NEXT: andnps %xmm4, %xmm0 710; SSE-NEXT: orps %xmm2, %xmm0 711; SSE-NEXT: movaps %xmm3, %xmm2 712; SSE-NEXT: minps %xmm1, %xmm2 713; SSE-NEXT: cmpunordps %xmm1, %xmm1 714; SSE-NEXT: andps %xmm1, %xmm3 715; SSE-NEXT: andnps %xmm2, %xmm1 716; SSE-NEXT: orps %xmm3, %xmm1 717; SSE-NEXT: retq 718; 719; AVX1-LABEL: vfmin_vv_v8f32: 720; AVX1: # %bb.0: 721; AVX1-NEXT: vminps %ymm0, %ymm1, %ymm2 722; AVX1-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 723; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 724; AVX1-NEXT: retq 725; 726; AVX2-LABEL: vfmin_vv_v8f32: 727; AVX2: # %bb.0: 728; AVX2-NEXT: vminps %ymm0, %ymm1, %ymm2 729; AVX2-NEXT: vcmpunordps %ymm0, %ymm0, %ymm0 730; AVX2-NEXT: vblendvps %ymm0, %ymm1, %ymm2, %ymm0 731; AVX2-NEXT: retq 732; 733; AVX512-LABEL: vfmin_vv_v8f32: 734; AVX512: # %bb.0: 735; AVX512-NEXT: vminps %ymm0, %ymm1, %ymm2 736; AVX512-NEXT: vcmpunordps %ymm0, %ymm0, %k1 737; AVX512-NEXT: vmovaps %ymm1, %ymm2 {%k1} 738; AVX512-NEXT: vmovaps %ymm2, %ymm0 739; AVX512-NEXT: retq 740 %v = call <8 x float> @llvm.vp.minnum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl) 741 ret <8 x float> %v 742} 743