1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=CHECK,SSE 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=CHECK,AVX,AVX1 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2 5 6 7define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) { 8; CHECK-LABEL: test_x86_sse41_blend_pd: 9; CHECK: # %bb.0: 10; CHECK-NEXT: retq 11 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0) 12 ret <2 x double> %1 13} 14 15define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) { 16; CHECK-LABEL: test_x86_sse41_blend_ps: 17; CHECK: # %bb.0: 18; CHECK-NEXT: retq 19 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0) 20 ret <4 x float> %1 21} 22 23define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) { 24; CHECK-LABEL: test_x86_sse41_pblend_w: 25; CHECK: # %bb.0: 26; CHECK-NEXT: retq 27 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0) 28 ret <8 x i16> %1 29} 30 31define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) { 32; SSE-LABEL: test2_x86_sse41_blend_pd: 33; SSE: # %bb.0: 34; SSE-NEXT: movaps %xmm1, %xmm0 35; SSE-NEXT: retq 36; 37; AVX-LABEL: test2_x86_sse41_blend_pd: 38; AVX: # %bb.0: 39; AVX-NEXT: vmovaps %xmm1, %xmm0 40; AVX-NEXT: retq 41 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1) 42 ret <2 x double> %1 43} 44 45define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) { 46; SSE-LABEL: test2_x86_sse41_blend_ps: 47; SSE: # %bb.0: 48; SSE-NEXT: movaps %xmm1, %xmm0 49; SSE-NEXT: retq 50; 51; AVX-LABEL: test2_x86_sse41_blend_ps: 52; AVX: # %bb.0: 53; AVX-NEXT: vmovaps %xmm1, %xmm0 54; AVX-NEXT: retq 55 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1) 56 ret <4 x float> %1 57} 58 59define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) { 60; SSE-LABEL: test2_x86_sse41_pblend_w: 61; SSE: # %bb.0: 62; SSE-NEXT: movaps %xmm1, %xmm0 63; SSE-NEXT: retq 64; 65; AVX-LABEL: test2_x86_sse41_pblend_w: 66; AVX: # %bb.0: 67; AVX-NEXT: vmovaps %xmm1, %xmm0 68; AVX-NEXT: retq 69 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1) 70 ret <8 x i16> %1 71} 72 73define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) { 74; CHECK-LABEL: test3_x86_sse41_blend_pd: 75; CHECK: # %bb.0: 76; CHECK-NEXT: retq 77 %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7) 78 ret <2 x double> %1 79} 80 81define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) { 82; CHECK-LABEL: test3_x86_sse41_blend_ps: 83; CHECK: # %bb.0: 84; CHECK-NEXT: retq 85 %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7) 86 ret <4 x float> %1 87} 88 89define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) { 90; CHECK-LABEL: test3_x86_sse41_pblend_w: 91; CHECK: # %bb.0: 92; CHECK-NEXT: retq 93 %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7) 94 ret <8 x i16> %1 95} 96 97define double @demandedelts_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 98; SSE-LABEL: demandedelts_blendvpd: 99; SSE: # %bb.0: 100; SSE-NEXT: movapd %xmm0, %xmm3 101; SSE-NEXT: movaps %xmm2, %xmm0 102; SSE-NEXT: blendvpd %xmm0, %xmm1, %xmm3 103; SSE-NEXT: movapd %xmm3, %xmm0 104; SSE-NEXT: retq 105; 106; AVX-LABEL: demandedelts_blendvpd: 107; AVX: # %bb.0: 108; AVX-NEXT: vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 109; AVX-NEXT: retq 110 %1 = shufflevector <2 x double> %a0, <2 x double> undef, <2 x i32> zeroinitializer 111 %2 = shufflevector <2 x double> %a1, <2 x double> undef, <2 x i32> zeroinitializer 112 %3 = shufflevector <2 x double> %a2, <2 x double> undef, <2 x i32> zeroinitializer 113 %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %1, <2 x double> %2, <2 x double> %3) 114 %5 = extractelement <2 x double> %4, i32 0 115 ret double %5 116} 117 118define float @demandedelts_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 119; SSE-LABEL: demandedelts_blendvps: 120; SSE: # %bb.0: 121; SSE-NEXT: movaps %xmm0, %xmm3 122; SSE-NEXT: movaps %xmm2, %xmm0 123; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 124; SSE-NEXT: movaps %xmm3, %xmm0 125; SSE-NEXT: retq 126; 127; AVX-LABEL: demandedelts_blendvps: 128; AVX: # %bb.0: 129; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 130; AVX-NEXT: retq 131 %1 = shufflevector <4 x float> %a0, <4 x float> undef, <4 x i32> zeroinitializer 132 %2 = shufflevector <4 x float> %a1, <4 x float> undef, <4 x i32> zeroinitializer 133 %3 = shufflevector <4 x float> %a2, <4 x float> undef, <4 x i32> zeroinitializer 134 %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %1, <4 x float> %2, <4 x float> %3) 135 %5 = extractelement <4 x float> %4, i32 0 136 ret float %5 137} 138 139define <16 x i8> @demandedelts_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { 140; SSE-LABEL: demandedelts_pblendvb: 141; SSE: # %bb.0: 142; SSE-NEXT: movdqa %xmm0, %xmm3 143; SSE-NEXT: movdqa %xmm2, %xmm0 144; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 145; SSE-NEXT: pxor %xmm0, %xmm0 146; SSE-NEXT: pshufb %xmm0, %xmm3 147; SSE-NEXT: movdqa %xmm3, %xmm0 148; SSE-NEXT: retq 149; 150; AVX1-LABEL: demandedelts_pblendvb: 151; AVX1: # %bb.0: 152; AVX1-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 153; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 154; AVX1-NEXT: vpshufb %xmm1, %xmm0, %xmm0 155; AVX1-NEXT: retq 156; 157; AVX2-LABEL: demandedelts_pblendvb: 158; AVX2: # %bb.0: 159; AVX2-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 160; AVX2-NEXT: vpbroadcastb %xmm0, %xmm0 161; AVX2-NEXT: retq 162 %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <16 x i32> zeroinitializer 163 %2 = shufflevector <16 x i8> %a1, <16 x i8> undef, <16 x i32> zeroinitializer 164 %3 = shufflevector <16 x i8> %a2, <16 x i8> undef, <16 x i32> zeroinitializer 165 %4 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %1, <16 x i8> %2, <16 x i8> %3) 166 %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> zeroinitializer 167 ret <16 x i8> %5 168} 169 170define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { 171; SSE-LABEL: demandedbits_sitofp_blendvps: 172; SSE: # %bb.0: 173; SSE-NEXT: movaps %xmm0, %xmm3 174; SSE-NEXT: cvtdq2ps %xmm2, %xmm0 175; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 176; SSE-NEXT: movaps %xmm3, %xmm0 177; SSE-NEXT: retq 178; 179; AVX-LABEL: demandedbits_sitofp_blendvps: 180; AVX: # %bb.0: 181; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2 182; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 183; AVX-NEXT: retq 184 %cvt = sitofp <4 x i32> %a2 to <4 x float> 185 %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt) 186 ret <4 x float> %sel 187} 188 189define <4 x float> @demandedbits_uitofp_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x i32> %a2) { 190; SSE-LABEL: demandedbits_uitofp_blendvps: 191; SSE: # %bb.0: 192; SSE-NEXT: movaps %xmm0, %xmm3 193; SSE-NEXT: movdqa {{.*#+}} xmm0 = [1258291200,1258291200,1258291200,1258291200] 194; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1],xmm2[2],xmm0[3],xmm2[4],xmm0[5],xmm2[6],xmm0[7] 195; SSE-NEXT: psrld $16, %xmm2 196; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7] 197; SSE-NEXT: subps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 198; SSE-NEXT: addps %xmm2, %xmm0 199; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3 200; SSE-NEXT: movaps %xmm3, %xmm0 201; SSE-NEXT: retq 202; 203; AVX1-LABEL: demandedbits_uitofp_blendvps: 204; AVX1: # %bb.0: 205; AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7] 206; AVX1-NEXT: vpsrld $16, %xmm2, %xmm2 207; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],mem[1],xmm2[2],mem[3],xmm2[4],mem[5],xmm2[6],mem[7] 208; AVX1-NEXT: vsubps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 209; AVX1-NEXT: vaddps %xmm2, %xmm3, %xmm2 210; AVX1-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 211; AVX1-NEXT: retq 212; 213; AVX2-LABEL: demandedbits_uitofp_blendvps: 214; AVX2: # %bb.0: 215; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [1258291200,1258291200,1258291200,1258291200] 216; AVX2-NEXT: vpblendw {{.*#+}} xmm3 = xmm2[0],xmm3[1],xmm2[2],xmm3[3],xmm2[4],xmm3[5],xmm2[6],xmm3[7] 217; AVX2-NEXT: vpsrld $16, %xmm2, %xmm2 218; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm4 = [1392508928,1392508928,1392508928,1392508928] 219; AVX2-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1],xmm2[2],xmm4[3],xmm2[4],xmm4[5],xmm2[6],xmm4[7] 220; AVX2-NEXT: vbroadcastss {{.*#+}} xmm4 = [5.49764202E+11,5.49764202E+11,5.49764202E+11,5.49764202E+11] 221; AVX2-NEXT: vsubps %xmm4, %xmm2, %xmm2 222; AVX2-NEXT: vaddps %xmm2, %xmm3, %xmm2 223; AVX2-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0 224; AVX2-NEXT: retq 225 %cvt = uitofp <4 x i32> %a2 to <4 x float> 226 %sel = tail call noundef <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %cvt) 227 ret <4 x float> %sel 228} 229 230define <2 x i64> @demandedbits_blendvpd(i64 %a0, i64 %a2, <2 x double> %a3) { 231; SSE-LABEL: demandedbits_blendvpd: 232; SSE: # %bb.0: 233; SSE-NEXT: movq %rdi, %rax 234; SSE-NEXT: orq $1, %rax 235; SSE-NEXT: orq $4, %rdi 236; SSE-NEXT: movq %rax, %xmm1 237; SSE-NEXT: movq %rdi, %xmm2 238; SSE-NEXT: movq {{.*#+}} xmm1 = xmm1[0],zero 239; SSE-NEXT: movq {{.*#+}} xmm2 = xmm2[0],zero 240; SSE-NEXT: blendvpd %xmm0, %xmm2, %xmm1 241; SSE-NEXT: psrlq $11, %xmm1 242; SSE-NEXT: movdqa %xmm1, %xmm0 243; SSE-NEXT: retq 244; 245; AVX-LABEL: demandedbits_blendvpd: 246; AVX: # %bb.0: 247; AVX-NEXT: movq %rdi, %rax 248; AVX-NEXT: orq $1, %rax 249; AVX-NEXT: orq $4, %rdi 250; AVX-NEXT: vmovq %rax, %xmm1 251; AVX-NEXT: vmovq %rdi, %xmm2 252; AVX-NEXT: vmovq {{.*#+}} xmm1 = xmm1[0],zero 253; AVX-NEXT: vmovq {{.*#+}} xmm2 = xmm2[0],zero 254; AVX-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0 255; AVX-NEXT: vpsrlq $11, %xmm0, %xmm0 256; AVX-NEXT: retq 257 %1 = or i64 %a0, 1 258 %2 = or i64 %a0, 4 259 %3 = bitcast i64 %1 to double 260 %4 = bitcast i64 %2 to double 261 %5 = insertelement <2 x double> zeroinitializer, double %3, i32 0 262 %6 = insertelement <2 x double> zeroinitializer, double %4, i32 0 263 %7 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %5, <2 x double> %6, <2 x double> %a3) 264 %8 = bitcast <2 x double> %7 to <2 x i64> 265 %9 = lshr <2 x i64> %8, <i64 11, i64 11> 266 ret <2 x i64> %9 267} 268 269define <16 x i8> @xor_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) { 270; SSE-LABEL: xor_pblendvb: 271; SSE: # %bb.0: 272; SSE-NEXT: movdqa %xmm0, %xmm3 273; SSE-NEXT: movaps %xmm2, %xmm0 274; SSE-NEXT: pblendvb %xmm0, %xmm3, %xmm1 275; SSE-NEXT: movdqa %xmm1, %xmm0 276; SSE-NEXT: retq 277; 278; AVX-LABEL: xor_pblendvb: 279; AVX: # %bb.0: 280; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 281; AVX-NEXT: retq 282 %1 = xor <16 x i8> %a2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 283 %2 = tail call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %1) 284 ret <16 x i8> %2 285} 286 287define <4 x float> @xor_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) { 288; SSE-LABEL: xor_blendvps: 289; SSE: # %bb.0: 290; SSE-NEXT: movaps %xmm0, %xmm3 291; SSE-NEXT: movaps %xmm2, %xmm0 292; SSE-NEXT: blendvps %xmm0, %xmm3, %xmm1 293; SSE-NEXT: movaps %xmm1, %xmm0 294; SSE-NEXT: retq 295; 296; AVX-LABEL: xor_blendvps: 297; AVX: # %bb.0: 298; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0 299; AVX-NEXT: retq 300 %1 = bitcast <4 x float> %a2 to <4 x i32> 301 %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1> 302 %3 = bitcast <4 x i32> %2 to <4 x float> 303 %4 = tail call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %3) 304 ret <4 x float> %4 305} 306 307define <2 x double> @xor_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) { 308; SSE-LABEL: xor_blendvpd: 309; SSE: # %bb.0: 310; SSE-NEXT: movapd %xmm0, %xmm3 311; SSE-NEXT: movaps %xmm2, %xmm0 312; SSE-NEXT: blendvpd %xmm0, %xmm3, %xmm1 313; SSE-NEXT: movapd %xmm1, %xmm0 314; SSE-NEXT: retq 315; 316; AVX-LABEL: xor_blendvpd: 317; AVX: # %bb.0: 318; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0 319; AVX-NEXT: retq 320 %1 = bitcast <2 x double> %a2 to <4 x i32> 321 %2 = xor <4 x i32> %1, <i32 -1, i32 -1, i32 -1, i32 -1> 322 %3 = bitcast <4 x i32> %2 to <2 x double> 323 %4 = tail call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %3) 324 ret <2 x double> %4 325} 326 327define <16 x i8> @PR47404(<16 x i8> %0, <16 x i8> %1, <16 x i8> %2) { 328; SSE-LABEL: PR47404: 329; SSE: # %bb.0: 330; SSE-NEXT: movdqa %xmm0, %xmm3 331; SSE-NEXT: movaps %xmm2, %xmm0 332; SSE-NEXT: pblendvb %xmm0, %xmm1, %xmm3 333; SSE-NEXT: movdqa %xmm3, %xmm0 334; SSE-NEXT: retq 335; 336; AVX-LABEL: PR47404: 337; AVX: # %bb.0: 338; AVX-NEXT: vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 339; AVX-NEXT: retq 340 %4 = icmp sgt <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 341 %5 = select <16 x i1> %4, <16 x i8> %0, <16 x i8> %1 342 ret <16 x i8> %5 343} 344 345declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32) 346declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32) 347declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32) 348 349declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) 350declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) 351declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) 352