1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=SSE42 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL 8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512BWNOVL 9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512BWVL 10 11; 12; General cases - packing of vector comparison to legal vector result types 13; 14 15define <16 x i8> @vselect_packss_v16i16(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { 16; SSE2-LABEL: vselect_packss_v16i16: 17; SSE2: # %bb.0: 18; SSE2-NEXT: pcmpeqw %xmm3, %xmm1 19; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 20; SSE2-NEXT: packsswb %xmm1, %xmm0 21; SSE2-NEXT: pand %xmm0, %xmm4 22; SSE2-NEXT: pandn %xmm5, %xmm0 23; SSE2-NEXT: por %xmm4, %xmm0 24; SSE2-NEXT: retq 25; 26; SSE42-LABEL: vselect_packss_v16i16: 27; SSE42: # %bb.0: 28; SSE42-NEXT: pcmpeqw %xmm3, %xmm1 29; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 30; SSE42-NEXT: packsswb %xmm1, %xmm0 31; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5 32; SSE42-NEXT: movdqa %xmm5, %xmm0 33; SSE42-NEXT: retq 34; 35; AVX1-LABEL: vselect_packss_v16i16: 36; AVX1: # %bb.0: 37; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 38; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 39; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 40; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 41; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 42; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 43; AVX1-NEXT: vzeroupper 44; AVX1-NEXT: retq 45; 46; AVX2-LABEL: vselect_packss_v16i16: 47; AVX2: # %bb.0: 48; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 49; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 50; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 51; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 52; AVX2-NEXT: vzeroupper 53; AVX2-NEXT: retq 54; 55; AVX512F-LABEL: vselect_packss_v16i16: 56; AVX512F: # %bb.0: 57; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 58; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 59; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 60; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 61; AVX512F-NEXT: vzeroupper 62; AVX512F-NEXT: retq 63; 64; AVX512VL-LABEL: vselect_packss_v16i16: 65; AVX512VL: # %bb.0: 66; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 67; AVX512VL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero 68; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 69; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 70; AVX512VL-NEXT: vzeroupper 71; AVX512VL-NEXT: retq 72; 73; AVX512BWNOVL-LABEL: vselect_packss_v16i16: 74; AVX512BWNOVL: # %bb.0: 75; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 76; AVX512BWNOVL-NEXT: vpmovwb %zmm0, %ymm0 77; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 78; AVX512BWNOVL-NEXT: vzeroupper 79; AVX512BWNOVL-NEXT: retq 80; 81; AVX512BWVL-LABEL: vselect_packss_v16i16: 82; AVX512BWVL: # %bb.0: 83; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %k0 84; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 85; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 86; AVX512BWVL-NEXT: vzeroupper 87; AVX512BWVL-NEXT: retq 88 %1 = icmp eq <16 x i16> %a0, %a1 89 %2 = sext <16 x i1> %1 to <16 x i8> 90 %3 = and <16 x i8> %2, %a2 91 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 92 %5 = and <16 x i8> %4, %a3 93 %6 = or <16 x i8> %3, %5 94 ret <16 x i8> %6 95} 96 97define <16 x i8> @vselect_packss_v16i32(<16 x i32> %a0, <16 x i32> %a1, <16 x i8> %a2, <16 x i8> %a3) { 98; SSE2-LABEL: vselect_packss_v16i32: 99; SSE2: # %bb.0: 100; SSE2-NEXT: pcmpeqd %xmm7, %xmm3 101; SSE2-NEXT: pcmpeqd %xmm6, %xmm2 102; SSE2-NEXT: packssdw %xmm3, %xmm2 103; SSE2-NEXT: pcmpeqd %xmm5, %xmm1 104; SSE2-NEXT: pcmpeqd %xmm4, %xmm0 105; SSE2-NEXT: packssdw %xmm1, %xmm0 106; SSE2-NEXT: packsswb %xmm2, %xmm0 107; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 108; SSE2-NEXT: pand %xmm0, %xmm1 109; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 110; SSE2-NEXT: por %xmm1, %xmm0 111; SSE2-NEXT: retq 112; 113; SSE42-LABEL: vselect_packss_v16i32: 114; SSE42: # %bb.0: 115; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 116; SSE42-NEXT: pcmpeqd %xmm7, %xmm3 117; SSE42-NEXT: pcmpeqd %xmm6, %xmm2 118; SSE42-NEXT: packssdw %xmm3, %xmm2 119; SSE42-NEXT: pcmpeqd %xmm5, %xmm1 120; SSE42-NEXT: pcmpeqd %xmm4, %xmm0 121; SSE42-NEXT: packssdw %xmm1, %xmm0 122; SSE42-NEXT: packsswb %xmm2, %xmm0 123; SSE42-NEXT: pblendvb %xmm0, {{[0-9]+}}(%rsp), %xmm8 124; SSE42-NEXT: movdqa %xmm8, %xmm0 125; SSE42-NEXT: retq 126; 127; AVX1-LABEL: vselect_packss_v16i32: 128; AVX1: # %bb.0: 129; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm6 130; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7 131; AVX1-NEXT: vpcmpeqd %xmm6, %xmm7, %xmm6 132; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 133; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1 134; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 135; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 136; AVX1-NEXT: vpcmpeqd %xmm3, %xmm6, %xmm3 137; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0 138; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 139; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 140; AVX1-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 141; AVX1-NEXT: vzeroupper 142; AVX1-NEXT: retq 143; 144; AVX2-LABEL: vselect_packss_v16i32: 145; AVX2: # %bb.0: 146; AVX2-NEXT: vpcmpeqd %ymm3, %ymm1, %ymm1 147; AVX2-NEXT: vpcmpeqd %ymm2, %ymm0, %ymm0 148; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 149; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 150; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 151; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 152; AVX2-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 153; AVX2-NEXT: vzeroupper 154; AVX2-NEXT: retq 155; 156; AVX512F-LABEL: vselect_packss_v16i32: 157; AVX512F: # %bb.0: 158; AVX512F-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 159; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 160; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 161; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 162; AVX512F-NEXT: vzeroupper 163; AVX512F-NEXT: retq 164; 165; AVX512VL-LABEL: vselect_packss_v16i32: 166; AVX512VL: # %bb.0: 167; AVX512VL-NEXT: vpcmpeqd %zmm1, %zmm0, %k1 168; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 169; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 170; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 171; AVX512VL-NEXT: vzeroupper 172; AVX512VL-NEXT: retq 173; 174; AVX512BWNOVL-LABEL: vselect_packss_v16i32: 175; AVX512BWNOVL: # %bb.0: 176; AVX512BWNOVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 177; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0 178; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 179; AVX512BWNOVL-NEXT: vzeroupper 180; AVX512BWNOVL-NEXT: retq 181; 182; AVX512BWVL-LABEL: vselect_packss_v16i32: 183; AVX512BWVL: # %bb.0: 184; AVX512BWVL-NEXT: vpcmpeqd %zmm1, %zmm0, %k0 185; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 186; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 187; AVX512BWVL-NEXT: vzeroupper 188; AVX512BWVL-NEXT: retq 189 %1 = icmp eq <16 x i32> %a0, %a1 190 %2 = sext <16 x i1> %1 to <16 x i8> 191 %3 = and <16 x i8> %2, %a2 192 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 193 %5 = and <16 x i8> %4, %a3 194 %6 = or <16 x i8> %3, %5 195 ret <16 x i8> %6 196} 197 198define <16 x i8> @vselect_packss_v16i64(<16 x i64> %a0, <16 x i64> %a1, <16 x i8> %a2, <16 x i8> %a3) { 199; SSE2-LABEL: vselect_packss_v16i64: 200; SSE2: # %bb.0: 201; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm7 202; SSE2-NEXT: pshufd {{.*#+}} xmm8 = xmm7[1,0,3,2] 203; SSE2-NEXT: pand %xmm7, %xmm8 204; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm6 205; SSE2-NEXT: pshufd {{.*#+}} xmm7 = xmm6[1,0,3,2] 206; SSE2-NEXT: pand %xmm6, %xmm7 207; SSE2-NEXT: packssdw %xmm8, %xmm7 208; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm5 209; SSE2-NEXT: pshufd {{.*#+}} xmm6 = xmm5[1,0,3,2] 210; SSE2-NEXT: pand %xmm5, %xmm6 211; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm4 212; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm4[1,0,3,2] 213; SSE2-NEXT: pand %xmm4, %xmm5 214; SSE2-NEXT: packssdw %xmm6, %xmm5 215; SSE2-NEXT: packssdw %xmm7, %xmm5 216; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm3 217; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm3[1,0,3,2] 218; SSE2-NEXT: pand %xmm3, %xmm4 219; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm2 220; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,3,2] 221; SSE2-NEXT: pand %xmm2, %xmm3 222; SSE2-NEXT: packssdw %xmm4, %xmm3 223; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm1 224; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] 225; SSE2-NEXT: pand %xmm1, %xmm2 226; SSE2-NEXT: pcmpeqd {{[0-9]+}}(%rsp), %xmm0 227; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] 228; SSE2-NEXT: pand %xmm1, %xmm0 229; SSE2-NEXT: packssdw %xmm2, %xmm0 230; SSE2-NEXT: packssdw %xmm3, %xmm0 231; SSE2-NEXT: packsswb %xmm5, %xmm0 232; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 233; SSE2-NEXT: pand %xmm0, %xmm1 234; SSE2-NEXT: pandn {{[0-9]+}}(%rsp), %xmm0 235; SSE2-NEXT: por %xmm1, %xmm0 236; SSE2-NEXT: retq 237; 238; SSE42-LABEL: vselect_packss_v16i64: 239; SSE42: # %bb.0: 240; SSE42-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 241; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm7 242; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm6 243; SSE42-NEXT: packssdw %xmm7, %xmm6 244; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm5 245; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm4 246; SSE42-NEXT: packssdw %xmm5, %xmm4 247; SSE42-NEXT: packssdw %xmm6, %xmm4 248; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm3 249; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm2 250; SSE42-NEXT: packssdw %xmm3, %xmm2 251; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm1 252; SSE42-NEXT: pcmpeqq {{[0-9]+}}(%rsp), %xmm0 253; SSE42-NEXT: packssdw %xmm1, %xmm0 254; SSE42-NEXT: packssdw %xmm2, %xmm0 255; SSE42-NEXT: packsswb %xmm4, %xmm0 256; SSE42-NEXT: pblendvb %xmm0, {{[0-9]+}}(%rsp), %xmm8 257; SSE42-NEXT: movdqa %xmm8, %xmm0 258; SSE42-NEXT: retq 259; 260; AVX1-LABEL: vselect_packss_v16i64: 261; AVX1: # %bb.0: 262; AVX1-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm8 263; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm9 264; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm10 265; AVX1-NEXT: vpcmpeqq %xmm9, %xmm10, %xmm9 266; AVX1-NEXT: vpcmpeqq %xmm7, %xmm3, %xmm3 267; AVX1-NEXT: vpackssdw %xmm9, %xmm3, %xmm3 268; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm7 269; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm9 270; AVX1-NEXT: vpcmpeqq %xmm7, %xmm9, %xmm7 271; AVX1-NEXT: vpcmpeqq %xmm6, %xmm2, %xmm2 272; AVX1-NEXT: vpackssdw %xmm7, %xmm2, %xmm2 273; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2 274; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm3 275; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6 276; AVX1-NEXT: vpcmpeqq %xmm3, %xmm6, %xmm3 277; AVX1-NEXT: vpcmpeqq %xmm5, %xmm1, %xmm1 278; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1 279; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm3 280; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 281; AVX1-NEXT: vpcmpeqq %xmm3, %xmm5, %xmm3 282; AVX1-NEXT: vpcmpeqq %xmm4, %xmm0, %xmm0 283; AVX1-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 284; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 285; AVX1-NEXT: vpacksswb %xmm2, %xmm0, %xmm0 286; AVX1-NEXT: vpblendvb %xmm0, {{[0-9]+}}(%rsp), %xmm8, %xmm0 287; AVX1-NEXT: vzeroupper 288; AVX1-NEXT: retq 289; 290; AVX2-LABEL: vselect_packss_v16i64: 291; AVX2: # %bb.0: 292; AVX2-NEXT: vpcmpeqq %ymm7, %ymm3, %ymm3 293; AVX2-NEXT: vpcmpeqq %ymm6, %ymm2, %ymm2 294; AVX2-NEXT: vpackssdw %ymm3, %ymm2, %ymm2 295; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3] 296; AVX2-NEXT: vpcmpeqq %ymm5, %ymm1, %ymm1 297; AVX2-NEXT: vpcmpeqq %ymm4, %ymm0, %ymm0 298; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 299; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] 300; AVX2-NEXT: vpackssdw %ymm2, %ymm0, %ymm0 301; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 302; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 303; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] 304; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1 305; AVX2-NEXT: vpandn {{[0-9]+}}(%rsp), %xmm0, %xmm0 306; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0 307; AVX2-NEXT: vzeroupper 308; AVX2-NEXT: retq 309; 310; AVX512F-LABEL: vselect_packss_v16i64: 311; AVX512F: # %bb.0: 312; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 313; AVX512F-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 314; AVX512F-NEXT: kunpckbw %k0, %k1, %k1 315; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 316; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 317; AVX512F-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 318; AVX512F-NEXT: vzeroupper 319; AVX512F-NEXT: retq 320; 321; AVX512VL-LABEL: vselect_packss_v16i64: 322; AVX512VL: # %bb.0: 323; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 324; AVX512VL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 325; AVX512VL-NEXT: kunpckbw %k0, %k1, %k1 326; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 327; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 328; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm5 ^ (xmm0 & (xmm4 ^ xmm5)) 329; AVX512VL-NEXT: vzeroupper 330; AVX512VL-NEXT: retq 331; 332; AVX512BWNOVL-LABEL: vselect_packss_v16i64: 333; AVX512BWNOVL: # %bb.0: 334; AVX512BWNOVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 335; AVX512BWNOVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 336; AVX512BWNOVL-NEXT: kunpckbw %k0, %k1, %k0 337; AVX512BWNOVL-NEXT: vpmovm2b %k0, %zmm0 338; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm4, %xmm5, %xmm0 339; AVX512BWNOVL-NEXT: vzeroupper 340; AVX512BWNOVL-NEXT: retq 341; 342; AVX512BWVL-LABEL: vselect_packss_v16i64: 343; AVX512BWVL: # %bb.0: 344; AVX512BWVL-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 345; AVX512BWVL-NEXT: vpcmpeqq %zmm3, %zmm1, %k1 346; AVX512BWVL-NEXT: kunpckbw %k0, %k1, %k0 347; AVX512BWVL-NEXT: vpmovm2b %k0, %xmm0 348; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm5 ^ (xmm0 & (xmm4 ^ xmm5)) 349; AVX512BWVL-NEXT: vzeroupper 350; AVX512BWVL-NEXT: retq 351 %1 = icmp eq <16 x i64> %a0, %a1 352 %2 = sext <16 x i1> %1 to <16 x i8> 353 %3 = and <16 x i8> %2, %a2 354 %4 = xor <16 x i8> %2, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 355 %5 = and <16 x i8> %4, %a3 356 %6 = or <16 x i8> %3, %5 357 ret <16 x i8> %6 358} 359 360; 361; PACKSS case 362; 363 364define <16 x i8> @vselect_packss(<16 x i16> %a0, <16 x i16> %a1, <16 x i8> %a2, <16 x i8> %a3) { 365; SSE2-LABEL: vselect_packss: 366; SSE2: # %bb.0: 367; SSE2-NEXT: pcmpeqw %xmm3, %xmm1 368; SSE2-NEXT: pcmpeqw %xmm2, %xmm0 369; SSE2-NEXT: packsswb %xmm1, %xmm0 370; SSE2-NEXT: pand %xmm0, %xmm4 371; SSE2-NEXT: pandn %xmm5, %xmm0 372; SSE2-NEXT: por %xmm4, %xmm0 373; SSE2-NEXT: retq 374; 375; SSE42-LABEL: vselect_packss: 376; SSE42: # %bb.0: 377; SSE42-NEXT: pcmpeqw %xmm3, %xmm1 378; SSE42-NEXT: pcmpeqw %xmm2, %xmm0 379; SSE42-NEXT: packsswb %xmm1, %xmm0 380; SSE42-NEXT: pblendvb %xmm0, %xmm4, %xmm5 381; SSE42-NEXT: movdqa %xmm5, %xmm0 382; SSE42-NEXT: retq 383; 384; AVX1-LABEL: vselect_packss: 385; AVX1: # %bb.0: 386; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 387; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 388; AVX1-NEXT: vpcmpeqw %xmm4, %xmm5, %xmm4 389; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 390; AVX1-NEXT: vpacksswb %xmm4, %xmm0, %xmm0 391; AVX1-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 392; AVX1-NEXT: vzeroupper 393; AVX1-NEXT: retq 394; 395; AVX2-LABEL: vselect_packss: 396; AVX2: # %bb.0: 397; AVX2-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 398; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 399; AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 400; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 401; AVX2-NEXT: vzeroupper 402; AVX2-NEXT: retq 403; 404; AVX512F-LABEL: vselect_packss: 405; AVX512F: # %bb.0: 406; AVX512F-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 407; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 408; AVX512F-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 409; AVX512F-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 410; AVX512F-NEXT: vzeroupper 411; AVX512F-NEXT: retq 412; 413; AVX512VL-LABEL: vselect_packss: 414; AVX512VL: # %bb.0: 415; AVX512VL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 416; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 417; AVX512VL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 418; AVX512VL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 419; AVX512VL-NEXT: vzeroupper 420; AVX512VL-NEXT: retq 421; 422; AVX512BWNOVL-LABEL: vselect_packss: 423; AVX512BWNOVL: # %bb.0: 424; AVX512BWNOVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 425; AVX512BWNOVL-NEXT: vextracti128 $1, %ymm0, %xmm1 426; AVX512BWNOVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 427; AVX512BWNOVL-NEXT: vpblendvb %xmm0, %xmm2, %xmm3, %xmm0 428; AVX512BWNOVL-NEXT: vzeroupper 429; AVX512BWNOVL-NEXT: retq 430; 431; AVX512BWVL-LABEL: vselect_packss: 432; AVX512BWVL: # %bb.0: 433; AVX512BWVL-NEXT: vpcmpeqw %ymm1, %ymm0, %ymm0 434; AVX512BWVL-NEXT: vextracti128 $1, %ymm0, %xmm1 435; AVX512BWVL-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 436; AVX512BWVL-NEXT: vpternlogq {{.*#+}} xmm0 = xmm3 ^ (xmm0 & (xmm2 ^ xmm3)) 437; AVX512BWVL-NEXT: vzeroupper 438; AVX512BWVL-NEXT: retq 439 %1 = icmp eq <16 x i16> %a0, %a1 440 %2 = sext <16 x i1> %1 to <16 x i16> 441 %3 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7> 442 %4 = shufflevector <16 x i16> %2, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15> 443 %5 = tail call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %3, <8 x i16> %4) 444 %6 = and <16 x i8> %5, %a2 445 %7 = xor <16 x i8> %5, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1> 446 %8 = and <16 x i8> %7, %a3 447 %9 = or <16 x i8> %6, %8 448 ret <16 x i8> %9 449} 450declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) 451