1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py 2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F 3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL 4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE 5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL 6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE 7 8define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) { 9; AVX512F-LABEL: shuf2i1_1_0: 10; AVX512F: # %bb.0: 11; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 12; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 13; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 14; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 15; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 16; AVX512F-NEXT: vzeroupper 17; AVX512F-NEXT: retq 18; 19; AVX512VL-LABEL: shuf2i1_1_0: 20; AVX512VL: # %bb.0: 21; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 22; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 23; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 24; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 25; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 26; AVX512VL-NEXT: retq 27; 28; VL_BW_DQ-LABEL: shuf2i1_1_0: 29; VL_BW_DQ: # %bb.0: 30; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 31; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] 32; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 33; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 34; VL_BW_DQ-NEXT: retq 35 %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0> 36 ret <2 x i1> %b 37} 38 39define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) { 40; AVX512F-LABEL: shuf2i1_1_2: 41; AVX512F: # %bb.0: 42; AVX512F-NEXT: vpsllq $63, %xmm0, %xmm0 43; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 44; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 45; AVX512F-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] 46; AVX512F-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 47; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 48; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 49; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 50; AVX512F-NEXT: vzeroupper 51; AVX512F-NEXT: retq 52; 53; AVX512VL-LABEL: shuf2i1_1_2: 54; AVX512VL: # %bb.0: 55; AVX512VL-NEXT: vpsllq $63, %xmm0, %xmm0 56; AVX512VL-NEXT: vptestmq %xmm0, %xmm0, %k1 57; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 58; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm1 {%k1} {z} 59; AVX512VL-NEXT: vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0] 60; AVX512VL-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] 61; AVX512VL-NEXT: vptestmq %xmm1, %xmm1, %k1 62; AVX512VL-NEXT: vmovdqa64 %xmm0, %xmm0 {%k1} {z} 63; AVX512VL-NEXT: retq 64; 65; VL_BW_DQ-LABEL: shuf2i1_1_2: 66; VL_BW_DQ: # %bb.0: 67; VL_BW_DQ-NEXT: vpsllq $63, %xmm0, %xmm0 68; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 69; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 70; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0] 71; VL_BW_DQ-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7] 72; VL_BW_DQ-NEXT: vpmovq2m %xmm0, %k0 73; VL_BW_DQ-NEXT: vpmovm2q %k0, %xmm0 74; VL_BW_DQ-NEXT: retq 75 %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2> 76 ret <2 x i1> %b 77} 78 79 80define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) { 81; AVX512F-LABEL: shuf4i1_3_2_10: 82; AVX512F: # %bb.0: 83; AVX512F-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 84; AVX512F-NEXT: vpslld $31, %xmm0, %xmm0 85; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 86; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 87; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0 88; AVX512F-NEXT: vzeroupper 89; AVX512F-NEXT: retq 90; 91; AVX512VL-LABEL: shuf4i1_3_2_10: 92; AVX512VL: # %bb.0: 93; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 94; AVX512VL-NEXT: vpslld $31, %xmm0, %xmm0 95; AVX512VL-NEXT: vptestmd %xmm0, %xmm0, %k1 96; AVX512VL-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 97; AVX512VL-NEXT: vmovdqa32 %xmm0, %xmm0 {%k1} {z} 98; AVX512VL-NEXT: retq 99; 100; VL_BW_DQ-LABEL: shuf4i1_3_2_10: 101; VL_BW_DQ: # %bb.0: 102; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0] 103; VL_BW_DQ-NEXT: vpslld $31, %xmm0, %xmm0 104; VL_BW_DQ-NEXT: vpmovd2m %xmm0, %k0 105; VL_BW_DQ-NEXT: vpmovm2d %k0, %xmm0 106; VL_BW_DQ-NEXT: retq 107 %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0> 108 ret <4 x i1> %b 109} 110 111define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) { 112; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 113; AVX512F: # %bb.0: 114; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 115; AVX512F-NEXT: vpermq %zmm2, %zmm1, %zmm2 116; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm0 117; AVX512F-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 118; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 119; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 120; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 121; AVX512F-NEXT: vzeroupper 122; AVX512F-NEXT: retq 123; 124; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 125; AVX512VL: # %bb.0: 126; AVX512VL-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 127; AVX512VL-NEXT: vpermq %zmm2, %zmm1, %zmm2 128; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm0 129; AVX512VL-NEXT: vpcmpeqq %zmm2, %zmm0, %k1 130; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 131; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 132; AVX512VL-NEXT: vpmovdw %ymm0, %xmm0 133; AVX512VL-NEXT: vzeroupper 134; AVX512VL-NEXT: retq 135; 136; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0: 137; VL_BW_DQ: # %bb.0: 138; VL_BW_DQ-NEXT: vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0] 139; VL_BW_DQ-NEXT: vpermq %zmm2, %zmm1, %zmm2 140; VL_BW_DQ-NEXT: vpermq %zmm0, %zmm1, %zmm0 141; VL_BW_DQ-NEXT: vpcmpeqq %zmm2, %zmm0, %k0 142; VL_BW_DQ-NEXT: vpmovm2w %k0, %xmm0 143; VL_BW_DQ-NEXT: vzeroupper 144; VL_BW_DQ-NEXT: retq 145 %a2 = icmp eq <8 x i64> %a, %a1 146 %b2 = icmp eq <8 x i64> %b, %b1 147 %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 148 ret <8 x i1> %c 149} 150 151define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) { 152; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 153; AVX512F: # %bb.0: 154; AVX512F-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 155; AVX512F-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 156; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 157; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 158; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 159; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 160; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 161; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 162; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 163; AVX512F-NEXT: vzeroupper 164; AVX512F-NEXT: retq 165; 166; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 167; AVX512VL: # %bb.0: 168; AVX512VL-NEXT: vpcmpeqd %zmm2, %zmm0, %k1 169; AVX512VL-NEXT: vpcmpeqd %zmm3, %zmm1, %k2 170; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 171; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 172; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 173; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 174; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 175; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 176; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 177; AVX512VL-NEXT: vzeroupper 178; AVX512VL-NEXT: retq 179; 180; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 181; VL_BW_DQ: # %bb.0: 182; VL_BW_DQ-NEXT: vpcmpeqd %zmm2, %zmm0, %k0 183; VL_BW_DQ-NEXT: vpcmpeqd %zmm3, %zmm1, %k1 184; VL_BW_DQ-NEXT: vpmovm2d %k1, %zmm0 185; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm1 186; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 187; VL_BW_DQ-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 188; VL_BW_DQ-NEXT: vpmovd2m %zmm2, %k0 189; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 190; VL_BW_DQ-NEXT: vzeroupper 191; VL_BW_DQ-NEXT: retq 192 %a2 = icmp eq <16 x i32> %a, %a1 193 %b2 = icmp eq <16 x i32> %b, %b1 194 %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 195 ret <16 x i1> %c 196} 197 198define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) { 199; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 200; AVX512F: # %bb.0: 201; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm1 202; AVX512F-NEXT: vpslld $31, %zmm1, %zmm1 203; AVX512F-NEXT: vptestmd %zmm1, %zmm1, %k1 204; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 205; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 206; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 207; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 208; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 209; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 210; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 211; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 212; AVX512F-NEXT: vptestmd %zmm2, %zmm2, %k1 213; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 214; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 215; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 216; AVX512F-NEXT: retq 217; 218; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 219; AVX512VL: # %bb.0: 220; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm1 221; AVX512VL-NEXT: vpslld $31, %zmm1, %zmm1 222; AVX512VL-NEXT: vptestmd %zmm1, %zmm1, %k1 223; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 224; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 225; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 226; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 227; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 228; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 229; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 230; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm2 231; AVX512VL-NEXT: vptestmd %zmm2, %zmm2, %k1 232; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 233; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 234; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 235; AVX512VL-NEXT: retq 236; 237; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0: 238; VL_BW_DQ: # %bb.0: 239; VL_BW_DQ-NEXT: vpsllw $7, %ymm0, %ymm0 240; VL_BW_DQ-NEXT: vpmovb2m %ymm0, %k0 241; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 242; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 243; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 244; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 245; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k0 246; VL_BW_DQ-NEXT: vpmovm2b %k0, %ymm0 247; VL_BW_DQ-NEXT: retq 248 %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 249 ret <32 x i1> %b 250} 251 252define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) { 253; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 254; AVX512F: # %bb.0: 255; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 256; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 257; AVX512F-NEXT: vpmovsxwd %ymm4, %zmm4 258; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 259; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0 260; AVX512F-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 261; AVX512F-NEXT: vpmovsxwd %ymm0, %zmm0 262; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 263; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 264; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 265; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 266; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 267; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 268; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 269; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 270; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 271; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) 272; AVX512F-NEXT: retq 273; 274; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 275; AVX512VL: # %bb.0: 276; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 277; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm4 278; AVX512VL-NEXT: vpmovsxwd %ymm4, %zmm4 279; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 280; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0 281; AVX512VL-NEXT: vpcmpeqw %ymm3, %ymm0, %ymm0 282; AVX512VL-NEXT: vpmovsxwd %ymm0, %zmm0 283; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 284; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 285; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 286; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 287; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 288; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 289; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 290; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 291; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 292; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2)) 293; AVX512VL-NEXT: retq 294; 295; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16: 296; VL_BW_DQ: # %bb.0: 297; VL_BW_DQ-NEXT: vptestnmw %zmm0, %zmm0, %k0 298; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 299; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 300; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 301; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 302; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 303; VL_BW_DQ-NEXT: vpblendmw %zmm1, %zmm2, %zmm0 {%k1} 304; VL_BW_DQ-NEXT: retq 305 %cmp = icmp eq <32 x i16> %a, zeroinitializer 306 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 307 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 308 ret <32 x i16> %sel 309} 310 311define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) { 312; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 313; AVX512F: # %bb.0: 314; AVX512F-NEXT: vpxor %xmm3, %xmm3, %xmm3 315; AVX512F-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 316; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm3 317; AVX512F-NEXT: vptestmd %zmm3, %zmm3, %k1 318; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 319; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 320; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k2 321; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 322; AVX512F-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 323; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 324; AVX512F-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 325; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 326; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 327; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 328; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 329; AVX512F-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0 330; AVX512F-NEXT: retq 331; 332; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 333; AVX512VL: # %bb.0: 334; AVX512VL-NEXT: vpxor %xmm3, %xmm3, %xmm3 335; AVX512VL-NEXT: vpcmpeqb %ymm3, %ymm0, %ymm0 336; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm3 337; AVX512VL-NEXT: vptestmd %zmm3, %zmm3, %k1 338; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 339; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 340; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k2 341; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 342; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1 343; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 344; AVX512VL-NEXT: vpermi2d %zmm0, %zmm3, %zmm4 345; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 346; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 347; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 348; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 349; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2)) 350; AVX512VL-NEXT: retq 351; 352; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8: 353; VL_BW_DQ: # %bb.0: 354; VL_BW_DQ-NEXT: vptestnmb %ymm0, %ymm0, %k0 355; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 356; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 357; VL_BW_DQ-NEXT: # zmm3 = mem[0,1,2,3,0,1,2,3] 358; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm3, %zmm0 359; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 360; VL_BW_DQ-NEXT: vpblendmb %ymm1, %ymm2, %ymm0 {%k1} 361; VL_BW_DQ-NEXT: retq 362 %cmp = icmp eq <32 x i8> %a, zeroinitializer 363 %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 364 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 365 ret <32 x i8> %sel 366} 367 368define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) { 369; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 370; AVX512F: # %bb.0: 371; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 372; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 373; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 374; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 375; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 376; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 377; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 378; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 379; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 380; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 381; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3)) 382; AVX512F-NEXT: retq 383; 384; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 385; AVX512VL: # %bb.0: 386; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 387; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 388; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 389; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 390; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 391; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 392; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 393; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 394; AVX512VL-NEXT: vpmovdw %zmm0, %ymm0 395; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 396; AVX512VL-NEXT: vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3)) 397; AVX512VL-NEXT: retq 398; 399; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split: 400; VL_BW_DQ: # %bb.0: 401; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 402; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 403; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 404; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 405; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 406; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 407; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 408; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 409; VL_BW_DQ-NEXT: vpblendmw %zmm2, %zmm3, %zmm0 {%k1} 410; VL_BW_DQ-NEXT: retq 411 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 412 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 413 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 414 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 415 %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d 416 ret <32 x i16> %sel 417} 418 419define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) { 420; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 421; AVX512F: # %bb.0: 422; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 423; AVX512F-NEXT: vptestnmd %zmm1, %zmm1, %k2 424; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 425; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 426; AVX512F-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 427; AVX512F-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 428; AVX512F-NEXT: vptestmd %zmm4, %zmm4, %k1 429; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 430; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 431; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 432; AVX512F-NEXT: vpblendvb %ymm0, %ymm2, %ymm3, %ymm0 433; AVX512F-NEXT: retq 434; 435; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 436; AVX512VL: # %bb.0: 437; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 438; AVX512VL-NEXT: vptestnmd %zmm1, %zmm1, %k2 439; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1 440; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1 441; AVX512VL-NEXT: vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 442; AVX512VL-NEXT: vpermi2d %zmm0, %zmm1, %zmm4 443; AVX512VL-NEXT: vptestmd %zmm4, %zmm4, %k1 444; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 445; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 446; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 447; AVX512VL-NEXT: vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3)) 448; AVX512VL-NEXT: retq 449; 450; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split: 451; VL_BW_DQ: # %bb.0: 452; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 453; VL_BW_DQ-NEXT: vptestnmd %zmm1, %zmm1, %k1 454; VL_BW_DQ-NEXT: kunpckwd %k0, %k1, %k0 455; VL_BW_DQ-NEXT: vpmovm2w %k0, %zmm0 456; VL_BW_DQ-NEXT: vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0] 457; VL_BW_DQ-NEXT: # zmm1 = mem[0,1,2,3,0,1,2,3] 458; VL_BW_DQ-NEXT: vpermw %zmm0, %zmm1, %zmm0 459; VL_BW_DQ-NEXT: vpmovw2m %zmm0, %k1 460; VL_BW_DQ-NEXT: vpblendmb %ymm2, %ymm3, %ymm0 {%k1} 461; VL_BW_DQ-NEXT: retq 462 %cmp1 = icmp eq <16 x i32> %a, zeroinitializer 463 %cmp2 = icmp eq <16 x i32> %b, zeroinitializer 464 %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31> 465 %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0> 466 %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d 467 ret <32 x i8> %sel 468} 469 470define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) { 471; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 472; AVX512F: # %bb.0: 473; AVX512F-NEXT: kmovw %edi, %k1 474; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 475; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 476; AVX512F-NEXT: vpbroadcastq %xmm0, %zmm0 477; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 478; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 479; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 480; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 481; AVX512F-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 482; AVX512F-NEXT: vzeroupper 483; AVX512F-NEXT: retq 484; 485; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 486; AVX512VL-FAST-ALL: # %bb.0: 487; AVX512VL-FAST-ALL-NEXT: kmovw %edi, %k1 488; AVX512VL-FAST-ALL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 489; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 490; AVX512VL-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2] 491; AVX512VL-FAST-ALL-NEXT: vpermd %ymm1, %ymm2, %ymm1 492; AVX512VL-FAST-ALL-NEXT: vpslld $31, %ymm1, %ymm1 493; AVX512VL-FAST-ALL-NEXT: vptestmd %ymm1, %ymm1, %k1 494; AVX512VL-FAST-ALL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 495; AVX512VL-FAST-ALL-NEXT: vpmovdw %ymm0, %xmm0 496; AVX512VL-FAST-ALL-NEXT: vzeroupper 497; AVX512VL-FAST-ALL-NEXT: retq 498; 499; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 500; AVX512VL-FAST-PERLANE: # %bb.0: 501; AVX512VL-FAST-PERLANE-NEXT: kmovw %edi, %k1 502; AVX512VL-FAST-PERLANE-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 503; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 504; AVX512VL-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] 505; AVX512VL-FAST-PERLANE-NEXT: vpbroadcastd %xmm1, %ymm1 506; AVX512VL-FAST-PERLANE-NEXT: vpslld $31, %ymm1, %ymm1 507; AVX512VL-FAST-PERLANE-NEXT: vptestmd %ymm1, %ymm1, %k1 508; AVX512VL-FAST-PERLANE-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 509; AVX512VL-FAST-PERLANE-NEXT: vpmovdw %ymm0, %xmm0 510; AVX512VL-FAST-PERLANE-NEXT: vzeroupper 511; AVX512VL-FAST-PERLANE-NEXT: retq 512; 513; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 514; VL_BW_DQ-FAST-ALL: # %bb.0: 515; VL_BW_DQ-FAST-ALL-NEXT: kmovd %edi, %k0 516; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2d %k0, %ymm0 517; VL_BW_DQ-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2] 518; VL_BW_DQ-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 519; VL_BW_DQ-FAST-ALL-NEXT: vpmovd2m %ymm0, %k0 520; VL_BW_DQ-FAST-ALL-NEXT: vpmovm2w %k0, %xmm0 521; VL_BW_DQ-FAST-ALL-NEXT: vzeroupper 522; VL_BW_DQ-FAST-ALL-NEXT: retq 523; 524; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u: 525; VL_BW_DQ-FAST-PERLANE: # %bb.0: 526; VL_BW_DQ-FAST-PERLANE-NEXT: kmovd %edi, %k0 527; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2d %k0, %ymm0 528; VL_BW_DQ-FAST-PERLANE-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] 529; VL_BW_DQ-FAST-PERLANE-NEXT: vpbroadcastd %xmm0, %ymm0 530; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovd2m %ymm0, %k0 531; VL_BW_DQ-FAST-PERLANE-NEXT: vpmovm2w %k0, %xmm0 532; VL_BW_DQ-FAST-PERLANE-NEXT: vzeroupper 533; VL_BW_DQ-FAST-PERLANE-NEXT: retq 534 %b = bitcast i8 %a to <8 x i1> 535 %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef> 536 ret <8 x i1> %c 537} 538 539define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) { 540; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 541; AVX512F: # %bb.0: 542; AVX512F-NEXT: kmovw %edi, %k1 543; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 544; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 545; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,2,10,0,3,0,2,0] 546; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 547; AVX512F-NEXT: vpsllq $63, %zmm2, %zmm0 548; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 549; AVX512F-NEXT: kmovw %k0, %eax 550; AVX512F-NEXT: # kill: def $al killed $al killed $eax 551; AVX512F-NEXT: vzeroupper 552; AVX512F-NEXT: retq 553; 554; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 555; AVX512VL: # %bb.0: 556; AVX512VL-NEXT: kmovw %edi, %k1 557; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 558; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 559; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 560; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 561; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 562; AVX512VL-NEXT: vpslld $31, %ymm2, %ymm0 563; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 564; AVX512VL-NEXT: kmovw %k0, %eax 565; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 566; AVX512VL-NEXT: vzeroupper 567; AVX512VL-NEXT: retq 568; 569; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u: 570; VL_BW_DQ: # %bb.0: 571; VL_BW_DQ-NEXT: kmovd %edi, %k0 572; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 573; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 574; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3] 575; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 576; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 577; VL_BW_DQ-NEXT: kmovd %k0, %eax 578; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 579; VL_BW_DQ-NEXT: vzeroupper 580; VL_BW_DQ-NEXT: retq 581 %b = bitcast i8 %a to <8 x i1> 582 %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef> 583 %d = bitcast <8 x i1> %c to i8 584 ret i8 %d 585} 586 587define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) { 588; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 589; AVX512F: # %bb.0: 590; AVX512F-NEXT: kmovw %edi, %k1 591; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 592; AVX512F-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,4,5,6,7] 593; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 594; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0 595; AVX512F-NEXT: kmovw %k0, %eax 596; AVX512F-NEXT: # kill: def $al killed $al killed $eax 597; AVX512F-NEXT: vzeroupper 598; AVX512F-NEXT: retq 599; 600; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 601; AVX512VL: # %bb.0: 602; AVX512VL-NEXT: kmovw %edi, %k1 603; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 604; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 605; AVX512VL-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 606; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 607; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 608; AVX512VL-NEXT: kmovw %k0, %eax 609; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 610; AVX512VL-NEXT: vzeroupper 611; AVX512VL-NEXT: retq 612; 613; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u: 614; VL_BW_DQ: # %bb.0: 615; VL_BW_DQ-NEXT: kmovd %edi, %k0 616; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 617; VL_BW_DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] 618; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 619; VL_BW_DQ-NEXT: kmovd %k0, %eax 620; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 621; VL_BW_DQ-NEXT: vzeroupper 622; VL_BW_DQ-NEXT: retq 623 %b = bitcast i8 %a to <8 x i1> 624 %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef> 625 %d = bitcast <8 x i1> %c to i8 626 ret i8 %d 627} 628 629define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) { 630; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 631; AVX512F: # %bb.0: 632; AVX512F-NEXT: kmovw %edi, %k1 633; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 634; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1 635; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0] 636; AVX512F-NEXT: vpermi2q %zmm1, %zmm0, %zmm2 637; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 638; AVX512F-NEXT: kmovw %k0, %eax 639; AVX512F-NEXT: # kill: def $al killed $al killed $eax 640; AVX512F-NEXT: vzeroupper 641; AVX512F-NEXT: retq 642; 643; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 644; AVX512VL: # %bb.0: 645; AVX512VL-NEXT: kmovw %edi, %k1 646; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 647; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 648; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1 649; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 650; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 651; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 652; AVX512VL-NEXT: kmovw %k0, %eax 653; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 654; AVX512VL-NEXT: vzeroupper 655; AVX512VL-NEXT: retq 656; 657; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0: 658; VL_BW_DQ: # %bb.0: 659; VL_BW_DQ-NEXT: kmovd %edi, %k0 660; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 661; VL_BW_DQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 662; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0] 663; VL_BW_DQ-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 664; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 665; VL_BW_DQ-NEXT: kmovd %k0, %eax 666; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 667; VL_BW_DQ-NEXT: vzeroupper 668; VL_BW_DQ-NEXT: retq 669 %b = bitcast i8 %a to <8 x i1> 670 %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 671 %d = bitcast <8 x i1>%c to i8 672 ret i8 %d 673} 674 675define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) { 676; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 677; AVX512F: # %bb.0: 678; AVX512F-NEXT: kmovw %edi, %k1 679; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 680; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7] 681; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2 682; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 683; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 684; AVX512F-NEXT: kmovw %k0, %eax 685; AVX512F-NEXT: # kill: def $al killed $al killed $eax 686; AVX512F-NEXT: vzeroupper 687; AVX512F-NEXT: retq 688; 689; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 690; AVX512VL: # %bb.0: 691; AVX512VL-NEXT: kmovw %edi, %k1 692; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 693; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 694; AVX512VL-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 695; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 696; AVX512VL-NEXT: kmovw %k0, %eax 697; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 698; AVX512VL-NEXT: vzeroupper 699; AVX512VL-NEXT: retq 700; 701; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0: 702; VL_BW_DQ: # %bb.0: 703; VL_BW_DQ-NEXT: kmovd %edi, %k0 704; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 705; VL_BW_DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero 706; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 707; VL_BW_DQ-NEXT: kmovd %k0, %eax 708; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 709; VL_BW_DQ-NEXT: vzeroupper 710; VL_BW_DQ-NEXT: retq 711 %b = bitcast i8 %a to <8 x i1> 712 %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0> 713 %d = bitcast <8 x i1>%c to i8 714 ret i8 %d 715} 716 717define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) { 718; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 719; AVX512F: # %bb.0: 720; AVX512F-NEXT: kmovw %edi, %k1 721; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 722; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9] 723; AVX512F-NEXT: vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 724; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0 725; AVX512F-NEXT: kmovw %k0, %eax 726; AVX512F-NEXT: # kill: def $al killed $al killed $eax 727; AVX512F-NEXT: vzeroupper 728; AVX512F-NEXT: retq 729; 730; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 731; AVX512VL: # %bb.0: 732; AVX512VL-NEXT: kmovw %edi, %k1 733; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 734; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm0 {%k1} {z} 735; AVX512VL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 736; AVX512VL-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 737; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k0 738; AVX512VL-NEXT: kmovw %k0, %eax 739; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 740; AVX512VL-NEXT: vzeroupper 741; AVX512VL-NEXT: retq 742; 743; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1: 744; VL_BW_DQ: # %bb.0: 745; VL_BW_DQ-NEXT: kmovd %edi, %k0 746; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 747; VL_BW_DQ-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] 748; VL_BW_DQ-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7] 749; VL_BW_DQ-NEXT: vpmovd2m %ymm0, %k0 750; VL_BW_DQ-NEXT: kmovd %k0, %eax 751; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 752; VL_BW_DQ-NEXT: vzeroupper 753; VL_BW_DQ-NEXT: retq 754 %b = bitcast i8 %a to <8 x i1> 755 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1> 756 %c1 = bitcast <8 x i1>%c to i8 757 ret i8 %c1 758} 759 760define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) { 761; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 762; AVX512F: # %bb.0: 763; AVX512F-NEXT: vpmovsxwq %xmm0, %zmm0 764; AVX512F-NEXT: vpsllq $63, %zmm0, %zmm0 765; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1 766; AVX512F-NEXT: vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1 767; AVX512F-NEXT: vpmovsxbq {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7] 768; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1 769; AVX512F-NEXT: vpermt2q %zmm0, %zmm1, %zmm2 770; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k0 771; AVX512F-NEXT: kmovw %k0, %eax 772; AVX512F-NEXT: # kill: def $al killed $al killed $eax 773; AVX512F-NEXT: vzeroupper 774; AVX512F-NEXT: retq 775; 776; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 777; AVX512VL: # %bb.0: 778; AVX512VL-NEXT: vpmovsxwd %xmm0, %ymm0 779; AVX512VL-NEXT: vpslld $31, %ymm0, %ymm0 780; AVX512VL-NEXT: vptestmd %ymm0, %ymm0, %k1 781; AVX512VL-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 782; AVX512VL-NEXT: vmovdqa32 %ymm0, %ymm1 {%k1} {z} 783; AVX512VL-NEXT: vpmovsxbd {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7] 784; AVX512VL-NEXT: vpermi2d %ymm1, %ymm0, %ymm2 785; AVX512VL-NEXT: vptestmd %ymm2, %ymm2, %k0 786; AVX512VL-NEXT: kmovw %k0, %eax 787; AVX512VL-NEXT: # kill: def $al killed $al killed $eax 788; AVX512VL-NEXT: vzeroupper 789; AVX512VL-NEXT: retq 790; 791; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones: 792; VL_BW_DQ: # %bb.0: 793; VL_BW_DQ-NEXT: vpsllw $15, %xmm0, %xmm0 794; VL_BW_DQ-NEXT: vpmovw2m %xmm0, %k0 795; VL_BW_DQ-NEXT: vpmovm2d %k0, %ymm0 796; VL_BW_DQ-NEXT: vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7] 797; VL_BW_DQ-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 798; VL_BW_DQ-NEXT: vpermt2d %ymm0, %ymm1, %ymm2 799; VL_BW_DQ-NEXT: vpmovd2m %ymm2, %k0 800; VL_BW_DQ-NEXT: kmovd %k0, %eax 801; VL_BW_DQ-NEXT: # kill: def $al killed $al killed $eax 802; VL_BW_DQ-NEXT: vzeroupper 803; VL_BW_DQ-NEXT: retq 804 %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0> 805 %c1 = bitcast <8 x i1>%c to i8 806 ret i8 %c1 807} 808 809define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) { 810; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 811; AVX512F: # %bb.0: 812; AVX512F-NEXT: kmovw %edi, %k1 813; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 814; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 815; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 816; AVX512F-NEXT: kmovw %k0, %eax 817; AVX512F-NEXT: # kill: def $ax killed $ax killed $eax 818; AVX512F-NEXT: vzeroupper 819; AVX512F-NEXT: retq 820; 821; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 822; AVX512VL: # %bb.0: 823; AVX512VL-NEXT: kmovw %edi, %k1 824; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 825; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 826; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 827; AVX512VL-NEXT: kmovw %k0, %eax 828; AVX512VL-NEXT: # kill: def $ax killed $ax killed $eax 829; AVX512VL-NEXT: vzeroupper 830; AVX512VL-NEXT: retq 831; 832; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0: 833; VL_BW_DQ: # %bb.0: 834; VL_BW_DQ-NEXT: kmovd %edi, %k0 835; VL_BW_DQ-NEXT: vpmovm2d %k0, %zmm0 836; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 837; VL_BW_DQ-NEXT: vpmovd2m %zmm0, %k0 838; VL_BW_DQ-NEXT: kmovd %k0, %eax 839; VL_BW_DQ-NEXT: # kill: def $ax killed $ax killed $eax 840; VL_BW_DQ-NEXT: vzeroupper 841; VL_BW_DQ-NEXT: retq 842 %b = bitcast i16 %a to <16 x i1> 843 %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer 844 %d = bitcast <16 x i1> %c to i16 845 ret i16 %d 846} 847 848define i64 @shuf64i1_zero(i64 %a) { 849; AVX512F-LABEL: shuf64i1_zero: 850; AVX512F: # %bb.0: 851; AVX512F-NEXT: kmovw %edi, %k1 852; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 853; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 854; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k0 855; AVX512F-NEXT: kmovw %k0, %eax 856; AVX512F-NEXT: kmovw %k0, %ecx 857; AVX512F-NEXT: shll $16, %ecx 858; AVX512F-NEXT: orl %eax, %ecx 859; AVX512F-NEXT: movq %rcx, %rax 860; AVX512F-NEXT: shlq $32, %rax 861; AVX512F-NEXT: orq %rcx, %rax 862; AVX512F-NEXT: vzeroupper 863; AVX512F-NEXT: retq 864; 865; AVX512VL-LABEL: shuf64i1_zero: 866; AVX512VL: # %bb.0: 867; AVX512VL-NEXT: kmovw %edi, %k1 868; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 869; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 870; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k0 871; AVX512VL-NEXT: kmovw %k0, %eax 872; AVX512VL-NEXT: kmovw %k0, %ecx 873; AVX512VL-NEXT: shll $16, %ecx 874; AVX512VL-NEXT: orl %eax, %ecx 875; AVX512VL-NEXT: movq %rcx, %rax 876; AVX512VL-NEXT: shlq $32, %rax 877; AVX512VL-NEXT: orq %rcx, %rax 878; AVX512VL-NEXT: vzeroupper 879; AVX512VL-NEXT: retq 880; 881; VL_BW_DQ-LABEL: shuf64i1_zero: 882; VL_BW_DQ: # %bb.0: 883; VL_BW_DQ-NEXT: kmovq %rdi, %k0 884; VL_BW_DQ-NEXT: vpmovm2b %k0, %zmm0 885; VL_BW_DQ-NEXT: vpbroadcastb %xmm0, %zmm0 886; VL_BW_DQ-NEXT: vpmovb2m %zmm0, %k0 887; VL_BW_DQ-NEXT: kmovq %k0, %rax 888; VL_BW_DQ-NEXT: vzeroupper 889; VL_BW_DQ-NEXT: retq 890 %b = bitcast i64 %a to <64 x i1> 891 %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer 892 %d = bitcast <64 x i1> %c to i64 893 ret i64 %d 894} 895 896define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) { 897; AVX512F-LABEL: PR52500: 898; AVX512F: # %bb.0: 899; AVX512F-NEXT: vpmovsxbd %xmm0, %zmm0 900; AVX512F-NEXT: vpslld $31, %zmm0, %zmm0 901; AVX512F-NEXT: vptestmd %zmm0, %zmm0, %k1 902; AVX512F-NEXT: vmovd %edi, %xmm0 903; AVX512F-NEXT: movl $789, %eax # imm = 0x315 904; AVX512F-NEXT: vmovd %eax, %xmm1 905; AVX512F-NEXT: vpmulld %xmm1, %xmm0, %xmm0 906; AVX512F-NEXT: vpbroadcastd %xmm0, %zmm0 907; AVX512F-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} 908; AVX512F-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 909; AVX512F-NEXT: vpmovdb %zmm0, %xmm0 910; AVX512F-NEXT: vzeroupper 911; AVX512F-NEXT: retq 912; 913; AVX512VL-LABEL: PR52500: 914; AVX512VL: # %bb.0: 915; AVX512VL-NEXT: vpmovsxbd %xmm0, %zmm0 916; AVX512VL-NEXT: vpslld $31, %zmm0, %zmm0 917; AVX512VL-NEXT: vptestmd %zmm0, %zmm0, %k1 918; AVX512VL-NEXT: vmovd %edi, %xmm0 919; AVX512VL-NEXT: movl $789, %eax # imm = 0x315 920; AVX512VL-NEXT: vmovd %eax, %xmm1 921; AVX512VL-NEXT: vpmulld %xmm1, %xmm0, %xmm0 922; AVX512VL-NEXT: vpbroadcastd %xmm0, %zmm0 923; AVX512VL-NEXT: vptestnmd %zmm0, %zmm0, %k1 {%k1} 924; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1 925; AVX512VL-NEXT: vpmovdb %zmm0, %xmm0 926; AVX512VL-NEXT: vzeroupper 927; AVX512VL-NEXT: retq 928; 929; VL_BW_DQ-LABEL: PR52500: 930; VL_BW_DQ: # %bb.0: 931; VL_BW_DQ-NEXT: vpsllw $7, %xmm0, %xmm0 932; VL_BW_DQ-NEXT: vpmovb2m %xmm0, %k1 933; VL_BW_DQ-NEXT: vmovd %edi, %xmm0 934; VL_BW_DQ-NEXT: movl $789, %eax # imm = 0x315 935; VL_BW_DQ-NEXT: vmovd %eax, %xmm1 936; VL_BW_DQ-NEXT: vpmulld %xmm1, %xmm0, %xmm0 937; VL_BW_DQ-NEXT: vpbroadcastd %xmm0, %zmm0 938; VL_BW_DQ-NEXT: vptestnmd %zmm0, %zmm0, %k0 {%k1} 939; VL_BW_DQ-NEXT: vpmovm2b %k0, %xmm0 940; VL_BW_DQ-NEXT: vzeroupper 941; VL_BW_DQ-NEXT: retq 942 %insrt = insertelement <16 x i32> undef, i32 %in, i32 0 943 %mul = mul <16 x i32> %insrt, <i32 789, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison> 944 %eq = icmp eq <16 x i32> %mul, zeroinitializer 945 %cmp1 = shufflevector <16 x i1> %eq, <16 x i1> poison, <16 x i32> zeroinitializer 946 %and = and <16 x i1> %cmp1, %msk 947 ret <16 x i1> %and 948} 949