xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-v1.ll (revision 70bd80dc51b62453210f6203c31ea826dd0675c2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-ALL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=AVX512VL --check-prefix=AVX512VL-FAST-PERLANE
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-ALL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512dq,+fast-variable-perlane-shuffle | FileCheck %s --check-prefix=VL_BW_DQ --check-prefix=VL_BW_DQ-FAST-PERLANE
7
8define <2 x i1> @shuf2i1_1_0(<2 x i1> %a) {
9; AVX512F-LABEL: shuf2i1_1_0:
10; AVX512F:       # %bb.0:
11; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
12; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
13; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
14; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
15; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
16; AVX512F-NEXT:    vzeroupper
17; AVX512F-NEXT:    retq
18;
19; AVX512VL-LABEL: shuf2i1_1_0:
20; AVX512VL:       # %bb.0:
21; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
22; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
23; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
24; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
25; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
26; AVX512VL-NEXT:    retq
27;
28; VL_BW_DQ-LABEL: shuf2i1_1_0:
29; VL_BW_DQ:       # %bb.0:
30; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
31; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
32; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
33; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
34; VL_BW_DQ-NEXT:    retq
35  %b = shufflevector <2 x i1> %a, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
36  ret <2 x i1> %b
37}
38
39define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
40; AVX512F-LABEL: shuf2i1_1_2:
41; AVX512F:       # %bb.0:
42; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
43; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
44; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
45; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
46; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
47; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
48; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
49; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
50; AVX512F-NEXT:    vzeroupper
51; AVX512F-NEXT:    retq
52;
53; AVX512VL-LABEL: shuf2i1_1_2:
54; AVX512VL:       # %bb.0:
55; AVX512VL-NEXT:    vpsllq $63, %xmm0, %xmm0
56; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
57; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
58; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
59; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [18446744073709551615,0]
60; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
61; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
62; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
63; AVX512VL-NEXT:    retq
64;
65; VL_BW_DQ-LABEL: shuf2i1_1_2:
66; VL_BW_DQ:       # %bb.0:
67; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
68; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
69; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
70; VL_BW_DQ-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [18446744073709551615,0]
71; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
72; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
73; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
74; VL_BW_DQ-NEXT:    retq
75  %b = shufflevector <2 x i1> %a, <2 x i1> <i1 1, i1 0>, <2 x i32> <i32 1, i32 2>
76  ret <2 x i1> %b
77}
78
79
80define <4 x i1> @shuf4i1_3_2_10(<4 x i1> %a) {
81; AVX512F-LABEL: shuf4i1_3_2_10:
82; AVX512F:       # %bb.0:
83; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
84; AVX512F-NEXT:    vpslld $31, %xmm0, %xmm0
85; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
86; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
87; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
88; AVX512F-NEXT:    vzeroupper
89; AVX512F-NEXT:    retq
90;
91; AVX512VL-LABEL: shuf4i1_3_2_10:
92; AVX512VL:       # %bb.0:
93; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
94; AVX512VL-NEXT:    vpslld $31, %xmm0, %xmm0
95; AVX512VL-NEXT:    vptestmd %xmm0, %xmm0, %k1
96; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
97; AVX512VL-NEXT:    vmovdqa32 %xmm0, %xmm0 {%k1} {z}
98; AVX512VL-NEXT:    retq
99;
100; VL_BW_DQ-LABEL: shuf4i1_3_2_10:
101; VL_BW_DQ:       # %bb.0:
102; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,1,0]
103; VL_BW_DQ-NEXT:    vpslld $31, %xmm0, %xmm0
104; VL_BW_DQ-NEXT:    vpmovd2m %xmm0, %k0
105; VL_BW_DQ-NEXT:    vpmovm2d %k0, %xmm0
106; VL_BW_DQ-NEXT:    retq
107  %b = shufflevector <4 x i1> %a, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
108  ret <4 x i1> %b
109}
110
111define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %a1, <8 x i64> %b1) {
112; AVX512F-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
113; AVX512F:       # %bb.0:
114; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
115; AVX512F-NEXT:    vpermq %zmm2, %zmm1, %zmm2
116; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm0
117; AVX512F-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
118; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
119; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
120; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
121; AVX512F-NEXT:    vzeroupper
122; AVX512F-NEXT:    retq
123;
124; AVX512VL-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
125; AVX512VL:       # %bb.0:
126; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
127; AVX512VL-NEXT:    vpermq %zmm2, %zmm1, %zmm2
128; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm0
129; AVX512VL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k1
130; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
131; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
132; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
133; AVX512VL-NEXT:    vzeroupper
134; AVX512VL-NEXT:    retq
135;
136; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
137; VL_BW_DQ:       # %bb.0:
138; VL_BW_DQ-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [3,6,1,0,3,7,7,0]
139; VL_BW_DQ-NEXT:    vpermq %zmm2, %zmm1, %zmm2
140; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
141; VL_BW_DQ-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
142; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
143; VL_BW_DQ-NEXT:    vzeroupper
144; VL_BW_DQ-NEXT:    retq
145  %a2 = icmp eq <8 x i64> %a, %a1
146  %b2 = icmp eq <8 x i64> %b, %b1
147  %c = shufflevector <8 x i1> %a2, <8 x i1> %b2, <8 x i32> <i32 3, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
148  ret <8 x i1> %c
149}
150
151define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <16 x i32> %b, <16 x i32> %a1, <16 x i32> %b1) {
152; AVX512F-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
153; AVX512F:       # %bb.0:
154; AVX512F-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
155; AVX512F-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
156; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
157; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
158; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
159; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
160; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
161; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
162; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
163; AVX512F-NEXT:    vzeroupper
164; AVX512F-NEXT:    retq
165;
166; AVX512VL-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
167; AVX512VL:       # %bb.0:
168; AVX512VL-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
169; AVX512VL-NEXT:    vpcmpeqd %zmm3, %zmm1, %k2
170; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
171; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
172; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
173; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
174; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
175; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
176; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
177; AVX512VL-NEXT:    vzeroupper
178; AVX512VL-NEXT:    retq
179;
180; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
181; VL_BW_DQ:       # %bb.0:
182; VL_BW_DQ-NEXT:    vpcmpeqd %zmm2, %zmm0, %k0
183; VL_BW_DQ-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
184; VL_BW_DQ-NEXT:    vpmovm2d %k1, %zmm0
185; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm1
186; VL_BW_DQ-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
187; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
188; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
189; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
190; VL_BW_DQ-NEXT:    vzeroupper
191; VL_BW_DQ-NEXT:    retq
192  %a2 = icmp eq <16 x i32> %a, %a1
193  %b2 = icmp eq <16 x i32> %b, %b1
194  %c = shufflevector <16 x i1> %a2, <16 x i1> %b2, <16 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
195  ret <16 x i1> %c
196}
197
198define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<32 x i1> %a) {
199; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
200; AVX512F:       # %bb.0:
201; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm1
202; AVX512F-NEXT:    vpslld $31, %zmm1, %zmm1
203; AVX512F-NEXT:    vptestmd %zmm1, %zmm1, %k1
204; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
205; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
206; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
207; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
208; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
209; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
210; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
211; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
212; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
213; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
214; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
215; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
216; AVX512F-NEXT:    retq
217;
218; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
219; AVX512VL:       # %bb.0:
220; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm1
221; AVX512VL-NEXT:    vpslld $31, %zmm1, %zmm1
222; AVX512VL-NEXT:    vptestmd %zmm1, %zmm1, %k1
223; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
224; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
225; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
226; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
227; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
228; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
229; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
230; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
231; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
232; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
233; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
234; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
235; AVX512VL-NEXT:    retq
236;
237; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
238; VL_BW_DQ:       # %bb.0:
239; VL_BW_DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
240; VL_BW_DQ-NEXT:    vpmovb2m %ymm0, %k0
241; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
242; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
243; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
244; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
245; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k0
246; VL_BW_DQ-NEXT:    vpmovm2b %k0, %ymm0
247; VL_BW_DQ-NEXT:    retq
248  %b = shufflevector <32 x i1> %a, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
249  ret <32 x i1> %b
250}
251
252define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16(<32 x i16> %a, <32 x i16> %c, <32 x i16> %d) {
253; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
254; AVX512F:       # %bb.0:
255; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
256; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
257; AVX512F-NEXT:    vpmovsxwd %ymm4, %zmm4
258; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
259; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
260; AVX512F-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
261; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
262; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
263; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
264; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
265; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
266; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
267; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
268; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
269; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
270; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
271; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2))
272; AVX512F-NEXT:    retq
273;
274; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
275; AVX512VL:       # %bb.0:
276; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
277; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm4
278; AVX512VL-NEXT:    vpmovsxwd %ymm4, %zmm4
279; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
280; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
281; AVX512VL-NEXT:    vpcmpeqw %ymm3, %ymm0, %ymm0
282; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
283; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
284; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
285; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
286; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
287; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
288; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
289; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
290; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
291; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
292; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm2 ^ (zmm0 & (zmm1 ^ zmm2))
293; AVX512VL-NEXT:    retq
294;
295; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16:
296; VL_BW_DQ:       # %bb.0:
297; VL_BW_DQ-NEXT:    vptestnmw %zmm0, %zmm0, %k0
298; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
299; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
300; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
301; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
302; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
303; VL_BW_DQ-NEXT:    vpblendmw %zmm1, %zmm2, %zmm0 {%k1}
304; VL_BW_DQ-NEXT:    retq
305  %cmp = icmp eq <32 x i16> %a, zeroinitializer
306  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
307  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
308  ret <32 x i16> %sel
309}
310
311define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8(<32 x i8> %a, <32 x i8> %c, <32 x i8> %d) {
312; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
313; AVX512F:       # %bb.0:
314; AVX512F-NEXT:    vpxor %xmm3, %xmm3, %xmm3
315; AVX512F-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
316; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm3
317; AVX512F-NEXT:    vptestmd %zmm3, %zmm3, %k1
318; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
319; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
320; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k2
321; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
322; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
323; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
324; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
325; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
326; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
327; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
328; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
329; AVX512F-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
330; AVX512F-NEXT:    retq
331;
332; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
333; AVX512VL:       # %bb.0:
334; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
335; AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm0, %ymm0
336; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm3
337; AVX512VL-NEXT:    vptestmd %zmm3, %zmm3, %k1
338; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
339; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
340; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k2
341; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
342; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm3 {%k1} {z} = -1
343; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
344; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm3, %zmm4
345; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
346; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
347; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
348; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
349; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm0 & (ymm1 ^ ymm2))
350; AVX512VL-NEXT:    retq
351;
352; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8:
353; VL_BW_DQ:       # %bb.0:
354; VL_BW_DQ-NEXT:    vptestnmb %ymm0, %ymm0, %k0
355; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
356; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
357; VL_BW_DQ-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
358; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm3, %zmm0
359; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
360; VL_BW_DQ-NEXT:    vpblendmb %ymm1, %ymm2, %ymm0 {%k1}
361; VL_BW_DQ-NEXT:    retq
362  %cmp = icmp eq <32 x i8> %a, zeroinitializer
363  %shuf = shufflevector <32 x i1> %cmp, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
364  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
365  ret <32 x i8> %sel
366}
367
368define <32 x i16> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split(<16 x i32> %a, <16 x i32> %b, <32 x i16> %c, <32 x i16> %d) {
369; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
370; AVX512F:       # %bb.0:
371; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
372; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
373; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
374; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
375; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
376; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
377; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
378; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
379; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
380; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
381; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3))
382; AVX512F-NEXT:    retq
383;
384; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
385; AVX512VL:       # %bb.0:
386; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
387; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
388; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
389; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
390; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
391; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
392; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
393; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
394; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
395; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
396; AVX512VL-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm3 ^ (zmm0 & (zmm2 ^ zmm3))
397; AVX512VL-NEXT:    retq
398;
399; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i16_split:
400; VL_BW_DQ:       # %bb.0:
401; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
402; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
403; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
404; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
405; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
406; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
407; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
408; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
409; VL_BW_DQ-NEXT:    vpblendmw %zmm2, %zmm3, %zmm0 {%k1}
410; VL_BW_DQ-NEXT:    retq
411  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
412  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
413  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
414  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
415  %sel = select <32 x i1> %shuf, <32 x i16> %c, <32 x i16> %d
416  ret <32 x i16> %sel
417}
418
419define <32 x i8> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split(<16 x i32> %a, <16 x i32> %b, <32 x i8> %c, <32 x i8> %d) {
420; AVX512F-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
421; AVX512F:       # %bb.0:
422; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
423; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
424; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
425; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
426; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
427; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
428; AVX512F-NEXT:    vptestmd %zmm4, %zmm4, %k1
429; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
430; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
431; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
432; AVX512F-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
433; AVX512F-NEXT:    retq
434;
435; AVX512VL-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
436; AVX512VL:       # %bb.0:
437; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1
438; AVX512VL-NEXT:    vptestnmd %zmm1, %zmm1, %k2
439; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
440; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
441; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm4 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
442; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm4
443; AVX512VL-NEXT:    vptestmd %zmm4, %zmm4, %k1
444; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
445; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
446; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
447; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm3 ^ (ymm0 & (ymm2 ^ ymm3))
448; AVX512VL-NEXT:    retq
449;
450; VL_BW_DQ-LABEL: shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_icmp_v32i8_split:
451; VL_BW_DQ:       # %bb.0:
452; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0
453; VL_BW_DQ-NEXT:    vptestnmd %zmm1, %zmm1, %k1
454; VL_BW_DQ-NEXT:    kunpckwd %k0, %k1, %k0
455; VL_BW_DQ-NEXT:    vpmovm2w %k0, %zmm0
456; VL_BW_DQ-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0,3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
457; VL_BW_DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
458; VL_BW_DQ-NEXT:    vpermw %zmm0, %zmm1, %zmm0
459; VL_BW_DQ-NEXT:    vpmovw2m %zmm0, %k1
460; VL_BW_DQ-NEXT:    vpblendmb %ymm2, %ymm3, %ymm0 {%k1}
461; VL_BW_DQ-NEXT:    retq
462  %cmp1 = icmp eq <16 x i32> %a, zeroinitializer
463  %cmp2 = icmp eq <16 x i32> %b, zeroinitializer
464  %concat = shufflevector <16 x i1> %cmp1, <16 x i1> %cmp2, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
465  %shuf = shufflevector <32 x i1> %concat, <32 x i1> undef, <32 x i32> <i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0, i32 3, i32 6, i32 22, i32 12, i32 3, i32 7, i32 7, i32 0, i32 3, i32 6, i32 1, i32 13, i32 3, i32 21, i32 7, i32 0>
466  %sel = select <32 x i1> %shuf, <32 x i8> %c, <32 x i8> %d
467  ret <32 x i8> %sel
468}
469
470define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
471; AVX512F-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
472; AVX512F:       # %bb.0:
473; AVX512F-NEXT:    kmovw %edi, %k1
474; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
475; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm0
476; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
477; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
478; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
479; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
480; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
481; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
482; AVX512F-NEXT:    vzeroupper
483; AVX512F-NEXT:    retq
484;
485; AVX512VL-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
486; AVX512VL-FAST-ALL:       # %bb.0:
487; AVX512VL-FAST-ALL-NEXT:    kmovw %edi, %k1
488; AVX512VL-FAST-ALL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
489; AVX512VL-FAST-ALL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
490; AVX512VL-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [2,2,2,2,2,2,2,2]
491; AVX512VL-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
492; AVX512VL-FAST-ALL-NEXT:    vpslld $31, %ymm1, %ymm1
493; AVX512VL-FAST-ALL-NEXT:    vptestmd %ymm1, %ymm1, %k1
494; AVX512VL-FAST-ALL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
495; AVX512VL-FAST-ALL-NEXT:    vpmovdw %ymm0, %xmm0
496; AVX512VL-FAST-ALL-NEXT:    vzeroupper
497; AVX512VL-FAST-ALL-NEXT:    retq
498;
499; AVX512VL-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
500; AVX512VL-FAST-PERLANE:       # %bb.0:
501; AVX512VL-FAST-PERLANE-NEXT:    kmovw %edi, %k1
502; AVX512VL-FAST-PERLANE-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
503; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
504; AVX512VL-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
505; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastd %xmm1, %ymm1
506; AVX512VL-FAST-PERLANE-NEXT:    vpslld $31, %ymm1, %ymm1
507; AVX512VL-FAST-PERLANE-NEXT:    vptestmd %ymm1, %ymm1, %k1
508; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
509; AVX512VL-FAST-PERLANE-NEXT:    vpmovdw %ymm0, %xmm0
510; AVX512VL-FAST-PERLANE-NEXT:    vzeroupper
511; AVX512VL-FAST-PERLANE-NEXT:    retq
512;
513; VL_BW_DQ-FAST-ALL-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
514; VL_BW_DQ-FAST-ALL:       # %bb.0:
515; VL_BW_DQ-FAST-ALL-NEXT:    kmovd %edi, %k0
516; VL_BW_DQ-FAST-ALL-NEXT:    vpmovm2d %k0, %ymm0
517; VL_BW_DQ-FAST-ALL-NEXT:    vpbroadcastd {{.*#+}} ymm1 = [2,2,2,2,2,2,2,2]
518; VL_BW_DQ-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
519; VL_BW_DQ-FAST-ALL-NEXT:    vpmovd2m %ymm0, %k0
520; VL_BW_DQ-FAST-ALL-NEXT:    vpmovm2w %k0, %xmm0
521; VL_BW_DQ-FAST-ALL-NEXT:    vzeroupper
522; VL_BW_DQ-FAST-ALL-NEXT:    retq
523;
524; VL_BW_DQ-FAST-PERLANE-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
525; VL_BW_DQ-FAST-PERLANE:       # %bb.0:
526; VL_BW_DQ-FAST-PERLANE-NEXT:    kmovd %edi, %k0
527; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovm2d %k0, %ymm0
528; VL_BW_DQ-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3]
529; VL_BW_DQ-FAST-PERLANE-NEXT:    vpbroadcastd %xmm0, %ymm0
530; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovd2m %ymm0, %k0
531; VL_BW_DQ-FAST-PERLANE-NEXT:    vpmovm2w %k0, %xmm0
532; VL_BW_DQ-FAST-PERLANE-NEXT:    vzeroupper
533; VL_BW_DQ-FAST-PERLANE-NEXT:    retq
534  %b = bitcast i8 %a to <8 x i1>
535  %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
536  ret <8 x i1> %c
537}
538
539define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
540; AVX512F-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
541; AVX512F:       # %bb.0:
542; AVX512F-NEXT:    kmovw %edi, %k1
543; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
544; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
545; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [8,2,10,0,3,0,2,0]
546; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
547; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
548; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
549; AVX512F-NEXT:    kmovw %k0, %eax
550; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
551; AVX512F-NEXT:    vzeroupper
552; AVX512F-NEXT:    retq
553;
554; AVX512VL-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
555; AVX512VL:       # %bb.0:
556; AVX512VL-NEXT:    kmovw %edi, %k1
557; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
558; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
559; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
560; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
561; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
562; AVX512VL-NEXT:    vpslld $31, %ymm2, %ymm0
563; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
564; AVX512VL-NEXT:    kmovw %k0, %eax
565; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
566; AVX512VL-NEXT:    vzeroupper
567; AVX512VL-NEXT:    retq
568;
569; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
570; VL_BW_DQ:       # %bb.0:
571; VL_BW_DQ-NEXT:    kmovd %edi, %k0
572; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
573; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
574; VL_BW_DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,2,10,3,3,2,2,3]
575; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
576; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
577; VL_BW_DQ-NEXT:    kmovd %k0, %eax
578; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
579; VL_BW_DQ-NEXT:    vzeroupper
580; VL_BW_DQ-NEXT:    retq
581  %b = bitcast i8 %a to <8 x i1>
582  %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
583  %d = bitcast <8 x i1> %c to i8
584  ret i8 %d
585}
586
587define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
588; AVX512F-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
589; AVX512F:       # %bb.0:
590; AVX512F-NEXT:    kmovw %edi, %k1
591; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
592; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,4,5,6,7]
593; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
594; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
595; AVX512F-NEXT:    kmovw %k0, %eax
596; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
597; AVX512F-NEXT:    vzeroupper
598; AVX512F-NEXT:    retq
599;
600; AVX512VL-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
601; AVX512VL:       # %bb.0:
602; AVX512VL-NEXT:    kmovw %edi, %k1
603; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
604; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
605; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
606; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
607; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
608; AVX512VL-NEXT:    kmovw %k0, %eax
609; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
610; AVX512VL-NEXT:    vzeroupper
611; AVX512VL-NEXT:    retq
612;
613; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
614; VL_BW_DQ:       # %bb.0:
615; VL_BW_DQ-NEXT:    kmovd %edi, %k0
616; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
617; VL_BW_DQ-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
618; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
619; VL_BW_DQ-NEXT:    kmovd %k0, %eax
620; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
621; VL_BW_DQ-NEXT:    vzeroupper
622; VL_BW_DQ-NEXT:    retq
623  %b = bitcast i8 %a to <8 x i1>
624  %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
625  %d = bitcast <8 x i1> %c to i8
626  ret i8 %d
627}
628
629define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
630; AVX512F-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
631; AVX512F:       # %bb.0:
632; AVX512F-NEXT:    kmovw %edi, %k1
633; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
634; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
635; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
636; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
637; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
638; AVX512F-NEXT:    kmovw %k0, %eax
639; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
640; AVX512F-NEXT:    vzeroupper
641; AVX512F-NEXT:    retq
642;
643; AVX512VL-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
644; AVX512VL:       # %bb.0:
645; AVX512VL-NEXT:    kmovw %edi, %k1
646; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
647; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
648; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
649; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
650; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
651; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
652; AVX512VL-NEXT:    kmovw %k0, %eax
653; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
654; AVX512VL-NEXT:    vzeroupper
655; AVX512VL-NEXT:    retq
656;
657; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
658; VL_BW_DQ:       # %bb.0:
659; VL_BW_DQ-NEXT:    kmovd %edi, %k0
660; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
661; VL_BW_DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
662; VL_BW_DQ-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [8,6,1,0,3,7,7,0]
663; VL_BW_DQ-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
664; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
665; VL_BW_DQ-NEXT:    kmovd %k0, %eax
666; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
667; VL_BW_DQ-NEXT:    vzeroupper
668; VL_BW_DQ-NEXT:    retq
669  %b = bitcast i8 %a to <8 x i1>
670  %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
671  %d = bitcast <8 x i1>%c to i8
672  ret i8 %d
673}
674
675define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
676; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
677; AVX512F:       # %bb.0:
678; AVX512F-NEXT:    kmovw %edi, %k1
679; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
680; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
681; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
682; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
683; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
684; AVX512F-NEXT:    kmovw %k0, %eax
685; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
686; AVX512F-NEXT:    vzeroupper
687; AVX512F-NEXT:    retq
688;
689; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
690; AVX512VL:       # %bb.0:
691; AVX512VL-NEXT:    kmovw %edi, %k1
692; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
693; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
694; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
695; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
696; AVX512VL-NEXT:    kmovw %k0, %eax
697; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
698; AVX512VL-NEXT:    vzeroupper
699; AVX512VL-NEXT:    retq
700;
701; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
702; VL_BW_DQ:       # %bb.0:
703; VL_BW_DQ-NEXT:    kmovd %edi, %k0
704; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
705; VL_BW_DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[4,5,6,7],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
706; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
707; VL_BW_DQ-NEXT:    kmovd %k0, %eax
708; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
709; VL_BW_DQ-NEXT:    vzeroupper
710; VL_BW_DQ-NEXT:    retq
711  %b = bitcast i8 %a to <8 x i1>
712  %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
713  %d = bitcast <8 x i1>%c to i8
714  ret i8 %d
715}
716
717define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
718; AVX512F-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
719; AVX512F:       # %bb.0:
720; AVX512F-NEXT:    kmovw %edi, %k1
721; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
722; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [1,14,9,8,11,15,15,9]
723; AVX512F-NEXT:    vpermi2q {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
724; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
725; AVX512F-NEXT:    kmovw %k0, %eax
726; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
727; AVX512F-NEXT:    vzeroupper
728; AVX512F-NEXT:    retq
729;
730; AVX512VL-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
731; AVX512VL:       # %bb.0:
732; AVX512VL-NEXT:    kmovw %edi, %k1
733; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
734; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
735; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
736; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
737; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k0
738; AVX512VL-NEXT:    kmovw %k0, %eax
739; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
740; AVX512VL-NEXT:    vzeroupper
741; AVX512VL-NEXT:    retq
742;
743; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
744; VL_BW_DQ:       # %bb.0:
745; VL_BW_DQ-NEXT:    kmovd %edi, %k0
746; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
747; VL_BW_DQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
748; VL_BW_DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3,4,5,6,7]
749; VL_BW_DQ-NEXT:    vpmovd2m %ymm0, %k0
750; VL_BW_DQ-NEXT:    kmovd %k0, %eax
751; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
752; VL_BW_DQ-NEXT:    vzeroupper
753; VL_BW_DQ-NEXT:    retq
754  %b = bitcast i8 %a to <8 x i1>
755  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
756  %c1 = bitcast <8 x i1>%c to i8
757  ret i8 %c1
758}
759
760define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
761; AVX512F-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
762; AVX512F:       # %bb.0:
763; AVX512F-NEXT:    vpmovsxwq %xmm0, %zmm0
764; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
765; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
766; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 {%k1} {z} = -1
767; AVX512F-NEXT:    vpmovsxbq {{.*#+}} zmm1 = [9,1,2,3,4,5,6,7]
768; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
769; AVX512F-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
770; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k0
771; AVX512F-NEXT:    kmovw %k0, %eax
772; AVX512F-NEXT:    # kill: def $al killed $al killed $eax
773; AVX512F-NEXT:    vzeroupper
774; AVX512F-NEXT:    retq
775;
776; AVX512VL-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
777; AVX512VL:       # %bb.0:
778; AVX512VL-NEXT:    vpmovsxwd %xmm0, %ymm0
779; AVX512VL-NEXT:    vpslld $31, %ymm0, %ymm0
780; AVX512VL-NEXT:    vptestmd %ymm0, %ymm0, %k1
781; AVX512VL-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
782; AVX512VL-NEXT:    vmovdqa32 %ymm0, %ymm1 {%k1} {z}
783; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} ymm2 = [9,1,2,3,4,5,6,7]
784; AVX512VL-NEXT:    vpermi2d %ymm1, %ymm0, %ymm2
785; AVX512VL-NEXT:    vptestmd %ymm2, %ymm2, %k0
786; AVX512VL-NEXT:    kmovw %k0, %eax
787; AVX512VL-NEXT:    # kill: def $al killed $al killed $eax
788; AVX512VL-NEXT:    vzeroupper
789; AVX512VL-NEXT:    retq
790;
791; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
792; VL_BW_DQ:       # %bb.0:
793; VL_BW_DQ-NEXT:    vpsllw $15, %xmm0, %xmm0
794; VL_BW_DQ-NEXT:    vpmovw2m %xmm0, %k0
795; VL_BW_DQ-NEXT:    vpmovm2d %k0, %ymm0
796; VL_BW_DQ-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [9,1,2,3,4,5,6,7]
797; VL_BW_DQ-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
798; VL_BW_DQ-NEXT:    vpermt2d %ymm0, %ymm1, %ymm2
799; VL_BW_DQ-NEXT:    vpmovd2m %ymm2, %k0
800; VL_BW_DQ-NEXT:    kmovd %k0, %eax
801; VL_BW_DQ-NEXT:    # kill: def $al killed $al killed $eax
802; VL_BW_DQ-NEXT:    vzeroupper
803; VL_BW_DQ-NEXT:    retq
804  %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
805  %c1 = bitcast <8 x i1>%c to i8
806  ret i8 %c1
807}
808
809define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
810; AVX512F-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
811; AVX512F:       # %bb.0:
812; AVX512F-NEXT:    kmovw %edi, %k1
813; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
814; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
815; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
816; AVX512F-NEXT:    kmovw %k0, %eax
817; AVX512F-NEXT:    # kill: def $ax killed $ax killed $eax
818; AVX512F-NEXT:    vzeroupper
819; AVX512F-NEXT:    retq
820;
821; AVX512VL-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
822; AVX512VL:       # %bb.0:
823; AVX512VL-NEXT:    kmovw %edi, %k1
824; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
825; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
826; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
827; AVX512VL-NEXT:    kmovw %k0, %eax
828; AVX512VL-NEXT:    # kill: def $ax killed $ax killed $eax
829; AVX512VL-NEXT:    vzeroupper
830; AVX512VL-NEXT:    retq
831;
832; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
833; VL_BW_DQ:       # %bb.0:
834; VL_BW_DQ-NEXT:    kmovd %edi, %k0
835; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
836; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
837; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
838; VL_BW_DQ-NEXT:    kmovd %k0, %eax
839; VL_BW_DQ-NEXT:    # kill: def $ax killed $ax killed $eax
840; VL_BW_DQ-NEXT:    vzeroupper
841; VL_BW_DQ-NEXT:    retq
842  %b = bitcast i16 %a to <16 x i1>
843  %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
844  %d = bitcast <16 x i1> %c to i16
845  ret i16 %d
846}
847
848define i64 @shuf64i1_zero(i64 %a) {
849; AVX512F-LABEL: shuf64i1_zero:
850; AVX512F:       # %bb.0:
851; AVX512F-NEXT:    kmovw %edi, %k1
852; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
853; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
854; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
855; AVX512F-NEXT:    kmovw %k0, %eax
856; AVX512F-NEXT:    kmovw %k0, %ecx
857; AVX512F-NEXT:    shll $16, %ecx
858; AVX512F-NEXT:    orl %eax, %ecx
859; AVX512F-NEXT:    movq %rcx, %rax
860; AVX512F-NEXT:    shlq $32, %rax
861; AVX512F-NEXT:    orq %rcx, %rax
862; AVX512F-NEXT:    vzeroupper
863; AVX512F-NEXT:    retq
864;
865; AVX512VL-LABEL: shuf64i1_zero:
866; AVX512VL:       # %bb.0:
867; AVX512VL-NEXT:    kmovw %edi, %k1
868; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
869; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
870; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k0
871; AVX512VL-NEXT:    kmovw %k0, %eax
872; AVX512VL-NEXT:    kmovw %k0, %ecx
873; AVX512VL-NEXT:    shll $16, %ecx
874; AVX512VL-NEXT:    orl %eax, %ecx
875; AVX512VL-NEXT:    movq %rcx, %rax
876; AVX512VL-NEXT:    shlq $32, %rax
877; AVX512VL-NEXT:    orq %rcx, %rax
878; AVX512VL-NEXT:    vzeroupper
879; AVX512VL-NEXT:    retq
880;
881; VL_BW_DQ-LABEL: shuf64i1_zero:
882; VL_BW_DQ:       # %bb.0:
883; VL_BW_DQ-NEXT:    kmovq %rdi, %k0
884; VL_BW_DQ-NEXT:    vpmovm2b %k0, %zmm0
885; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
886; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
887; VL_BW_DQ-NEXT:    kmovq %k0, %rax
888; VL_BW_DQ-NEXT:    vzeroupper
889; VL_BW_DQ-NEXT:    retq
890  %b = bitcast i64 %a to <64 x i1>
891  %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
892  %d = bitcast <64 x i1> %c to i64
893  ret i64 %d
894}
895
896define <16 x i1> @PR52500(<16 x i1> %msk, i32 %in) {
897; AVX512F-LABEL: PR52500:
898; AVX512F:       # %bb.0:
899; AVX512F-NEXT:    vpmovsxbd %xmm0, %zmm0
900; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
901; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
902; AVX512F-NEXT:    vmovd %edi, %xmm0
903; AVX512F-NEXT:    movl $789, %eax # imm = 0x315
904; AVX512F-NEXT:    vmovd %eax, %xmm1
905; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
906; AVX512F-NEXT:    vpbroadcastd %xmm0, %zmm0
907; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1 {%k1}
908; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
909; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
910; AVX512F-NEXT:    vzeroupper
911; AVX512F-NEXT:    retq
912;
913; AVX512VL-LABEL: PR52500:
914; AVX512VL:       # %bb.0:
915; AVX512VL-NEXT:    vpmovsxbd %xmm0, %zmm0
916; AVX512VL-NEXT:    vpslld $31, %zmm0, %zmm0
917; AVX512VL-NEXT:    vptestmd %zmm0, %zmm0, %k1
918; AVX512VL-NEXT:    vmovd %edi, %xmm0
919; AVX512VL-NEXT:    movl $789, %eax # imm = 0x315
920; AVX512VL-NEXT:    vmovd %eax, %xmm1
921; AVX512VL-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
922; AVX512VL-NEXT:    vpbroadcastd %xmm0, %zmm0
923; AVX512VL-NEXT:    vptestnmd %zmm0, %zmm0, %k1 {%k1}
924; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
925; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
926; AVX512VL-NEXT:    vzeroupper
927; AVX512VL-NEXT:    retq
928;
929; VL_BW_DQ-LABEL: PR52500:
930; VL_BW_DQ:       # %bb.0:
931; VL_BW_DQ-NEXT:    vpsllw $7, %xmm0, %xmm0
932; VL_BW_DQ-NEXT:    vpmovb2m %xmm0, %k1
933; VL_BW_DQ-NEXT:    vmovd %edi, %xmm0
934; VL_BW_DQ-NEXT:    movl $789, %eax # imm = 0x315
935; VL_BW_DQ-NEXT:    vmovd %eax, %xmm1
936; VL_BW_DQ-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
937; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
938; VL_BW_DQ-NEXT:    vptestnmd %zmm0, %zmm0, %k0 {%k1}
939; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
940; VL_BW_DQ-NEXT:    vzeroupper
941; VL_BW_DQ-NEXT:    retq
942  %insrt = insertelement <16 x i32> undef, i32 %in, i32 0
943  %mul = mul <16 x i32> %insrt, <i32 789, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
944  %eq = icmp eq <16 x i32> %mul, zeroinitializer
945  %cmp1 = shufflevector <16 x i1> %eq, <16 x i1> poison, <16 x i32> zeroinitializer
946  %and = and <16 x i1> %cmp1, %msk
947  ret <16 x i1> %and
948}
949