xref: /llvm-project/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll (revision 8ac00ca4867835cacaf013f5c442658b9b1bce38)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-ALL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FAST-PERLANE
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VBMI
13
14; PR31551
15; Pairs of shufflevector:trunc functions with functional equivalence.
16; Ideally, the shuffles should be lowered to code with the same quality as the truncates.
17
18define void @shuffle_v64i8_to_v32i8(ptr %L, ptr %S) nounwind {
19; AVX512F-LABEL: shuffle_v64i8_to_v32i8:
20; AVX512F:       # %bb.0:
21; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
22; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
23; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
24; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
25; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
26; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
27; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
28; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
29; AVX512F-NEXT:    vzeroupper
30; AVX512F-NEXT:    retq
31;
32; AVX512VL-FAST-ALL-LABEL: shuffle_v64i8_to_v32i8:
33; AVX512VL-FAST-ALL:       # %bb.0:
34; AVX512VL-FAST-ALL-NEXT:    vmovdqa (%rdi), %ymm0
35; AVX512VL-FAST-ALL-NEXT:    vmovdqa 32(%rdi), %ymm1
36; AVX512VL-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30]
37; AVX512VL-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u,16,18,20,22,24,26,28,30,u,u,u,u,u,u,u,u]
38; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm2 = [0,2,5,7]
39; AVX512VL-FAST-ALL-NEXT:    vpermi2q %ymm1, %ymm0, %ymm2
40; AVX512VL-FAST-ALL-NEXT:    vmovdqa %ymm2, (%rsi)
41; AVX512VL-FAST-ALL-NEXT:    vzeroupper
42; AVX512VL-FAST-ALL-NEXT:    retq
43;
44; AVX512VL-FAST-PERLANE-LABEL: shuffle_v64i8_to_v32i8:
45; AVX512VL-FAST-PERLANE:       # %bb.0:
46; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
47; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
48; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
49; AVX512VL-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
50; AVX512VL-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
51; AVX512VL-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
52; AVX512VL-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
53; AVX512VL-FAST-PERLANE-NEXT:    vmovdqa %ymm0, (%rsi)
54; AVX512VL-FAST-PERLANE-NEXT:    vzeroupper
55; AVX512VL-FAST-PERLANE-NEXT:    retq
56;
57; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
58; AVX512BW:       # %bb.0:
59; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
60; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
61; AVX512BW-NEXT:    vzeroupper
62; AVX512BW-NEXT:    retq
63;
64; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
65; AVX512BWVL:       # %bb.0:
66; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
67; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
68; AVX512BWVL-NEXT:    vzeroupper
69; AVX512BWVL-NEXT:    retq
70;
71; AVX512VBMI-LABEL: shuffle_v64i8_to_v32i8:
72; AVX512VBMI:       # %bb.0:
73; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
74; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
75; AVX512VBMI-NEXT:    vzeroupper
76; AVX512VBMI-NEXT:    retq
77  %vec = load <64 x i8>, ptr %L
78  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
79  store <32 x i8> %strided.vec, ptr %S
80  ret void
81}
82
83define void @trunc_v32i16_to_v32i8(ptr %L, ptr %S) nounwind {
84; AVX512F-LABEL: trunc_v32i16_to_v32i8:
85; AVX512F:       # %bb.0:
86; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
87; AVX512F-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
88; AVX512F-NEXT:    vpmovdb %zmm1, 16(%rsi)
89; AVX512F-NEXT:    vpmovdb %zmm0, (%rsi)
90; AVX512F-NEXT:    vzeroupper
91; AVX512F-NEXT:    retq
92;
93; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
94; AVX512VL:       # %bb.0:
95; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
96; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
97; AVX512VL-NEXT:    vpmovdb %zmm1, 16(%rsi)
98; AVX512VL-NEXT:    vpmovdb %zmm0, (%rsi)
99; AVX512VL-NEXT:    vzeroupper
100; AVX512VL-NEXT:    retq
101;
102; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
103; AVX512BW:       # %bb.0:
104; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
105; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
106; AVX512BW-NEXT:    vzeroupper
107; AVX512BW-NEXT:    retq
108;
109; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
110; AVX512BWVL:       # %bb.0:
111; AVX512BWVL-NEXT:    vmovdqa64 (%rdi), %zmm0
112; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
113; AVX512BWVL-NEXT:    vzeroupper
114; AVX512BWVL-NEXT:    retq
115;
116; AVX512VBMI-LABEL: trunc_v32i16_to_v32i8:
117; AVX512VBMI:       # %bb.0:
118; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
119; AVX512VBMI-NEXT:    vpmovwb %zmm0, (%rsi)
120; AVX512VBMI-NEXT:    vzeroupper
121; AVX512VBMI-NEXT:    retq
122  %vec = load <64 x i8>, ptr %L
123  %bc = bitcast <64 x i8> %vec to <32 x i16>
124  %strided.vec = trunc <32 x i16> %bc to <32 x i8>
125  store <32 x i8> %strided.vec, ptr %S
126  ret void
127}
128
129define void @shuffle_v32i16_to_v16i16(ptr %L, ptr %S) nounwind {
130; AVX512-LABEL: shuffle_v32i16_to_v16i16:
131; AVX512:       # %bb.0:
132; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
133; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
134; AVX512-NEXT:    vzeroupper
135; AVX512-NEXT:    retq
136  %vec = load <32 x i16>, ptr %L
137  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
138  store <16 x i16> %strided.vec, ptr %S
139  ret void
140}
141
142define void @trunc_v16i32_to_v16i16(ptr %L, ptr %S) nounwind {
143; AVX512-LABEL: trunc_v16i32_to_v16i16:
144; AVX512:       # %bb.0:
145; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
146; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
147; AVX512-NEXT:    vzeroupper
148; AVX512-NEXT:    retq
149  %vec = load <32 x i16>, ptr %L
150  %bc = bitcast <32 x i16> %vec to <16 x i32>
151  %strided.vec = trunc <16 x i32> %bc to <16 x i16>
152  store <16 x i16> %strided.vec, ptr %S
153  ret void
154}
155
156define void @shuffle_v16i32_to_v8i32(ptr %L, ptr %S) nounwind {
157; AVX512-LABEL: shuffle_v16i32_to_v8i32:
158; AVX512:       # %bb.0:
159; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
160; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
161; AVX512-NEXT:    vzeroupper
162; AVX512-NEXT:    retq
163  %vec = load <16 x i32>, ptr %L
164  %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
165  store <8 x i32> %strided.vec, ptr %S
166  ret void
167}
168
169define void @trunc_v8i64_to_v8i32(ptr %L, ptr %S) nounwind {
170; AVX512-LABEL: trunc_v8i64_to_v8i32:
171; AVX512:       # %bb.0:
172; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
173; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
174; AVX512-NEXT:    vzeroupper
175; AVX512-NEXT:    retq
176  %vec = load <16 x i32>, ptr %L
177  %bc = bitcast <16 x i32> %vec to <8 x i64>
178  %strided.vec = trunc <8 x i64> %bc to <8 x i32>
179  store <8 x i32> %strided.vec, ptr %S
180  ret void
181}
182
183define void @shuffle_v64i8_to_v16i8(ptr %L, ptr %S) nounwind {
184; AVX512-LABEL: shuffle_v64i8_to_v16i8:
185; AVX512:       # %bb.0:
186; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
187; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
188; AVX512-NEXT:    vzeroupper
189; AVX512-NEXT:    retq
190  %vec = load <64 x i8>, ptr %L
191  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
192  store <16 x i8> %strided.vec, ptr %S
193  ret void
194}
195
196define void @trunc_v16i32_to_v16i8(ptr %L, ptr %S) nounwind {
197; AVX512-LABEL: trunc_v16i32_to_v16i8:
198; AVX512:       # %bb.0:
199; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
200; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
201; AVX512-NEXT:    vzeroupper
202; AVX512-NEXT:    retq
203  %vec = load <64 x i8>, ptr %L
204  %bc = bitcast <64 x i8> %vec to <16 x i32>
205  %strided.vec = trunc <16 x i32> %bc to <16 x i8>
206  store <16 x i8> %strided.vec, ptr %S
207  ret void
208}
209
210define void @shuffle_v32i16_to_v8i16(ptr %L, ptr %S) nounwind {
211; AVX512-LABEL: shuffle_v32i16_to_v8i16:
212; AVX512:       # %bb.0:
213; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
214; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
215; AVX512-NEXT:    vzeroupper
216; AVX512-NEXT:    retq
217  %vec = load <32 x i16>, ptr %L
218  %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
219  store <8 x i16> %strided.vec, ptr %S
220  ret void
221}
222
223define void @trunc_v8i64_to_v8i16(ptr %L, ptr %S) nounwind {
224; AVX512-LABEL: trunc_v8i64_to_v8i16:
225; AVX512:       # %bb.0:
226; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
227; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
228; AVX512-NEXT:    vzeroupper
229; AVX512-NEXT:    retq
230  %vec = load <32 x i16>, ptr %L
231  %bc = bitcast <32 x i16> %vec to <8 x i64>
232  %strided.vec = trunc <8 x i64> %bc to <8 x i16>
233  store <8 x i16> %strided.vec, ptr %S
234  ret void
235}
236
237define void @shuffle_v64i8_to_v8i8(ptr %L, ptr %S) nounwind {
238; AVX512-LABEL: shuffle_v64i8_to_v8i8:
239; AVX512:       # %bb.0:
240; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
241; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
242; AVX512-NEXT:    vzeroupper
243; AVX512-NEXT:    retq
244  %vec = load <64 x i8>, ptr %L
245  %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
246  store <8 x i8> %strided.vec, ptr %S
247  ret void
248}
249
250define void @trunc_v8i64_to_v8i8(ptr %L, ptr %S) nounwind {
251; AVX512-LABEL: trunc_v8i64_to_v8i8:
252; AVX512:       # %bb.0:
253; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
254; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
255; AVX512-NEXT:    vzeroupper
256; AVX512-NEXT:    retq
257  %vec = load <64 x i8>, ptr %L
258  %bc = bitcast <64 x i8> %vec to <8 x i64>
259  %strided.vec = trunc <8 x i64> %bc to <8 x i8>
260  store <8 x i8> %strided.vec, ptr %S
261  ret void
262}
263
264define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61(<64 x i8> %x) {
265; AVX512-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_61:
266; AVX512:       # %bb.0:
267; AVX512-NEXT:    vpsrld $8, %zmm0, %zmm0
268; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
269; AVX512-NEXT:    vzeroupper
270; AVX512-NEXT:    retq
271  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 61>
272  ret <16 x i8> %res
273}
274
275define <16 x i8> @trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62(<64 x i8> %x) {
276; AVX512F-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
277; AVX512F:       # %bb.0:
278; AVX512F-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
279; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
280; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
281; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
282; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
283; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
284; AVX512F-NEXT:    vzeroupper
285; AVX512F-NEXT:    retq
286;
287; AVX512VL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
288; AVX512VL:       # %bb.0:
289; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
290; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
291; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
292; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
293; AVX512VL-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
294; AVX512VL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
295; AVX512VL-NEXT:    vzeroupper
296; AVX512VL-NEXT:    retq
297;
298; AVX512BW-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
299; AVX512BW:       # %bb.0:
300; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4,16,21]
301; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
302; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
303; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
304; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm0
305; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
306; AVX512BW-NEXT:    vzeroupper
307; AVX512BW-NEXT:    retq
308;
309; AVX512BWVL-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
310; AVX512BWVL:       # %bb.0:
311; AVX512BWVL-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [0,4,8,13]
312; AVX512BWVL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
313; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,30,u,u,u,u,u,u,u,u]
314; AVX512BWVL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u,17,21,25,29,u,u,u,u,u,u,u,u,u,u,u,u]
315; AVX512BWVL-NEXT:    vpermt2d %ymm2, %ymm1, %ymm0
316; AVX512BWVL-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
317; AVX512BWVL-NEXT:    vzeroupper
318; AVX512BWVL-NEXT:    retq
319;
320; AVX512VBMI-LABEL: trunc_shuffle_v64i8_01_05_09_13_17_21_25_29_33_37_41_45_49_53_57_62:
321; AVX512VBMI:       # %bb.0:
322; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,5,9,13,17,21,25,29,33,37,41,45,49,53,57,62]
323; AVX512VBMI-NEXT:    vpermb %zmm0, %zmm1, %zmm0
324; AVX512VBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
325; AVX512VBMI-NEXT:    vzeroupper
326; AVX512VBMI-NEXT:    retq
327  %res = shufflevector <64 x i8> %x, <64 x i8> %x, <16 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29, i32 33, i32 37, i32 41, i32 45, i32 49, i32 53, i32 57, i32 62>
328  ret <16 x i8> %res
329}
330
331; PR111611
332define <32 x i8> @trunc_shuffle_v32i16_v32i8_ofs1(<32 x i16> %a0) {
333; AVX512F-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
334; AVX512F:       # %bb.0:
335; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
336; AVX512F-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
337; AVX512F-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
338; AVX512F-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
339; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
340; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
341; AVX512F-NEXT:    retq
342;
343; AVX512VL-FAST-ALL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
344; AVX512VL-FAST-ALL:       # %bb.0:
345; AVX512VL-FAST-ALL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
346; AVX512VL-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31]
347; AVX512VL-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u,17,19,21,23,25,27,29,31,u,u,u,u,u,u,u,u]
348; AVX512VL-FAST-ALL-NEXT:    vpmovsxbq {{.*#+}} ymm0 = [0,2,5,7]
349; AVX512VL-FAST-ALL-NEXT:    vpermi2q %ymm1, %ymm2, %ymm0
350; AVX512VL-FAST-ALL-NEXT:    retq
351;
352; AVX512VL-FAST-PERLANE-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
353; AVX512VL-FAST-PERLANE:       # %bb.0:
354; AVX512VL-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
355; AVX512VL-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
356; AVX512VL-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
357; AVX512VL-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
358; AVX512VL-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
359; AVX512VL-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
360; AVX512VL-FAST-PERLANE-NEXT:    retq
361;
362; AVX512BW-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
363; AVX512BW:       # %bb.0:
364; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
365; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
366; AVX512BW-NEXT:    retq
367;
368; AVX512BWVL-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
369; AVX512BWVL:       # %bb.0:
370; AVX512BWVL-NEXT:    vpsrlw $8, %zmm0, %zmm0
371; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
372; AVX512BWVL-NEXT:    retq
373;
374; AVX512VBMI-LABEL: trunc_shuffle_v32i16_v32i8_ofs1:
375; AVX512VBMI:       # %bb.0:
376; AVX512VBMI-NEXT:    vpsrlw $8, %zmm0, %zmm0
377; AVX512VBMI-NEXT:    vpmovwb %zmm0, %ymm0
378; AVX512VBMI-NEXT:    retq
379  %bc = bitcast <32 x i16> %a0 to <64 x i8>
380  %res = shufflevector <64 x i8> %bc, <64 x i8> poison, <32 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31, i32 33, i32 35, i32 37, i32 39, i32 41, i32 43, i32 45, i32 47, i32 49, i32 51, i32 53, i32 55, i32 57, i32 59, i32 61, i32 63>
381  ret <32 x i8> %res
382}
383
384define <4 x double> @PR34175(ptr %p) {
385; AVX512F-LABEL: PR34175:
386; AVX512F:       # %bb.0:
387; AVX512F-NEXT:    vmovdqu (%rdi), %xmm0
388; AVX512F-NEXT:    vmovdqu 32(%rdi), %xmm1
389; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
390; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
391; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
392; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
393; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
394; AVX512F-NEXT:    retq
395;
396; AVX512VL-LABEL: PR34175:
397; AVX512VL:       # %bb.0:
398; AVX512VL-NEXT:    vmovdqu (%rdi), %xmm0
399; AVX512VL-NEXT:    vmovdqu 32(%rdi), %xmm1
400; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
401; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
402; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
403; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
404; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %ymm0
405; AVX512VL-NEXT:    retq
406;
407; AVX512BW-LABEL: PR34175:
408; AVX512BW:       # %bb.0:
409; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
410; AVX512BW-NEXT:    vpermw (%rdi), %zmm0, %zmm0
411; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
412; AVX512BW-NEXT:    vcvtdq2pd %xmm0, %ymm0
413; AVX512BW-NEXT:    retq
414;
415; AVX512BWVL-LABEL: PR34175:
416; AVX512BWVL:       # %bb.0:
417; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
418; AVX512BWVL-NEXT:    vpermw (%rdi), %zmm0, %zmm0
419; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
420; AVX512BWVL-NEXT:    vcvtdq2pd %xmm0, %ymm0
421; AVX512BWVL-NEXT:    retq
422;
423; AVX512VBMI-LABEL: PR34175:
424; AVX512VBMI:       # %bb.0:
425; AVX512VBMI-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
426; AVX512VBMI-NEXT:    vpermw (%rdi), %zmm0, %zmm0
427; AVX512VBMI-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
428; AVX512VBMI-NEXT:    vcvtdq2pd %xmm0, %ymm0
429; AVX512VBMI-NEXT:    retq
430  %v = load <32 x i16>, ptr %p, align 2
431  %shuf = shufflevector <32 x i16> %v, <32 x i16> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
432  %tofp = uitofp <4 x i16> %shuf to <4 x double>
433  ret <4 x double> %tofp
434}
435
436define <16 x i8> @trunc_v8i64_to_v8i8_return_v16i8(<8 x i64> %vec) nounwind {
437; AVX512-LABEL: trunc_v8i64_to_v8i8_return_v16i8:
438; AVX512:       # %bb.0:
439; AVX512-NEXT:    vpmovqb %zmm0, %xmm0
440; AVX512-NEXT:    vzeroupper
441; AVX512-NEXT:    retq
442  %truncated = trunc <8 x i64> %vec to <8 x i8>
443  %result = shufflevector <8 x i8> %truncated, <8 x i8> zeroinitializer, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
444  ret <16 x i8> %result
445}
446
447