xref: /llvm-project/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll (revision 8b43c1be23119c1024bed0a8ce392bc73727e2e2)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-SLOW
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2,AVX2-FAST
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
13
14define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
15; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
16; AVX:       # %bb.0:
17; AVX-NEXT:    vmovdqa (%rdi), %xmm0
18; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
19; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
20; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
21; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
22; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
23; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
24; AVX-NEXT:    retq
25;
26; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
27; AVX512F:       # %bb.0:
28; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
29; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
30; AVX512F-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
31; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
32; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
33; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
34; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
35; AVX512F-NEXT:    retq
36;
37; AVX512VL-LABEL: shuffle_v32i8_to_v16i8_1:
38; AVX512VL:       # %bb.0:
39; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
40; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
41; AVX512VL-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
42; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
43; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
44; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
45; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
46; AVX512VL-NEXT:    retq
47;
48; AVX512BW-LABEL: shuffle_v32i8_to_v16i8_1:
49; AVX512BW:       # %bb.0:
50; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
51; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
52; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
53; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
54; AVX512BW-NEXT:    vzeroupper
55; AVX512BW-NEXT:    retq
56;
57; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8_1:
58; AVX512BWVL:       # %bb.0:
59; AVX512BWVL-NEXT:    vpsrlw $8, (%rdi), %ymm0
60; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
61; AVX512BWVL-NEXT:    vzeroupper
62; AVX512BWVL-NEXT:    retq
63  %vec = load <32 x i8>, ptr %L
64  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23, i32 25, i32 27, i32 29, i32 31>
65  store <16 x i8> %strided.vec, ptr %S
66  ret void
67}
68
69define void @shuffle_v16i16_to_v8i16_1(ptr %L, ptr %S) nounwind {
70; AVX-LABEL: shuffle_v16i16_to_v8i16_1:
71; AVX:       # %bb.0:
72; AVX-NEXT:    vmovdqa (%rdi), %xmm0
73; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
74; AVX-NEXT:    vpsrld $16, %xmm1, %xmm1
75; AVX-NEXT:    vpsrld $16, %xmm0, %xmm0
76; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
77; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
78; AVX-NEXT:    retq
79;
80; AVX512F-LABEL: shuffle_v16i16_to_v8i16_1:
81; AVX512F:       # %bb.0:
82; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
83; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
84; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
85; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
86; AVX512F-NEXT:    vzeroupper
87; AVX512F-NEXT:    retq
88;
89; AVX512VL-LABEL: shuffle_v16i16_to_v8i16_1:
90; AVX512VL:       # %bb.0:
91; AVX512VL-NEXT:    vpsrld $16, (%rdi), %ymm0
92; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
93; AVX512VL-NEXT:    vzeroupper
94; AVX512VL-NEXT:    retq
95;
96; AVX512BW-LABEL: shuffle_v16i16_to_v8i16_1:
97; AVX512BW:       # %bb.0:
98; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
99; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
100; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
101; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
102; AVX512BW-NEXT:    vzeroupper
103; AVX512BW-NEXT:    retq
104;
105; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16_1:
106; AVX512BWVL:       # %bb.0:
107; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %ymm0
108; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
109; AVX512BWVL-NEXT:    vzeroupper
110; AVX512BWVL-NEXT:    retq
111  %vec = load <16 x i16>, ptr %L
112  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
113  store <8 x i16> %strided.vec, ptr %S
114  ret void
115}
116
117define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
118; AVX-LABEL: shuffle_v8i32_to_v4i32_1:
119; AVX:       # %bb.0:
120; AVX-NEXT:    vmovaps (%rdi), %xmm0
121; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
122; AVX-NEXT:    vmovaps %xmm0, (%rsi)
123; AVX-NEXT:    retq
124;
125; AVX512-LABEL: shuffle_v8i32_to_v4i32_1:
126; AVX512:       # %bb.0:
127; AVX512-NEXT:    vmovaps (%rdi), %xmm0
128; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],mem[1,3]
129; AVX512-NEXT:    vmovaps %xmm0, (%rsi)
130; AVX512-NEXT:    retq
131  %vec = load <8 x i32>, ptr %L
132  %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
133  store <4 x i32> %strided.vec, ptr %S
134  ret void
135}
136
137define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
138; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
139; AVX:       # %bb.0:
140; AVX-NEXT:    vmovdqa (%rdi), %xmm0
141; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
142; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
143; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
144; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
145; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
146; AVX-NEXT:    vmovq %xmm0, (%rsi)
147; AVX-NEXT:    retq
148;
149; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
150; AVX512F:       # %bb.0:
151; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
152; AVX512F-NEXT:    vpsrld $8, %ymm0, %ymm0
153; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
154; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
155; AVX512F-NEXT:    vzeroupper
156; AVX512F-NEXT:    retq
157;
158; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_1:
159; AVX512VL:       # %bb.0:
160; AVX512VL-NEXT:    vpsrld $8, (%rdi), %ymm0
161; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
162; AVX512VL-NEXT:    vzeroupper
163; AVX512VL-NEXT:    retq
164;
165; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_1:
166; AVX512BW:       # %bb.0:
167; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
168; AVX512BW-NEXT:    vpsrld $8, %ymm0, %ymm0
169; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
170; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
171; AVX512BW-NEXT:    vzeroupper
172; AVX512BW-NEXT:    retq
173;
174; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_1:
175; AVX512BWVL:       # %bb.0:
176; AVX512BWVL-NEXT:    vpsrld $8, (%rdi), %ymm0
177; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
178; AVX512BWVL-NEXT:    vzeroupper
179; AVX512BWVL-NEXT:    retq
180  %vec = load <32 x i8>, ptr %L
181  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
182  store <8 x i8> %strided.vec, ptr %S
183  ret void
184}
185
186define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
187; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
188; AVX:       # %bb.0:
189; AVX-NEXT:    vmovdqa (%rdi), %xmm0
190; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
191; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
192; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
193; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
194; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
195; AVX-NEXT:    vmovq %xmm0, (%rsi)
196; AVX-NEXT:    retq
197;
198; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
199; AVX512F:       # %bb.0:
200; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
201; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
202; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
203; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
204; AVX512F-NEXT:    vzeroupper
205; AVX512F-NEXT:    retq
206;
207; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_2:
208; AVX512VL:       # %bb.0:
209; AVX512VL-NEXT:    vpsrld $16, (%rdi), %ymm0
210; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
211; AVX512VL-NEXT:    vzeroupper
212; AVX512VL-NEXT:    retq
213;
214; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_2:
215; AVX512BW:       # %bb.0:
216; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
217; AVX512BW-NEXT:    vpsrld $16, %ymm0, %ymm0
218; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
219; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
220; AVX512BW-NEXT:    vzeroupper
221; AVX512BW-NEXT:    retq
222;
223; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_2:
224; AVX512BWVL:       # %bb.0:
225; AVX512BWVL-NEXT:    vpsrld $16, (%rdi), %ymm0
226; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
227; AVX512BWVL-NEXT:    vzeroupper
228; AVX512BWVL-NEXT:    retq
229  %vec = load <32 x i8>, ptr %L
230  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
231  store <8 x i8> %strided.vec, ptr %S
232  ret void
233}
234
235define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
236; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
237; AVX:       # %bb.0:
238; AVX-NEXT:    vmovdqa (%rdi), %xmm0
239; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
240; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
241; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
242; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
243; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
244; AVX-NEXT:    vmovq %xmm0, (%rsi)
245; AVX-NEXT:    retq
246;
247; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
248; AVX512F:       # %bb.0:
249; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
250; AVX512F-NEXT:    vpsrld $24, %ymm0, %ymm0
251; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
252; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
253; AVX512F-NEXT:    vzeroupper
254; AVX512F-NEXT:    retq
255;
256; AVX512VL-LABEL: shuffle_v32i8_to_v8i8_3:
257; AVX512VL:       # %bb.0:
258; AVX512VL-NEXT:    vpsrld $24, (%rdi), %ymm0
259; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
260; AVX512VL-NEXT:    vzeroupper
261; AVX512VL-NEXT:    retq
262;
263; AVX512BW-LABEL: shuffle_v32i8_to_v8i8_3:
264; AVX512BW:       # %bb.0:
265; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
266; AVX512BW-NEXT:    vpsrld $24, %ymm0, %ymm0
267; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
268; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
269; AVX512BW-NEXT:    vzeroupper
270; AVX512BW-NEXT:    retq
271;
272; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8_3:
273; AVX512BWVL:       # %bb.0:
274; AVX512BWVL-NEXT:    vpsrld $24, (%rdi), %ymm0
275; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
276; AVX512BWVL-NEXT:    vzeroupper
277; AVX512BWVL-NEXT:    retq
278  %vec = load <32 x i8>, ptr %L
279  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
280  store <8 x i8> %strided.vec, ptr %S
281  ret void
282}
283
284define void @shuffle_v16i16_to_v4i16_1(ptr %L, ptr %S) nounwind {
285; AVX1-LABEL: shuffle_v16i16_to_v4i16_1:
286; AVX1:       # %bb.0:
287; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
288; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
289; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
290; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
291; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
292; AVX1-NEXT:    vmovq %xmm0, (%rsi)
293; AVX1-NEXT:    retq
294;
295; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_1:
296; AVX2-SLOW:       # %bb.0:
297; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
298; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
299; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[0,2,2,3]
300; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
301; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
302; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
303; AVX2-SLOW-NEXT:    retq
304;
305; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_1:
306; AVX2-FAST:       # %bb.0:
307; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
308; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
309; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,10,11,8,9,10,11,8,9,10,11,12,13,14,15]
310; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
311; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
312; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
313; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
314; AVX2-FAST-NEXT:    retq
315;
316; AVX512F-LABEL: shuffle_v16i16_to_v4i16_1:
317; AVX512F:       # %bb.0:
318; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
319; AVX512F-NEXT:    vpsrlq $16, %ymm0, %ymm0
320; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
321; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
322; AVX512F-NEXT:    vzeroupper
323; AVX512F-NEXT:    retq
324;
325; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_1:
326; AVX512VL:       # %bb.0:
327; AVX512VL-NEXT:    vpsrlq $16, (%rdi), %ymm0
328; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
329; AVX512VL-NEXT:    vzeroupper
330; AVX512VL-NEXT:    retq
331;
332; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_1:
333; AVX512BW:       # %bb.0:
334; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
335; AVX512BW-NEXT:    vpsrlq $16, %ymm0, %ymm0
336; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
337; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
338; AVX512BW-NEXT:    vzeroupper
339; AVX512BW-NEXT:    retq
340;
341; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_1:
342; AVX512BWVL:       # %bb.0:
343; AVX512BWVL-NEXT:    vpsrlq $16, (%rdi), %ymm0
344; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
345; AVX512BWVL-NEXT:    vzeroupper
346; AVX512BWVL-NEXT:    retq
347  %vec = load <16 x i16>, ptr %L
348  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
349  store <4 x i16> %strided.vec, ptr %S
350  ret void
351}
352
353define void @shuffle_v16i16_to_v4i16_2(ptr %L, ptr %S) nounwind {
354; AVX1-LABEL: shuffle_v16i16_to_v4i16_2:
355; AVX1:       # %bb.0:
356; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
357; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
358; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
359; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
360; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
361; AVX1-NEXT:    vmovq %xmm0, (%rsi)
362; AVX1-NEXT:    retq
363;
364; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_2:
365; AVX2-SLOW:       # %bb.0:
366; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
367; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
368; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
369; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,0,2,3,4,5,6,7]
370; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
371; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
372; AVX2-SLOW-NEXT:    retq
373;
374; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_2:
375; AVX2-FAST:       # %bb.0:
376; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
377; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
378; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,12,13,4,5,6,7,8,9,10,11,12,13,14,15]
379; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
380; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
381; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
382; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
383; AVX2-FAST-NEXT:    retq
384;
385; AVX512F-LABEL: shuffle_v16i16_to_v4i16_2:
386; AVX512F:       # %bb.0:
387; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
388; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
389; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
390; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
391; AVX512F-NEXT:    vzeroupper
392; AVX512F-NEXT:    retq
393;
394; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_2:
395; AVX512VL:       # %bb.0:
396; AVX512VL-NEXT:    vpsrlq $32, (%rdi), %ymm0
397; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
398; AVX512VL-NEXT:    vzeroupper
399; AVX512VL-NEXT:    retq
400;
401; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_2:
402; AVX512BW:       # %bb.0:
403; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
404; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm0
405; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
406; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
407; AVX512BW-NEXT:    vzeroupper
408; AVX512BW-NEXT:    retq
409;
410; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_2:
411; AVX512BWVL:       # %bb.0:
412; AVX512BWVL-NEXT:    vpsrlq $32, (%rdi), %ymm0
413; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
414; AVX512BWVL-NEXT:    vzeroupper
415; AVX512BWVL-NEXT:    retq
416  %vec = load <16 x i16>, ptr %L
417  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
418  store <4 x i16> %strided.vec, ptr %S
419  ret void
420}
421
422define void @shuffle_v16i16_to_v4i16_3(ptr %L, ptr %S) nounwind {
423; AVX1-LABEL: shuffle_v16i16_to_v4i16_3:
424; AVX1:       # %bb.0:
425; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
426; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
427; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
428; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
429; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
430; AVX1-NEXT:    vmovq %xmm0, (%rsi)
431; AVX1-NEXT:    retq
432;
433; AVX2-SLOW-LABEL: shuffle_v16i16_to_v4i16_3:
434; AVX2-SLOW:       # %bb.0:
435; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
436; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
437; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = mem[3,1,2,3]
438; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
439; AVX2-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
440; AVX2-SLOW-NEXT:    vmovq %xmm0, (%rsi)
441; AVX2-SLOW-NEXT:    retq
442;
443; AVX2-FAST-LABEL: shuffle_v16i16_to_v4i16_3:
444; AVX2-FAST:       # %bb.0:
445; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
446; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
447; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,14,15,4,5,6,7,8,9,10,11,12,13,14,15]
448; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
449; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
450; AVX2-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
451; AVX2-FAST-NEXT:    vmovq %xmm0, (%rsi)
452; AVX2-FAST-NEXT:    retq
453;
454; AVX512F-LABEL: shuffle_v16i16_to_v4i16_3:
455; AVX512F:       # %bb.0:
456; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
457; AVX512F-NEXT:    vpsrlq $48, %ymm0, %ymm0
458; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
459; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
460; AVX512F-NEXT:    vzeroupper
461; AVX512F-NEXT:    retq
462;
463; AVX512VL-LABEL: shuffle_v16i16_to_v4i16_3:
464; AVX512VL:       # %bb.0:
465; AVX512VL-NEXT:    vpsrlq $48, (%rdi), %ymm0
466; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
467; AVX512VL-NEXT:    vzeroupper
468; AVX512VL-NEXT:    retq
469;
470; AVX512BW-LABEL: shuffle_v16i16_to_v4i16_3:
471; AVX512BW:       # %bb.0:
472; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
473; AVX512BW-NEXT:    vpsrlq $48, %ymm0, %ymm0
474; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
475; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
476; AVX512BW-NEXT:    vzeroupper
477; AVX512BW-NEXT:    retq
478;
479; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16_3:
480; AVX512BWVL:       # %bb.0:
481; AVX512BWVL-NEXT:    vpsrlq $48, (%rdi), %ymm0
482; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
483; AVX512BWVL-NEXT:    vzeroupper
484; AVX512BWVL-NEXT:    retq
485  %vec = load <16 x i16>, ptr %L
486  %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
487  store <4 x i16> %strided.vec, ptr %S
488  ret void
489}
490
491define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
492; AVX1-LABEL: shuffle_v32i8_to_v4i8_1:
493; AVX1:       # %bb.0:
494; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
495; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
496; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
497; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
498; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
499; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
500; AVX1-NEXT:    vmovd %xmm0, (%rsi)
501; AVX1-NEXT:    retq
502;
503; AVX2-LABEL: shuffle_v32i8_to_v4i8_1:
504; AVX2:       # %bb.0:
505; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
506; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
507; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [1,9,1,9,1,9,1,9,1,9,1,9,1,9,1,9]
508; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
509; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
510; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
511; AVX2-NEXT:    vmovd %xmm0, (%rsi)
512; AVX2-NEXT:    retq
513;
514; AVX512F-LABEL: shuffle_v32i8_to_v4i8_1:
515; AVX512F:       # %bb.0:
516; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
517; AVX512F-NEXT:    vpsrlq $8, %ymm0, %ymm0
518; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
519; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
520; AVX512F-NEXT:    vzeroupper
521; AVX512F-NEXT:    retq
522;
523; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_1:
524; AVX512VL:       # %bb.0:
525; AVX512VL-NEXT:    vpsrlq $8, (%rdi), %ymm0
526; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
527; AVX512VL-NEXT:    vzeroupper
528; AVX512VL-NEXT:    retq
529;
530; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_1:
531; AVX512BW:       # %bb.0:
532; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
533; AVX512BW-NEXT:    vpsrlq $8, %ymm0, %ymm0
534; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
535; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
536; AVX512BW-NEXT:    vzeroupper
537; AVX512BW-NEXT:    retq
538;
539; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_1:
540; AVX512BWVL:       # %bb.0:
541; AVX512BWVL-NEXT:    vpsrlq $8, (%rdi), %ymm0
542; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
543; AVX512BWVL-NEXT:    vzeroupper
544; AVX512BWVL-NEXT:    retq
545  %vec = load <32 x i8>, ptr %L
546  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 1, i32 9, i32 17, i32 25>
547  store <4 x i8> %strided.vec, ptr %S
548  ret void
549}
550
551define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
552; AVX1-LABEL: shuffle_v32i8_to_v4i8_2:
553; AVX1:       # %bb.0:
554; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
555; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
556; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
557; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
558; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
559; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
560; AVX1-NEXT:    vmovd %xmm0, (%rsi)
561; AVX1-NEXT:    retq
562;
563; AVX2-LABEL: shuffle_v32i8_to_v4i8_2:
564; AVX2:       # %bb.0:
565; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
566; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
567; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [2,10,2,10,2,10,2,10,2,10,2,10,2,10,2,10]
568; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
569; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
570; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
571; AVX2-NEXT:    vmovd %xmm0, (%rsi)
572; AVX2-NEXT:    retq
573;
574; AVX512F-LABEL: shuffle_v32i8_to_v4i8_2:
575; AVX512F:       # %bb.0:
576; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
577; AVX512F-NEXT:    vpsrlq $16, %ymm0, %ymm0
578; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
579; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
580; AVX512F-NEXT:    vzeroupper
581; AVX512F-NEXT:    retq
582;
583; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_2:
584; AVX512VL:       # %bb.0:
585; AVX512VL-NEXT:    vpsrlq $16, (%rdi), %ymm0
586; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
587; AVX512VL-NEXT:    vzeroupper
588; AVX512VL-NEXT:    retq
589;
590; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_2:
591; AVX512BW:       # %bb.0:
592; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
593; AVX512BW-NEXT:    vpsrlq $16, %ymm0, %ymm0
594; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
595; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
596; AVX512BW-NEXT:    vzeroupper
597; AVX512BW-NEXT:    retq
598;
599; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_2:
600; AVX512BWVL:       # %bb.0:
601; AVX512BWVL-NEXT:    vpsrlq $16, (%rdi), %ymm0
602; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
603; AVX512BWVL-NEXT:    vzeroupper
604; AVX512BWVL-NEXT:    retq
605  %vec = load <32 x i8>, ptr %L
606  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 2, i32 10, i32 18, i32 26>
607  store <4 x i8> %strided.vec, ptr %S
608  ret void
609}
610
611define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
612; AVX1-LABEL: shuffle_v32i8_to_v4i8_3:
613; AVX1:       # %bb.0:
614; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
615; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
616; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
617; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
618; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
619; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
620; AVX1-NEXT:    vmovd %xmm0, (%rsi)
621; AVX1-NEXT:    retq
622;
623; AVX2-LABEL: shuffle_v32i8_to_v4i8_3:
624; AVX2:       # %bb.0:
625; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
626; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
627; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [3,11,3,11,3,11,3,11,3,11,3,11,3,11,3,11]
628; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
629; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
630; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
631; AVX2-NEXT:    vmovd %xmm0, (%rsi)
632; AVX2-NEXT:    retq
633;
634; AVX512F-LABEL: shuffle_v32i8_to_v4i8_3:
635; AVX512F:       # %bb.0:
636; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
637; AVX512F-NEXT:    vpsrlq $24, %ymm0, %ymm0
638; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
639; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
640; AVX512F-NEXT:    vzeroupper
641; AVX512F-NEXT:    retq
642;
643; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_3:
644; AVX512VL:       # %bb.0:
645; AVX512VL-NEXT:    vpsrlq $24, (%rdi), %ymm0
646; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
647; AVX512VL-NEXT:    vzeroupper
648; AVX512VL-NEXT:    retq
649;
650; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_3:
651; AVX512BW:       # %bb.0:
652; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
653; AVX512BW-NEXT:    vpsrlq $24, %ymm0, %ymm0
654; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
655; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
656; AVX512BW-NEXT:    vzeroupper
657; AVX512BW-NEXT:    retq
658;
659; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_3:
660; AVX512BWVL:       # %bb.0:
661; AVX512BWVL-NEXT:    vpsrlq $24, (%rdi), %ymm0
662; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
663; AVX512BWVL-NEXT:    vzeroupper
664; AVX512BWVL-NEXT:    retq
665  %vec = load <32 x i8>, ptr %L
666  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
667  store <4 x i8> %strided.vec, ptr %S
668  ret void
669}
670
671define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
672; AVX1-LABEL: shuffle_v32i8_to_v4i8_4:
673; AVX1:       # %bb.0:
674; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
675; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
676; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
677; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
678; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
679; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
680; AVX1-NEXT:    vmovd %xmm0, (%rsi)
681; AVX1-NEXT:    retq
682;
683; AVX2-LABEL: shuffle_v32i8_to_v4i8_4:
684; AVX2:       # %bb.0:
685; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
686; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
687; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [4,12,4,12,4,12,4,12,4,12,4,12,4,12,4,12]
688; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
689; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
690; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
691; AVX2-NEXT:    vmovd %xmm0, (%rsi)
692; AVX2-NEXT:    retq
693;
694; AVX512F-LABEL: shuffle_v32i8_to_v4i8_4:
695; AVX512F:       # %bb.0:
696; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
697; AVX512F-NEXT:    vpsrlq $32, %ymm0, %ymm0
698; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
699; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
700; AVX512F-NEXT:    vzeroupper
701; AVX512F-NEXT:    retq
702;
703; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_4:
704; AVX512VL:       # %bb.0:
705; AVX512VL-NEXT:    vpsrlq $32, (%rdi), %ymm0
706; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
707; AVX512VL-NEXT:    vzeroupper
708; AVX512VL-NEXT:    retq
709;
710; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_4:
711; AVX512BW:       # %bb.0:
712; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
713; AVX512BW-NEXT:    vpsrlq $32, %ymm0, %ymm0
714; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
715; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
716; AVX512BW-NEXT:    vzeroupper
717; AVX512BW-NEXT:    retq
718;
719; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_4:
720; AVX512BWVL:       # %bb.0:
721; AVX512BWVL-NEXT:    vpsrlq $32, (%rdi), %ymm0
722; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
723; AVX512BWVL-NEXT:    vzeroupper
724; AVX512BWVL-NEXT:    retq
725  %vec = load <32 x i8>, ptr %L
726  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 4, i32 12, i32 20, i32 28>
727  store <4 x i8> %strided.vec, ptr %S
728  ret void
729}
730
731define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
732; AVX1-LABEL: shuffle_v32i8_to_v4i8_5:
733; AVX1:       # %bb.0:
734; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
735; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
736; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
737; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
738; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
739; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
740; AVX1-NEXT:    vmovd %xmm0, (%rsi)
741; AVX1-NEXT:    retq
742;
743; AVX2-LABEL: shuffle_v32i8_to_v4i8_5:
744; AVX2:       # %bb.0:
745; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
746; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
747; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [5,13,5,13,5,13,5,13,5,13,5,13,5,13,5,13]
748; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
749; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
750; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
751; AVX2-NEXT:    vmovd %xmm0, (%rsi)
752; AVX2-NEXT:    retq
753;
754; AVX512F-LABEL: shuffle_v32i8_to_v4i8_5:
755; AVX512F:       # %bb.0:
756; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
757; AVX512F-NEXT:    vpsrlq $40, %ymm0, %ymm0
758; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
759; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
760; AVX512F-NEXT:    vzeroupper
761; AVX512F-NEXT:    retq
762;
763; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_5:
764; AVX512VL:       # %bb.0:
765; AVX512VL-NEXT:    vpsrlq $40, (%rdi), %ymm0
766; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
767; AVX512VL-NEXT:    vzeroupper
768; AVX512VL-NEXT:    retq
769;
770; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_5:
771; AVX512BW:       # %bb.0:
772; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
773; AVX512BW-NEXT:    vpsrlq $40, %ymm0, %ymm0
774; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
775; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
776; AVX512BW-NEXT:    vzeroupper
777; AVX512BW-NEXT:    retq
778;
779; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_5:
780; AVX512BWVL:       # %bb.0:
781; AVX512BWVL-NEXT:    vpsrlq $40, (%rdi), %ymm0
782; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
783; AVX512BWVL-NEXT:    vzeroupper
784; AVX512BWVL-NEXT:    retq
785  %vec = load <32 x i8>, ptr %L
786  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 5, i32 13, i32 21, i32 29>
787  store <4 x i8> %strided.vec, ptr %S
788  ret void
789}
790
791define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
792; AVX1-LABEL: shuffle_v32i8_to_v4i8_6:
793; AVX1:       # %bb.0:
794; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
795; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
796; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
797; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
798; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
799; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
800; AVX1-NEXT:    vmovd %xmm0, (%rsi)
801; AVX1-NEXT:    retq
802;
803; AVX2-LABEL: shuffle_v32i8_to_v4i8_6:
804; AVX2:       # %bb.0:
805; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
806; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
807; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [6,14,6,14,6,14,6,14,6,14,6,14,6,14,6,14]
808; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
809; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
810; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
811; AVX2-NEXT:    vmovd %xmm0, (%rsi)
812; AVX2-NEXT:    retq
813;
814; AVX512F-LABEL: shuffle_v32i8_to_v4i8_6:
815; AVX512F:       # %bb.0:
816; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
817; AVX512F-NEXT:    vpsrlq $48, %ymm0, %ymm0
818; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
819; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
820; AVX512F-NEXT:    vzeroupper
821; AVX512F-NEXT:    retq
822;
823; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_6:
824; AVX512VL:       # %bb.0:
825; AVX512VL-NEXT:    vpsrlq $48, (%rdi), %ymm0
826; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
827; AVX512VL-NEXT:    vzeroupper
828; AVX512VL-NEXT:    retq
829;
830; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_6:
831; AVX512BW:       # %bb.0:
832; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
833; AVX512BW-NEXT:    vpsrlq $48, %ymm0, %ymm0
834; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
835; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
836; AVX512BW-NEXT:    vzeroupper
837; AVX512BW-NEXT:    retq
838;
839; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_6:
840; AVX512BWVL:       # %bb.0:
841; AVX512BWVL-NEXT:    vpsrlq $48, (%rdi), %ymm0
842; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
843; AVX512BWVL-NEXT:    vzeroupper
844; AVX512BWVL-NEXT:    retq
845  %vec = load <32 x i8>, ptr %L
846  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 6, i32 14, i32 22, i32 30>
847  store <4 x i8> %strided.vec, ptr %S
848  ret void
849}
850
851define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
852; AVX1-LABEL: shuffle_v32i8_to_v4i8_7:
853; AVX1:       # %bb.0:
854; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
855; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
856; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
857; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
858; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
859; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
860; AVX1-NEXT:    vmovd %xmm0, (%rsi)
861; AVX1-NEXT:    retq
862;
863; AVX2-LABEL: shuffle_v32i8_to_v4i8_7:
864; AVX2:       # %bb.0:
865; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
866; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
867; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm2 = [7,15,7,15,7,15,7,15,7,15,7,15,7,15,7,15]
868; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
869; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
870; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
871; AVX2-NEXT:    vmovd %xmm0, (%rsi)
872; AVX2-NEXT:    retq
873;
874; AVX512F-LABEL: shuffle_v32i8_to_v4i8_7:
875; AVX512F:       # %bb.0:
876; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
877; AVX512F-NEXT:    vpsrlq $56, %ymm0, %ymm0
878; AVX512F-NEXT:    vpmovqb %zmm0, %xmm0
879; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
880; AVX512F-NEXT:    vzeroupper
881; AVX512F-NEXT:    retq
882;
883; AVX512VL-LABEL: shuffle_v32i8_to_v4i8_7:
884; AVX512VL:       # %bb.0:
885; AVX512VL-NEXT:    vpsrlq $56, (%rdi), %ymm0
886; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
887; AVX512VL-NEXT:    vzeroupper
888; AVX512VL-NEXT:    retq
889;
890; AVX512BW-LABEL: shuffle_v32i8_to_v4i8_7:
891; AVX512BW:       # %bb.0:
892; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
893; AVX512BW-NEXT:    vpsrlq $56, %ymm0, %ymm0
894; AVX512BW-NEXT:    vpmovqb %zmm0, %xmm0
895; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
896; AVX512BW-NEXT:    vzeroupper
897; AVX512BW-NEXT:    retq
898;
899; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8_7:
900; AVX512BWVL:       # %bb.0:
901; AVX512BWVL-NEXT:    vpsrlq $56, (%rdi), %ymm0
902; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
903; AVX512BWVL-NEXT:    vzeroupper
904; AVX512BWVL-NEXT:    retq
905  %vec = load <32 x i8>, ptr %L
906  %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 7, i32 15, i32 23, i32 31>
907  store <4 x i8> %strided.vec, ptr %S
908  ret void
909}
910
911