xref: /llvm-project/llvm/test/CodeGen/X86/shuffle-strided-with-offset-128.ll (revision a70d5e25f32ebd5f1d1c394312036a37591e998b)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefixes=SSE,SSE42
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2-SLOW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX,AVX2-FAST
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512VL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BW
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
15
16define void @shuffle_v16i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
17; SSE2-LABEL: shuffle_v16i8_to_v8i8_1:
18; SSE2:       # %bb.0:
19; SSE2-NEXT:    movdqa (%rdi), %xmm0
20; SSE2-NEXT:    psrlw $8, %xmm0
21; SSE2-NEXT:    packuswb %xmm0, %xmm0
22; SSE2-NEXT:    movq %xmm0, (%rsi)
23; SSE2-NEXT:    retq
24;
25; SSE42-LABEL: shuffle_v16i8_to_v8i8_1:
26; SSE42:       # %bb.0:
27; SSE42-NEXT:    movdqa (%rdi), %xmm0
28; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
29; SSE42-NEXT:    movq %xmm0, (%rsi)
30; SSE42-NEXT:    retq
31;
32; AVX-LABEL: shuffle_v16i8_to_v8i8_1:
33; AVX:       # %bb.0:
34; AVX-NEXT:    vmovdqa (%rdi), %xmm0
35; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
36; AVX-NEXT:    vmovq %xmm0, (%rsi)
37; AVX-NEXT:    retq
38;
39; AVX512-LABEL: shuffle_v16i8_to_v8i8_1:
40; AVX512:       # %bb.0:
41; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
42; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
43; AVX512-NEXT:    vmovq %xmm0, (%rsi)
44; AVX512-NEXT:    retq
45  %vec = load <16 x i8>, ptr %L
46  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
47  store <8 x i8> %strided.vec, ptr %S
48  ret void
49}
50
51define void @shuffle_v8i16_to_v4i16_1(ptr %L, ptr %S) nounwind {
52; SSE2-LABEL: shuffle_v8i16_to_v4i16_1:
53; SSE2:       # %bb.0:
54; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
55; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
56; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
57; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
58; SSE2-NEXT:    movq %xmm0, (%rsi)
59; SSE2-NEXT:    retq
60;
61; SSE42-LABEL: shuffle_v8i16_to_v4i16_1:
62; SSE42:       # %bb.0:
63; SSE42-NEXT:    movdqa (%rdi), %xmm0
64; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
65; SSE42-NEXT:    movq %xmm0, (%rsi)
66; SSE42-NEXT:    retq
67;
68; AVX-LABEL: shuffle_v8i16_to_v4i16_1:
69; AVX:       # %bb.0:
70; AVX-NEXT:    vmovdqa (%rdi), %xmm0
71; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
72; AVX-NEXT:    vmovq %xmm0, (%rsi)
73; AVX-NEXT:    retq
74;
75; AVX512-LABEL: shuffle_v8i16_to_v4i16_1:
76; AVX512:       # %bb.0:
77; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
78; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
79; AVX512-NEXT:    vmovq %xmm0, (%rsi)
80; AVX512-NEXT:    retq
81  %vec = load <8 x i16>, ptr %L
82  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
83  store <4 x i16> %strided.vec, ptr %S
84  ret void
85}
86
87define void @shuffle_v4i32_to_v2i32_1(ptr %L, ptr %S) nounwind {
88; SSE-LABEL: shuffle_v4i32_to_v2i32_1:
89; SSE:       # %bb.0:
90; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[1,3,2,3]
91; SSE-NEXT:    movq %xmm0, (%rsi)
92; SSE-NEXT:    retq
93;
94; AVX-LABEL: shuffle_v4i32_to_v2i32_1:
95; AVX:       # %bb.0:
96; AVX-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
97; AVX-NEXT:    vmovlps %xmm0, (%rsi)
98; AVX-NEXT:    retq
99;
100; AVX512-LABEL: shuffle_v4i32_to_v2i32_1:
101; AVX512:       # %bb.0:
102; AVX512-NEXT:    vpermilps {{.*#+}} xmm0 = mem[1,3,2,3]
103; AVX512-NEXT:    vmovlps %xmm0, (%rsi)
104; AVX512-NEXT:    retq
105  %vec = load <4 x i32>, ptr %L
106  %strided.vec = shufflevector <4 x i32> %vec, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
107  store <2 x i32> %strided.vec, ptr %S
108  ret void
109}
110
111define void @shuffle_v16i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
112; SSE2-LABEL: shuffle_v16i8_to_v4i8_1:
113; SSE2:       # %bb.0:
114; SSE2-NEXT:    movdqa (%rdi), %xmm0
115; SSE2-NEXT:    pxor %xmm1, %xmm1
116; SSE2-NEXT:    movdqa %xmm0, %xmm2
117; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
118; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
119; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
120; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
121; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
122; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
123; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
124; SSE2-NEXT:    packuswb %xmm0, %xmm0
125; SSE2-NEXT:    movd %xmm0, (%rsi)
126; SSE2-NEXT:    retq
127;
128; SSE42-LABEL: shuffle_v16i8_to_v4i8_1:
129; SSE42:       # %bb.0:
130; SSE42-NEXT:    movdqa (%rdi), %xmm0
131; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
132; SSE42-NEXT:    movd %xmm0, (%rsi)
133; SSE42-NEXT:    retq
134;
135; AVX-LABEL: shuffle_v16i8_to_v4i8_1:
136; AVX:       # %bb.0:
137; AVX-NEXT:    vmovdqa (%rdi), %xmm0
138; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
139; AVX-NEXT:    vmovd %xmm0, (%rsi)
140; AVX-NEXT:    retq
141;
142; AVX512-LABEL: shuffle_v16i8_to_v4i8_1:
143; AVX512:       # %bb.0:
144; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
145; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,5,9,13,u,u,u,u,u,u,u,u,u,u,u,u]
146; AVX512-NEXT:    vmovd %xmm0, (%rsi)
147; AVX512-NEXT:    retq
148  %vec = load <16 x i8>, ptr %L
149  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
150  store <4 x i8> %strided.vec, ptr %S
151  ret void
152}
153
154define void @shuffle_v16i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
155; SSE2-LABEL: shuffle_v16i8_to_v4i8_2:
156; SSE2:       # %bb.0:
157; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = mem[3,1,2,3,4,5,6,7]
158; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
159; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
160; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
161; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
162; SSE2-NEXT:    packuswb %xmm0, %xmm0
163; SSE2-NEXT:    movd %xmm0, (%rsi)
164; SSE2-NEXT:    retq
165;
166; SSE42-LABEL: shuffle_v16i8_to_v4i8_2:
167; SSE42:       # %bb.0:
168; SSE42-NEXT:    movdqa (%rdi), %xmm0
169; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
170; SSE42-NEXT:    movd %xmm0, (%rsi)
171; SSE42-NEXT:    retq
172;
173; AVX-LABEL: shuffle_v16i8_to_v4i8_2:
174; AVX:       # %bb.0:
175; AVX-NEXT:    vmovdqa (%rdi), %xmm0
176; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
177; AVX-NEXT:    vmovd %xmm0, (%rsi)
178; AVX-NEXT:    retq
179;
180; AVX512-LABEL: shuffle_v16i8_to_v4i8_2:
181; AVX512:       # %bb.0:
182; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
183; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,6,10,14,u,u,u,u,u,u,u,u,u,u,u,u]
184; AVX512-NEXT:    vmovd %xmm0, (%rsi)
185; AVX512-NEXT:    retq
186  %vec = load <16 x i8>, ptr %L
187  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
188  store <4 x i8> %strided.vec, ptr %S
189  ret void
190}
191
192define void @shuffle_v16i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
193; SSE2-LABEL: shuffle_v16i8_to_v4i8_3:
194; SSE2:       # %bb.0:
195; SSE2-NEXT:    movdqa (%rdi), %xmm0
196; SSE2-NEXT:    pxor %xmm1, %xmm1
197; SSE2-NEXT:    movdqa %xmm0, %xmm2
198; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
199; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
200; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
201; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
202; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
203; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
204; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
205; SSE2-NEXT:    packuswb %xmm0, %xmm0
206; SSE2-NEXT:    movd %xmm0, (%rsi)
207; SSE2-NEXT:    retq
208;
209; SSE42-LABEL: shuffle_v16i8_to_v4i8_3:
210; SSE42:       # %bb.0:
211; SSE42-NEXT:    movdqa (%rdi), %xmm0
212; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
213; SSE42-NEXT:    movd %xmm0, (%rsi)
214; SSE42-NEXT:    retq
215;
216; AVX-LABEL: shuffle_v16i8_to_v4i8_3:
217; AVX:       # %bb.0:
218; AVX-NEXT:    vmovdqa (%rdi), %xmm0
219; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
220; AVX-NEXT:    vmovd %xmm0, (%rsi)
221; AVX-NEXT:    retq
222;
223; AVX512-LABEL: shuffle_v16i8_to_v4i8_3:
224; AVX512:       # %bb.0:
225; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
226; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,7,11,15,u,u,u,u,u,u,u,u,u,u,u,u]
227; AVX512-NEXT:    vmovd %xmm0, (%rsi)
228; AVX512-NEXT:    retq
229  %vec = load <16 x i8>, ptr %L
230  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
231  store <4 x i8> %strided.vec, ptr %S
232  ret void
233}
234
235define void @shuffle_v8i16_to_v2i16_1(ptr %L, ptr %S) nounwind {
236; SSE-LABEL: shuffle_v8i16_to_v2i16_1:
237; SSE:       # %bb.0:
238; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
239; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
240; SSE-NEXT:    movd %xmm0, (%rsi)
241; SSE-NEXT:    retq
242;
243; AVX1-LABEL: shuffle_v8i16_to_v2i16_1:
244; AVX1:       # %bb.0:
245; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
246; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
247; AVX1-NEXT:    vmovd %xmm0, (%rsi)
248; AVX1-NEXT:    retq
249;
250; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_1:
251; AVX2-SLOW:       # %bb.0:
252; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
253; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
254; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
255; AVX2-SLOW-NEXT:    retq
256;
257; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_1:
258; AVX2-FAST:       # %bb.0:
259; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
260; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
261; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
262; AVX2-FAST-NEXT:    retq
263;
264; AVX512F-LABEL: shuffle_v8i16_to_v2i16_1:
265; AVX512F:       # %bb.0:
266; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[0,2,2,3]
267; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
268; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
269; AVX512F-NEXT:    retq
270;
271; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_1:
272; AVX512VL:       # %bb.0:
273; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
274; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
275; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
276; AVX512VL-NEXT:    retq
277;
278; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_1:
279; AVX512BW:       # %bb.0:
280; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
281; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
282; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
283; AVX512BW-NEXT:    retq
284;
285; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_1:
286; AVX512BWVL:       # %bb.0:
287; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
288; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
289; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
290; AVX512BWVL-NEXT:    retq
291  %vec = load <8 x i16>, ptr %L
292  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 1, i32 5>
293  store <2 x i16> %strided.vec, ptr %S
294  ret void
295}
296
297define void @shuffle_v8i16_to_v2i16_2(ptr %L, ptr %S) nounwind {
298; SSE-LABEL: shuffle_v8i16_to_v2i16_2:
299; SSE:       # %bb.0:
300; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
301; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
302; SSE-NEXT:    movd %xmm0, (%rsi)
303; SSE-NEXT:    retq
304;
305; AVX1-LABEL: shuffle_v8i16_to_v2i16_2:
306; AVX1:       # %bb.0:
307; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
308; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
309; AVX1-NEXT:    vmovd %xmm0, (%rsi)
310; AVX1-NEXT:    retq
311;
312; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_2:
313; AVX2-SLOW:       # %bb.0:
314; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
315; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
316; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
317; AVX2-SLOW-NEXT:    retq
318;
319; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_2:
320; AVX2-FAST:       # %bb.0:
321; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
322; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
323; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
324; AVX2-FAST-NEXT:    retq
325;
326; AVX512F-LABEL: shuffle_v8i16_to_v2i16_2:
327; AVX512F:       # %bb.0:
328; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
329; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
330; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
331; AVX512F-NEXT:    retq
332;
333; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_2:
334; AVX512VL:       # %bb.0:
335; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
336; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
337; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
338; AVX512VL-NEXT:    retq
339;
340; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_2:
341; AVX512BW:       # %bb.0:
342; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
343; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
344; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
345; AVX512BW-NEXT:    retq
346;
347; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_2:
348; AVX512BWVL:       # %bb.0:
349; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
350; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u]
351; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
352; AVX512BWVL-NEXT:    retq
353  %vec = load <8 x i16>, ptr %L
354  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 2, i32 6>
355  store <2 x i16> %strided.vec, ptr %S
356  ret void
357}
358
359define void @shuffle_v8i16_to_v2i16_3(ptr %L, ptr %S) nounwind {
360; SSE-LABEL: shuffle_v8i16_to_v2i16_3:
361; SSE:       # %bb.0:
362; SSE-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
363; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
364; SSE-NEXT:    movd %xmm0, (%rsi)
365; SSE-NEXT:    retq
366;
367; AVX1-LABEL: shuffle_v8i16_to_v2i16_3:
368; AVX1:       # %bb.0:
369; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
370; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
371; AVX1-NEXT:    vmovd %xmm0, (%rsi)
372; AVX1-NEXT:    retq
373;
374; AVX2-SLOW-LABEL: shuffle_v8i16_to_v2i16_3:
375; AVX2-SLOW:       # %bb.0:
376; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
377; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
378; AVX2-SLOW-NEXT:    vmovd %xmm0, (%rsi)
379; AVX2-SLOW-NEXT:    retq
380;
381; AVX2-FAST-LABEL: shuffle_v8i16_to_v2i16_3:
382; AVX2-FAST:       # %bb.0:
383; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm0
384; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
385; AVX2-FAST-NEXT:    vmovd %xmm0, (%rsi)
386; AVX2-FAST-NEXT:    retq
387;
388; AVX512F-LABEL: shuffle_v8i16_to_v2i16_3:
389; AVX512F:       # %bb.0:
390; AVX512F-NEXT:    vpshufd {{.*#+}} xmm0 = mem[3,1,2,3]
391; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
392; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
393; AVX512F-NEXT:    retq
394;
395; AVX512VL-LABEL: shuffle_v8i16_to_v2i16_3:
396; AVX512VL:       # %bb.0:
397; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
398; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
399; AVX512VL-NEXT:    vmovd %xmm0, (%rsi)
400; AVX512VL-NEXT:    retq
401;
402; AVX512BW-LABEL: shuffle_v8i16_to_v2i16_3:
403; AVX512BW:       # %bb.0:
404; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
405; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
406; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
407; AVX512BW-NEXT:    retq
408;
409; AVX512BWVL-LABEL: shuffle_v8i16_to_v2i16_3:
410; AVX512BWVL:       # %bb.0:
411; AVX512BWVL-NEXT:    vmovdqa (%rdi), %xmm0
412; AVX512BWVL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
413; AVX512BWVL-NEXT:    vmovd %xmm0, (%rsi)
414; AVX512BWVL-NEXT:    retq
415  %vec = load <8 x i16>, ptr %L
416  %strided.vec = shufflevector <8 x i16> %vec, <8 x i16> undef, <2 x i32> <i32 3, i32 7>
417  store <2 x i16> %strided.vec, ptr %S
418  ret void
419}
420
421define void @shuffle_v16i8_to_v2i8_1(ptr %L, ptr %S) nounwind {
422; SSE2-LABEL: shuffle_v16i8_to_v2i8_1:
423; SSE2:       # %bb.0:
424; SSE2-NEXT:    movdqa (%rdi), %xmm0
425; SSE2-NEXT:    pxor %xmm1, %xmm1
426; SSE2-NEXT:    movdqa %xmm0, %xmm2
427; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
428; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
429; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
430; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
431; SSE2-NEXT:    packuswb %xmm0, %xmm0
432; SSE2-NEXT:    movd %xmm0, %eax
433; SSE2-NEXT:    movw %ax, (%rsi)
434; SSE2-NEXT:    retq
435;
436; SSE42-LABEL: shuffle_v16i8_to_v2i8_1:
437; SSE42:       # %bb.0:
438; SSE42-NEXT:    movdqa (%rdi), %xmm0
439; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
440; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
441; SSE42-NEXT:    retq
442;
443; AVX-LABEL: shuffle_v16i8_to_v2i8_1:
444; AVX:       # %bb.0:
445; AVX-NEXT:    vmovdqa (%rdi), %xmm0
446; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
447; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
448; AVX-NEXT:    retq
449;
450; AVX512-LABEL: shuffle_v16i8_to_v2i8_1:
451; AVX512:       # %bb.0:
452; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
453; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[1,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
454; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
455; AVX512-NEXT:    retq
456  %vec = load <16 x i8>, ptr %L
457  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 1, i32 9>
458  store <2 x i8> %strided.vec, ptr %S
459  ret void
460}
461
462define void @shuffle_v16i8_to_v2i8_2(ptr %L, ptr %S) nounwind {
463; SSE2-LABEL: shuffle_v16i8_to_v2i8_2:
464; SSE2:       # %bb.0:
465; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[0,2,2,3]
466; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
467; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
468; SSE2-NEXT:    packuswb %xmm0, %xmm0
469; SSE2-NEXT:    movd %xmm0, %eax
470; SSE2-NEXT:    movw %ax, (%rsi)
471; SSE2-NEXT:    retq
472;
473; SSE42-LABEL: shuffle_v16i8_to_v2i8_2:
474; SSE42:       # %bb.0:
475; SSE42-NEXT:    movdqa (%rdi), %xmm0
476; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
477; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
478; SSE42-NEXT:    retq
479;
480; AVX-LABEL: shuffle_v16i8_to_v2i8_2:
481; AVX:       # %bb.0:
482; AVX-NEXT:    vmovdqa (%rdi), %xmm0
483; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
484; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
485; AVX-NEXT:    retq
486;
487; AVX512-LABEL: shuffle_v16i8_to_v2i8_2:
488; AVX512:       # %bb.0:
489; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
490; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[2,10,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
491; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
492; AVX512-NEXT:    retq
493  %vec = load <16 x i8>, ptr %L
494  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 2, i32 10>
495  store <2 x i8> %strided.vec, ptr %S
496  ret void
497}
498
499define void @shuffle_v16i8_to_v2i8_3(ptr %L, ptr %S) nounwind {
500; SSE2-LABEL: shuffle_v16i8_to_v2i8_3:
501; SSE2:       # %bb.0:
502; SSE2-NEXT:    movdqa (%rdi), %xmm0
503; SSE2-NEXT:    pxor %xmm1, %xmm1
504; SSE2-NEXT:    movdqa %xmm0, %xmm2
505; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
506; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
507; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
508; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
509; SSE2-NEXT:    packuswb %xmm0, %xmm0
510; SSE2-NEXT:    movd %xmm0, %eax
511; SSE2-NEXT:    movw %ax, (%rsi)
512; SSE2-NEXT:    retq
513;
514; SSE42-LABEL: shuffle_v16i8_to_v2i8_3:
515; SSE42:       # %bb.0:
516; SSE42-NEXT:    movdqa (%rdi), %xmm0
517; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
518; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
519; SSE42-NEXT:    retq
520;
521; AVX-LABEL: shuffle_v16i8_to_v2i8_3:
522; AVX:       # %bb.0:
523; AVX-NEXT:    vmovdqa (%rdi), %xmm0
524; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
525; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
526; AVX-NEXT:    retq
527;
528; AVX512-LABEL: shuffle_v16i8_to_v2i8_3:
529; AVX512:       # %bb.0:
530; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
531; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[3,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
532; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
533; AVX512-NEXT:    retq
534  %vec = load <16 x i8>, ptr %L
535  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 3, i32 11>
536  store <2 x i8> %strided.vec, ptr %S
537  ret void
538}
539
540define void @shuffle_v16i8_to_v2i8_4(ptr %L, ptr %S) nounwind {
541; SSE2-LABEL: shuffle_v16i8_to_v2i8_4:
542; SSE2:       # %bb.0:
543; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
544; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
545; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,0,2,3,4,5,6,7]
546; SSE2-NEXT:    packuswb %xmm0, %xmm0
547; SSE2-NEXT:    movd %xmm0, %eax
548; SSE2-NEXT:    movw %ax, (%rsi)
549; SSE2-NEXT:    retq
550;
551; SSE42-LABEL: shuffle_v16i8_to_v2i8_4:
552; SSE42:       # %bb.0:
553; SSE42-NEXT:    movdqa (%rdi), %xmm0
554; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
555; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
556; SSE42-NEXT:    retq
557;
558; AVX-LABEL: shuffle_v16i8_to_v2i8_4:
559; AVX:       # %bb.0:
560; AVX-NEXT:    vmovdqa (%rdi), %xmm0
561; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
562; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
563; AVX-NEXT:    retq
564;
565; AVX512-LABEL: shuffle_v16i8_to_v2i8_4:
566; AVX512:       # %bb.0:
567; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
568; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,12,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
569; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
570; AVX512-NEXT:    retq
571  %vec = load <16 x i8>, ptr %L
572  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 4, i32 12>
573  store <2 x i8> %strided.vec, ptr %S
574  ret void
575}
576
577define void @shuffle_v16i8_to_v2i8_5(ptr %L, ptr %S) nounwind {
578; SSE2-LABEL: shuffle_v16i8_to_v2i8_5:
579; SSE2:       # %bb.0:
580; SSE2-NEXT:    movdqa (%rdi), %xmm0
581; SSE2-NEXT:    pxor %xmm1, %xmm1
582; SSE2-NEXT:    movdqa %xmm0, %xmm2
583; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
584; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
585; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
586; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
587; SSE2-NEXT:    packuswb %xmm0, %xmm0
588; SSE2-NEXT:    movd %xmm0, %eax
589; SSE2-NEXT:    movw %ax, (%rsi)
590; SSE2-NEXT:    retq
591;
592; SSE42-LABEL: shuffle_v16i8_to_v2i8_5:
593; SSE42:       # %bb.0:
594; SSE42-NEXT:    movdqa (%rdi), %xmm0
595; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
596; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
597; SSE42-NEXT:    retq
598;
599; AVX-LABEL: shuffle_v16i8_to_v2i8_5:
600; AVX:       # %bb.0:
601; AVX-NEXT:    vmovdqa (%rdi), %xmm0
602; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
603; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
604; AVX-NEXT:    retq
605;
606; AVX512-LABEL: shuffle_v16i8_to_v2i8_5:
607; AVX512:       # %bb.0:
608; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
609; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[5,13,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
610; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
611; AVX512-NEXT:    retq
612  %vec = load <16 x i8>, ptr %L
613  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 5, i32 13>
614  store <2 x i8> %strided.vec, ptr %S
615  ret void
616}
617
618define void @shuffle_v16i8_to_v2i8_6(ptr %L, ptr %S) nounwind {
619; SSE2-LABEL: shuffle_v16i8_to_v2i8_6:
620; SSE2:       # %bb.0:
621; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = mem[3,1,2,3]
622; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
623; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
624; SSE2-NEXT:    packuswb %xmm0, %xmm0
625; SSE2-NEXT:    movd %xmm0, %eax
626; SSE2-NEXT:    movw %ax, (%rsi)
627; SSE2-NEXT:    retq
628;
629; SSE42-LABEL: shuffle_v16i8_to_v2i8_6:
630; SSE42:       # %bb.0:
631; SSE42-NEXT:    movdqa (%rdi), %xmm0
632; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
633; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
634; SSE42-NEXT:    retq
635;
636; AVX-LABEL: shuffle_v16i8_to_v2i8_6:
637; AVX:       # %bb.0:
638; AVX-NEXT:    vmovdqa (%rdi), %xmm0
639; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
640; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
641; AVX-NEXT:    retq
642;
643; AVX512-LABEL: shuffle_v16i8_to_v2i8_6:
644; AVX512:       # %bb.0:
645; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
646; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[6,14,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
647; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
648; AVX512-NEXT:    retq
649  %vec = load <16 x i8>, ptr %L
650  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 6, i32 14>
651  store <2 x i8> %strided.vec, ptr %S
652  ret void
653}
654
655define void @shuffle_v16i8_to_v2i8_7(ptr %L, ptr %S) nounwind {
656; SSE2-LABEL: shuffle_v16i8_to_v2i8_7:
657; SSE2:       # %bb.0:
658; SSE2-NEXT:    movdqa (%rdi), %xmm0
659; SSE2-NEXT:    pxor %xmm1, %xmm1
660; SSE2-NEXT:    movdqa %xmm0, %xmm2
661; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
662; SSE2-NEXT:    psrlw $8, %xmm0
663; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
664; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[3,3,3,3]
665; SSE2-NEXT:    packuswb %xmm0, %xmm0
666; SSE2-NEXT:    movd %xmm0, %eax
667; SSE2-NEXT:    movw %ax, (%rsi)
668; SSE2-NEXT:    retq
669;
670; SSE42-LABEL: shuffle_v16i8_to_v2i8_7:
671; SSE42:       # %bb.0:
672; SSE42-NEXT:    movdqa (%rdi), %xmm0
673; SSE42-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
674; SSE42-NEXT:    pextrw $0, %xmm0, (%rsi)
675; SSE42-NEXT:    retq
676;
677; AVX-LABEL: shuffle_v16i8_to_v2i8_7:
678; AVX:       # %bb.0:
679; AVX-NEXT:    vmovdqa (%rdi), %xmm0
680; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
681; AVX-NEXT:    vpextrw $0, %xmm0, (%rsi)
682; AVX-NEXT:    retq
683;
684; AVX512-LABEL: shuffle_v16i8_to_v2i8_7:
685; AVX512:       # %bb.0:
686; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
687; AVX512-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[7,15,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
688; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsi)
689; AVX512-NEXT:    retq
690  %vec = load <16 x i8>, ptr %L
691  %strided.vec = shufflevector <16 x i8> %vec, <16 x i8> undef, <2 x i32> <i32 7, i32 15>
692  store <2 x i8> %strided.vec, ptr %S
693  ret void
694}
695
696