xref: /llvm-project/llvm/test/CodeGen/X86/vector-shuffle-variable-128.ll (revision d96529af3c362c53ef2e8c883a9e571fb3626927)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
7
8;
9; Unary shuffle indices from registers
10;
11
12define <2 x double> @var_shuffle_v2f64_v2f64_xx_i64(<2 x double> %x, i64 %i0, i64 %i1) nounwind {
13; SSE-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
14; SSE:       # %bb.0:
15; SSE-NEXT:    andl $1, %esi
16; SSE-NEXT:    andl $1, %edi
17; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
18; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
19; SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
20; SSE-NEXT:    retq
21;
22; AVX-LABEL: var_shuffle_v2f64_v2f64_xx_i64:
23; AVX:       # %bb.0:
24; AVX-NEXT:    andl $1, %esi
25; AVX-NEXT:    andl $1, %edi
26; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
27; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
28; AVX-NEXT:    vmovhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
29; AVX-NEXT:    retq
30  %x0 = extractelement <2 x double> %x, i64 %i0
31  %x1 = extractelement <2 x double> %x, i64 %i1
32  %r0 = insertelement <2 x double> undef, double %x0, i32 0
33  %r1 = insertelement <2 x double>   %r0, double %x1, i32 1
34  ret <2 x double> %r1
35}
36
37define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1) nounwind {
38; SSE-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
39; SSE:       # %bb.0:
40; SSE-NEXT:    # kill: def $esi killed $esi def $rsi
41; SSE-NEXT:    # kill: def $edi killed $edi def $rdi
42; SSE-NEXT:    andl $1, %edi
43; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
44; SSE-NEXT:    andl $1, %esi
45; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
46; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
47; SSE-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
48; SSE-NEXT:    retq
49;
50; AVX-LABEL: var_shuffle_v2i64_v2i64_xx_i64:
51; AVX:       # %bb.0:
52; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
53; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
54; AVX-NEXT:    andl $1, %edi
55; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
56; AVX-NEXT:    andl $1, %esi
57; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
58; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
59; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
60; AVX-NEXT:    retq
61  %x0 = extractelement <2 x i64> %x, i32 %i0
62  %x1 = extractelement <2 x i64> %x, i32 %i1
63  %r0 = insertelement <2 x i64> undef, i64 %x0, i32 0
64  %r1 = insertelement <2 x i64>   %r0, i64 %x1, i32 1
65  ret <2 x i64> %r1
66}
67
68define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
69; SSE2-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
70; SSE2:       # %bb.0:
71; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
72; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
73; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
74; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
75; SSE2-NEXT:    andl $3, %edi
76; SSE2-NEXT:    andl $3, %esi
77; SSE2-NEXT:    andl $3, %edx
78; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
79; SSE2-NEXT:    andl $3, %ecx
80; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
81; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
82; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
83; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
84; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
85; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
86; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
87; SSE2-NEXT:    retq
88;
89; SSSE3-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
90; SSSE3:       # %bb.0:
91; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
92; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
93; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
94; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
95; SSSE3-NEXT:    andl $3, %edi
96; SSSE3-NEXT:    andl $3, %esi
97; SSSE3-NEXT:    andl $3, %edx
98; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
99; SSSE3-NEXT:    andl $3, %ecx
100; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
101; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
102; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
103; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
104; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
105; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
106; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
107; SSSE3-NEXT:    retq
108;
109; SSE41-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
110; SSE41:       # %bb.0:
111; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
112; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
113; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
114; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
115; SSE41-NEXT:    andl $3, %edi
116; SSE41-NEXT:    andl $3, %esi
117; SSE41-NEXT:    andl $3, %edx
118; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
119; SSE41-NEXT:    andl $3, %ecx
120; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
121; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
122; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
123; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
124; SSE41-NEXT:    retq
125;
126; AVX-LABEL: var_shuffle_v4f32_v4f32_xxxx_i32:
127; AVX:       # %bb.0:
128; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
129; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
130; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
131; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
132; AVX-NEXT:    andl $3, %edi
133; AVX-NEXT:    andl $3, %esi
134; AVX-NEXT:    andl $3, %edx
135; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
136; AVX-NEXT:    andl $3, %ecx
137; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
138; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
139; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
140; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
141; AVX-NEXT:    retq
142  %x0 = extractelement <4 x float> %x, i32 %i0
143  %x1 = extractelement <4 x float> %x, i32 %i1
144  %x2 = extractelement <4 x float> %x, i32 %i2
145  %x3 = extractelement <4 x float> %x, i32 %i3
146  %r0 = insertelement <4 x float> undef, float %x0, i32 0
147  %r1 = insertelement <4 x float>   %r0, float %x1, i32 1
148  %r2 = insertelement <4 x float>   %r1, float %x2, i32 2
149  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
150  ret <4 x float> %r3
151}
152
153define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
154; SSE2-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
155; SSE2:       # %bb.0:
156; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
157; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
158; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
159; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
160; SSE2-NEXT:    andl $3, %edi
161; SSE2-NEXT:    andl $3, %esi
162; SSE2-NEXT:    andl $3, %edx
163; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
164; SSE2-NEXT:    andl $3, %ecx
165; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
166; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
167; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
168; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
169; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
170; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
171; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
172; SSE2-NEXT:    retq
173;
174; SSSE3-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
175; SSSE3:       # %bb.0:
176; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
177; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
178; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
179; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
180; SSSE3-NEXT:    andl $3, %edi
181; SSSE3-NEXT:    andl $3, %esi
182; SSSE3-NEXT:    andl $3, %edx
183; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
184; SSSE3-NEXT:    andl $3, %ecx
185; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
186; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
187; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
188; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
189; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
190; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
191; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
192; SSSE3-NEXT:    retq
193;
194; SSE41-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
195; SSE41:       # %bb.0:
196; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
197; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
198; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
199; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
200; SSE41-NEXT:    andl $3, %edi
201; SSE41-NEXT:    andl $3, %esi
202; SSE41-NEXT:    andl $3, %edx
203; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
204; SSE41-NEXT:    andl $3, %ecx
205; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
206; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rsi,4), %xmm0
207; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
208; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rcx,4), %xmm0
209; SSE41-NEXT:    retq
210;
211; AVX-LABEL: var_shuffle_v4i32_v4i32_xxxx_i32:
212; AVX:       # %bb.0:
213; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
214; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
215; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
216; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
217; AVX-NEXT:    andl $3, %edi
218; AVX-NEXT:    andl $3, %esi
219; AVX-NEXT:    andl $3, %edx
220; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
221; AVX-NEXT:    andl $3, %ecx
222; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
223; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rsi,4), %xmm0, %xmm0
224; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
225; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rcx,4), %xmm0, %xmm0
226; AVX-NEXT:    retq
227  %x0 = extractelement <4 x i32> %x, i32 %i0
228  %x1 = extractelement <4 x i32> %x, i32 %i1
229  %x2 = extractelement <4 x i32> %x, i32 %i2
230  %x3 = extractelement <4 x i32> %x, i32 %i3
231  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
232  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
233  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
234  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
235  ret <4 x i32> %r3
236}
237
238define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
239; SSE2-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
240; SSE2:       # %bb.0:
241; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
242; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
243; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
244; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
245; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
246; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
247; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
248; SSE2-NEXT:    andl $7, %eax
249; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
250; SSE2-NEXT:    andl $7, %r10d
251; SSE2-NEXT:    andl $7, %edi
252; SSE2-NEXT:    andl $7, %esi
253; SSE2-NEXT:    andl $7, %edx
254; SSE2-NEXT:    andl $7, %ecx
255; SSE2-NEXT:    andl $7, %r8d
256; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
257; SSE2-NEXT:    andl $7, %r9d
258; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
259; SSE2-NEXT:    movd %r10d, %xmm0
260; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
261; SSE2-NEXT:    movd %eax, %xmm1
262; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
263; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
264; SSE2-NEXT:    movd %eax, %xmm0
265; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
266; SSE2-NEXT:    movd %eax, %xmm2
267; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
268; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
269; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
270; SSE2-NEXT:    movd %eax, %xmm0
271; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
272; SSE2-NEXT:    movd %eax, %xmm1
273; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
274; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
275; SSE2-NEXT:    movd %eax, %xmm3
276; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
277; SSE2-NEXT:    movd %eax, %xmm0
278; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
279; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
280; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
281; SSE2-NEXT:    retq
282;
283; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
284; SSSE3:       # %bb.0:
285; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
286; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
287; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
288; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
289; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
290; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
291; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
292; SSSE3-NEXT:    andl $7, %eax
293; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
294; SSSE3-NEXT:    andl $7, %r10d
295; SSSE3-NEXT:    andl $7, %edi
296; SSSE3-NEXT:    andl $7, %esi
297; SSSE3-NEXT:    andl $7, %edx
298; SSSE3-NEXT:    andl $7, %ecx
299; SSSE3-NEXT:    andl $7, %r8d
300; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
301; SSSE3-NEXT:    andl $7, %r9d
302; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
303; SSSE3-NEXT:    movd %r10d, %xmm0
304; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
305; SSSE3-NEXT:    movd %eax, %xmm1
306; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
307; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
308; SSSE3-NEXT:    movd %eax, %xmm0
309; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
310; SSSE3-NEXT:    movd %eax, %xmm2
311; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
312; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
313; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
314; SSSE3-NEXT:    movd %eax, %xmm0
315; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
316; SSSE3-NEXT:    movd %eax, %xmm1
317; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
318; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
319; SSSE3-NEXT:    movd %eax, %xmm3
320; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
321; SSSE3-NEXT:    movd %eax, %xmm0
322; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
323; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
324; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
325; SSSE3-NEXT:    retq
326;
327; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
328; SSE41:       # %bb.0:
329; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
330; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
331; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
332; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
333; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
334; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
335; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
336; SSE41-NEXT:    andl $7, %eax
337; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
338; SSE41-NEXT:    andl $7, %r10d
339; SSE41-NEXT:    andl $7, %edi
340; SSE41-NEXT:    andl $7, %esi
341; SSE41-NEXT:    andl $7, %edx
342; SSE41-NEXT:    andl $7, %ecx
343; SSE41-NEXT:    andl $7, %r8d
344; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
345; SSE41-NEXT:    andl $7, %r9d
346; SSE41-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
347; SSE41-NEXT:    movd %edi, %xmm0
348; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
349; SSE41-NEXT:    pinsrw $2, -24(%rsp,%rdx,2), %xmm0
350; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
351; SSE41-NEXT:    pinsrw $4, -24(%rsp,%r8,2), %xmm0
352; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
353; SSE41-NEXT:    pinsrw $6, -24(%rsp,%r10,2), %xmm0
354; SSE41-NEXT:    pinsrw $7, -24(%rsp,%rax,2), %xmm0
355; SSE41-NEXT:    retq
356;
357; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
358; AVX:       # %bb.0:
359; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
360; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
361; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
362; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
363; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
364; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
365; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
366; AVX-NEXT:    andl $7, %eax
367; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %r10d
368; AVX-NEXT:    andl $7, %r10d
369; AVX-NEXT:    andl $7, %edi
370; AVX-NEXT:    andl $7, %esi
371; AVX-NEXT:    andl $7, %edx
372; AVX-NEXT:    andl $7, %ecx
373; AVX-NEXT:    andl $7, %r8d
374; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
375; AVX-NEXT:    andl $7, %r9d
376; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
377; AVX-NEXT:    vmovd %edi, %xmm0
378; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
379; AVX-NEXT:    vpinsrw $2, -24(%rsp,%rdx,2), %xmm0, %xmm0
380; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
381; AVX-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
382; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
383; AVX-NEXT:    vpinsrw $6, -24(%rsp,%r10,2), %xmm0, %xmm0
384; AVX-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
385; AVX-NEXT:    retq
386  %x0 = extractelement <8 x i16> %x, i16 %i0
387  %x1 = extractelement <8 x i16> %x, i16 %i1
388  %x2 = extractelement <8 x i16> %x, i16 %i2
389  %x3 = extractelement <8 x i16> %x, i16 %i3
390  %x4 = extractelement <8 x i16> %x, i16 %i4
391  %x5 = extractelement <8 x i16> %x, i16 %i5
392  %x6 = extractelement <8 x i16> %x, i16 %i6
393  %x7 = extractelement <8 x i16> %x, i16 %i7
394  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
395  %r1 = insertelement <8 x i16>   %r0, i16 %x1, i32 1
396  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
397  %r3 = insertelement <8 x i16>   %r2, i16 %x3, i32 3
398  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
399  %r5 = insertelement <8 x i16>   %r4, i16 %x5, i32 5
400  %r6 = insertelement <8 x i16>   %r5, i16 %x6, i32 6
401  %r7 = insertelement <8 x i16>   %r6, i16 %x7, i32 7
402  ret <8 x i16> %r7
403}
404
405define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %i0, i8 %i1, i8 %i2, i8 %i3, i8 %i4, i8 %i5, i8 %i6, i8 %i7, i8 %i8, i8 %i9, i8 %i10, i8 %i11, i8 %i12, i8 %i13, i8 %i14, i8 %i15) nounwind {
406; SSE2-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
407; SSE2:       # %bb.0:
408; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
409; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
410; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
411; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
412; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
413; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
414; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
415; SSE2-NEXT:    andl $15, %eax
416; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
417; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
418; SSE2-NEXT:    movd %eax, %xmm1
419; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
420; SSE2-NEXT:    andl $15, %eax
421; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
422; SSE2-NEXT:    movd %eax, %xmm2
423; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
424; SSE2-NEXT:    andl $15, %eax
425; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
426; SSE2-NEXT:    movd %eax, %xmm4
427; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
428; SSE2-NEXT:    andl $15, %eax
429; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
430; SSE2-NEXT:    movd %eax, %xmm3
431; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
432; SSE2-NEXT:    andl $15, %eax
433; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
434; SSE2-NEXT:    movd %eax, %xmm6
435; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
436; SSE2-NEXT:    andl $15, %eax
437; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
438; SSE2-NEXT:    movd %eax, %xmm7
439; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
440; SSE2-NEXT:    andl $15, %eax
441; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
442; SSE2-NEXT:    movd %eax, %xmm8
443; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
444; SSE2-NEXT:    andl $15, %eax
445; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
446; SSE2-NEXT:    movd %eax, %xmm5
447; SSE2-NEXT:    andl $15, %ecx
448; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax
449; SSE2-NEXT:    movd %eax, %xmm9
450; SSE2-NEXT:    andl $15, %edx
451; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax
452; SSE2-NEXT:    movd %eax, %xmm10
453; SSE2-NEXT:    andl $15, %esi
454; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax
455; SSE2-NEXT:    movd %eax, %xmm11
456; SSE2-NEXT:    andl $15, %edi
457; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %eax
458; SSE2-NEXT:    movd %eax, %xmm0
459; SSE2-NEXT:    andl $15, %r9d
460; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax
461; SSE2-NEXT:    movd %eax, %xmm12
462; SSE2-NEXT:    andl $15, %r8d
463; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax
464; SSE2-NEXT:    movd %eax, %xmm13
465; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
466; SSE2-NEXT:    andl $15, %eax
467; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
468; SSE2-NEXT:    movd %eax, %xmm14
469; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
470; SSE2-NEXT:    andl $15, %eax
471; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
472; SSE2-NEXT:    movd %eax, %xmm15
473; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
474; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
475; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
476; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
477; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
478; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
479; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
480; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
481; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
482; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
483; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
484; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
485; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
486; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
487; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
488; SSE2-NEXT:    retq
489;
490; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
491; SSSE3:       # %bb.0:
492; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
493; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
494; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
495; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
496; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
497; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
498; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
499; SSSE3-NEXT:    andl $15, %eax
500; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
501; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
502; SSSE3-NEXT:    movd %eax, %xmm1
503; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
504; SSSE3-NEXT:    andl $15, %eax
505; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
506; SSSE3-NEXT:    movd %eax, %xmm2
507; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
508; SSSE3-NEXT:    andl $15, %eax
509; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
510; SSSE3-NEXT:    movd %eax, %xmm4
511; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
512; SSSE3-NEXT:    andl $15, %eax
513; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
514; SSSE3-NEXT:    movd %eax, %xmm3
515; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
516; SSSE3-NEXT:    andl $15, %eax
517; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
518; SSSE3-NEXT:    movd %eax, %xmm6
519; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
520; SSSE3-NEXT:    andl $15, %eax
521; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
522; SSSE3-NEXT:    movd %eax, %xmm7
523; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
524; SSSE3-NEXT:    andl $15, %eax
525; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
526; SSSE3-NEXT:    movd %eax, %xmm8
527; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
528; SSSE3-NEXT:    andl $15, %eax
529; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
530; SSSE3-NEXT:    movd %eax, %xmm5
531; SSSE3-NEXT:    andl $15, %ecx
532; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax
533; SSSE3-NEXT:    movd %eax, %xmm9
534; SSSE3-NEXT:    andl $15, %edx
535; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax
536; SSSE3-NEXT:    movd %eax, %xmm10
537; SSSE3-NEXT:    andl $15, %esi
538; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax
539; SSSE3-NEXT:    movd %eax, %xmm11
540; SSSE3-NEXT:    andl $15, %edi
541; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %eax
542; SSSE3-NEXT:    movd %eax, %xmm0
543; SSSE3-NEXT:    andl $15, %r9d
544; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax
545; SSSE3-NEXT:    movd %eax, %xmm12
546; SSSE3-NEXT:    andl $15, %r8d
547; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax
548; SSSE3-NEXT:    movd %eax, %xmm13
549; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
550; SSSE3-NEXT:    andl $15, %eax
551; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
552; SSSE3-NEXT:    movd %eax, %xmm14
553; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
554; SSSE3-NEXT:    andl $15, %eax
555; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
556; SSSE3-NEXT:    movd %eax, %xmm15
557; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
558; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
559; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
560; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
561; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
562; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
563; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
564; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
565; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3],xmm0[4],xmm11[4],xmm0[5],xmm11[5],xmm0[6],xmm11[6],xmm0[7],xmm11[7]
566; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3]
567; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
568; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
569; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3]
570; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
571; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
572; SSSE3-NEXT:    retq
573;
574; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
575; SSE41:       # %bb.0:
576; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
577; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
578; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
579; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
580; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
581; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
582; SSE41-NEXT:    andl $15, %edi
583; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
584; SSE41-NEXT:    movzbl -24(%rsp,%rdi), %eax
585; SSE41-NEXT:    movd %eax, %xmm0
586; SSE41-NEXT:    andl $15, %esi
587; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rsi), %xmm0
588; SSE41-NEXT:    andl $15, %edx
589; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rdx), %xmm0
590; SSE41-NEXT:    andl $15, %ecx
591; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rcx), %xmm0
592; SSE41-NEXT:    andl $15, %r8d
593; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r8), %xmm0
594; SSE41-NEXT:    andl $15, %r9d
595; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r9), %xmm0
596; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
597; SSE41-NEXT:    andl $15, %eax
598; SSE41-NEXT:    pinsrb $6, -24(%rsp,%rax), %xmm0
599; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
600; SSE41-NEXT:    andl $15, %eax
601; SSE41-NEXT:    pinsrb $7, -24(%rsp,%rax), %xmm0
602; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
603; SSE41-NEXT:    andl $15, %eax
604; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rax), %xmm0
605; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
606; SSE41-NEXT:    andl $15, %eax
607; SSE41-NEXT:    pinsrb $9, -24(%rsp,%rax), %xmm0
608; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
609; SSE41-NEXT:    andl $15, %eax
610; SSE41-NEXT:    pinsrb $10, -24(%rsp,%rax), %xmm0
611; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
612; SSE41-NEXT:    andl $15, %eax
613; SSE41-NEXT:    pinsrb $11, -24(%rsp,%rax), %xmm0
614; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
615; SSE41-NEXT:    andl $15, %eax
616; SSE41-NEXT:    pinsrb $12, -24(%rsp,%rax), %xmm0
617; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
618; SSE41-NEXT:    andl $15, %eax
619; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rax), %xmm0
620; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
621; SSE41-NEXT:    andl $15, %eax
622; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
623; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
624; SSE41-NEXT:    andl $15, %eax
625; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rax), %xmm0
626; SSE41-NEXT:    retq
627;
628; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
629; AVX:       # %bb.0:
630; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
631; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
632; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
633; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
634; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
635; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
636; AVX-NEXT:    andl $15, %edi
637; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
638; AVX-NEXT:    movzbl -24(%rsp,%rdi), %eax
639; AVX-NEXT:    vmovd %eax, %xmm0
640; AVX-NEXT:    andl $15, %esi
641; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rsi), %xmm0, %xmm0
642; AVX-NEXT:    andl $15, %edx
643; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
644; AVX-NEXT:    andl $15, %ecx
645; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rcx), %xmm0, %xmm0
646; AVX-NEXT:    andl $15, %r8d
647; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
648; AVX-NEXT:    andl $15, %r9d
649; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
650; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
651; AVX-NEXT:    andl $15, %eax
652; AVX-NEXT:    vpinsrb $6, -24(%rsp,%rax), %xmm0, %xmm0
653; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
654; AVX-NEXT:    andl $15, %eax
655; AVX-NEXT:    vpinsrb $7, -24(%rsp,%rax), %xmm0, %xmm0
656; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
657; AVX-NEXT:    andl $15, %eax
658; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rax), %xmm0, %xmm0
659; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
660; AVX-NEXT:    andl $15, %eax
661; AVX-NEXT:    vpinsrb $9, -24(%rsp,%rax), %xmm0, %xmm0
662; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
663; AVX-NEXT:    andl $15, %eax
664; AVX-NEXT:    vpinsrb $10, -24(%rsp,%rax), %xmm0, %xmm0
665; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
666; AVX-NEXT:    andl $15, %eax
667; AVX-NEXT:    vpinsrb $11, -24(%rsp,%rax), %xmm0, %xmm0
668; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
669; AVX-NEXT:    andl $15, %eax
670; AVX-NEXT:    vpinsrb $12, -24(%rsp,%rax), %xmm0, %xmm0
671; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
672; AVX-NEXT:    andl $15, %eax
673; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rax), %xmm0, %xmm0
674; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
675; AVX-NEXT:    andl $15, %eax
676; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
677; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
678; AVX-NEXT:    andl $15, %eax
679; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rax), %xmm0, %xmm0
680; AVX-NEXT:    retq
681  %x0  = extractelement <16 x i8> %x, i8 %i0
682  %x1  = extractelement <16 x i8> %x, i8 %i1
683  %x2  = extractelement <16 x i8> %x, i8 %i2
684  %x3  = extractelement <16 x i8> %x, i8 %i3
685  %x4  = extractelement <16 x i8> %x, i8 %i4
686  %x5  = extractelement <16 x i8> %x, i8 %i5
687  %x6  = extractelement <16 x i8> %x, i8 %i6
688  %x7  = extractelement <16 x i8> %x, i8 %i7
689  %x8  = extractelement <16 x i8> %x, i8 %i8
690  %x9  = extractelement <16 x i8> %x, i8 %i9
691  %x10 = extractelement <16 x i8> %x, i8 %i10
692  %x11 = extractelement <16 x i8> %x, i8 %i11
693  %x12 = extractelement <16 x i8> %x, i8 %i12
694  %x13 = extractelement <16 x i8> %x, i8 %i13
695  %x14 = extractelement <16 x i8> %x, i8 %i14
696  %x15 = extractelement <16 x i8> %x, i8 %i15
697  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
698  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
699  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
700  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
701  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
702  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
703  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
704  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
705  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
706  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
707  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
708  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
709  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
710  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
711  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
712  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
713  ret <16 x i8> %r15
714}
715
716;
717; Unary shuffle indices from memory
718;
719
720define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, ptr %i) nounwind {
721; SSE2-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
722; SSE2:       # %bb.0:
723; SSE2-NEXT:    movl (%rdi), %eax
724; SSE2-NEXT:    movl 4(%rdi), %ecx
725; SSE2-NEXT:    andl $3, %eax
726; SSE2-NEXT:    andl $3, %ecx
727; SSE2-NEXT:    movl 8(%rdi), %edx
728; SSE2-NEXT:    andl $3, %edx
729; SSE2-NEXT:    movl 12(%rdi), %esi
730; SSE2-NEXT:    andl $3, %esi
731; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
732; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
733; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
734; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
735; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
736; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
737; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
738; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
739; SSE2-NEXT:    retq
740;
741; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
742; SSSE3:       # %bb.0:
743; SSSE3-NEXT:    movl (%rdi), %eax
744; SSSE3-NEXT:    movl 4(%rdi), %ecx
745; SSSE3-NEXT:    andl $3, %eax
746; SSSE3-NEXT:    andl $3, %ecx
747; SSSE3-NEXT:    movl 8(%rdi), %edx
748; SSSE3-NEXT:    andl $3, %edx
749; SSSE3-NEXT:    movl 12(%rdi), %esi
750; SSSE3-NEXT:    andl $3, %esi
751; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
752; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
753; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
754; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
755; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
756; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
757; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
758; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
759; SSSE3-NEXT:    retq
760;
761; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
762; SSE41:       # %bb.0:
763; SSE41-NEXT:    movl (%rdi), %eax
764; SSE41-NEXT:    movl 4(%rdi), %ecx
765; SSE41-NEXT:    andl $3, %eax
766; SSE41-NEXT:    andl $3, %ecx
767; SSE41-NEXT:    movl 8(%rdi), %edx
768; SSE41-NEXT:    andl $3, %edx
769; SSE41-NEXT:    movl 12(%rdi), %esi
770; SSE41-NEXT:    andl $3, %esi
771; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
772; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
773; SSE41-NEXT:    pinsrd $1, -24(%rsp,%rcx,4), %xmm0
774; SSE41-NEXT:    pinsrd $2, -24(%rsp,%rdx,4), %xmm0
775; SSE41-NEXT:    pinsrd $3, -24(%rsp,%rsi,4), %xmm0
776; SSE41-NEXT:    retq
777;
778; AVX-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
779; AVX:       # %bb.0:
780; AVX-NEXT:    movl (%rdi), %eax
781; AVX-NEXT:    movl 4(%rdi), %ecx
782; AVX-NEXT:    andl $3, %eax
783; AVX-NEXT:    andl $3, %ecx
784; AVX-NEXT:    movl 8(%rdi), %edx
785; AVX-NEXT:    andl $3, %edx
786; AVX-NEXT:    movl 12(%rdi), %esi
787; AVX-NEXT:    andl $3, %esi
788; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
789; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
790; AVX-NEXT:    vpinsrd $1, -24(%rsp,%rcx,4), %xmm0, %xmm0
791; AVX-NEXT:    vpinsrd $2, -24(%rsp,%rdx,4), %xmm0, %xmm0
792; AVX-NEXT:    vpinsrd $3, -24(%rsp,%rsi,4), %xmm0, %xmm0
793; AVX-NEXT:    retq
794  %p1  = getelementptr inbounds i32, ptr %i, i64 1
795  %p2  = getelementptr inbounds i32, ptr %i, i64 2
796  %p3  = getelementptr inbounds i32, ptr %i, i64 3
797  %i0  = load i32, ptr %i, align 4
798  %i1  = load i32, ptr %p1, align 4
799  %i2  = load i32, ptr %p2, align 4
800  %i3  = load i32, ptr %p3, align 4
801  %x0 = extractelement <4 x i32> %x, i32 %i0
802  %x1 = extractelement <4 x i32> %x, i32 %i1
803  %x2 = extractelement <4 x i32> %x, i32 %i2
804  %x3 = extractelement <4 x i32> %x, i32 %i3
805  %r0 = insertelement <4 x i32> undef, i32 %x0, i32 0
806  %r1 = insertelement <4 x i32>   %r0, i32 %x1, i32 1
807  %r2 = insertelement <4 x i32>   %r1, i32 %x2, i32 2
808  %r3 = insertelement <4 x i32>   %r2, i32 %x3, i32 3
809  ret <4 x i32> %r3
810}
811
812define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, ptr %i) nounwind {
813; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
814; SSE2:       # %bb.0:
815; SSE2-NEXT:    pushq %rbp
816; SSE2-NEXT:    pushq %r15
817; SSE2-NEXT:    pushq %r14
818; SSE2-NEXT:    pushq %r13
819; SSE2-NEXT:    pushq %r12
820; SSE2-NEXT:    pushq %rbx
821; SSE2-NEXT:    movzbl (%rdi), %eax
822; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
823; SSE2-NEXT:    movzbl 1(%rdi), %ecx
824; SSE2-NEXT:    movzbl 2(%rdi), %edx
825; SSE2-NEXT:    movzbl 3(%rdi), %esi
826; SSE2-NEXT:    movzbl 4(%rdi), %r8d
827; SSE2-NEXT:    movzbl 5(%rdi), %r9d
828; SSE2-NEXT:    movzbl 6(%rdi), %r10d
829; SSE2-NEXT:    movzbl 7(%rdi), %r11d
830; SSE2-NEXT:    movzbl 8(%rdi), %ebx
831; SSE2-NEXT:    movzbl 9(%rdi), %r14d
832; SSE2-NEXT:    movzbl 10(%rdi), %r15d
833; SSE2-NEXT:    movzbl 11(%rdi), %r12d
834; SSE2-NEXT:    movzbl 12(%rdi), %r13d
835; SSE2-NEXT:    movzbl 13(%rdi), %ebp
836; SSE2-NEXT:    movzbl 14(%rdi), %eax
837; SSE2-NEXT:    movzbl 15(%rdi), %edi
838; SSE2-NEXT:    andl $15, %edi
839; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
840; SSE2-NEXT:    movzbl -24(%rsp,%rdi), %edi
841; SSE2-NEXT:    movd %edi, %xmm1
842; SSE2-NEXT:    andl $15, %eax
843; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
844; SSE2-NEXT:    movd %eax, %xmm2
845; SSE2-NEXT:    andl $15, %ebp
846; SSE2-NEXT:    movzbl -24(%rsp,%rbp), %eax
847; SSE2-NEXT:    movd %eax, %xmm4
848; SSE2-NEXT:    andl $15, %r13d
849; SSE2-NEXT:    movzbl -24(%rsp,%r13), %eax
850; SSE2-NEXT:    movd %eax, %xmm3
851; SSE2-NEXT:    andl $15, %r12d
852; SSE2-NEXT:    movzbl -24(%rsp,%r12), %eax
853; SSE2-NEXT:    movd %eax, %xmm6
854; SSE2-NEXT:    andl $15, %r15d
855; SSE2-NEXT:    movzbl -24(%rsp,%r15), %eax
856; SSE2-NEXT:    movd %eax, %xmm7
857; SSE2-NEXT:    andl $15, %r14d
858; SSE2-NEXT:    movzbl -24(%rsp,%r14), %eax
859; SSE2-NEXT:    movd %eax, %xmm8
860; SSE2-NEXT:    andl $15, %ebx
861; SSE2-NEXT:    movzbl -24(%rsp,%rbx), %eax
862; SSE2-NEXT:    movd %eax, %xmm5
863; SSE2-NEXT:    andl $15, %r11d
864; SSE2-NEXT:    movzbl -24(%rsp,%r11), %eax
865; SSE2-NEXT:    movd %eax, %xmm9
866; SSE2-NEXT:    andl $15, %r10d
867; SSE2-NEXT:    movzbl -24(%rsp,%r10), %eax
868; SSE2-NEXT:    movd %eax, %xmm10
869; SSE2-NEXT:    andl $15, %r9d
870; SSE2-NEXT:    movzbl -24(%rsp,%r9), %eax
871; SSE2-NEXT:    movd %eax, %xmm12
872; SSE2-NEXT:    andl $15, %r8d
873; SSE2-NEXT:    movzbl -24(%rsp,%r8), %eax
874; SSE2-NEXT:    movd %eax, %xmm11
875; SSE2-NEXT:    andl $15, %esi
876; SSE2-NEXT:    movzbl -24(%rsp,%rsi), %eax
877; SSE2-NEXT:    movd %eax, %xmm13
878; SSE2-NEXT:    andl $15, %edx
879; SSE2-NEXT:    movzbl -24(%rsp,%rdx), %eax
880; SSE2-NEXT:    movd %eax, %xmm14
881; SSE2-NEXT:    andl $15, %ecx
882; SSE2-NEXT:    movzbl -24(%rsp,%rcx), %eax
883; SSE2-NEXT:    movd %eax, %xmm15
884; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
885; SSE2-NEXT:    andl $15, %eax
886; SSE2-NEXT:    movzbl -24(%rsp,%rax), %eax
887; SSE2-NEXT:    movd %eax, %xmm0
888; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
889; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
890; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
891; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
892; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
893; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
894; SSE2-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
895; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
896; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
897; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
898; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
899; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
900; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
901; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
902; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
903; SSE2-NEXT:    popq %rbx
904; SSE2-NEXT:    popq %r12
905; SSE2-NEXT:    popq %r13
906; SSE2-NEXT:    popq %r14
907; SSE2-NEXT:    popq %r15
908; SSE2-NEXT:    popq %rbp
909; SSE2-NEXT:    retq
910;
911; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
912; SSSE3:       # %bb.0:
913; SSSE3-NEXT:    pushq %rbp
914; SSSE3-NEXT:    pushq %r15
915; SSSE3-NEXT:    pushq %r14
916; SSSE3-NEXT:    pushq %r13
917; SSSE3-NEXT:    pushq %r12
918; SSSE3-NEXT:    pushq %rbx
919; SSSE3-NEXT:    movzbl (%rdi), %eax
920; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
921; SSSE3-NEXT:    movzbl 1(%rdi), %ecx
922; SSSE3-NEXT:    movzbl 2(%rdi), %edx
923; SSSE3-NEXT:    movzbl 3(%rdi), %esi
924; SSSE3-NEXT:    movzbl 4(%rdi), %r8d
925; SSSE3-NEXT:    movzbl 5(%rdi), %r9d
926; SSSE3-NEXT:    movzbl 6(%rdi), %r10d
927; SSSE3-NEXT:    movzbl 7(%rdi), %r11d
928; SSSE3-NEXT:    movzbl 8(%rdi), %ebx
929; SSSE3-NEXT:    movzbl 9(%rdi), %r14d
930; SSSE3-NEXT:    movzbl 10(%rdi), %r15d
931; SSSE3-NEXT:    movzbl 11(%rdi), %r12d
932; SSSE3-NEXT:    movzbl 12(%rdi), %r13d
933; SSSE3-NEXT:    movzbl 13(%rdi), %ebp
934; SSSE3-NEXT:    movzbl 14(%rdi), %eax
935; SSSE3-NEXT:    movzbl 15(%rdi), %edi
936; SSSE3-NEXT:    andl $15, %edi
937; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
938; SSSE3-NEXT:    movzbl -24(%rsp,%rdi), %edi
939; SSSE3-NEXT:    movd %edi, %xmm1
940; SSSE3-NEXT:    andl $15, %eax
941; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
942; SSSE3-NEXT:    movd %eax, %xmm2
943; SSSE3-NEXT:    andl $15, %ebp
944; SSSE3-NEXT:    movzbl -24(%rsp,%rbp), %eax
945; SSSE3-NEXT:    movd %eax, %xmm4
946; SSSE3-NEXT:    andl $15, %r13d
947; SSSE3-NEXT:    movzbl -24(%rsp,%r13), %eax
948; SSSE3-NEXT:    movd %eax, %xmm3
949; SSSE3-NEXT:    andl $15, %r12d
950; SSSE3-NEXT:    movzbl -24(%rsp,%r12), %eax
951; SSSE3-NEXT:    movd %eax, %xmm6
952; SSSE3-NEXT:    andl $15, %r15d
953; SSSE3-NEXT:    movzbl -24(%rsp,%r15), %eax
954; SSSE3-NEXT:    movd %eax, %xmm7
955; SSSE3-NEXT:    andl $15, %r14d
956; SSSE3-NEXT:    movzbl -24(%rsp,%r14), %eax
957; SSSE3-NEXT:    movd %eax, %xmm8
958; SSSE3-NEXT:    andl $15, %ebx
959; SSSE3-NEXT:    movzbl -24(%rsp,%rbx), %eax
960; SSSE3-NEXT:    movd %eax, %xmm5
961; SSSE3-NEXT:    andl $15, %r11d
962; SSSE3-NEXT:    movzbl -24(%rsp,%r11), %eax
963; SSSE3-NEXT:    movd %eax, %xmm9
964; SSSE3-NEXT:    andl $15, %r10d
965; SSSE3-NEXT:    movzbl -24(%rsp,%r10), %eax
966; SSSE3-NEXT:    movd %eax, %xmm10
967; SSSE3-NEXT:    andl $15, %r9d
968; SSSE3-NEXT:    movzbl -24(%rsp,%r9), %eax
969; SSSE3-NEXT:    movd %eax, %xmm12
970; SSSE3-NEXT:    andl $15, %r8d
971; SSSE3-NEXT:    movzbl -24(%rsp,%r8), %eax
972; SSSE3-NEXT:    movd %eax, %xmm11
973; SSSE3-NEXT:    andl $15, %esi
974; SSSE3-NEXT:    movzbl -24(%rsp,%rsi), %eax
975; SSSE3-NEXT:    movd %eax, %xmm13
976; SSSE3-NEXT:    andl $15, %edx
977; SSSE3-NEXT:    movzbl -24(%rsp,%rdx), %eax
978; SSSE3-NEXT:    movd %eax, %xmm14
979; SSSE3-NEXT:    andl $15, %ecx
980; SSSE3-NEXT:    movzbl -24(%rsp,%rcx), %eax
981; SSSE3-NEXT:    movd %eax, %xmm15
982; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
983; SSSE3-NEXT:    andl $15, %eax
984; SSSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
985; SSSE3-NEXT:    movd %eax, %xmm0
986; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
987; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
988; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
989; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
990; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
991; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
992; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
993; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
994; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
995; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
996; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
997; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
998; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
999; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1000; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1001; SSSE3-NEXT:    popq %rbx
1002; SSSE3-NEXT:    popq %r12
1003; SSSE3-NEXT:    popq %r13
1004; SSSE3-NEXT:    popq %r14
1005; SSSE3-NEXT:    popq %r15
1006; SSSE3-NEXT:    popq %rbp
1007; SSSE3-NEXT:    retq
1008;
1009; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
1010; SSE41:       # %bb.0:
1011; SSE41-NEXT:    pushq %rbp
1012; SSE41-NEXT:    pushq %r15
1013; SSE41-NEXT:    pushq %r14
1014; SSE41-NEXT:    pushq %r13
1015; SSE41-NEXT:    pushq %r12
1016; SSE41-NEXT:    pushq %rbx
1017; SSE41-NEXT:    movzbl (%rdi), %ecx
1018; SSE41-NEXT:    andl $15, %ecx
1019; SSE41-NEXT:    movzbl 1(%rdi), %eax
1020; SSE41-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1021; SSE41-NEXT:    movzbl 2(%rdi), %edx
1022; SSE41-NEXT:    movzbl 3(%rdi), %esi
1023; SSE41-NEXT:    movzbl 4(%rdi), %r8d
1024; SSE41-NEXT:    movzbl 5(%rdi), %r9d
1025; SSE41-NEXT:    movzbl 6(%rdi), %r10d
1026; SSE41-NEXT:    movzbl 7(%rdi), %r11d
1027; SSE41-NEXT:    movzbl 8(%rdi), %ebx
1028; SSE41-NEXT:    movzbl 9(%rdi), %r14d
1029; SSE41-NEXT:    movzbl 10(%rdi), %r15d
1030; SSE41-NEXT:    movzbl 11(%rdi), %r12d
1031; SSE41-NEXT:    movzbl 12(%rdi), %r13d
1032; SSE41-NEXT:    movzbl 13(%rdi), %ebp
1033; SSE41-NEXT:    movzbl 14(%rdi), %eax
1034; SSE41-NEXT:    movzbl 15(%rdi), %edi
1035; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1036; SSE41-NEXT:    movzbl -24(%rsp,%rcx), %ecx
1037; SSE41-NEXT:    movd %ecx, %xmm0
1038; SSE41-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
1039; SSE41-NEXT:    andl $15, %ecx
1040; SSE41-NEXT:    pinsrb $1, -24(%rsp,%rcx), %xmm0
1041; SSE41-NEXT:    andl $15, %edx
1042; SSE41-NEXT:    pinsrb $2, -24(%rsp,%rdx), %xmm0
1043; SSE41-NEXT:    andl $15, %esi
1044; SSE41-NEXT:    pinsrb $3, -24(%rsp,%rsi), %xmm0
1045; SSE41-NEXT:    andl $15, %r8d
1046; SSE41-NEXT:    pinsrb $4, -24(%rsp,%r8), %xmm0
1047; SSE41-NEXT:    andl $15, %r9d
1048; SSE41-NEXT:    pinsrb $5, -24(%rsp,%r9), %xmm0
1049; SSE41-NEXT:    andl $15, %r10d
1050; SSE41-NEXT:    pinsrb $6, -24(%rsp,%r10), %xmm0
1051; SSE41-NEXT:    andl $15, %r11d
1052; SSE41-NEXT:    pinsrb $7, -24(%rsp,%r11), %xmm0
1053; SSE41-NEXT:    andl $15, %ebx
1054; SSE41-NEXT:    pinsrb $8, -24(%rsp,%rbx), %xmm0
1055; SSE41-NEXT:    andl $15, %r14d
1056; SSE41-NEXT:    pinsrb $9, -24(%rsp,%r14), %xmm0
1057; SSE41-NEXT:    andl $15, %r15d
1058; SSE41-NEXT:    pinsrb $10, -24(%rsp,%r15), %xmm0
1059; SSE41-NEXT:    andl $15, %r12d
1060; SSE41-NEXT:    pinsrb $11, -24(%rsp,%r12), %xmm0
1061; SSE41-NEXT:    andl $15, %r13d
1062; SSE41-NEXT:    pinsrb $12, -24(%rsp,%r13), %xmm0
1063; SSE41-NEXT:    andl $15, %ebp
1064; SSE41-NEXT:    pinsrb $13, -24(%rsp,%rbp), %xmm0
1065; SSE41-NEXT:    andl $15, %eax
1066; SSE41-NEXT:    pinsrb $14, -24(%rsp,%rax), %xmm0
1067; SSE41-NEXT:    andl $15, %edi
1068; SSE41-NEXT:    pinsrb $15, -24(%rsp,%rdi), %xmm0
1069; SSE41-NEXT:    popq %rbx
1070; SSE41-NEXT:    popq %r12
1071; SSE41-NEXT:    popq %r13
1072; SSE41-NEXT:    popq %r14
1073; SSE41-NEXT:    popq %r15
1074; SSE41-NEXT:    popq %rbp
1075; SSE41-NEXT:    retq
1076;
1077; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
1078; AVX:       # %bb.0:
1079; AVX-NEXT:    pushq %rbp
1080; AVX-NEXT:    pushq %r15
1081; AVX-NEXT:    pushq %r14
1082; AVX-NEXT:    pushq %r13
1083; AVX-NEXT:    pushq %r12
1084; AVX-NEXT:    pushq %rbx
1085; AVX-NEXT:    movzbl (%rdi), %ecx
1086; AVX-NEXT:    andl $15, %ecx
1087; AVX-NEXT:    movzbl 1(%rdi), %eax
1088; AVX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1089; AVX-NEXT:    movzbl 2(%rdi), %edx
1090; AVX-NEXT:    movzbl 3(%rdi), %esi
1091; AVX-NEXT:    movzbl 4(%rdi), %r8d
1092; AVX-NEXT:    movzbl 5(%rdi), %r9d
1093; AVX-NEXT:    movzbl 6(%rdi), %r10d
1094; AVX-NEXT:    movzbl 7(%rdi), %r11d
1095; AVX-NEXT:    movzbl 8(%rdi), %ebx
1096; AVX-NEXT:    movzbl 9(%rdi), %r14d
1097; AVX-NEXT:    movzbl 10(%rdi), %r15d
1098; AVX-NEXT:    movzbl 11(%rdi), %r12d
1099; AVX-NEXT:    movzbl 12(%rdi), %r13d
1100; AVX-NEXT:    movzbl 13(%rdi), %ebp
1101; AVX-NEXT:    movzbl 14(%rdi), %eax
1102; AVX-NEXT:    movzbl 15(%rdi), %edi
1103; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1104; AVX-NEXT:    movzbl -24(%rsp,%rcx), %ecx
1105; AVX-NEXT:    vmovd %ecx, %xmm0
1106; AVX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
1107; AVX-NEXT:    andl $15, %ecx
1108; AVX-NEXT:    vpinsrb $1, -24(%rsp,%rcx), %xmm0, %xmm0
1109; AVX-NEXT:    andl $15, %edx
1110; AVX-NEXT:    vpinsrb $2, -24(%rsp,%rdx), %xmm0, %xmm0
1111; AVX-NEXT:    andl $15, %esi
1112; AVX-NEXT:    vpinsrb $3, -24(%rsp,%rsi), %xmm0, %xmm0
1113; AVX-NEXT:    andl $15, %r8d
1114; AVX-NEXT:    vpinsrb $4, -24(%rsp,%r8), %xmm0, %xmm0
1115; AVX-NEXT:    andl $15, %r9d
1116; AVX-NEXT:    vpinsrb $5, -24(%rsp,%r9), %xmm0, %xmm0
1117; AVX-NEXT:    andl $15, %r10d
1118; AVX-NEXT:    vpinsrb $6, -24(%rsp,%r10), %xmm0, %xmm0
1119; AVX-NEXT:    andl $15, %r11d
1120; AVX-NEXT:    vpinsrb $7, -24(%rsp,%r11), %xmm0, %xmm0
1121; AVX-NEXT:    andl $15, %ebx
1122; AVX-NEXT:    vpinsrb $8, -24(%rsp,%rbx), %xmm0, %xmm0
1123; AVX-NEXT:    andl $15, %r14d
1124; AVX-NEXT:    vpinsrb $9, -24(%rsp,%r14), %xmm0, %xmm0
1125; AVX-NEXT:    andl $15, %r15d
1126; AVX-NEXT:    vpinsrb $10, -24(%rsp,%r15), %xmm0, %xmm0
1127; AVX-NEXT:    andl $15, %r12d
1128; AVX-NEXT:    vpinsrb $11, -24(%rsp,%r12), %xmm0, %xmm0
1129; AVX-NEXT:    andl $15, %r13d
1130; AVX-NEXT:    vpinsrb $12, -24(%rsp,%r13), %xmm0, %xmm0
1131; AVX-NEXT:    andl $15, %ebp
1132; AVX-NEXT:    vpinsrb $13, -24(%rsp,%rbp), %xmm0, %xmm0
1133; AVX-NEXT:    andl $15, %eax
1134; AVX-NEXT:    vpinsrb $14, -24(%rsp,%rax), %xmm0, %xmm0
1135; AVX-NEXT:    andl $15, %edi
1136; AVX-NEXT:    vpinsrb $15, -24(%rsp,%rdi), %xmm0, %xmm0
1137; AVX-NEXT:    popq %rbx
1138; AVX-NEXT:    popq %r12
1139; AVX-NEXT:    popq %r13
1140; AVX-NEXT:    popq %r14
1141; AVX-NEXT:    popq %r15
1142; AVX-NEXT:    popq %rbp
1143; AVX-NEXT:    retq
1144  %p1  = getelementptr inbounds i8, ptr %i, i64 1
1145  %p2  = getelementptr inbounds i8, ptr %i, i64 2
1146  %p3  = getelementptr inbounds i8, ptr %i, i64 3
1147  %p4  = getelementptr inbounds i8, ptr %i, i64 4
1148  %p5  = getelementptr inbounds i8, ptr %i, i64 5
1149  %p6  = getelementptr inbounds i8, ptr %i, i64 6
1150  %p7  = getelementptr inbounds i8, ptr %i, i64 7
1151  %p8  = getelementptr inbounds i8, ptr %i, i64 8
1152  %p9  = getelementptr inbounds i8, ptr %i, i64 9
1153  %p10 = getelementptr inbounds i8, ptr %i, i64 10
1154  %p11 = getelementptr inbounds i8, ptr %i, i64 11
1155  %p12 = getelementptr inbounds i8, ptr %i, i64 12
1156  %p13 = getelementptr inbounds i8, ptr %i, i64 13
1157  %p14 = getelementptr inbounds i8, ptr %i, i64 14
1158  %p15 = getelementptr inbounds i8, ptr %i, i64 15
1159  %i0  = load i8, ptr %i , align 4
1160  %i1  = load i8, ptr %p1 , align 4
1161  %i2  = load i8, ptr %p2 , align 4
1162  %i3  = load i8, ptr %p3 , align 4
1163  %i4  = load i8, ptr %p4 , align 4
1164  %i5  = load i8, ptr %p5 , align 4
1165  %i6  = load i8, ptr %p6 , align 4
1166  %i7  = load i8, ptr %p7 , align 4
1167  %i8  = load i8, ptr %p8 , align 4
1168  %i9  = load i8, ptr %p9 , align 4
1169  %i10 = load i8, ptr %p10, align 4
1170  %i11 = load i8, ptr %p11, align 4
1171  %i12 = load i8, ptr %p12, align 4
1172  %i13 = load i8, ptr %p13, align 4
1173  %i14 = load i8, ptr %p14, align 4
1174  %i15 = load i8, ptr %p15, align 4
1175  %x0  = extractelement <16 x i8> %x, i8 %i0
1176  %x1  = extractelement <16 x i8> %x, i8 %i1
1177  %x2  = extractelement <16 x i8> %x, i8 %i2
1178  %x3  = extractelement <16 x i8> %x, i8 %i3
1179  %x4  = extractelement <16 x i8> %x, i8 %i4
1180  %x5  = extractelement <16 x i8> %x, i8 %i5
1181  %x6  = extractelement <16 x i8> %x, i8 %i6
1182  %x7  = extractelement <16 x i8> %x, i8 %i7
1183  %x8  = extractelement <16 x i8> %x, i8 %i8
1184  %x9  = extractelement <16 x i8> %x, i8 %i9
1185  %x10 = extractelement <16 x i8> %x, i8 %i10
1186  %x11 = extractelement <16 x i8> %x, i8 %i11
1187  %x12 = extractelement <16 x i8> %x, i8 %i12
1188  %x13 = extractelement <16 x i8> %x, i8 %i13
1189  %x14 = extractelement <16 x i8> %x, i8 %i14
1190  %x15 = extractelement <16 x i8> %x, i8 %i15
1191  %r0  = insertelement <16 x i8> undef, i8 %x0 , i32 0
1192  %r1  = insertelement <16 x i8>  %r0 , i8 %x1 , i32 1
1193  %r2  = insertelement <16 x i8>  %r1 , i8 %x2 , i32 2
1194  %r3  = insertelement <16 x i8>  %r2 , i8 %x3 , i32 3
1195  %r4  = insertelement <16 x i8>  %r3 , i8 %x4 , i32 4
1196  %r5  = insertelement <16 x i8>  %r4 , i8 %x5 , i32 5
1197  %r6  = insertelement <16 x i8>  %r5 , i8 %x6 , i32 6
1198  %r7  = insertelement <16 x i8>  %r6 , i8 %x7 , i32 7
1199  %r8  = insertelement <16 x i8>  %r7 , i8 %x8 , i32 8
1200  %r9  = insertelement <16 x i8>  %r8 , i8 %x9 , i32 9
1201  %r10 = insertelement <16 x i8>  %r9 , i8 %x10, i32 10
1202  %r11 = insertelement <16 x i8>  %r10, i8 %x11, i32 11
1203  %r12 = insertelement <16 x i8>  %r11, i8 %x12, i32 12
1204  %r13 = insertelement <16 x i8>  %r12, i8 %x13, i32 13
1205  %r14 = insertelement <16 x i8>  %r13, i8 %x14, i32 14
1206  %r15 = insertelement <16 x i8>  %r14, i8 %x15, i32 15
1207  ret <16 x i8> %r15
1208}
1209
1210;
1211; Binary shuffle indices from registers
1212;
1213
1214define <4 x float> @var_shuffle_v4f32_v4f32_x0yx_i32(<4 x float> %x, <4 x float> %y, i32 %i0, i32 %i1, i32 %i2, i32 %i3) nounwind {
1215; SSE2-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1216; SSE2:       # %bb.0:
1217; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
1218; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
1219; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1220; SSE2-NEXT:    andl $3, %edi
1221; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1222; SSE2-NEXT:    andl $3, %edx
1223; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1224; SSE2-NEXT:    andl $3, %ecx
1225; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1226; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1227; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1228; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1229; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1230; SSE2-NEXT:    retq
1231;
1232; SSSE3-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1233; SSSE3:       # %bb.0:
1234; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
1235; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
1236; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
1237; SSSE3-NEXT:    andl $3, %edi
1238; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1239; SSSE3-NEXT:    andl $3, %edx
1240; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1241; SSSE3-NEXT:    andl $3, %ecx
1242; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1243; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1244; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
1245; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1246; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1247; SSSE3-NEXT:    retq
1248;
1249; SSE41-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1250; SSE41:       # %bb.0:
1251; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
1252; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
1253; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
1254; SSE41-NEXT:    andl $3, %edi
1255; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1256; SSE41-NEXT:    andl $3, %edx
1257; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1258; SSE41-NEXT:    andl $3, %ecx
1259; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1260; SSE41-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1261; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
1262; SSE41-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1263; SSE41-NEXT:    retq
1264;
1265; AVX-LABEL: var_shuffle_v4f32_v4f32_x0yx_i32:
1266; AVX:       # %bb.0:
1267; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
1268; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
1269; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
1270; AVX-NEXT:    andl $3, %edi
1271; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1272; AVX-NEXT:    andl $3, %edx
1273; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1274; AVX-NEXT:    andl $3, %ecx
1275; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
1276; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
1277; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],zero,zero
1278; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1279; AVX-NEXT:    retq
1280  %x0 = extractelement <4 x float> %x, i32 %i0
1281  %x1 = extractelement <4 x float> %x, i32 %i1
1282  %y2 = extractelement <4 x float> %y, i32 %i2
1283  %x3 = extractelement <4 x float> %x, i32 %i3
1284  %r0 = insertelement <4 x float> undef, float %x0, i32 0
1285  %r1 = insertelement <4 x float>   %r0, float 0.0, i32 1
1286  %r2 = insertelement <4 x float>   %r1, float %y2, i32 2
1287  %r3 = insertelement <4 x float>   %r2, float %x3, i32 3
1288  ret <4 x float> %r3
1289}
1290
1291define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %y, i16 %i0, i16 %i1, i16 %i2, i16 %i3, i16 %i4, i16 %i5, i16 %i6, i16 %i7) nounwind {
1292; SSE2-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1293; SSE2:       # %bb.0:
1294; SSE2-NEXT:    # kill: def $r9d killed $r9d def $r9
1295; SSE2-NEXT:    # kill: def $r8d killed $r8d def $r8
1296; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
1297; SSE2-NEXT:    # kill: def $edx killed $edx def $rdx
1298; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
1299; SSE2-NEXT:    # kill: def $edi killed $edi def $rdi
1300; SSE2-NEXT:    andl $7, %edi
1301; SSE2-NEXT:    andl $7, %esi
1302; SSE2-NEXT:    andl $7, %edx
1303; SSE2-NEXT:    andl $7, %ecx
1304; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1305; SSE2-NEXT:    andl $7, %r8d
1306; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1307; SSE2-NEXT:    andl $7, %r9d
1308; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
1309; SSE2-NEXT:    movd %eax, %xmm0
1310; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
1311; SSE2-NEXT:    movd %eax, %xmm1
1312; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1313; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
1314; SSE2-NEXT:    movd %eax, %xmm2
1315; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1316; SSE2-NEXT:    movd %eax, %xmm0
1317; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1318; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1319; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
1320; SSE2-NEXT:    movd %eax, %xmm1
1321; SSE2-NEXT:    movzwl -40(%rsp,%r8,2), %eax
1322; SSE2-NEXT:    movd %eax, %xmm2
1323; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1324; SSE2-NEXT:    pxor %xmm1, %xmm1
1325; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1326; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1327; SSE2-NEXT:    retq
1328;
1329; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1330; SSSE3:       # %bb.0:
1331; SSSE3-NEXT:    # kill: def $r9d killed $r9d def $r9
1332; SSSE3-NEXT:    # kill: def $r8d killed $r8d def $r8
1333; SSSE3-NEXT:    # kill: def $ecx killed $ecx def $rcx
1334; SSSE3-NEXT:    # kill: def $edx killed $edx def $rdx
1335; SSSE3-NEXT:    # kill: def $esi killed $esi def $rsi
1336; SSSE3-NEXT:    # kill: def $edi killed $edi def $rdi
1337; SSSE3-NEXT:    andl $7, %edi
1338; SSSE3-NEXT:    andl $7, %esi
1339; SSSE3-NEXT:    andl $7, %edx
1340; SSSE3-NEXT:    andl $7, %ecx
1341; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1342; SSSE3-NEXT:    andl $7, %r8d
1343; SSSE3-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1344; SSSE3-NEXT:    andl $7, %r9d
1345; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
1346; SSSE3-NEXT:    movd %eax, %xmm0
1347; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
1348; SSSE3-NEXT:    movd %eax, %xmm1
1349; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1350; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
1351; SSSE3-NEXT:    movd %eax, %xmm2
1352; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1353; SSSE3-NEXT:    movd %eax, %xmm0
1354; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
1355; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1356; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
1357; SSSE3-NEXT:    movd %eax, %xmm1
1358; SSSE3-NEXT:    movzwl -40(%rsp,%r8,2), %eax
1359; SSSE3-NEXT:    movd %eax, %xmm2
1360; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
1361; SSSE3-NEXT:    pxor %xmm1, %xmm1
1362; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3]
1363; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1364; SSSE3-NEXT:    retq
1365;
1366; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1367; SSE41:       # %bb.0:
1368; SSE41-NEXT:    # kill: def $r9d killed $r9d def $r9
1369; SSE41-NEXT:    # kill: def $r8d killed $r8d def $r8
1370; SSE41-NEXT:    # kill: def $ecx killed $ecx def $rcx
1371; SSE41-NEXT:    # kill: def $edx killed $edx def $rdx
1372; SSE41-NEXT:    # kill: def $esi killed $esi def $rsi
1373; SSE41-NEXT:    # kill: def $edi killed $edi def $rdi
1374; SSE41-NEXT:    andl $7, %edi
1375; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
1376; SSE41-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1377; SSE41-NEXT:    andl $7, %esi
1378; SSE41-NEXT:    andl $7, %edx
1379; SSE41-NEXT:    andl $7, %ecx
1380; SSE41-NEXT:    andl $7, %r8d
1381; SSE41-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
1382; SSE41-NEXT:    andl $7, %r9d
1383; SSE41-NEXT:    movd %eax, %xmm0
1384; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
1385; SSE41-NEXT:    pinsrw $2, -40(%rsp,%rdx,2), %xmm0
1386; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
1387; SSE41-NEXT:    pinsrw $4, -40(%rsp,%r8,2), %xmm0
1388; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
1389; SSE41-NEXT:    retq
1390;
1391; AVX-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
1392; AVX:       # %bb.0:
1393; AVX-NEXT:    # kill: def $r9d killed $r9d def $r9
1394; AVX-NEXT:    # kill: def $r8d killed $r8d def $r8
1395; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
1396; AVX-NEXT:    # kill: def $edx killed $edx def $rdx
1397; AVX-NEXT:    # kill: def $esi killed $esi def $rsi
1398; AVX-NEXT:    # kill: def $edi killed $edi def $rdi
1399; AVX-NEXT:    andl $7, %edi
1400; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
1401; AVX-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
1402; AVX-NEXT:    andl $7, %esi
1403; AVX-NEXT:    andl $7, %edx
1404; AVX-NEXT:    andl $7, %ecx
1405; AVX-NEXT:    andl $7, %r8d
1406; AVX-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
1407; AVX-NEXT:    andl $7, %r9d
1408; AVX-NEXT:    vmovd %eax, %xmm0
1409; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
1410; AVX-NEXT:    vpinsrw $2, -40(%rsp,%rdx,2), %xmm0, %xmm0
1411; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
1412; AVX-NEXT:    vpinsrw $4, -40(%rsp,%r8,2), %xmm0, %xmm0
1413; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
1414; AVX-NEXT:    retq
1415  %x0 = extractelement <8 x i16> %x, i16 %i0
1416  %y1 = extractelement <8 x i16> %y, i16 %i1
1417  %x2 = extractelement <8 x i16> %x, i16 %i2
1418  %y3 = extractelement <8 x i16> %y, i16 %i3
1419  %x4 = extractelement <8 x i16> %x, i16 %i4
1420  %y5 = extractelement <8 x i16> %y, i16 %i5
1421  %x6 = extractelement <8 x i16> %x, i16 %i6
1422  %x7 = extractelement <8 x i16> %x, i16 %i7
1423  %r0 = insertelement <8 x i16> undef, i16 %x0, i32 0
1424  %r1 = insertelement <8 x i16>   %r0, i16 %y1, i32 1
1425  %r2 = insertelement <8 x i16>   %r1, i16 %x2, i32 2
1426  %r3 = insertelement <8 x i16>   %r2, i16 %y3, i32 3
1427  %r4 = insertelement <8 x i16>   %r3, i16 %x4, i32 4
1428  %r5 = insertelement <8 x i16>   %r4, i16 %y5, i32 5
1429  %r6 = insertelement <8 x i16>   %r5, i16   0, i32 6
1430  %r7 = insertelement <8 x i16>   %r6, i16   0, i32 7
1431  ret <8 x i16> %r7
1432}
1433