xref: /llvm-project/llvm/test/CodeGen/X86/var-permute-128.ll (revision fbd303ba5dd98d4513d62058bc1ad463507464ce)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --no_x86_scrub_mem_shuffle
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,XOP
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVXNOVLBW,AVX512
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512VL,AVX512VLBW
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+avx512vbmi | FileCheck %s --check-prefixes=AVX,AVX512VL,VLVBMI
13
14define <2 x i64> @var_shuffle_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
15; SSE3-LABEL: var_shuffle_v2i64:
16; SSE3:       # %bb.0:
17; SSE3-NEXT:    movq %xmm1, %rax
18; SSE3-NEXT:    andl $1, %eax
19; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
20; SSE3-NEXT:    movq %xmm1, %rcx
21; SSE3-NEXT:    andl $1, %ecx
22; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
23; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
24; SSE3-NEXT:    movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
25; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
26; SSE3-NEXT:    retq
27;
28; SSSE3-LABEL: var_shuffle_v2i64:
29; SSSE3:       # %bb.0:
30; SSSE3-NEXT:    movq %xmm1, %rax
31; SSSE3-NEXT:    andl $1, %eax
32; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
33; SSSE3-NEXT:    movq %xmm1, %rcx
34; SSSE3-NEXT:    andl $1, %ecx
35; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
36; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
37; SSSE3-NEXT:    movsd -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
38; SSSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
39; SSSE3-NEXT:    retq
40;
41; SSE41-LABEL: var_shuffle_v2i64:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pxor %xmm2, %xmm2
44; SSE41-NEXT:    pcmpeqq %xmm1, %xmm2
45; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1]
46; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
47; SSE41-NEXT:    movdqa %xmm2, %xmm0
48; SSE41-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
49; SSE41-NEXT:    movapd %xmm1, %xmm0
50; SSE41-NEXT:    retq
51;
52; AVX-LABEL: var_shuffle_v2i64:
53; AVX:       # %bb.0:
54; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
55; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
56; AVX-NEXT:    retq
57  %index0 = extractelement <2 x i64> %indices, i32 0
58  %index1 = extractelement <2 x i64> %indices, i32 1
59  %v0 = extractelement <2 x i64> %v, i64 %index0
60  %v1 = extractelement <2 x i64> %v, i64 %index1
61  %ret0 = insertelement <2 x i64> undef, i64 %v0, i32 0
62  %ret1 = insertelement <2 x i64> %ret0, i64 %v1, i32 1
63  ret <2 x i64> %ret1
64}
65
66define <2 x i64> @var_shuffle_zero_v2i64(<2 x i64> %v, <2 x i64> %indices) nounwind {
67; SSE3-LABEL: var_shuffle_zero_v2i64:
68; SSE3:       # %bb.0:
69; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
70; SSE3-NEXT:    pxor %xmm1, %xmm2
71; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
72; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
73; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
74; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
75; SSE3-NEXT:    pand %xmm4, %xmm3
76; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
77; SSE3-NEXT:    por %xmm3, %xmm2
78; SSE3-NEXT:    por %xmm2, %xmm1
79; SSE3-NEXT:    movq %xmm1, %rax
80; SSE3-NEXT:    andl $1, %eax
81; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
82; SSE3-NEXT:    movq %xmm1, %rcx
83; SSE3-NEXT:    andl $1, %ecx
84; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
85; SSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
86; SSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
87; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
88; SSE3-NEXT:    pandn %xmm0, %xmm2
89; SSE3-NEXT:    movdqa %xmm2, %xmm0
90; SSE3-NEXT:    retq
91;
92; SSSE3-LABEL: var_shuffle_zero_v2i64:
93; SSSE3:       # %bb.0:
94; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
95; SSSE3-NEXT:    pxor %xmm1, %xmm2
96; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
97; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
98; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
99; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
100; SSSE3-NEXT:    pand %xmm4, %xmm3
101; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
102; SSSE3-NEXT:    por %xmm3, %xmm2
103; SSSE3-NEXT:    por %xmm2, %xmm1
104; SSSE3-NEXT:    movq %xmm1, %rax
105; SSSE3-NEXT:    andl $1, %eax
106; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
107; SSSE3-NEXT:    movq %xmm1, %rcx
108; SSSE3-NEXT:    andl $1, %ecx
109; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
110; SSSE3-NEXT:    movq -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
111; SSSE3-NEXT:    movq -24(%rsp,%rcx,8), %xmm1 # xmm1 = mem[0],zero
112; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
113; SSSE3-NEXT:    pandn %xmm0, %xmm2
114; SSSE3-NEXT:    movdqa %xmm2, %xmm0
115; SSSE3-NEXT:    retq
116;
117; SSE41-LABEL: var_shuffle_zero_v2i64:
118; SSE41:       # %bb.0:
119; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
120; SSE41-NEXT:    pxor %xmm1, %xmm2
121; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
122; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
123; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
124; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
125; SSE41-NEXT:    pand %xmm4, %xmm3
126; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
127; SSE41-NEXT:    por %xmm3, %xmm2
128; SSE41-NEXT:    por %xmm2, %xmm1
129; SSE41-NEXT:    pxor %xmm3, %xmm3
130; SSE41-NEXT:    pcmpeqq %xmm1, %xmm3
131; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1]
132; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
133; SSE41-NEXT:    movdqa %xmm3, %xmm0
134; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm4
135; SSE41-NEXT:    pandn %xmm4, %xmm2
136; SSE41-NEXT:    movdqa %xmm2, %xmm0
137; SSE41-NEXT:    retq
138;
139; XOP-LABEL: var_shuffle_zero_v2i64:
140; XOP:       # %bb.0:
141; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
142; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
143; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
144; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
145; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
146; XOP-NEXT:    retq
147;
148; AVX1-LABEL: var_shuffle_zero_v2i64:
149; AVX1:       # %bb.0:
150; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
151; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
152; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
153; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
154; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
155; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
156; AVX1-NEXT:    retq
157;
158; AVX2-LABEL: var_shuffle_zero_v2i64:
159; AVX2:       # %bb.0:
160; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
161; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
162; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
163; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
164; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
165; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
166; AVX2-NEXT:    retq
167;
168; AVX512-LABEL: var_shuffle_zero_v2i64:
169; AVX512:       # %bb.0:
170; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
171; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
172; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
173; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
175; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
176; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
177; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
178; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm0 {%k1}
179; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
180; AVX512-NEXT:    vzeroupper
181; AVX512-NEXT:    retq
182;
183; AVX512VL-LABEL: var_shuffle_zero_v2i64:
184; AVX512VL:       # %bb.0:
185; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
186; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
187; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
188; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
189; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
190; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
191; AVX512VL-NEXT:    vmovdqa64 %xmm1, %xmm0 {%k1}
192; AVX512VL-NEXT:    retq
193  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
194  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
195  %idx0 = extractelement <2 x i64> %or, i64 0
196  %idx1 = extractelement <2 x i64> %or, i64 1
197  %elt0 = extractelement <2 x i64> %v, i64 %idx0
198  %elt1 = extractelement <2 x i64> %v, i64 %idx1
199  %vec0 = insertelement <2 x i64> poison, i64 %elt0, i64 0
200  %vec1 = insertelement <2 x i64> %vec0, i64 %elt1, i64 1
201  %res = select <2 x i1> %cmp, <2 x i64> zeroinitializer, <2 x i64> %vec1
202  ret <2 x i64> %res
203}
204
205define <4 x i32> @var_shuffle_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
206; SSE3-LABEL: var_shuffle_v4i32:
207; SSE3:       # %bb.0:
208; SSE3-NEXT:    movd %xmm1, %eax
209; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
210; SSE3-NEXT:    movd %xmm2, %ecx
211; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
212; SSE3-NEXT:    movd %xmm2, %edx
213; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
214; SSE3-NEXT:    movd %xmm1, %esi
215; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
216; SSE3-NEXT:    andl $3, %eax
217; SSE3-NEXT:    andl $3, %ecx
218; SSE3-NEXT:    andl $3, %edx
219; SSE3-NEXT:    andl $3, %esi
220; SSE3-NEXT:    movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
221; SSE3-NEXT:    movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
222; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
223; SSE3-NEXT:    movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
224; SSE3-NEXT:    movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
225; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
226; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
227; SSE3-NEXT:    retq
228;
229; SSSE3-LABEL: var_shuffle_v4i32:
230; SSSE3:       # %bb.0:
231; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
232; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
233; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
234; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
235; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
236; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
237; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
238; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
239; SSSE3-NEXT:    pshufb %xmm1, %xmm0
240; SSSE3-NEXT:    retq
241;
242; SSE41-LABEL: var_shuffle_v4i32:
243; SSE41:       # %bb.0:
244; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
245; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
246; SSE41-NEXT:    pshufb %xmm1, %xmm0
247; SSE41-NEXT:    retq
248;
249; AVX-LABEL: var_shuffle_v4i32:
250; AVX:       # %bb.0:
251; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
252; AVX-NEXT:    retq
253  %index0 = extractelement <4 x i32> %indices, i32 0
254  %index1 = extractelement <4 x i32> %indices, i32 1
255  %index2 = extractelement <4 x i32> %indices, i32 2
256  %index3 = extractelement <4 x i32> %indices, i32 3
257  %v0 = extractelement <4 x i32> %v, i32 %index0
258  %v1 = extractelement <4 x i32> %v, i32 %index1
259  %v2 = extractelement <4 x i32> %v, i32 %index2
260  %v3 = extractelement <4 x i32> %v, i32 %index3
261  %ret0 = insertelement <4 x i32> undef, i32 %v0, i32 0
262  %ret1 = insertelement <4 x i32> %ret0, i32 %v1, i32 1
263  %ret2 = insertelement <4 x i32> %ret1, i32 %v2, i32 2
264  %ret3 = insertelement <4 x i32> %ret2, i32 %v3, i32 3
265  ret <4 x i32> %ret3
266}
267
268define <4 x i32> @var_shuffle_zero_v4i32(<4 x i32> %v, <4 x i32> %indices) nounwind {
269; SSE3-LABEL: var_shuffle_zero_v4i32:
270; SSE3:       # %bb.0:
271; SSE3-NEXT:    movaps %xmm0, %xmm2
272; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
273; SSE3-NEXT:    pxor %xmm1, %xmm0
274; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
275; SSE3-NEXT:    por %xmm0, %xmm1
276; SSE3-NEXT:    movd %xmm1, %eax
277; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
278; SSE3-NEXT:    movd %xmm3, %ecx
279; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
280; SSE3-NEXT:    movd %xmm3, %edx
281; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
282; SSE3-NEXT:    movd %xmm1, %esi
283; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
284; SSE3-NEXT:    andl $3, %eax
285; SSE3-NEXT:    andl $3, %ecx
286; SSE3-NEXT:    andl $3, %edx
287; SSE3-NEXT:    andl $3, %esi
288; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
289; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
290; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
291; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
292; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
293; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
294; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
295; SSE3-NEXT:    pandn %xmm1, %xmm0
296; SSE3-NEXT:    retq
297;
298; SSSE3-LABEL: var_shuffle_zero_v4i32:
299; SSSE3:       # %bb.0:
300; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
301; SSSE3-NEXT:    pxor %xmm1, %xmm2
302; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
303; SSSE3-NEXT:    por %xmm2, %xmm1
304; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
305; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
306; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
307; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
308; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
309; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
310; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
311; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
312; SSSE3-NEXT:    por %xmm2, %xmm1
313; SSSE3-NEXT:    pshufb %xmm1, %xmm0
314; SSSE3-NEXT:    retq
315;
316; SSE41-LABEL: var_shuffle_zero_v4i32:
317; SSE41:       # %bb.0:
318; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
319; SSE41-NEXT:    pmaxud %xmm1, %xmm2
320; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
321; SSE41-NEXT:    por %xmm2, %xmm1
322; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
323; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
324; SSE41-NEXT:    por %xmm2, %xmm1
325; SSE41-NEXT:    pshufb %xmm1, %xmm0
326; SSE41-NEXT:    retq
327;
328; XOP-LABEL: var_shuffle_zero_v4i32:
329; XOP:       # %bb.0:
330; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
331; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
332; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
333; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
334; XOP-NEXT:    retq
335;
336; AVX1-LABEL: var_shuffle_zero_v4i32:
337; AVX1:       # %bb.0:
338; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
339; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
340; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
341; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
342; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
343; AVX1-NEXT:    retq
344;
345; AVX2-LABEL: var_shuffle_zero_v4i32:
346; AVX2:       # %bb.0:
347; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
348; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
349; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
350; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
351; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
352; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
353; AVX2-NEXT:    retq
354;
355; AVX512-LABEL: var_shuffle_zero_v4i32:
356; AVX512:       # %bb.0:
357; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
358; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
359; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
360; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
361; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
362; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
363; AVX512-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
364; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
365; AVX512-NEXT:    vzeroupper
366; AVX512-NEXT:    retq
367;
368; AVX512VL-LABEL: var_shuffle_zero_v4i32:
369; AVX512VL:       # %bb.0:
370; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
371; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
372; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
373; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
374; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
375; AVX512VL-NEXT:    vmovdqa32 %xmm1, %xmm0 {%k1}
376; AVX512VL-NEXT:    retq
377  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
378  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
379  %idx0 = extractelement <4 x i32> %or, i64 0
380  %idx1 = extractelement <4 x i32> %or, i64 1
381  %idx2 = extractelement <4 x i32> %or, i64 2
382  %idx3 = extractelement <4 x i32> %or, i64 3
383  %elt0 = extractelement <4 x i32> %v, i32 %idx0
384  %elt1 = extractelement <4 x i32> %v, i32 %idx1
385  %elt2 = extractelement <4 x i32> %v, i32 %idx2
386  %elt3 = extractelement <4 x i32> %v, i32 %idx3
387  %vec0 = insertelement <4 x i32> poison, i32 %elt0, i32 0
388  %vec1 = insertelement <4 x i32> %vec0, i32 %elt1, i32 1
389  %vec2 = insertelement <4 x i32> %vec1, i32 %elt2, i32 2
390  %vec3 = insertelement <4 x i32> %vec2, i32 %elt3, i32 3
391  %res = select <4 x i1> %cmp, <4 x i32> zeroinitializer, <4 x i32> %vec3
392  ret <4 x i32> %res
393}
394
395define <8 x i16> @var_shuffle_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
396; SSE3-LABEL: var_shuffle_v8i16:
397; SSE3:       # %bb.0:
398; SSE3-NEXT:    pextrw $0, %xmm1, %eax
399; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
400; SSE3-NEXT:    pextrw $2, %xmm1, %edx
401; SSE3-NEXT:    pextrw $3, %xmm1, %esi
402; SSE3-NEXT:    pextrw $4, %xmm1, %edi
403; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
404; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
405; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
406; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
407; SSE3-NEXT:    andl $7, %eax
408; SSE3-NEXT:    andl $7, %ecx
409; SSE3-NEXT:    andl $7, %edx
410; SSE3-NEXT:    andl $7, %esi
411; SSE3-NEXT:    andl $7, %edi
412; SSE3-NEXT:    andl $7, %r8d
413; SSE3-NEXT:    andl $7, %r9d
414; SSE3-NEXT:    andl $7, %r10d
415; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
416; SSE3-NEXT:    movd %r10d, %xmm0
417; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
418; SSE3-NEXT:    movd %r9d, %xmm1
419; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
420; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
421; SSE3-NEXT:    movd %r8d, %xmm0
422; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
423; SSE3-NEXT:    movd %edi, %xmm2
424; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
425; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
426; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
427; SSE3-NEXT:    movd %esi, %xmm0
428; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
429; SSE3-NEXT:    movd %edx, %xmm1
430; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
431; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
432; SSE3-NEXT:    movd %ecx, %xmm3
433; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
434; SSE3-NEXT:    movd %eax, %xmm0
435; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
436; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
437; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
438; SSE3-NEXT:    retq
439;
440; SSSE3-LABEL: var_shuffle_v8i16:
441; SSSE3:       # %bb.0:
442; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
443; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
444; SSSE3-NEXT:    pshufb %xmm1, %xmm0
445; SSSE3-NEXT:    retq
446;
447; SSE41-LABEL: var_shuffle_v8i16:
448; SSE41:       # %bb.0:
449; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
450; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
451; SSE41-NEXT:    pshufb %xmm1, %xmm0
452; SSE41-NEXT:    retq
453;
454; AVXNOVLBW-LABEL: var_shuffle_v8i16:
455; AVXNOVLBW:       # %bb.0:
456; AVXNOVLBW-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
457; AVXNOVLBW-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
458; AVXNOVLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
459; AVXNOVLBW-NEXT:    retq
460;
461; AVX512VL-LABEL: var_shuffle_v8i16:
462; AVX512VL:       # %bb.0:
463; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
464; AVX512VL-NEXT:    retq
465  %index0 = extractelement <8 x i16> %indices, i32 0
466  %index1 = extractelement <8 x i16> %indices, i32 1
467  %index2 = extractelement <8 x i16> %indices, i32 2
468  %index3 = extractelement <8 x i16> %indices, i32 3
469  %index4 = extractelement <8 x i16> %indices, i32 4
470  %index5 = extractelement <8 x i16> %indices, i32 5
471  %index6 = extractelement <8 x i16> %indices, i32 6
472  %index7 = extractelement <8 x i16> %indices, i32 7
473  %v0 = extractelement <8 x i16> %v, i16 %index0
474  %v1 = extractelement <8 x i16> %v, i16 %index1
475  %v2 = extractelement <8 x i16> %v, i16 %index2
476  %v3 = extractelement <8 x i16> %v, i16 %index3
477  %v4 = extractelement <8 x i16> %v, i16 %index4
478  %v5 = extractelement <8 x i16> %v, i16 %index5
479  %v6 = extractelement <8 x i16> %v, i16 %index6
480  %v7 = extractelement <8 x i16> %v, i16 %index7
481  %ret0 = insertelement <8 x i16> undef, i16 %v0, i32 0
482  %ret1 = insertelement <8 x i16> %ret0, i16 %v1, i32 1
483  %ret2 = insertelement <8 x i16> %ret1, i16 %v2, i32 2
484  %ret3 = insertelement <8 x i16> %ret2, i16 %v3, i32 3
485  %ret4 = insertelement <8 x i16> %ret3, i16 %v4, i32 4
486  %ret5 = insertelement <8 x i16> %ret4, i16 %v5, i32 5
487  %ret6 = insertelement <8 x i16> %ret5, i16 %v6, i32 6
488  %ret7 = insertelement <8 x i16> %ret6, i16 %v7, i32 7
489  ret <8 x i16> %ret7
490}
491
492define <8 x i16> @var_shuffle_zero_v8i16(<8 x i16> %v, <8 x i16> %indices) nounwind {
493; SSE3-LABEL: var_shuffle_zero_v8i16:
494; SSE3:       # %bb.0:
495; SSE3-NEXT:    movdqa %xmm0, %xmm2
496; SSE3-NEXT:    movdqa {{.*#+}} xmm3 = [8,8,8,8,8,8,8,8]
497; SSE3-NEXT:    psubusw %xmm1, %xmm3
498; SSE3-NEXT:    pxor %xmm0, %xmm0
499; SSE3-NEXT:    pcmpeqw %xmm3, %xmm0
500; SSE3-NEXT:    por %xmm0, %xmm1
501; SSE3-NEXT:    pextrw $0, %xmm1, %eax
502; SSE3-NEXT:    pextrw $1, %xmm1, %ecx
503; SSE3-NEXT:    pextrw $2, %xmm1, %edx
504; SSE3-NEXT:    pextrw $3, %xmm1, %esi
505; SSE3-NEXT:    pextrw $4, %xmm1, %edi
506; SSE3-NEXT:    pextrw $5, %xmm1, %r8d
507; SSE3-NEXT:    pextrw $6, %xmm1, %r9d
508; SSE3-NEXT:    pextrw $7, %xmm1, %r10d
509; SSE3-NEXT:    movdqa %xmm2, -24(%rsp)
510; SSE3-NEXT:    andl $7, %eax
511; SSE3-NEXT:    andl $7, %ecx
512; SSE3-NEXT:    andl $7, %edx
513; SSE3-NEXT:    andl $7, %esi
514; SSE3-NEXT:    andl $7, %edi
515; SSE3-NEXT:    andl $7, %r8d
516; SSE3-NEXT:    andl $7, %r9d
517; SSE3-NEXT:    andl $7, %r10d
518; SSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
519; SSE3-NEXT:    movd %r10d, %xmm1
520; SSE3-NEXT:    movzwl -24(%rsp,%r9,2), %r9d
521; SSE3-NEXT:    movd %r9d, %xmm2
522; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
523; SSE3-NEXT:    movzwl -24(%rsp,%r8,2), %r8d
524; SSE3-NEXT:    movd %r8d, %xmm1
525; SSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
526; SSE3-NEXT:    movd %edi, %xmm3
527; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
528; SSE3-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
529; SSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
530; SSE3-NEXT:    movd %esi, %xmm1
531; SSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
532; SSE3-NEXT:    movd %edx, %xmm2
533; SSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
534; SSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
535; SSE3-NEXT:    movd %ecx, %xmm1
536; SSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
537; SSE3-NEXT:    movd %eax, %xmm4
538; SSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
539; SSE3-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
540; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
541; SSE3-NEXT:    pandn %xmm4, %xmm0
542; SSE3-NEXT:    retq
543;
544; SSSE3-LABEL: var_shuffle_zero_v8i16:
545; SSSE3:       # %bb.0:
546; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
547; SSSE3-NEXT:    psubusw %xmm1, %xmm2
548; SSSE3-NEXT:    pxor %xmm3, %xmm3
549; SSSE3-NEXT:    pcmpeqw %xmm2, %xmm3
550; SSSE3-NEXT:    por %xmm3, %xmm1
551; SSSE3-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
552; SSSE3-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
553; SSSE3-NEXT:    por %xmm3, %xmm1
554; SSSE3-NEXT:    pshufb %xmm1, %xmm0
555; SSSE3-NEXT:    retq
556;
557; SSE41-LABEL: var_shuffle_zero_v8i16:
558; SSE41:       # %bb.0:
559; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8]
560; SSE41-NEXT:    pmaxuw %xmm1, %xmm2
561; SSE41-NEXT:    pcmpeqw %xmm1, %xmm2
562; SSE41-NEXT:    por %xmm2, %xmm1
563; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [514,514,514,514,514,514,514,514]
564; SSE41-NEXT:    paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
565; SSE41-NEXT:    por %xmm2, %xmm1
566; SSE41-NEXT:    pshufb %xmm1, %xmm0
567; SSE41-NEXT:    retq
568;
569; XOP-LABEL: var_shuffle_zero_v8i16:
570; XOP:       # %bb.0:
571; XOP-NEXT:    vpcomgtuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
572; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
573; XOP-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
574; XOP-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
575; XOP-NEXT:    vpor %xmm2, %xmm1, %xmm1
576; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
577; XOP-NEXT:    retq
578;
579; AVX1-LABEL: var_shuffle_zero_v8i16:
580; AVX1:       # %bb.0:
581; AVX1-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
582; AVX1-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
583; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
584; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
585; AVX1-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
586; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
587; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
588; AVX1-NEXT:    retq
589;
590; AVX2-LABEL: var_shuffle_zero_v8i16:
591; AVX2:       # %bb.0:
592; AVX2-NEXT:    vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
593; AVX2-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm2
594; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
595; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [514,514,514,514,514,514,514,514]
596; AVX2-NEXT:    vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
597; AVX2-NEXT:    vpor %xmm2, %xmm1, %xmm1
598; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
599; AVX2-NEXT:    retq
600;
601; AVX512VL-LABEL: var_shuffle_zero_v8i16:
602; AVX512VL:       # %bb.0:
603; AVX512VL-NEXT:    vpcmpnleuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
604; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
605; AVX512VL-NEXT:    vmovdqu16 %xmm2, %xmm1 {%k1}
606; AVX512VL-NEXT:    vpermw %xmm0, %xmm1, %xmm0
607; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
608; AVX512VL-NEXT:    vmovdqu16 %xmm1, %xmm0 {%k1}
609; AVX512VL-NEXT:    retq
610  %cmp = icmp ugt <8 x i16> %indices, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
611  %or = select <8 x i1> %cmp, <8 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>, <8 x i16> %indices
612  %idx0 = extractelement <8 x i16> %or, i64 0
613  %idx1 = extractelement <8 x i16> %or, i64 1
614  %idx2 = extractelement <8 x i16> %or, i64 2
615  %idx3 = extractelement <8 x i16> %or, i64 3
616  %idx4 = extractelement <8 x i16> %or, i64 4
617  %idx5 = extractelement <8 x i16> %or, i64 5
618  %idx6 = extractelement <8 x i16> %or, i64 6
619  %idx7 = extractelement <8 x i16> %or, i64 7
620  %elt0 = extractelement <8 x i16> %v, i16 %idx0
621  %elt1 = extractelement <8 x i16> %v, i16 %idx1
622  %elt2 = extractelement <8 x i16> %v, i16 %idx2
623  %elt3 = extractelement <8 x i16> %v, i16 %idx3
624  %elt4 = extractelement <8 x i16> %v, i16 %idx4
625  %elt5 = extractelement <8 x i16> %v, i16 %idx5
626  %elt6 = extractelement <8 x i16> %v, i16 %idx6
627  %elt7 = extractelement <8 x i16> %v, i16 %idx7
628  %vec0 = insertelement <8 x i16> poison, i16 %elt0, i64 0
629  %vec1 = insertelement <8 x i16> %vec0, i16 %elt1, i64 1
630  %vec2 = insertelement <8 x i16> %vec1, i16 %elt2, i64 2
631  %vec3 = insertelement <8 x i16> %vec2, i16 %elt3, i64 3
632  %vec4 = insertelement <8 x i16> %vec3, i16 %elt4, i64 4
633  %vec5 = insertelement <8 x i16> %vec4, i16 %elt5, i64 5
634  %vec6 = insertelement <8 x i16> %vec5, i16 %elt6, i64 6
635  %vec7 = insertelement <8 x i16> %vec6, i16 %elt7, i64 7
636  %res = select <8 x i1> %cmp, <8 x i16> zeroinitializer, <8 x i16> %vec7
637  ret <8 x i16> %res
638}
639
640define <16 x i8> @var_shuffle_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
641; SSE3-LABEL: var_shuffle_v16i8:
642; SSE3:       # %bb.0:
643; SSE3-NEXT:    movaps %xmm1, -40(%rsp)
644; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
645; SSE3-NEXT:    movzbl -25(%rsp), %eax
646; SSE3-NEXT:    andl $15, %eax
647; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
648; SSE3-NEXT:    movd %eax, %xmm1
649; SSE3-NEXT:    movzbl -26(%rsp), %eax
650; SSE3-NEXT:    andl $15, %eax
651; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
652; SSE3-NEXT:    movd %eax, %xmm2
653; SSE3-NEXT:    movzbl -27(%rsp), %eax
654; SSE3-NEXT:    andl $15, %eax
655; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
656; SSE3-NEXT:    movd %eax, %xmm4
657; SSE3-NEXT:    movzbl -28(%rsp), %eax
658; SSE3-NEXT:    andl $15, %eax
659; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
660; SSE3-NEXT:    movd %eax, %xmm3
661; SSE3-NEXT:    movzbl -29(%rsp), %eax
662; SSE3-NEXT:    andl $15, %eax
663; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
664; SSE3-NEXT:    movd %eax, %xmm6
665; SSE3-NEXT:    movzbl -30(%rsp), %eax
666; SSE3-NEXT:    andl $15, %eax
667; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
668; SSE3-NEXT:    movd %eax, %xmm7
669; SSE3-NEXT:    movzbl -31(%rsp), %eax
670; SSE3-NEXT:    andl $15, %eax
671; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
672; SSE3-NEXT:    movd %eax, %xmm8
673; SSE3-NEXT:    movzbl -32(%rsp), %eax
674; SSE3-NEXT:    andl $15, %eax
675; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
676; SSE3-NEXT:    movd %eax, %xmm5
677; SSE3-NEXT:    movzbl -33(%rsp), %eax
678; SSE3-NEXT:    andl $15, %eax
679; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
680; SSE3-NEXT:    movd %eax, %xmm9
681; SSE3-NEXT:    movzbl -34(%rsp), %eax
682; SSE3-NEXT:    andl $15, %eax
683; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
684; SSE3-NEXT:    movd %eax, %xmm10
685; SSE3-NEXT:    movzbl -35(%rsp), %eax
686; SSE3-NEXT:    andl $15, %eax
687; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
688; SSE3-NEXT:    movd %eax, %xmm12
689; SSE3-NEXT:    movzbl -36(%rsp), %eax
690; SSE3-NEXT:    andl $15, %eax
691; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
692; SSE3-NEXT:    movd %eax, %xmm11
693; SSE3-NEXT:    movzbl -37(%rsp), %eax
694; SSE3-NEXT:    andl $15, %eax
695; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
696; SSE3-NEXT:    movd %eax, %xmm13
697; SSE3-NEXT:    movzbl -38(%rsp), %eax
698; SSE3-NEXT:    andl $15, %eax
699; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
700; SSE3-NEXT:    movd %eax, %xmm14
701; SSE3-NEXT:    movzbl -39(%rsp), %eax
702; SSE3-NEXT:    andl $15, %eax
703; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
704; SSE3-NEXT:    movd %eax, %xmm15
705; SSE3-NEXT:    movzbl -40(%rsp), %eax
706; SSE3-NEXT:    andl $15, %eax
707; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
708; SSE3-NEXT:    movd %eax, %xmm0
709; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
710; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
711; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
712; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
713; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
714; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
715; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
716; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
717; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
718; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
719; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
720; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
721; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
722; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
723; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
724; SSE3-NEXT:    retq
725;
726; SSSE3-LABEL: var_shuffle_v16i8:
727; SSSE3:       # %bb.0:
728; SSSE3-NEXT:    pshufb %xmm1, %xmm0
729; SSSE3-NEXT:    retq
730;
731; SSE41-LABEL: var_shuffle_v16i8:
732; SSE41:       # %bb.0:
733; SSE41-NEXT:    pshufb %xmm1, %xmm0
734; SSE41-NEXT:    retq
735;
736; AVX-LABEL: var_shuffle_v16i8:
737; AVX:       # %bb.0:
738; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
739; AVX-NEXT:    retq
740  %index0 = extractelement <16 x i8> %indices, i32 0
741  %index1 = extractelement <16 x i8> %indices, i32 1
742  %index2 = extractelement <16 x i8> %indices, i32 2
743  %index3 = extractelement <16 x i8> %indices, i32 3
744  %index4 = extractelement <16 x i8> %indices, i32 4
745  %index5 = extractelement <16 x i8> %indices, i32 5
746  %index6 = extractelement <16 x i8> %indices, i32 6
747  %index7 = extractelement <16 x i8> %indices, i32 7
748  %index8 = extractelement <16 x i8> %indices, i32 8
749  %index9 = extractelement <16 x i8> %indices, i32 9
750  %index10 = extractelement <16 x i8> %indices, i32 10
751  %index11 = extractelement <16 x i8> %indices, i32 11
752  %index12 = extractelement <16 x i8> %indices, i32 12
753  %index13 = extractelement <16 x i8> %indices, i32 13
754  %index14 = extractelement <16 x i8> %indices, i32 14
755  %index15 = extractelement <16 x i8> %indices, i32 15
756  %v0 = extractelement <16 x i8> %v, i8 %index0
757  %v1 = extractelement <16 x i8> %v, i8 %index1
758  %v2 = extractelement <16 x i8> %v, i8 %index2
759  %v3 = extractelement <16 x i8> %v, i8 %index3
760  %v4 = extractelement <16 x i8> %v, i8 %index4
761  %v5 = extractelement <16 x i8> %v, i8 %index5
762  %v6 = extractelement <16 x i8> %v, i8 %index6
763  %v7 = extractelement <16 x i8> %v, i8 %index7
764  %v8 = extractelement <16 x i8> %v, i8 %index8
765  %v9 = extractelement <16 x i8> %v, i8 %index9
766  %v10 = extractelement <16 x i8> %v, i8 %index10
767  %v11 = extractelement <16 x i8> %v, i8 %index11
768  %v12 = extractelement <16 x i8> %v, i8 %index12
769  %v13 = extractelement <16 x i8> %v, i8 %index13
770  %v14 = extractelement <16 x i8> %v, i8 %index14
771  %v15 = extractelement <16 x i8> %v, i8 %index15
772  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
773  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
774  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
775  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
776  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
777  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
778  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
779  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
780  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
781  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
782  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
783  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
784  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
785  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
786  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
787  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
788  ret <16 x i8> %ret15
789}
790
791define <16 x i8> @var_shuffle_zero_v16i8(<16 x i8> %v, <16 x i8> %indices) nounwind {
792; SSE3-LABEL: var_shuffle_zero_v16i8:
793; SSE3:       # %bb.0:
794; SSE3-NEXT:    movaps %xmm0, %xmm2
795; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
796; SSE3-NEXT:    pmaxub %xmm1, %xmm0
797; SSE3-NEXT:    pcmpeqb %xmm1, %xmm0
798; SSE3-NEXT:    por %xmm0, %xmm1
799; SSE3-NEXT:    movdqa %xmm1, -40(%rsp)
800; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
801; SSE3-NEXT:    movzbl -25(%rsp), %eax
802; SSE3-NEXT:    andl $15, %eax
803; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
804; SSE3-NEXT:    movd %eax, %xmm1
805; SSE3-NEXT:    movzbl -26(%rsp), %eax
806; SSE3-NEXT:    andl $15, %eax
807; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
808; SSE3-NEXT:    movd %eax, %xmm2
809; SSE3-NEXT:    movzbl -27(%rsp), %eax
810; SSE3-NEXT:    andl $15, %eax
811; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
812; SSE3-NEXT:    movd %eax, %xmm4
813; SSE3-NEXT:    movzbl -28(%rsp), %eax
814; SSE3-NEXT:    andl $15, %eax
815; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
816; SSE3-NEXT:    movd %eax, %xmm3
817; SSE3-NEXT:    movzbl -29(%rsp), %eax
818; SSE3-NEXT:    andl $15, %eax
819; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
820; SSE3-NEXT:    movd %eax, %xmm6
821; SSE3-NEXT:    movzbl -30(%rsp), %eax
822; SSE3-NEXT:    andl $15, %eax
823; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
824; SSE3-NEXT:    movd %eax, %xmm7
825; SSE3-NEXT:    movzbl -31(%rsp), %eax
826; SSE3-NEXT:    andl $15, %eax
827; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
828; SSE3-NEXT:    movd %eax, %xmm8
829; SSE3-NEXT:    movzbl -32(%rsp), %eax
830; SSE3-NEXT:    andl $15, %eax
831; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
832; SSE3-NEXT:    movd %eax, %xmm5
833; SSE3-NEXT:    movzbl -33(%rsp), %eax
834; SSE3-NEXT:    andl $15, %eax
835; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
836; SSE3-NEXT:    movd %eax, %xmm9
837; SSE3-NEXT:    movzbl -34(%rsp), %eax
838; SSE3-NEXT:    andl $15, %eax
839; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
840; SSE3-NEXT:    movd %eax, %xmm10
841; SSE3-NEXT:    movzbl -35(%rsp), %eax
842; SSE3-NEXT:    andl $15, %eax
843; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
844; SSE3-NEXT:    movd %eax, %xmm12
845; SSE3-NEXT:    movzbl -36(%rsp), %eax
846; SSE3-NEXT:    andl $15, %eax
847; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
848; SSE3-NEXT:    movd %eax, %xmm11
849; SSE3-NEXT:    movzbl -37(%rsp), %eax
850; SSE3-NEXT:    andl $15, %eax
851; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
852; SSE3-NEXT:    movd %eax, %xmm13
853; SSE3-NEXT:    movzbl -38(%rsp), %eax
854; SSE3-NEXT:    andl $15, %eax
855; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
856; SSE3-NEXT:    movd %eax, %xmm14
857; SSE3-NEXT:    movzbl -39(%rsp), %eax
858; SSE3-NEXT:    andl $15, %eax
859; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
860; SSE3-NEXT:    movd %eax, %xmm15
861; SSE3-NEXT:    movzbl -40(%rsp), %eax
862; SSE3-NEXT:    andl $15, %eax
863; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
864; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
865; SSE3-NEXT:    movd %eax, %xmm1
866; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
867; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
868; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
869; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
870; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
871; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
872; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
873; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
874; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
875; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
876; SSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
877; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3]
878; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1]
879; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
880; SSE3-NEXT:    pandn %xmm1, %xmm0
881; SSE3-NEXT:    retq
882;
883; SSSE3-LABEL: var_shuffle_zero_v16i8:
884; SSSE3:       # %bb.0:
885; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
886; SSSE3-NEXT:    pmaxub %xmm1, %xmm2
887; SSSE3-NEXT:    pcmpeqb %xmm1, %xmm2
888; SSSE3-NEXT:    por %xmm1, %xmm2
889; SSSE3-NEXT:    pshufb %xmm2, %xmm0
890; SSSE3-NEXT:    retq
891;
892; SSE41-LABEL: var_shuffle_zero_v16i8:
893; SSE41:       # %bb.0:
894; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
895; SSE41-NEXT:    pmaxub %xmm1, %xmm2
896; SSE41-NEXT:    pcmpeqb %xmm1, %xmm2
897; SSE41-NEXT:    por %xmm1, %xmm2
898; SSE41-NEXT:    pshufb %xmm2, %xmm0
899; SSE41-NEXT:    retq
900;
901; XOP-LABEL: var_shuffle_zero_v16i8:
902; XOP:       # %bb.0:
903; XOP-NEXT:    vpcomgtub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
904; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
905; XOP-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
906; XOP-NEXT:    retq
907;
908; AVX1-LABEL: var_shuffle_zero_v16i8:
909; AVX1:       # %bb.0:
910; AVX1-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
911; AVX1-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
912; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
913; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
914; AVX1-NEXT:    retq
915;
916; AVX2-LABEL: var_shuffle_zero_v16i8:
917; AVX2:       # %bb.0:
918; AVX2-NEXT:    vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
919; AVX2-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm2
920; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
921; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
922; AVX2-NEXT:    retq
923;
924; AVX512VL-LABEL: var_shuffle_zero_v16i8:
925; AVX512VL:       # %bb.0:
926; AVX512VL-NEXT:    vpcmpnleub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %k1
927; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
928; AVX512VL-NEXT:    vmovdqu8 %xmm2, %xmm1 {%k1}
929; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
930; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
931; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
932; AVX512VL-NEXT:    retq
933  %cmp = icmp ugt <16 x i8> %indices, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
934  %or = select <16 x i1> %cmp, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, <16 x i8> %indices
935  %idx0 = extractelement <16 x i8> %or, i64 0
936  %idx1 = extractelement <16 x i8> %or, i64 1
937  %idx2 = extractelement <16 x i8> %or, i64 2
938  %idx3 = extractelement <16 x i8> %or, i64 3
939  %idx4 = extractelement <16 x i8> %or, i64 4
940  %idx5 = extractelement <16 x i8> %or, i64 5
941  %idx6 = extractelement <16 x i8> %or, i64 6
942  %idx7 = extractelement <16 x i8> %or, i64 7
943  %idx8 = extractelement <16 x i8> %or, i64 8
944  %idx9 = extractelement <16 x i8> %or, i64 9
945  %idxA = extractelement <16 x i8> %or, i64 10
946  %idxB = extractelement <16 x i8> %or, i64 11
947  %idxC = extractelement <16 x i8> %or, i64 12
948  %idxD = extractelement <16 x i8> %or, i64 13
949  %idxE = extractelement <16 x i8> %or, i64 14
950  %idxF = extractelement <16 x i8> %or, i64 15
951  %elt0 = extractelement <16 x i8> %v, i8 %idx0
952  %elt1 = extractelement <16 x i8> %v, i8 %idx1
953  %elt2 = extractelement <16 x i8> %v, i8 %idx2
954  %elt3 = extractelement <16 x i8> %v, i8 %idx3
955  %elt4 = extractelement <16 x i8> %v, i8 %idx4
956  %elt5 = extractelement <16 x i8> %v, i8 %idx5
957  %elt6 = extractelement <16 x i8> %v, i8 %idx6
958  %elt7 = extractelement <16 x i8> %v, i8 %idx7
959  %elt8 = extractelement <16 x i8> %v, i8 %idx8
960  %elt9 = extractelement <16 x i8> %v, i8 %idx9
961  %eltA = extractelement <16 x i8> %v, i8 %idxA
962  %eltB = extractelement <16 x i8> %v, i8 %idxB
963  %eltC = extractelement <16 x i8> %v, i8 %idxC
964  %eltD = extractelement <16 x i8> %v, i8 %idxD
965  %eltE = extractelement <16 x i8> %v, i8 %idxE
966  %eltF = extractelement <16 x i8> %v, i8 %idxF
967  %vec0 = insertelement <16 x i8> poison, i8 %elt0, i64 0
968  %vec1 = insertelement <16 x i8> %vec0, i8 %elt1, i64 1
969  %vec2 = insertelement <16 x i8> %vec1, i8 %elt2, i64 2
970  %vec3 = insertelement <16 x i8> %vec2, i8 %elt3, i64 3
971  %vec4 = insertelement <16 x i8> %vec3, i8 %elt4, i64 4
972  %vec5 = insertelement <16 x i8> %vec4, i8 %elt5, i64 5
973  %vec6 = insertelement <16 x i8> %vec5, i8 %elt6, i64 6
974  %vec7 = insertelement <16 x i8> %vec6, i8 %elt7, i64 7
975  %vec8 = insertelement <16 x i8> %vec7, i8 %elt8, i64 8
976  %vec9 = insertelement <16 x i8> %vec8, i8 %elt9, i64 9
977  %vecA = insertelement <16 x i8> %vec9, i8 %eltA, i64 10
978  %vecB = insertelement <16 x i8> %vecA, i8 %eltB, i64 11
979  %vecC = insertelement <16 x i8> %vecB, i8 %eltC, i64 12
980  %vecD = insertelement <16 x i8> %vecC, i8 %eltD, i64 13
981  %vecE = insertelement <16 x i8> %vecD, i8 %eltE, i64 14
982  %vecF = insertelement <16 x i8> %vecE, i8 %eltF, i64 15
983  %res = select <16 x i1> %cmp, <16 x i8> zeroinitializer, <16 x i8> %vecF
984  ret <16 x i8> %res
985}
986
987define <2 x double> @var_shuffle_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
988; SSE3-LABEL: var_shuffle_v2f64:
989; SSE3:       # %bb.0:
990; SSE3-NEXT:    movq %xmm1, %rax
991; SSE3-NEXT:    andl $1, %eax
992; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
993; SSE3-NEXT:    movq %xmm1, %rcx
994; SSE3-NEXT:    andl $1, %ecx
995; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
996; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
997; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
998; SSE3-NEXT:    retq
999;
1000; SSSE3-LABEL: var_shuffle_v2f64:
1001; SSSE3:       # %bb.0:
1002; SSSE3-NEXT:    movq %xmm1, %rax
1003; SSSE3-NEXT:    andl $1, %eax
1004; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1005; SSSE3-NEXT:    movq %xmm1, %rcx
1006; SSSE3-NEXT:    andl $1, %ecx
1007; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
1008; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1009; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1010; SSSE3-NEXT:    retq
1011;
1012; SSE41-LABEL: var_shuffle_v2f64:
1013; SSE41:       # %bb.0:
1014; SSE41-NEXT:    movdqa %xmm0, %xmm2
1015; SSE41-NEXT:    pxor %xmm0, %xmm0
1016; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
1017; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
1018; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1019; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
1020; SSE41-NEXT:    movapd %xmm2, %xmm0
1021; SSE41-NEXT:    retq
1022;
1023; AVX-LABEL: var_shuffle_v2f64:
1024; AVX:       # %bb.0:
1025; AVX-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1026; AVX-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1027; AVX-NEXT:    retq
1028  %index0 = extractelement <2 x i64> %indices, i32 0
1029  %index1 = extractelement <2 x i64> %indices, i32 1
1030  %v0 = extractelement <2 x double> %v, i64 %index0
1031  %v1 = extractelement <2 x double> %v, i64 %index1
1032  %ret0 = insertelement <2 x double> undef, double %v0, i32 0
1033  %ret1 = insertelement <2 x double> %ret0, double %v1, i32 1
1034  ret <2 x double> %ret1
1035}
1036
1037define <2 x double> @var_shuffle_zero_v2f64(<2 x double> %v, <2 x i64> %indices) nounwind {
1038; SSE3-LABEL: var_shuffle_zero_v2f64:
1039; SSE3:       # %bb.0:
1040; SSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1041; SSE3-NEXT:    pxor %xmm1, %xmm2
1042; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1043; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1044; SSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1045; SSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1046; SSE3-NEXT:    pand %xmm4, %xmm3
1047; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1048; SSE3-NEXT:    por %xmm3, %xmm2
1049; SSE3-NEXT:    por %xmm2, %xmm1
1050; SSE3-NEXT:    movq %xmm1, %rax
1051; SSE3-NEXT:    andl $1, %eax
1052; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1053; SSE3-NEXT:    movq %xmm1, %rcx
1054; SSE3-NEXT:    andl $1, %ecx
1055; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
1056; SSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1057; SSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1058; SSE3-NEXT:    pandn %xmm0, %xmm2
1059; SSE3-NEXT:    movdqa %xmm2, %xmm0
1060; SSE3-NEXT:    retq
1061;
1062; SSSE3-LABEL: var_shuffle_zero_v2f64:
1063; SSSE3:       # %bb.0:
1064; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [9223372039002259456,9223372039002259456]
1065; SSSE3-NEXT:    pxor %xmm1, %xmm2
1066; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
1067; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1068; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm2[0,0,2,2]
1069; SSSE3-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
1070; SSSE3-NEXT:    pand %xmm4, %xmm3
1071; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
1072; SSSE3-NEXT:    por %xmm3, %xmm2
1073; SSSE3-NEXT:    por %xmm2, %xmm1
1074; SSSE3-NEXT:    movq %xmm1, %rax
1075; SSSE3-NEXT:    andl $1, %eax
1076; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
1077; SSSE3-NEXT:    movq %xmm1, %rcx
1078; SSSE3-NEXT:    andl $1, %ecx
1079; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
1080; SSSE3-NEXT:    movsd -24(%rsp,%rax,8), %xmm0 # xmm0 = mem[0],zero
1081; SSSE3-NEXT:    movhps -24(%rsp,%rcx,8), %xmm0 # xmm0 = xmm0[0,1],mem[0,1]
1082; SSSE3-NEXT:    pandn %xmm0, %xmm2
1083; SSSE3-NEXT:    movdqa %xmm2, %xmm0
1084; SSSE3-NEXT:    retq
1085;
1086; SSE41-LABEL: var_shuffle_zero_v2f64:
1087; SSE41:       # %bb.0:
1088; SSE41-NEXT:    movapd %xmm0, %xmm2
1089; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [9223372039002259456,9223372039002259456]
1090; SSE41-NEXT:    pxor %xmm1, %xmm0
1091; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
1092; SSE41-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1093; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,2,2]
1094; SSE41-NEXT:    pcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
1095; SSE41-NEXT:    pand %xmm3, %xmm4
1096; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
1097; SSE41-NEXT:    por %xmm4, %xmm3
1098; SSE41-NEXT:    por %xmm3, %xmm1
1099; SSE41-NEXT:    pxor %xmm0, %xmm0
1100; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
1101; SSE41-NEXT:    movddup {{.*#+}} xmm1 = xmm2[0,0]
1102; SSE41-NEXT:    unpckhpd {{.*#+}} xmm2 = xmm2[1,1]
1103; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
1104; SSE41-NEXT:    pandn %xmm2, %xmm3
1105; SSE41-NEXT:    movdqa %xmm3, %xmm0
1106; SSE41-NEXT:    retq
1107;
1108; XOP-LABEL: var_shuffle_zero_v2f64:
1109; XOP:       # %bb.0:
1110; XOP-NEXT:    vpcomgtuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1111; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
1112; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1113; XOP-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1114; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1115; XOP-NEXT:    retq
1116;
1117; AVX1-LABEL: var_shuffle_zero_v2f64:
1118; AVX1:       # %bb.0:
1119; AVX1-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1120; AVX1-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1121; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
1122; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1123; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1124; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1125; AVX1-NEXT:    retq
1126;
1127; AVX2-LABEL: var_shuffle_zero_v2f64:
1128; AVX2:       # %bb.0:
1129; AVX2-NEXT:    vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1130; AVX2-NEXT:    vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1131; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
1132; AVX2-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1133; AVX2-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1134; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1135; AVX2-NEXT:    retq
1136;
1137; AVX512-LABEL: var_shuffle_zero_v2f64:
1138; AVX512:       # %bb.0:
1139; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1140; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [3,3]
1141; AVX512-NEXT:    vpcmpnleuq %zmm2, %zmm1, %k1
1142; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1143; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm1 {%k1}
1144; AVX512-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1145; AVX512-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1146; AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1147; AVX512-NEXT:    vmovapd %zmm1, %zmm0 {%k1}
1148; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1149; AVX512-NEXT:    vzeroupper
1150; AVX512-NEXT:    retq
1151;
1152; AVX512VL-LABEL: var_shuffle_zero_v2f64:
1153; AVX512VL:       # %bb.0:
1154; AVX512VL-NEXT:    vpcmpnleuq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %k1
1155; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1156; AVX512VL-NEXT:    vmovdqa64 %xmm2, %xmm1 {%k1}
1157; AVX512VL-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
1158; AVX512VL-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
1159; AVX512VL-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
1160; AVX512VL-NEXT:    vmovapd %xmm1, %xmm0 {%k1}
1161; AVX512VL-NEXT:    retq
1162  %cmp = icmp ugt <2 x i64> %indices, <i64 3, i64 3>
1163  %or = select <2 x i1> %cmp, <2 x i64> <i64 -1, i64 -1>, <2 x i64> %indices
1164  %idx0 = extractelement <2 x i64> %or, i64 0
1165  %idx1 = extractelement <2 x i64> %or, i64 1
1166  %elt0 = extractelement <2 x double> %v, i64 %idx0
1167  %elt1 = extractelement <2 x double> %v, i64 %idx1
1168  %vec0 = insertelement <2 x double> poison, double %elt0, i64 0
1169  %vec1 = insertelement <2 x double> %vec0, double %elt1, i64 1
1170  %res = select <2 x i1> %cmp, <2 x double> zeroinitializer, <2 x double> %vec1
1171  ret <2 x double> %res
1172}
1173
1174define <4 x float> @var_shuffle_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
1175; SSE3-LABEL: var_shuffle_v4f32:
1176; SSE3:       # %bb.0:
1177; SSE3-NEXT:    movd %xmm1, %eax
1178; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
1179; SSE3-NEXT:    movd %xmm2, %ecx
1180; SSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
1181; SSE3-NEXT:    movd %xmm2, %edx
1182; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1183; SSE3-NEXT:    movd %xmm1, %esi
1184; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
1185; SSE3-NEXT:    andl $3, %eax
1186; SSE3-NEXT:    andl $3, %ecx
1187; SSE3-NEXT:    andl $3, %edx
1188; SSE3-NEXT:    andl $3, %esi
1189; SSE3-NEXT:    movss -24(%rsp,%rsi,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
1190; SSE3-NEXT:    movss -24(%rsp,%rdx,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1191; SSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1192; SSE3-NEXT:    movss -24(%rsp,%rax,4), %xmm0 # xmm0 = mem[0],zero,zero,zero
1193; SSE3-NEXT:    movss -24(%rsp,%rcx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
1194; SSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1195; SSE3-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1196; SSE3-NEXT:    retq
1197;
1198; SSSE3-LABEL: var_shuffle_v4f32:
1199; SSSE3:       # %bb.0:
1200; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [67372036,67372036,67372036,67372036]
1201; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
1202; SSSE3-NEXT:    pmuludq %xmm2, %xmm1
1203; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1204; SSSE3-NEXT:    pmuludq %xmm2, %xmm3
1205; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,2,3]
1206; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
1207; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1208; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1209; SSSE3-NEXT:    retq
1210;
1211; SSE41-LABEL: var_shuffle_v4f32:
1212; SSE41:       # %bb.0:
1213; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1214; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1215; SSE41-NEXT:    pshufb %xmm1, %xmm0
1216; SSE41-NEXT:    retq
1217;
1218; AVX-LABEL: var_shuffle_v4f32:
1219; AVX:       # %bb.0:
1220; AVX-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1221; AVX-NEXT:    retq
1222  %index0 = extractelement <4 x i32> %indices, i32 0
1223  %index1 = extractelement <4 x i32> %indices, i32 1
1224  %index2 = extractelement <4 x i32> %indices, i32 2
1225  %index3 = extractelement <4 x i32> %indices, i32 3
1226  %v0 = extractelement <4 x float> %v, i32 %index0
1227  %v1 = extractelement <4 x float> %v, i32 %index1
1228  %v2 = extractelement <4 x float> %v, i32 %index2
1229  %v3 = extractelement <4 x float> %v, i32 %index3
1230  %ret0 = insertelement <4 x float> undef, float %v0, i32 0
1231  %ret1 = insertelement <4 x float> %ret0, float %v1, i32 1
1232  %ret2 = insertelement <4 x float> %ret1, float %v2, i32 2
1233  %ret3 = insertelement <4 x float> %ret2, float %v3, i32 3
1234  ret <4 x float> %ret3
1235}
1236
1237define <4 x float> @var_shuffle_zero_v4f32(<4 x float> %v, <4 x i32> %indices) nounwind {
1238; SSE3-LABEL: var_shuffle_zero_v4f32:
1239; SSE3:       # %bb.0:
1240; SSE3-NEXT:    movaps %xmm0, %xmm2
1241; SSE3-NEXT:    movdqa {{.*#+}} xmm0 = [2147483648,2147483648,2147483648,2147483648]
1242; SSE3-NEXT:    pxor %xmm1, %xmm0
1243; SSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1244; SSE3-NEXT:    por %xmm0, %xmm1
1245; SSE3-NEXT:    movd %xmm1, %eax
1246; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,1,1]
1247; SSE3-NEXT:    movd %xmm3, %ecx
1248; SSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,2,3]
1249; SSE3-NEXT:    movd %xmm3, %edx
1250; SSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,3,3,3]
1251; SSE3-NEXT:    movd %xmm1, %esi
1252; SSE3-NEXT:    movaps %xmm2, -24(%rsp)
1253; SSE3-NEXT:    andl $3, %eax
1254; SSE3-NEXT:    andl $3, %ecx
1255; SSE3-NEXT:    andl $3, %edx
1256; SSE3-NEXT:    andl $3, %esi
1257; SSE3-NEXT:    movd -24(%rsp,%rsi,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1258; SSE3-NEXT:    movd -24(%rsp,%rdx,4), %xmm2 # xmm2 = mem[0],zero,zero,zero
1259; SSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1260; SSE3-NEXT:    movd -24(%rsp,%rax,4), %xmm1 # xmm1 = mem[0],zero,zero,zero
1261; SSE3-NEXT:    movd -24(%rsp,%rcx,4), %xmm3 # xmm3 = mem[0],zero,zero,zero
1262; SSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1263; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1264; SSE3-NEXT:    pandn %xmm1, %xmm0
1265; SSE3-NEXT:    retq
1266;
1267; SSSE3-LABEL: var_shuffle_zero_v4f32:
1268; SSSE3:       # %bb.0:
1269; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
1270; SSSE3-NEXT:    pxor %xmm1, %xmm2
1271; SSSE3-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1272; SSSE3-NEXT:    por %xmm2, %xmm1
1273; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [67372036,67372036,67372036,67372036]
1274; SSSE3-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
1275; SSSE3-NEXT:    pmuludq %xmm3, %xmm1
1276; SSSE3-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1277; SSSE3-NEXT:    pmuludq %xmm3, %xmm4
1278; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
1279; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
1280; SSSE3-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1281; SSSE3-NEXT:    por %xmm2, %xmm1
1282; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1283; SSSE3-NEXT:    retq
1284;
1285; SSE41-LABEL: var_shuffle_zero_v4f32:
1286; SSE41:       # %bb.0:
1287; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm2 = [4,4,4,4]
1288; SSE41-NEXT:    pmaxud %xmm1, %xmm2
1289; SSE41-NEXT:    pcmpeqd %xmm1, %xmm2
1290; SSE41-NEXT:    por %xmm2, %xmm1
1291; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1292; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1293; SSE41-NEXT:    por %xmm2, %xmm1
1294; SSE41-NEXT:    pshufb %xmm1, %xmm0
1295; SSE41-NEXT:    retq
1296;
1297; XOP-LABEL: var_shuffle_zero_v4f32:
1298; XOP:       # %bb.0:
1299; XOP-NEXT:    vpcomgtud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1300; XOP-NEXT:    vpor %xmm1, %xmm2, %xmm1
1301; XOP-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1302; XOP-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1303; XOP-NEXT:    retq
1304;
1305; AVX1-LABEL: var_shuffle_zero_v4f32:
1306; AVX1:       # %bb.0:
1307; AVX1-NEXT:    vpmaxud {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
1308; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
1309; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
1310; AVX1-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1311; AVX1-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1312; AVX1-NEXT:    retq
1313;
1314; AVX2-LABEL: var_shuffle_zero_v4f32:
1315; AVX2:       # %bb.0:
1316; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [4,4,4,4]
1317; AVX2-NEXT:    vpmaxud %xmm2, %xmm1, %xmm2
1318; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm2
1319; AVX2-NEXT:    vpor %xmm1, %xmm2, %xmm1
1320; AVX2-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1321; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
1322; AVX2-NEXT:    retq
1323;
1324; AVX512-LABEL: var_shuffle_zero_v4f32:
1325; AVX512:       # %bb.0:
1326; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1327; AVX512-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %k1
1328; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1329; AVX512-NEXT:    vmovdqa32 %zmm2, %zmm1 {%k1}
1330; AVX512-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1331; AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1332; AVX512-NEXT:    vmovaps %zmm1, %zmm0 {%k1}
1333; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1334; AVX512-NEXT:    vzeroupper
1335; AVX512-NEXT:    retq
1336;
1337; AVX512VL-LABEL: var_shuffle_zero_v4f32:
1338; AVX512VL:       # %bb.0:
1339; AVX512VL-NEXT:    vpcmpnleud {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %k1
1340; AVX512VL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1341; AVX512VL-NEXT:    vmovdqa32 %xmm2, %xmm1 {%k1}
1342; AVX512VL-NEXT:    vpermilps %xmm1, %xmm0, %xmm0
1343; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1344; AVX512VL-NEXT:    vmovaps %xmm1, %xmm0 {%k1}
1345; AVX512VL-NEXT:    retq
1346  %cmp = icmp ugt <4 x i32> %indices, <i32 3, i32 3, i32 3, i32 3>
1347  %or = select <4 x i1> %cmp, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32> %indices
1348  %idx0 = extractelement <4 x i32> %or, i64 0
1349  %idx1 = extractelement <4 x i32> %or, i64 1
1350  %idx2 = extractelement <4 x i32> %or, i64 2
1351  %idx3 = extractelement <4 x i32> %or, i64 3
1352  %elt0 = extractelement <4 x float> %v, i32 %idx0
1353  %elt1 = extractelement <4 x float> %v, i32 %idx1
1354  %elt2 = extractelement <4 x float> %v, i32 %idx2
1355  %elt3 = extractelement <4 x float> %v, i32 %idx3
1356  %vec0 = insertelement <4 x float> poison, float %elt0, i64 0
1357  %vec1 = insertelement <4 x float> %vec0, float %elt1, i64 1
1358  %vec2 = insertelement <4 x float> %vec1, float %elt2, i64 2
1359  %vec3 = insertelement <4 x float> %vec2, float %elt3, i64 3
1360  %res = select <4 x i1> %cmp, <4 x float> zeroinitializer, <4 x float> %vec3
1361  ret <4 x float> %res
1362}
1363
1364define <16 x i8> @var_shuffle_v16i8_from_v16i8_v32i8(<16 x i8> %v, <32 x i8> %indices) nounwind {
1365; SSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1366; SSE3:       # %bb.0:
1367; SSE3-NEXT:    movaps %xmm1, -40(%rsp)
1368; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
1369; SSE3-NEXT:    movzbl -25(%rsp), %eax
1370; SSE3-NEXT:    andl $15, %eax
1371; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1372; SSE3-NEXT:    movd %eax, %xmm1
1373; SSE3-NEXT:    movzbl -26(%rsp), %eax
1374; SSE3-NEXT:    andl $15, %eax
1375; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1376; SSE3-NEXT:    movd %eax, %xmm2
1377; SSE3-NEXT:    movzbl -27(%rsp), %eax
1378; SSE3-NEXT:    andl $15, %eax
1379; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1380; SSE3-NEXT:    movd %eax, %xmm4
1381; SSE3-NEXT:    movzbl -28(%rsp), %eax
1382; SSE3-NEXT:    andl $15, %eax
1383; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1384; SSE3-NEXT:    movd %eax, %xmm3
1385; SSE3-NEXT:    movzbl -29(%rsp), %eax
1386; SSE3-NEXT:    andl $15, %eax
1387; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1388; SSE3-NEXT:    movd %eax, %xmm6
1389; SSE3-NEXT:    movzbl -30(%rsp), %eax
1390; SSE3-NEXT:    andl $15, %eax
1391; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1392; SSE3-NEXT:    movd %eax, %xmm7
1393; SSE3-NEXT:    movzbl -31(%rsp), %eax
1394; SSE3-NEXT:    andl $15, %eax
1395; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1396; SSE3-NEXT:    movd %eax, %xmm8
1397; SSE3-NEXT:    movzbl -32(%rsp), %eax
1398; SSE3-NEXT:    andl $15, %eax
1399; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1400; SSE3-NEXT:    movd %eax, %xmm5
1401; SSE3-NEXT:    movzbl -33(%rsp), %eax
1402; SSE3-NEXT:    andl $15, %eax
1403; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1404; SSE3-NEXT:    movd %eax, %xmm9
1405; SSE3-NEXT:    movzbl -34(%rsp), %eax
1406; SSE3-NEXT:    andl $15, %eax
1407; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1408; SSE3-NEXT:    movd %eax, %xmm10
1409; SSE3-NEXT:    movzbl -35(%rsp), %eax
1410; SSE3-NEXT:    andl $15, %eax
1411; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1412; SSE3-NEXT:    movd %eax, %xmm12
1413; SSE3-NEXT:    movzbl -36(%rsp), %eax
1414; SSE3-NEXT:    andl $15, %eax
1415; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1416; SSE3-NEXT:    movd %eax, %xmm11
1417; SSE3-NEXT:    movzbl -37(%rsp), %eax
1418; SSE3-NEXT:    andl $15, %eax
1419; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1420; SSE3-NEXT:    movd %eax, %xmm13
1421; SSE3-NEXT:    movzbl -38(%rsp), %eax
1422; SSE3-NEXT:    andl $15, %eax
1423; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1424; SSE3-NEXT:    movd %eax, %xmm14
1425; SSE3-NEXT:    movzbl -39(%rsp), %eax
1426; SSE3-NEXT:    andl $15, %eax
1427; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1428; SSE3-NEXT:    movd %eax, %xmm15
1429; SSE3-NEXT:    movzbl -40(%rsp), %eax
1430; SSE3-NEXT:    andl $15, %eax
1431; SSE3-NEXT:    movzbl -24(%rsp,%rax), %eax
1432; SSE3-NEXT:    movd %eax, %xmm0
1433; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1434; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1435; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1436; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1437; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1438; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1439; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1440; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1441; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1442; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1443; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1444; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1445; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1446; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1447; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1448; SSE3-NEXT:    retq
1449;
1450; SSSE3-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1451; SSSE3:       # %bb.0:
1452; SSSE3-NEXT:    pshufb %xmm1, %xmm0
1453; SSSE3-NEXT:    retq
1454;
1455; SSE41-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1456; SSE41:       # %bb.0:
1457; SSE41-NEXT:    pshufb %xmm1, %xmm0
1458; SSE41-NEXT:    retq
1459;
1460; AVX-LABEL: var_shuffle_v16i8_from_v16i8_v32i8:
1461; AVX:       # %bb.0:
1462; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1463; AVX-NEXT:    vzeroupper
1464; AVX-NEXT:    retq
1465  %index0 = extractelement <32 x i8> %indices, i32 0
1466  %index1 = extractelement <32 x i8> %indices, i32 1
1467  %index2 = extractelement <32 x i8> %indices, i32 2
1468  %index3 = extractelement <32 x i8> %indices, i32 3
1469  %index4 = extractelement <32 x i8> %indices, i32 4
1470  %index5 = extractelement <32 x i8> %indices, i32 5
1471  %index6 = extractelement <32 x i8> %indices, i32 6
1472  %index7 = extractelement <32 x i8> %indices, i32 7
1473  %index8 = extractelement <32 x i8> %indices, i32 8
1474  %index9 = extractelement <32 x i8> %indices, i32 9
1475  %index10 = extractelement <32 x i8> %indices, i32 10
1476  %index11 = extractelement <32 x i8> %indices, i32 11
1477  %index12 = extractelement <32 x i8> %indices, i32 12
1478  %index13 = extractelement <32 x i8> %indices, i32 13
1479  %index14 = extractelement <32 x i8> %indices, i32 14
1480  %index15 = extractelement <32 x i8> %indices, i32 15
1481  %v0 = extractelement <16 x i8> %v, i8 %index0
1482  %v1 = extractelement <16 x i8> %v, i8 %index1
1483  %v2 = extractelement <16 x i8> %v, i8 %index2
1484  %v3 = extractelement <16 x i8> %v, i8 %index3
1485  %v4 = extractelement <16 x i8> %v, i8 %index4
1486  %v5 = extractelement <16 x i8> %v, i8 %index5
1487  %v6 = extractelement <16 x i8> %v, i8 %index6
1488  %v7 = extractelement <16 x i8> %v, i8 %index7
1489  %v8 = extractelement <16 x i8> %v, i8 %index8
1490  %v9 = extractelement <16 x i8> %v, i8 %index9
1491  %v10 = extractelement <16 x i8> %v, i8 %index10
1492  %v11 = extractelement <16 x i8> %v, i8 %index11
1493  %v12 = extractelement <16 x i8> %v, i8 %index12
1494  %v13 = extractelement <16 x i8> %v, i8 %index13
1495  %v14 = extractelement <16 x i8> %v, i8 %index14
1496  %v15 = extractelement <16 x i8> %v, i8 %index15
1497  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1498  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1499  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1500  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1501  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1502  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1503  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1504  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1505  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1506  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1507  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1508  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1509  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1510  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1511  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1512  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1513  ret <16 x i8> %ret15
1514}
1515
1516define <16 x i8> @var_shuffle_v16i8_from_v32i8_v16i8(<32 x i8> %v, <16 x i8> %indices) nounwind {
1517; SSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1518; SSE3:       # %bb.0:
1519; SSE3-NEXT:    pushq %rbp
1520; SSE3-NEXT:    pushq %r15
1521; SSE3-NEXT:    pushq %r14
1522; SSE3-NEXT:    pushq %r13
1523; SSE3-NEXT:    pushq %r12
1524; SSE3-NEXT:    pushq %rbx
1525; SSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
1526; SSE3-NEXT:    movaps %xmm2, -128(%rsp)
1527; SSE3-NEXT:    movaps %xmm1, 400(%rsp)
1528; SSE3-NEXT:    movaps %xmm0, 384(%rsp)
1529; SSE3-NEXT:    movzbl -128(%rsp), %eax
1530; SSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1531; SSE3-NEXT:    movaps %xmm1, 368(%rsp)
1532; SSE3-NEXT:    movaps %xmm0, 352(%rsp)
1533; SSE3-NEXT:    movzbl -127(%rsp), %ecx
1534; SSE3-NEXT:    movaps %xmm1, 336(%rsp)
1535; SSE3-NEXT:    movaps %xmm0, 320(%rsp)
1536; SSE3-NEXT:    movzbl -126(%rsp), %edx
1537; SSE3-NEXT:    movaps %xmm1, 304(%rsp)
1538; SSE3-NEXT:    movaps %xmm0, 288(%rsp)
1539; SSE3-NEXT:    movzbl -125(%rsp), %esi
1540; SSE3-NEXT:    movaps %xmm1, 272(%rsp)
1541; SSE3-NEXT:    movaps %xmm0, 256(%rsp)
1542; SSE3-NEXT:    movzbl -124(%rsp), %edi
1543; SSE3-NEXT:    movaps %xmm1, 240(%rsp)
1544; SSE3-NEXT:    movaps %xmm0, 224(%rsp)
1545; SSE3-NEXT:    movzbl -123(%rsp), %r8d
1546; SSE3-NEXT:    movaps %xmm1, 208(%rsp)
1547; SSE3-NEXT:    movaps %xmm0, 192(%rsp)
1548; SSE3-NEXT:    movzbl -122(%rsp), %r9d
1549; SSE3-NEXT:    movaps %xmm1, 176(%rsp)
1550; SSE3-NEXT:    movaps %xmm0, 160(%rsp)
1551; SSE3-NEXT:    movzbl -121(%rsp), %r10d
1552; SSE3-NEXT:    movaps %xmm1, 144(%rsp)
1553; SSE3-NEXT:    movaps %xmm0, 128(%rsp)
1554; SSE3-NEXT:    movzbl -120(%rsp), %r11d
1555; SSE3-NEXT:    movaps %xmm1, 112(%rsp)
1556; SSE3-NEXT:    movaps %xmm0, 96(%rsp)
1557; SSE3-NEXT:    movzbl -119(%rsp), %ebx
1558; SSE3-NEXT:    movaps %xmm1, 80(%rsp)
1559; SSE3-NEXT:    movaps %xmm0, 64(%rsp)
1560; SSE3-NEXT:    movzbl -118(%rsp), %r14d
1561; SSE3-NEXT:    movaps %xmm1, 48(%rsp)
1562; SSE3-NEXT:    movaps %xmm0, 32(%rsp)
1563; SSE3-NEXT:    movzbl -117(%rsp), %r15d
1564; SSE3-NEXT:    movaps %xmm1, 16(%rsp)
1565; SSE3-NEXT:    movaps %xmm0, (%rsp)
1566; SSE3-NEXT:    movzbl -116(%rsp), %r12d
1567; SSE3-NEXT:    movaps %xmm1, -16(%rsp)
1568; SSE3-NEXT:    movaps %xmm0, -32(%rsp)
1569; SSE3-NEXT:    movzbl -115(%rsp), %r13d
1570; SSE3-NEXT:    movaps %xmm1, -48(%rsp)
1571; SSE3-NEXT:    movaps %xmm0, -64(%rsp)
1572; SSE3-NEXT:    movzbl -114(%rsp), %ebp
1573; SSE3-NEXT:    movaps %xmm1, -80(%rsp)
1574; SSE3-NEXT:    movaps %xmm0, -96(%rsp)
1575; SSE3-NEXT:    movzbl -113(%rsp), %eax
1576; SSE3-NEXT:    andl $31, %eax
1577; SSE3-NEXT:    movzbl -96(%rsp,%rax), %eax
1578; SSE3-NEXT:    movd %eax, %xmm1
1579; SSE3-NEXT:    andl $31, %ebp
1580; SSE3-NEXT:    movzbl -64(%rsp,%rbp), %eax
1581; SSE3-NEXT:    movd %eax, %xmm2
1582; SSE3-NEXT:    andl $31, %r13d
1583; SSE3-NEXT:    movzbl -32(%rsp,%r13), %eax
1584; SSE3-NEXT:    movd %eax, %xmm4
1585; SSE3-NEXT:    andl $31, %r12d
1586; SSE3-NEXT:    movzbl (%rsp,%r12), %eax
1587; SSE3-NEXT:    movd %eax, %xmm3
1588; SSE3-NEXT:    andl $31, %r15d
1589; SSE3-NEXT:    movzbl 32(%rsp,%r15), %eax
1590; SSE3-NEXT:    movd %eax, %xmm6
1591; SSE3-NEXT:    andl $31, %r14d
1592; SSE3-NEXT:    movzbl 64(%rsp,%r14), %eax
1593; SSE3-NEXT:    movd %eax, %xmm7
1594; SSE3-NEXT:    andl $31, %ebx
1595; SSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
1596; SSE3-NEXT:    movd %eax, %xmm8
1597; SSE3-NEXT:    andl $31, %r11d
1598; SSE3-NEXT:    movzbl 128(%rsp,%r11), %eax
1599; SSE3-NEXT:    movd %eax, %xmm5
1600; SSE3-NEXT:    andl $31, %r10d
1601; SSE3-NEXT:    movzbl 160(%rsp,%r10), %eax
1602; SSE3-NEXT:    movd %eax, %xmm9
1603; SSE3-NEXT:    andl $31, %r9d
1604; SSE3-NEXT:    movzbl 192(%rsp,%r9), %eax
1605; SSE3-NEXT:    movd %eax, %xmm10
1606; SSE3-NEXT:    andl $31, %r8d
1607; SSE3-NEXT:    movzbl 224(%rsp,%r8), %eax
1608; SSE3-NEXT:    movd %eax, %xmm12
1609; SSE3-NEXT:    andl $31, %edi
1610; SSE3-NEXT:    movzbl 256(%rsp,%rdi), %eax
1611; SSE3-NEXT:    movd %eax, %xmm11
1612; SSE3-NEXT:    andl $31, %esi
1613; SSE3-NEXT:    movzbl 288(%rsp,%rsi), %eax
1614; SSE3-NEXT:    movd %eax, %xmm13
1615; SSE3-NEXT:    andl $31, %edx
1616; SSE3-NEXT:    movzbl 320(%rsp,%rdx), %eax
1617; SSE3-NEXT:    movd %eax, %xmm14
1618; SSE3-NEXT:    andl $31, %ecx
1619; SSE3-NEXT:    movzbl 352(%rsp,%rcx), %eax
1620; SSE3-NEXT:    movd %eax, %xmm15
1621; SSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1622; SSE3-NEXT:    andl $31, %eax
1623; SSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
1624; SSE3-NEXT:    movd %eax, %xmm0
1625; SSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1626; SSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1627; SSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1628; SSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1629; SSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1630; SSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1631; SSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1632; SSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1633; SSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1634; SSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1635; SSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1636; SSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1637; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1638; SSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1639; SSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1640; SSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
1641; SSE3-NEXT:    popq %rbx
1642; SSE3-NEXT:    popq %r12
1643; SSE3-NEXT:    popq %r13
1644; SSE3-NEXT:    popq %r14
1645; SSE3-NEXT:    popq %r15
1646; SSE3-NEXT:    popq %rbp
1647; SSE3-NEXT:    retq
1648;
1649; SSSE3-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1650; SSSE3:       # %bb.0:
1651; SSSE3-NEXT:    pushq %rbp
1652; SSSE3-NEXT:    pushq %r15
1653; SSSE3-NEXT:    pushq %r14
1654; SSSE3-NEXT:    pushq %r13
1655; SSSE3-NEXT:    pushq %r12
1656; SSSE3-NEXT:    pushq %rbx
1657; SSSE3-NEXT:    subq $424, %rsp # imm = 0x1A8
1658; SSSE3-NEXT:    movaps %xmm2, -128(%rsp)
1659; SSSE3-NEXT:    movaps %xmm1, 400(%rsp)
1660; SSSE3-NEXT:    movaps %xmm0, 384(%rsp)
1661; SSSE3-NEXT:    movzbl -128(%rsp), %eax
1662; SSSE3-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
1663; SSSE3-NEXT:    movaps %xmm1, 368(%rsp)
1664; SSSE3-NEXT:    movaps %xmm0, 352(%rsp)
1665; SSSE3-NEXT:    movzbl -127(%rsp), %ecx
1666; SSSE3-NEXT:    movaps %xmm1, 336(%rsp)
1667; SSSE3-NEXT:    movaps %xmm0, 320(%rsp)
1668; SSSE3-NEXT:    movzbl -126(%rsp), %edx
1669; SSSE3-NEXT:    movaps %xmm1, 304(%rsp)
1670; SSSE3-NEXT:    movaps %xmm0, 288(%rsp)
1671; SSSE3-NEXT:    movzbl -125(%rsp), %esi
1672; SSSE3-NEXT:    movaps %xmm1, 272(%rsp)
1673; SSSE3-NEXT:    movaps %xmm0, 256(%rsp)
1674; SSSE3-NEXT:    movzbl -124(%rsp), %edi
1675; SSSE3-NEXT:    movaps %xmm1, 240(%rsp)
1676; SSSE3-NEXT:    movaps %xmm0, 224(%rsp)
1677; SSSE3-NEXT:    movzbl -123(%rsp), %r8d
1678; SSSE3-NEXT:    movaps %xmm1, 208(%rsp)
1679; SSSE3-NEXT:    movaps %xmm0, 192(%rsp)
1680; SSSE3-NEXT:    movzbl -122(%rsp), %r9d
1681; SSSE3-NEXT:    movaps %xmm1, 176(%rsp)
1682; SSSE3-NEXT:    movaps %xmm0, 160(%rsp)
1683; SSSE3-NEXT:    movzbl -121(%rsp), %r10d
1684; SSSE3-NEXT:    movaps %xmm1, 144(%rsp)
1685; SSSE3-NEXT:    movaps %xmm0, 128(%rsp)
1686; SSSE3-NEXT:    movzbl -120(%rsp), %r11d
1687; SSSE3-NEXT:    movaps %xmm1, 112(%rsp)
1688; SSSE3-NEXT:    movaps %xmm0, 96(%rsp)
1689; SSSE3-NEXT:    movzbl -119(%rsp), %ebx
1690; SSSE3-NEXT:    movaps %xmm1, 80(%rsp)
1691; SSSE3-NEXT:    movaps %xmm0, 64(%rsp)
1692; SSSE3-NEXT:    movzbl -118(%rsp), %r14d
1693; SSSE3-NEXT:    movaps %xmm1, 48(%rsp)
1694; SSSE3-NEXT:    movaps %xmm0, 32(%rsp)
1695; SSSE3-NEXT:    movzbl -117(%rsp), %r15d
1696; SSSE3-NEXT:    movaps %xmm1, 16(%rsp)
1697; SSSE3-NEXT:    movaps %xmm0, (%rsp)
1698; SSSE3-NEXT:    movzbl -116(%rsp), %r12d
1699; SSSE3-NEXT:    movaps %xmm1, -16(%rsp)
1700; SSSE3-NEXT:    movaps %xmm0, -32(%rsp)
1701; SSSE3-NEXT:    movzbl -115(%rsp), %r13d
1702; SSSE3-NEXT:    movaps %xmm1, -48(%rsp)
1703; SSSE3-NEXT:    movaps %xmm0, -64(%rsp)
1704; SSSE3-NEXT:    movzbl -114(%rsp), %ebp
1705; SSSE3-NEXT:    movaps %xmm1, -80(%rsp)
1706; SSSE3-NEXT:    movaps %xmm0, -96(%rsp)
1707; SSSE3-NEXT:    movzbl -113(%rsp), %eax
1708; SSSE3-NEXT:    andl $31, %eax
1709; SSSE3-NEXT:    movzbl -96(%rsp,%rax), %eax
1710; SSSE3-NEXT:    movd %eax, %xmm1
1711; SSSE3-NEXT:    andl $31, %ebp
1712; SSSE3-NEXT:    movzbl -64(%rsp,%rbp), %eax
1713; SSSE3-NEXT:    movd %eax, %xmm2
1714; SSSE3-NEXT:    andl $31, %r13d
1715; SSSE3-NEXT:    movzbl -32(%rsp,%r13), %eax
1716; SSSE3-NEXT:    movd %eax, %xmm4
1717; SSSE3-NEXT:    andl $31, %r12d
1718; SSSE3-NEXT:    movzbl (%rsp,%r12), %eax
1719; SSSE3-NEXT:    movd %eax, %xmm3
1720; SSSE3-NEXT:    andl $31, %r15d
1721; SSSE3-NEXT:    movzbl 32(%rsp,%r15), %eax
1722; SSSE3-NEXT:    movd %eax, %xmm6
1723; SSSE3-NEXT:    andl $31, %r14d
1724; SSSE3-NEXT:    movzbl 64(%rsp,%r14), %eax
1725; SSSE3-NEXT:    movd %eax, %xmm7
1726; SSSE3-NEXT:    andl $31, %ebx
1727; SSSE3-NEXT:    movzbl 96(%rsp,%rbx), %eax
1728; SSSE3-NEXT:    movd %eax, %xmm8
1729; SSSE3-NEXT:    andl $31, %r11d
1730; SSSE3-NEXT:    movzbl 128(%rsp,%r11), %eax
1731; SSSE3-NEXT:    movd %eax, %xmm5
1732; SSSE3-NEXT:    andl $31, %r10d
1733; SSSE3-NEXT:    movzbl 160(%rsp,%r10), %eax
1734; SSSE3-NEXT:    movd %eax, %xmm9
1735; SSSE3-NEXT:    andl $31, %r9d
1736; SSSE3-NEXT:    movzbl 192(%rsp,%r9), %eax
1737; SSSE3-NEXT:    movd %eax, %xmm10
1738; SSSE3-NEXT:    andl $31, %r8d
1739; SSSE3-NEXT:    movzbl 224(%rsp,%r8), %eax
1740; SSSE3-NEXT:    movd %eax, %xmm12
1741; SSSE3-NEXT:    andl $31, %edi
1742; SSSE3-NEXT:    movzbl 256(%rsp,%rdi), %eax
1743; SSSE3-NEXT:    movd %eax, %xmm11
1744; SSSE3-NEXT:    andl $31, %esi
1745; SSSE3-NEXT:    movzbl 288(%rsp,%rsi), %eax
1746; SSSE3-NEXT:    movd %eax, %xmm13
1747; SSSE3-NEXT:    andl $31, %edx
1748; SSSE3-NEXT:    movzbl 320(%rsp,%rdx), %eax
1749; SSSE3-NEXT:    movd %eax, %xmm14
1750; SSSE3-NEXT:    andl $31, %ecx
1751; SSSE3-NEXT:    movzbl 352(%rsp,%rcx), %eax
1752; SSSE3-NEXT:    movd %eax, %xmm15
1753; SSSE3-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
1754; SSSE3-NEXT:    andl $31, %eax
1755; SSSE3-NEXT:    movzbl 384(%rsp,%rax), %eax
1756; SSSE3-NEXT:    movd %eax, %xmm0
1757; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
1758; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
1759; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
1760; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
1761; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm8[0],xmm5[1],xmm8[1],xmm5[2],xmm8[2],xmm5[3],xmm8[3],xmm5[4],xmm8[4],xmm5[5],xmm8[5],xmm5[6],xmm8[6],xmm5[7],xmm8[7]
1762; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
1763; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
1764; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
1765; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
1766; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
1767; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
1768; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3],xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
1769; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3]
1770; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
1771; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm5[0]
1772; SSSE3-NEXT:    addq $424, %rsp # imm = 0x1A8
1773; SSSE3-NEXT:    popq %rbx
1774; SSSE3-NEXT:    popq %r12
1775; SSSE3-NEXT:    popq %r13
1776; SSSE3-NEXT:    popq %r14
1777; SSSE3-NEXT:    popq %r15
1778; SSSE3-NEXT:    popq %rbp
1779; SSSE3-NEXT:    retq
1780;
1781; SSE41-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1782; SSE41:       # %bb.0:
1783; SSE41-NEXT:    subq $392, %rsp # imm = 0x188
1784; SSE41-NEXT:    movd %xmm2, %eax
1785; SSE41-NEXT:    movaps %xmm1, 368(%rsp)
1786; SSE41-NEXT:    movaps %xmm0, 352(%rsp)
1787; SSE41-NEXT:    andl $31, %eax
1788; SSE41-NEXT:    movaps %xmm1, 336(%rsp)
1789; SSE41-NEXT:    movaps %xmm0, 320(%rsp)
1790; SSE41-NEXT:    movaps %xmm1, 304(%rsp)
1791; SSE41-NEXT:    movaps %xmm0, 288(%rsp)
1792; SSE41-NEXT:    movaps %xmm1, 272(%rsp)
1793; SSE41-NEXT:    movaps %xmm0, 256(%rsp)
1794; SSE41-NEXT:    movaps %xmm1, 240(%rsp)
1795; SSE41-NEXT:    movaps %xmm0, 224(%rsp)
1796; SSE41-NEXT:    movaps %xmm1, 208(%rsp)
1797; SSE41-NEXT:    movaps %xmm0, 192(%rsp)
1798; SSE41-NEXT:    movaps %xmm1, 176(%rsp)
1799; SSE41-NEXT:    movaps %xmm0, 160(%rsp)
1800; SSE41-NEXT:    movaps %xmm1, 144(%rsp)
1801; SSE41-NEXT:    movaps %xmm0, 128(%rsp)
1802; SSE41-NEXT:    movaps %xmm1, 112(%rsp)
1803; SSE41-NEXT:    movaps %xmm0, 96(%rsp)
1804; SSE41-NEXT:    movaps %xmm1, 80(%rsp)
1805; SSE41-NEXT:    movaps %xmm0, 64(%rsp)
1806; SSE41-NEXT:    movaps %xmm1, 48(%rsp)
1807; SSE41-NEXT:    movaps %xmm0, 32(%rsp)
1808; SSE41-NEXT:    movaps %xmm1, 16(%rsp)
1809; SSE41-NEXT:    movaps %xmm0, (%rsp)
1810; SSE41-NEXT:    movaps %xmm1, -16(%rsp)
1811; SSE41-NEXT:    movaps %xmm0, -32(%rsp)
1812; SSE41-NEXT:    movaps %xmm1, -48(%rsp)
1813; SSE41-NEXT:    movaps %xmm0, -64(%rsp)
1814; SSE41-NEXT:    movaps %xmm1, -80(%rsp)
1815; SSE41-NEXT:    movaps %xmm0, -96(%rsp)
1816; SSE41-NEXT:    movaps %xmm1, -112(%rsp)
1817; SSE41-NEXT:    movaps %xmm0, -128(%rsp)
1818; SSE41-NEXT:    movzbl 352(%rsp,%rax), %eax
1819; SSE41-NEXT:    movd %eax, %xmm0
1820; SSE41-NEXT:    pextrb $1, %xmm2, %eax
1821; SSE41-NEXT:    andl $31, %eax
1822; SSE41-NEXT:    pinsrb $1, 320(%rsp,%rax), %xmm0
1823; SSE41-NEXT:    pextrb $2, %xmm2, %eax
1824; SSE41-NEXT:    andl $31, %eax
1825; SSE41-NEXT:    pinsrb $2, 288(%rsp,%rax), %xmm0
1826; SSE41-NEXT:    pextrb $3, %xmm2, %eax
1827; SSE41-NEXT:    andl $31, %eax
1828; SSE41-NEXT:    pinsrb $3, 256(%rsp,%rax), %xmm0
1829; SSE41-NEXT:    pextrb $4, %xmm2, %eax
1830; SSE41-NEXT:    andl $31, %eax
1831; SSE41-NEXT:    pinsrb $4, 224(%rsp,%rax), %xmm0
1832; SSE41-NEXT:    pextrb $5, %xmm2, %eax
1833; SSE41-NEXT:    andl $31, %eax
1834; SSE41-NEXT:    pinsrb $5, 192(%rsp,%rax), %xmm0
1835; SSE41-NEXT:    pextrb $6, %xmm2, %eax
1836; SSE41-NEXT:    andl $31, %eax
1837; SSE41-NEXT:    pinsrb $6, 160(%rsp,%rax), %xmm0
1838; SSE41-NEXT:    pextrb $7, %xmm2, %eax
1839; SSE41-NEXT:    andl $31, %eax
1840; SSE41-NEXT:    pinsrb $7, 128(%rsp,%rax), %xmm0
1841; SSE41-NEXT:    pextrb $8, %xmm2, %eax
1842; SSE41-NEXT:    andl $31, %eax
1843; SSE41-NEXT:    pinsrb $8, 96(%rsp,%rax), %xmm0
1844; SSE41-NEXT:    pextrb $9, %xmm2, %eax
1845; SSE41-NEXT:    andl $31, %eax
1846; SSE41-NEXT:    pinsrb $9, 64(%rsp,%rax), %xmm0
1847; SSE41-NEXT:    pextrb $10, %xmm2, %eax
1848; SSE41-NEXT:    andl $31, %eax
1849; SSE41-NEXT:    pinsrb $10, 32(%rsp,%rax), %xmm0
1850; SSE41-NEXT:    pextrb $11, %xmm2, %eax
1851; SSE41-NEXT:    andl $31, %eax
1852; SSE41-NEXT:    pinsrb $11, (%rsp,%rax), %xmm0
1853; SSE41-NEXT:    pextrb $12, %xmm2, %eax
1854; SSE41-NEXT:    andl $31, %eax
1855; SSE41-NEXT:    pinsrb $12, -32(%rsp,%rax), %xmm0
1856; SSE41-NEXT:    pextrb $13, %xmm2, %eax
1857; SSE41-NEXT:    andl $31, %eax
1858; SSE41-NEXT:    pinsrb $13, -64(%rsp,%rax), %xmm0
1859; SSE41-NEXT:    pextrb $14, %xmm2, %eax
1860; SSE41-NEXT:    andl $31, %eax
1861; SSE41-NEXT:    pinsrb $14, -96(%rsp,%rax), %xmm0
1862; SSE41-NEXT:    pextrb $15, %xmm2, %eax
1863; SSE41-NEXT:    andl $31, %eax
1864; SSE41-NEXT:    pinsrb $15, -128(%rsp,%rax), %xmm0
1865; SSE41-NEXT:    addq $392, %rsp # imm = 0x188
1866; SSE41-NEXT:    retq
1867;
1868; XOP-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1869; XOP:       # %bb.0:
1870; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
1871; XOP-NEXT:    vpperm %xmm1, %xmm2, %xmm0, %xmm0
1872; XOP-NEXT:    vzeroupper
1873; XOP-NEXT:    retq
1874;
1875; AVX1-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1876; AVX1:       # %bb.0:
1877; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1878; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1879; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1880; AVX1-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1881; AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1882; AVX1-NEXT:    vzeroupper
1883; AVX1-NEXT:    retq
1884;
1885; AVX2-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1886; AVX2:       # %bb.0:
1887; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1888; AVX2-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1889; AVX2-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1890; AVX2-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1891; AVX2-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1892; AVX2-NEXT:    vzeroupper
1893; AVX2-NEXT:    retq
1894;
1895; AVX512-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1896; AVX512:       # %bb.0:
1897; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm2
1898; AVX512-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1899; AVX512-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1900; AVX512-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1901; AVX512-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
1902; AVX512-NEXT:    vzeroupper
1903; AVX512-NEXT:    retq
1904;
1905; AVX512VLBW-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1906; AVX512VLBW:       # %bb.0:
1907; AVX512VLBW-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1908; AVX512VLBW-NEXT:    vextracti128 $1, %ymm0, %xmm2
1909; AVX512VLBW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
1910; AVX512VLBW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
1911; AVX512VLBW-NEXT:    vpcmpgtb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %k1
1912; AVX512VLBW-NEXT:    vmovdqu8 %ymm2, %ymm0 {%k1}
1913; AVX512VLBW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1914; AVX512VLBW-NEXT:    vzeroupper
1915; AVX512VLBW-NEXT:    retq
1916;
1917; VLVBMI-LABEL: var_shuffle_v16i8_from_v32i8_v16i8:
1918; VLVBMI:       # %bb.0:
1919; VLVBMI-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
1920; VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
1921; VLVBMI-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1922; VLVBMI-NEXT:    vzeroupper
1923; VLVBMI-NEXT:    retq
1924  %index0 = extractelement <16 x i8> %indices, i32 0
1925  %index1 = extractelement <16 x i8> %indices, i32 1
1926  %index2 = extractelement <16 x i8> %indices, i32 2
1927  %index3 = extractelement <16 x i8> %indices, i32 3
1928  %index4 = extractelement <16 x i8> %indices, i32 4
1929  %index5 = extractelement <16 x i8> %indices, i32 5
1930  %index6 = extractelement <16 x i8> %indices, i32 6
1931  %index7 = extractelement <16 x i8> %indices, i32 7
1932  %index8 = extractelement <16 x i8> %indices, i32 8
1933  %index9 = extractelement <16 x i8> %indices, i32 9
1934  %index10 = extractelement <16 x i8> %indices, i32 10
1935  %index11 = extractelement <16 x i8> %indices, i32 11
1936  %index12 = extractelement <16 x i8> %indices, i32 12
1937  %index13 = extractelement <16 x i8> %indices, i32 13
1938  %index14 = extractelement <16 x i8> %indices, i32 14
1939  %index15 = extractelement <16 x i8> %indices, i32 15
1940  %v0 = extractelement <32 x i8> %v, i8 %index0
1941  %v1 = extractelement <32 x i8> %v, i8 %index1
1942  %v2 = extractelement <32 x i8> %v, i8 %index2
1943  %v3 = extractelement <32 x i8> %v, i8 %index3
1944  %v4 = extractelement <32 x i8> %v, i8 %index4
1945  %v5 = extractelement <32 x i8> %v, i8 %index5
1946  %v6 = extractelement <32 x i8> %v, i8 %index6
1947  %v7 = extractelement <32 x i8> %v, i8 %index7
1948  %v8 = extractelement <32 x i8> %v, i8 %index8
1949  %v9 = extractelement <32 x i8> %v, i8 %index9
1950  %v10 = extractelement <32 x i8> %v, i8 %index10
1951  %v11 = extractelement <32 x i8> %v, i8 %index11
1952  %v12 = extractelement <32 x i8> %v, i8 %index12
1953  %v13 = extractelement <32 x i8> %v, i8 %index13
1954  %v14 = extractelement <32 x i8> %v, i8 %index14
1955  %v15 = extractelement <32 x i8> %v, i8 %index15
1956  %ret0 = insertelement <16 x i8> undef, i8 %v0, i32 0
1957  %ret1 = insertelement <16 x i8> %ret0, i8 %v1, i32 1
1958  %ret2 = insertelement <16 x i8> %ret1, i8 %v2, i32 2
1959  %ret3 = insertelement <16 x i8> %ret2, i8 %v3, i32 3
1960  %ret4 = insertelement <16 x i8> %ret3, i8 %v4, i32 4
1961  %ret5 = insertelement <16 x i8> %ret4, i8 %v5, i32 5
1962  %ret6 = insertelement <16 x i8> %ret5, i8 %v6, i32 6
1963  %ret7 = insertelement <16 x i8> %ret6, i8 %v7, i32 7
1964  %ret8 = insertelement <16 x i8> %ret7, i8 %v8, i32 8
1965  %ret9 = insertelement <16 x i8> %ret8, i8 %v9, i32 9
1966  %ret10 = insertelement <16 x i8> %ret9, i8 %v10, i32 10
1967  %ret11 = insertelement <16 x i8> %ret10, i8 %v11, i32 11
1968  %ret12 = insertelement <16 x i8> %ret11, i8 %v12, i32 12
1969  %ret13 = insertelement <16 x i8> %ret12, i8 %v13, i32 13
1970  %ret14 = insertelement <16 x i8> %ret13, i8 %v14, i32 14
1971  %ret15 = insertelement <16 x i8> %ret14, i8 %v15, i32 15
1972  ret <16 x i8> %ret15
1973}
1974
1975define void @indices_convert() {
1976; SSE3-LABEL: indices_convert:
1977; SSE3:       # %bb.0: # %bb
1978; SSE3-NEXT:    movaps (%rax), %xmm0
1979; SSE3-NEXT:    movaps %xmm0, -24(%rsp)
1980; SSE3-NEXT:    movaps %xmm0, -40(%rsp)
1981; SSE3-NEXT:    movl (%rax), %eax
1982; SSE3-NEXT:    movaps %xmm0, -56(%rsp)
1983; SSE3-NEXT:    movaps %xmm0, -72(%rsp)
1984; SSE3-NEXT:    andl $3, %eax
1985; SSE3-NEXT:    shll $3, %eax
1986; SSE3-NEXT:    movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
1987; SSE3-NEXT:    movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
1988; SSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
1989; SSE3-NEXT:    movups %xmm1, (%rax)
1990; SSE3-NEXT:    retq
1991;
1992; SSSE3-LABEL: indices_convert:
1993; SSSE3:       # %bb.0: # %bb
1994; SSSE3-NEXT:    movaps (%rax), %xmm0
1995; SSSE3-NEXT:    movaps %xmm0, -24(%rsp)
1996; SSSE3-NEXT:    movaps %xmm0, -40(%rsp)
1997; SSSE3-NEXT:    movl (%rax), %eax
1998; SSSE3-NEXT:    movaps %xmm0, -56(%rsp)
1999; SSSE3-NEXT:    movaps %xmm0, -72(%rsp)
2000; SSSE3-NEXT:    andl $3, %eax
2001; SSSE3-NEXT:    shll $3, %eax
2002; SSSE3-NEXT:    movsd -72(%rsp,%rax), %xmm0 # xmm0 = mem[0],zero
2003; SSSE3-NEXT:    movsd -40(%rsp,%rax), %xmm1 # xmm1 = mem[0],zero
2004; SSSE3-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2005; SSSE3-NEXT:    movups %xmm1, (%rax)
2006; SSSE3-NEXT:    retq
2007;
2008; SSE41-LABEL: indices_convert:
2009; SSE41:       # %bb.0: # %bb
2010; SSE41-NEXT:    movaps (%rax), %xmm0
2011; SSE41-NEXT:    extractps $2, %xmm0, %eax
2012; SSE41-NEXT:    movaps %xmm0, -24(%rsp)
2013; SSE41-NEXT:    movaps %xmm0, -40(%rsp)
2014; SSE41-NEXT:    andl $3, %eax
2015; SSE41-NEXT:    extractps $3, %xmm0, %ecx
2016; SSE41-NEXT:    movaps %xmm0, -56(%rsp)
2017; SSE41-NEXT:    movaps %xmm0, -72(%rsp)
2018; SSE41-NEXT:    andl $3, %ecx
2019; SSE41-NEXT:    movsd -72(%rsp,%rcx,8), %xmm0 # xmm0 = mem[0],zero
2020; SSE41-NEXT:    movsd -40(%rsp,%rax,8), %xmm1 # xmm1 = mem[0],zero
2021; SSE41-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
2022; SSE41-NEXT:    movups %xmm1, (%rax)
2023; SSE41-NEXT:    retq
2024;
2025; XOP-LABEL: indices_convert:
2026; XOP:       # %bb.0: # %bb
2027; XOP-NEXT:    vmovdqa (%rax), %xmm0
2028; XOP-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2029; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2030; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2031; XOP-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
2032; XOP-NEXT:    vpermil2pd $0, %xmm1, %xmm0, %xmm0, %xmm0
2033; XOP-NEXT:    vmovupd %xmm0, (%rax)
2034; XOP-NEXT:    retq
2035;
2036; AVX1-LABEL: indices_convert:
2037; AVX1:       # %bb.0: # %bb
2038; AVX1-NEXT:    vmovdqa (%rax), %xmm0
2039; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
2040; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2041; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2042; AVX1-NEXT:    vpaddq %xmm1, %xmm1, %xmm1
2043; AVX1-NEXT:    vpermilpd %xmm1, %xmm0, %xmm0
2044; AVX1-NEXT:    vmovupd %xmm0, (%rax)
2045; AVX1-NEXT:    retq
2046;
2047; AVX2-LABEL: indices_convert:
2048; AVX2:       # %bb.0: # %bb
2049; AVX2-NEXT:    vpbroadcastq (%rax), %xmm0
2050; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [7,7,7,7]
2051; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
2052; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2053; AVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
2054; AVX2-NEXT:    vmovapd (%rax), %xmm1
2055; AVX2-NEXT:    vpermilpd %xmm0, %xmm1, %xmm0
2056; AVX2-NEXT:    vmovupd %xmm0, (%rax)
2057; AVX2-NEXT:    retq
2058;
2059; AVX512-LABEL: indices_convert:
2060; AVX512:       # %bb.0: # %bb
2061; AVX512-NEXT:    vmovdqa (%rax), %ymm0
2062; AVX512-NEXT:    vpbroadcastq (%rax), %xmm1
2063; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [7,7,7,7]
2064; AVX512-NEXT:    vpand %xmm2, %xmm1, %xmm1
2065; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
2066; AVX512-NEXT:    vpermq %zmm0, %zmm1, %zmm0
2067; AVX512-NEXT:    vmovdqu %xmm0, (%rax)
2068; AVX512-NEXT:    vzeroupper
2069; AVX512-NEXT:    retq
2070;
2071; AVX512VL-LABEL: indices_convert:
2072; AVX512VL:       # %bb.0: # %bb
2073; AVX512VL-NEXT:    vpbroadcastq (%rax), %xmm0
2074; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2075; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
2076; AVX512VL-NEXT:    vpermq (%rax), %ymm0, %ymm0
2077; AVX512VL-NEXT:    vmovdqu %xmm0, (%rax)
2078; AVX512VL-NEXT:    vzeroupper
2079; AVX512VL-NEXT:    retq
2080bb:
2081  %0 = load <4 x i64>, ptr undef, align 32
2082  %1 = bitcast <4 x i64> %0 to <8 x i32>
2083  %2 = shufflevector <8 x i32> %1, <8 x i32> undef, <2 x i32> <i32 2, i32 12>
2084  %3 = and <2 x i32> %2, <i32 7, i32 7>
2085  %4 = extractelement <2 x i32> %3, i32 0
2086  %vecext.i8.1 = extractelement <4 x i64> %0, i32 %4
2087  %5 = extractelement <2 x i32> %3, i32 1
2088  %vecext.i8.2 = extractelement <4 x i64> %0, i32 %5
2089  %6 = insertelement <2 x i64> poison, i64 %vecext.i8.1, i32 0
2090  %7 = insertelement <2 x i64> %6, i64 %vecext.i8.2, i32 1
2091  %8 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> %7
2092  store <2 x i64> %8, ptr undef, align 8
2093  ret void
2094}
2095