xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll (revision 5c181a9191bfb758575329ff7eb8db4fc46ffac9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
15
16;
17; Variable Shifts
18;
19
20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21; SSE2-LABEL: var_shift_v2i64:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    movdqa %xmm0, %xmm2
24; SSE2-NEXT:    psrlq %xmm1, %xmm2
25; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
26; SSE2-NEXT:    psrlq %xmm1, %xmm0
27; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
28; SSE2-NEXT:    retq
29;
30; SSE41-LABEL: var_shift_v2i64:
31; SSE41:       # %bb.0:
32; SSE41-NEXT:    movdqa %xmm0, %xmm2
33; SSE41-NEXT:    psrlq %xmm1, %xmm2
34; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
35; SSE41-NEXT:    psrlq %xmm1, %xmm0
36; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
37; SSE41-NEXT:    retq
38;
39; AVX1-LABEL: var_shift_v2i64:
40; AVX1:       # %bb.0:
41; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm2
42; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
43; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
44; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4,5,6,7]
45; AVX1-NEXT:    retq
46;
47; AVX2-LABEL: var_shift_v2i64:
48; AVX2:       # %bb.0:
49; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
50; AVX2-NEXT:    retq
51;
52; XOPAVX1-LABEL: var_shift_v2i64:
53; XOPAVX1:       # %bb.0:
54; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
55; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
56; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
57; XOPAVX1-NEXT:    retq
58;
59; XOPAVX2-LABEL: var_shift_v2i64:
60; XOPAVX2:       # %bb.0:
61; XOPAVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
62; XOPAVX2-NEXT:    retq
63;
64; AVX512-LABEL: var_shift_v2i64:
65; AVX512:       # %bb.0:
66; AVX512-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
67; AVX512-NEXT:    retq
68;
69; AVX512VL-LABEL: var_shift_v2i64:
70; AVX512VL:       # %bb.0:
71; AVX512VL-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
72; AVX512VL-NEXT:    retq
73;
74; X86-SSE-LABEL: var_shift_v2i64:
75; X86-SSE:       # %bb.0:
76; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
77; X86-SSE-NEXT:    psrlq %xmm1, %xmm2
78; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
79; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
80; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
81; X86-SSE-NEXT:    retl
82  %shift = lshr <2 x i64> %a, %b
83  ret <2 x i64> %shift
84}
85
86define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
87; SSE2-LABEL: var_shift_v4i32:
88; SSE2:       # %bb.0:
89; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
90; SSE2-NEXT:    movdqa %xmm0, %xmm3
91; SSE2-NEXT:    psrld %xmm2, %xmm3
92; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
93; SSE2-NEXT:    movdqa %xmm0, %xmm2
94; SSE2-NEXT:    psrld %xmm4, %xmm2
95; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
96; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
97; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
98; SSE2-NEXT:    movdqa %xmm0, %xmm4
99; SSE2-NEXT:    psrld %xmm3, %xmm4
100; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
101; SSE2-NEXT:    psrld %xmm1, %xmm0
102; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
103; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
104; SSE2-NEXT:    movaps %xmm2, %xmm0
105; SSE2-NEXT:    retq
106;
107; SSE41-LABEL: var_shift_v4i32:
108; SSE41:       # %bb.0:
109; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
110; SSE41-NEXT:    movdqa %xmm0, %xmm3
111; SSE41-NEXT:    psrld %xmm2, %xmm3
112; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
113; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
114; SSE41-NEXT:    movdqa %xmm0, %xmm5
115; SSE41-NEXT:    psrld %xmm4, %xmm5
116; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
117; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
118; SSE41-NEXT:    movdqa %xmm0, %xmm3
119; SSE41-NEXT:    psrld %xmm1, %xmm3
120; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
121; SSE41-NEXT:    psrld %xmm1, %xmm0
122; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
123; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
124; SSE41-NEXT:    retq
125;
126; AVX1-LABEL: var_shift_v4i32:
127; AVX1:       # %bb.0:
128; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
129; AVX1-NEXT:    vpsrld %xmm2, %xmm0, %xmm2
130; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
131; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
132; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
133; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
134; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
135; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
136; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
137; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
138; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
139; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
140; AVX1-NEXT:    retq
141;
142; AVX2-LABEL: var_shift_v4i32:
143; AVX2:       # %bb.0:
144; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
145; AVX2-NEXT:    retq
146;
147; XOPAVX1-LABEL: var_shift_v4i32:
148; XOPAVX1:       # %bb.0:
149; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
150; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
151; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
152; XOPAVX1-NEXT:    retq
153;
154; XOPAVX2-LABEL: var_shift_v4i32:
155; XOPAVX2:       # %bb.0:
156; XOPAVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
157; XOPAVX2-NEXT:    retq
158;
159; AVX512-LABEL: var_shift_v4i32:
160; AVX512:       # %bb.0:
161; AVX512-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
162; AVX512-NEXT:    retq
163;
164; AVX512VL-LABEL: var_shift_v4i32:
165; AVX512VL:       # %bb.0:
166; AVX512VL-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
167; AVX512VL-NEXT:    retq
168;
169; X86-SSE-LABEL: var_shift_v4i32:
170; X86-SSE:       # %bb.0:
171; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
172; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
173; X86-SSE-NEXT:    psrld %xmm2, %xmm3
174; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
175; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
176; X86-SSE-NEXT:    psrld %xmm4, %xmm2
177; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
178; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
179; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
180; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
181; X86-SSE-NEXT:    psrld %xmm3, %xmm4
182; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
183; X86-SSE-NEXT:    psrld %xmm1, %xmm0
184; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
185; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
186; X86-SSE-NEXT:    movaps %xmm2, %xmm0
187; X86-SSE-NEXT:    retl
188  %shift = lshr <4 x i32> %a, %b
189  ret <4 x i32> %shift
190}
191
192define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
193; SSE2-LABEL: var_shift_v8i16:
194; SSE2:       # %bb.0:
195; SSE2-NEXT:    psllw $12, %xmm1
196; SSE2-NEXT:    movdqa %xmm1, %xmm2
197; SSE2-NEXT:    psraw $15, %xmm2
198; SSE2-NEXT:    movdqa %xmm2, %xmm3
199; SSE2-NEXT:    pandn %xmm0, %xmm3
200; SSE2-NEXT:    psrlw $8, %xmm0
201; SSE2-NEXT:    pand %xmm2, %xmm0
202; SSE2-NEXT:    por %xmm3, %xmm0
203; SSE2-NEXT:    paddw %xmm1, %xmm1
204; SSE2-NEXT:    movdqa %xmm1, %xmm2
205; SSE2-NEXT:    psraw $15, %xmm2
206; SSE2-NEXT:    movdqa %xmm2, %xmm3
207; SSE2-NEXT:    pandn %xmm0, %xmm3
208; SSE2-NEXT:    psrlw $4, %xmm0
209; SSE2-NEXT:    pand %xmm2, %xmm0
210; SSE2-NEXT:    por %xmm3, %xmm0
211; SSE2-NEXT:    paddw %xmm1, %xmm1
212; SSE2-NEXT:    movdqa %xmm1, %xmm2
213; SSE2-NEXT:    psraw $15, %xmm2
214; SSE2-NEXT:    movdqa %xmm2, %xmm3
215; SSE2-NEXT:    pandn %xmm0, %xmm3
216; SSE2-NEXT:    psrlw $2, %xmm0
217; SSE2-NEXT:    pand %xmm2, %xmm0
218; SSE2-NEXT:    por %xmm3, %xmm0
219; SSE2-NEXT:    paddw %xmm1, %xmm1
220; SSE2-NEXT:    psraw $15, %xmm1
221; SSE2-NEXT:    movdqa %xmm1, %xmm2
222; SSE2-NEXT:    pandn %xmm0, %xmm2
223; SSE2-NEXT:    psrlw $1, %xmm0
224; SSE2-NEXT:    pand %xmm1, %xmm0
225; SSE2-NEXT:    por %xmm2, %xmm0
226; SSE2-NEXT:    retq
227;
228; SSE41-LABEL: var_shift_v8i16:
229; SSE41:       # %bb.0:
230; SSE41-NEXT:    movdqa %xmm0, %xmm2
231; SSE41-NEXT:    movdqa %xmm1, %xmm0
232; SSE41-NEXT:    psllw $12, %xmm0
233; SSE41-NEXT:    psllw $4, %xmm1
234; SSE41-NEXT:    por %xmm1, %xmm0
235; SSE41-NEXT:    movdqa %xmm0, %xmm1
236; SSE41-NEXT:    paddw %xmm0, %xmm1
237; SSE41-NEXT:    movdqa %xmm2, %xmm3
238; SSE41-NEXT:    psrlw $8, %xmm3
239; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
240; SSE41-NEXT:    movdqa %xmm2, %xmm3
241; SSE41-NEXT:    psrlw $4, %xmm3
242; SSE41-NEXT:    movdqa %xmm1, %xmm0
243; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
244; SSE41-NEXT:    movdqa %xmm2, %xmm3
245; SSE41-NEXT:    psrlw $2, %xmm3
246; SSE41-NEXT:    paddw %xmm1, %xmm1
247; SSE41-NEXT:    movdqa %xmm1, %xmm0
248; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
249; SSE41-NEXT:    movdqa %xmm2, %xmm3
250; SSE41-NEXT:    psrlw $1, %xmm3
251; SSE41-NEXT:    paddw %xmm1, %xmm1
252; SSE41-NEXT:    movdqa %xmm1, %xmm0
253; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
254; SSE41-NEXT:    movdqa %xmm2, %xmm0
255; SSE41-NEXT:    retq
256;
257; AVX1-LABEL: var_shift_v8i16:
258; AVX1:       # %bb.0:
259; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
260; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
261; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
262; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
263; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm3
264; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
265; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
266; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
267; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
268; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
269; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
270; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
271; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
272; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
273; AVX1-NEXT:    retq
274;
275; AVX2-LABEL: var_shift_v8i16:
276; AVX2:       # %bb.0:
277; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
278; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
279; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
280; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
281; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
282; AVX2-NEXT:    vzeroupper
283; AVX2-NEXT:    retq
284;
285; XOP-LABEL: var_shift_v8i16:
286; XOP:       # %bb.0:
287; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
288; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
289; XOP-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
290; XOP-NEXT:    retq
291;
292; AVX512DQ-LABEL: var_shift_v8i16:
293; AVX512DQ:       # %bb.0:
294; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
295; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
296; AVX512DQ-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
297; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
298; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
299; AVX512DQ-NEXT:    vzeroupper
300; AVX512DQ-NEXT:    retq
301;
302; AVX512BW-LABEL: var_shift_v8i16:
303; AVX512BW:       # %bb.0:
304; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
305; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
306; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
307; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
308; AVX512BW-NEXT:    vzeroupper
309; AVX512BW-NEXT:    retq
310;
311; AVX512DQVL-LABEL: var_shift_v8i16:
312; AVX512DQVL:       # %bb.0:
313; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
314; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
315; AVX512DQVL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
316; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
317; AVX512DQVL-NEXT:    vzeroupper
318; AVX512DQVL-NEXT:    retq
319;
320; AVX512BWVL-LABEL: var_shift_v8i16:
321; AVX512BWVL:       # %bb.0:
322; AVX512BWVL-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
323; AVX512BWVL-NEXT:    retq
324;
325; X86-SSE-LABEL: var_shift_v8i16:
326; X86-SSE:       # %bb.0:
327; X86-SSE-NEXT:    psllw $12, %xmm1
328; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
329; X86-SSE-NEXT:    psraw $15, %xmm2
330; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
331; X86-SSE-NEXT:    pandn %xmm0, %xmm3
332; X86-SSE-NEXT:    psrlw $8, %xmm0
333; X86-SSE-NEXT:    pand %xmm2, %xmm0
334; X86-SSE-NEXT:    por %xmm3, %xmm0
335; X86-SSE-NEXT:    paddw %xmm1, %xmm1
336; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
337; X86-SSE-NEXT:    psraw $15, %xmm2
338; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
339; X86-SSE-NEXT:    pandn %xmm0, %xmm3
340; X86-SSE-NEXT:    psrlw $4, %xmm0
341; X86-SSE-NEXT:    pand %xmm2, %xmm0
342; X86-SSE-NEXT:    por %xmm3, %xmm0
343; X86-SSE-NEXT:    paddw %xmm1, %xmm1
344; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
345; X86-SSE-NEXT:    psraw $15, %xmm2
346; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
347; X86-SSE-NEXT:    pandn %xmm0, %xmm3
348; X86-SSE-NEXT:    psrlw $2, %xmm0
349; X86-SSE-NEXT:    pand %xmm2, %xmm0
350; X86-SSE-NEXT:    por %xmm3, %xmm0
351; X86-SSE-NEXT:    paddw %xmm1, %xmm1
352; X86-SSE-NEXT:    psraw $15, %xmm1
353; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
354; X86-SSE-NEXT:    pandn %xmm0, %xmm2
355; X86-SSE-NEXT:    psrlw $1, %xmm0
356; X86-SSE-NEXT:    pand %xmm1, %xmm0
357; X86-SSE-NEXT:    por %xmm2, %xmm0
358; X86-SSE-NEXT:    retl
359  %shift = lshr <8 x i16> %a, %b
360  ret <8 x i16> %shift
361}
362
363define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
364; SSE2-LABEL: var_shift_v16i8:
365; SSE2:       # %bb.0:
366; SSE2-NEXT:    psllw $5, %xmm1
367; SSE2-NEXT:    pxor %xmm2, %xmm2
368; SSE2-NEXT:    pxor %xmm3, %xmm3
369; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
370; SSE2-NEXT:    movdqa %xmm3, %xmm4
371; SSE2-NEXT:    pandn %xmm0, %xmm4
372; SSE2-NEXT:    psrlw $4, %xmm0
373; SSE2-NEXT:    pand %xmm3, %xmm0
374; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
375; SSE2-NEXT:    por %xmm4, %xmm0
376; SSE2-NEXT:    paddb %xmm1, %xmm1
377; SSE2-NEXT:    pxor %xmm3, %xmm3
378; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
379; SSE2-NEXT:    movdqa %xmm3, %xmm4
380; SSE2-NEXT:    pandn %xmm0, %xmm4
381; SSE2-NEXT:    psrlw $2, %xmm0
382; SSE2-NEXT:    pand %xmm3, %xmm0
383; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
384; SSE2-NEXT:    por %xmm4, %xmm0
385; SSE2-NEXT:    paddb %xmm1, %xmm1
386; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
387; SSE2-NEXT:    movdqa %xmm2, %xmm1
388; SSE2-NEXT:    pandn %xmm0, %xmm1
389; SSE2-NEXT:    psrlw $1, %xmm0
390; SSE2-NEXT:    pand %xmm2, %xmm0
391; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
392; SSE2-NEXT:    por %xmm1, %xmm0
393; SSE2-NEXT:    retq
394;
395; SSE41-LABEL: var_shift_v16i8:
396; SSE41:       # %bb.0:
397; SSE41-NEXT:    movdqa %xmm0, %xmm2
398; SSE41-NEXT:    psllw $5, %xmm1
399; SSE41-NEXT:    movdqa %xmm0, %xmm3
400; SSE41-NEXT:    psrlw $4, %xmm3
401; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
402; SSE41-NEXT:    movdqa %xmm1, %xmm0
403; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
404; SSE41-NEXT:    movdqa %xmm2, %xmm3
405; SSE41-NEXT:    psrlw $2, %xmm3
406; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
407; SSE41-NEXT:    paddb %xmm1, %xmm1
408; SSE41-NEXT:    movdqa %xmm1, %xmm0
409; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
410; SSE41-NEXT:    movdqa %xmm2, %xmm3
411; SSE41-NEXT:    psrlw $1, %xmm3
412; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
413; SSE41-NEXT:    paddb %xmm1, %xmm1
414; SSE41-NEXT:    movdqa %xmm1, %xmm0
415; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
416; SSE41-NEXT:    movdqa %xmm2, %xmm0
417; SSE41-NEXT:    retq
418;
419; AVX-LABEL: var_shift_v16i8:
420; AVX:       # %bb.0:
421; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
422; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
423; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
424; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
425; AVX-NEXT:    vpsrlw $2, %xmm0, %xmm2
426; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
427; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
428; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
429; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm2
430; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
431; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
432; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
433; AVX-NEXT:    retq
434;
435; XOP-LABEL: var_shift_v16i8:
436; XOP:       # %bb.0:
437; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
438; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
439; XOP-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
440; XOP-NEXT:    retq
441;
442; AVX512DQ-LABEL: var_shift_v16i8:
443; AVX512DQ:       # %bb.0:
444; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
445; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
446; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
447; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
448; AVX512DQ-NEXT:    vzeroupper
449; AVX512DQ-NEXT:    retq
450;
451; AVX512BW-LABEL: var_shift_v16i8:
452; AVX512BW:       # %bb.0:
453; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
454; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
455; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
456; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
457; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
458; AVX512BW-NEXT:    vzeroupper
459; AVX512BW-NEXT:    retq
460;
461; AVX512DQVL-LABEL: var_shift_v16i8:
462; AVX512DQVL:       # %bb.0:
463; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
464; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
465; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
466; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
467; AVX512DQVL-NEXT:    vzeroupper
468; AVX512DQVL-NEXT:    retq
469;
470; AVX512BWVL-LABEL: var_shift_v16i8:
471; AVX512BWVL:       # %bb.0:
472; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
473; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
474; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
475; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
476; AVX512BWVL-NEXT:    vzeroupper
477; AVX512BWVL-NEXT:    retq
478;
479; X86-SSE-LABEL: var_shift_v16i8:
480; X86-SSE:       # %bb.0:
481; X86-SSE-NEXT:    psllw $5, %xmm1
482; X86-SSE-NEXT:    pxor %xmm2, %xmm2
483; X86-SSE-NEXT:    pxor %xmm3, %xmm3
484; X86-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
485; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
486; X86-SSE-NEXT:    pandn %xmm0, %xmm4
487; X86-SSE-NEXT:    psrlw $4, %xmm0
488; X86-SSE-NEXT:    pand %xmm3, %xmm0
489; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
490; X86-SSE-NEXT:    por %xmm4, %xmm0
491; X86-SSE-NEXT:    paddb %xmm1, %xmm1
492; X86-SSE-NEXT:    pxor %xmm3, %xmm3
493; X86-SSE-NEXT:    pcmpgtb %xmm1, %xmm3
494; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
495; X86-SSE-NEXT:    pandn %xmm0, %xmm4
496; X86-SSE-NEXT:    psrlw $2, %xmm0
497; X86-SSE-NEXT:    pand %xmm3, %xmm0
498; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
499; X86-SSE-NEXT:    por %xmm4, %xmm0
500; X86-SSE-NEXT:    paddb %xmm1, %xmm1
501; X86-SSE-NEXT:    pcmpgtb %xmm1, %xmm2
502; X86-SSE-NEXT:    movdqa %xmm2, %xmm1
503; X86-SSE-NEXT:    pandn %xmm0, %xmm1
504; X86-SSE-NEXT:    psrlw $1, %xmm0
505; X86-SSE-NEXT:    pand %xmm2, %xmm0
506; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
507; X86-SSE-NEXT:    por %xmm1, %xmm0
508; X86-SSE-NEXT:    retl
509  %shift = lshr <16 x i8> %a, %b
510  ret <16 x i8> %shift
511}
512
513;
514; Uniform Variable Shifts
515;
516
517define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
518; SSE-LABEL: splatvar_shift_v2i64:
519; SSE:       # %bb.0:
520; SSE-NEXT:    psrlq %xmm1, %xmm0
521; SSE-NEXT:    retq
522;
523; AVX-LABEL: splatvar_shift_v2i64:
524; AVX:       # %bb.0:
525; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
526; AVX-NEXT:    retq
527;
528; XOP-LABEL: splatvar_shift_v2i64:
529; XOP:       # %bb.0:
530; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
531; XOP-NEXT:    retq
532;
533; AVX512-LABEL: splatvar_shift_v2i64:
534; AVX512:       # %bb.0:
535; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
536; AVX512-NEXT:    retq
537;
538; AVX512VL-LABEL: splatvar_shift_v2i64:
539; AVX512VL:       # %bb.0:
540; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
541; AVX512VL-NEXT:    retq
542;
543; X86-SSE-LABEL: splatvar_shift_v2i64:
544; X86-SSE:       # %bb.0:
545; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
546; X86-SSE-NEXT:    retl
547  %splat = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
548  %shift = lshr <2 x i64> %a, %splat
549  ret <2 x i64> %shift
550}
551
552define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
553; SSE2-LABEL: splatvar_shift_v4i32:
554; SSE2:       # %bb.0:
555; SSE2-NEXT:    xorps %xmm2, %xmm2
556; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
557; SSE2-NEXT:    psrld %xmm2, %xmm0
558; SSE2-NEXT:    retq
559;
560; SSE41-LABEL: splatvar_shift_v4i32:
561; SSE41:       # %bb.0:
562; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
563; SSE41-NEXT:    psrld %xmm1, %xmm0
564; SSE41-NEXT:    retq
565;
566; AVX-LABEL: splatvar_shift_v4i32:
567; AVX:       # %bb.0:
568; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
569; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
570; AVX-NEXT:    retq
571;
572; XOP-LABEL: splatvar_shift_v4i32:
573; XOP:       # %bb.0:
574; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
575; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
576; XOP-NEXT:    retq
577;
578; AVX512-LABEL: splatvar_shift_v4i32:
579; AVX512:       # %bb.0:
580; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
581; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
582; AVX512-NEXT:    retq
583;
584; AVX512VL-LABEL: splatvar_shift_v4i32:
585; AVX512VL:       # %bb.0:
586; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
587; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
588; AVX512VL-NEXT:    retq
589;
590; X86-SSE-LABEL: splatvar_shift_v4i32:
591; X86-SSE:       # %bb.0:
592; X86-SSE-NEXT:    xorps %xmm2, %xmm2
593; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
594; X86-SSE-NEXT:    psrld %xmm2, %xmm0
595; X86-SSE-NEXT:    retl
596  %splat = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
597  %shift = lshr <4 x i32> %a, %splat
598  ret <4 x i32> %shift
599}
600
601define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
602; SSE2-LABEL: splatvar_shift_v8i16:
603; SSE2:       # %bb.0:
604; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
605; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
606; SSE2-NEXT:    psrlw %xmm1, %xmm0
607; SSE2-NEXT:    retq
608;
609; SSE41-LABEL: splatvar_shift_v8i16:
610; SSE41:       # %bb.0:
611; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
612; SSE41-NEXT:    psrlw %xmm1, %xmm0
613; SSE41-NEXT:    retq
614;
615; AVX-LABEL: splatvar_shift_v8i16:
616; AVX:       # %bb.0:
617; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
618; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
619; AVX-NEXT:    retq
620;
621; XOP-LABEL: splatvar_shift_v8i16:
622; XOP:       # %bb.0:
623; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
624; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
625; XOP-NEXT:    retq
626;
627; AVX512-LABEL: splatvar_shift_v8i16:
628; AVX512:       # %bb.0:
629; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
630; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
631; AVX512-NEXT:    retq
632;
633; AVX512VL-LABEL: splatvar_shift_v8i16:
634; AVX512VL:       # %bb.0:
635; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
636; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
637; AVX512VL-NEXT:    retq
638;
639; X86-SSE-LABEL: splatvar_shift_v8i16:
640; X86-SSE:       # %bb.0:
641; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
642; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
643; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
644; X86-SSE-NEXT:    retl
645  %splat = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
646  %shift = lshr <8 x i16> %a, %splat
647  ret <8 x i16> %shift
648}
649
650define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
651; SSE2-LABEL: splatvar_shift_v16i8:
652; SSE2:       # %bb.0:
653; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
654; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
655; SSE2-NEXT:    psrlw %xmm1, %xmm0
656; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
657; SSE2-NEXT:    psrlw %xmm1, %xmm2
658; SSE2-NEXT:    psrlw $8, %xmm2
659; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
660; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
661; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
662; SSE2-NEXT:    pand %xmm1, %xmm0
663; SSE2-NEXT:    retq
664;
665; SSE41-LABEL: splatvar_shift_v16i8:
666; SSE41:       # %bb.0:
667; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
668; SSE41-NEXT:    psrlw %xmm1, %xmm0
669; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
670; SSE41-NEXT:    psrlw %xmm1, %xmm2
671; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
672; SSE41-NEXT:    pand %xmm2, %xmm0
673; SSE41-NEXT:    retq
674;
675; AVX1-LABEL: splatvar_shift_v16i8:
676; AVX1:       # %bb.0:
677; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
678; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
679; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
680; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
681; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
682; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
683; AVX1-NEXT:    retq
684;
685; AVX2-LABEL: splatvar_shift_v16i8:
686; AVX2:       # %bb.0:
687; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
688; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
689; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
690; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
691; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
692; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
693; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
694; AVX2-NEXT:    retq
695;
696; XOPAVX1-LABEL: splatvar_shift_v16i8:
697; XOPAVX1:       # %bb.0:
698; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
699; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
700; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
701; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
702; XOPAVX1-NEXT:    retq
703;
704; XOPAVX2-LABEL: splatvar_shift_v16i8:
705; XOPAVX2:       # %bb.0:
706; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
707; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
708; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
709; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
710; XOPAVX2-NEXT:    retq
711;
712; AVX512DQ-LABEL: splatvar_shift_v16i8:
713; AVX512DQ:       # %bb.0:
714; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
715; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
716; AVX512DQ-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
717; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
718; AVX512DQ-NEXT:    vzeroupper
719; AVX512DQ-NEXT:    retq
720;
721; AVX512BW-LABEL: splatvar_shift_v16i8:
722; AVX512BW:       # %bb.0:
723; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
724; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
725; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
726; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
727; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
728; AVX512BW-NEXT:    vzeroupper
729; AVX512BW-NEXT:    retq
730;
731; AVX512DQVL-LABEL: splatvar_shift_v16i8:
732; AVX512DQVL:       # %bb.0:
733; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
734; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
735; AVX512DQVL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
736; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
737; AVX512DQVL-NEXT:    vzeroupper
738; AVX512DQVL-NEXT:    retq
739;
740; AVX512BWVL-LABEL: splatvar_shift_v16i8:
741; AVX512BWVL:       # %bb.0:
742; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
743; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
744; AVX512BWVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
745; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
746; AVX512BWVL-NEXT:    vzeroupper
747; AVX512BWVL-NEXT:    retq
748;
749; X86-SSE-LABEL: splatvar_shift_v16i8:
750; X86-SSE:       # %bb.0:
751; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
752; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
753; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
754; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
755; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
756; X86-SSE-NEXT:    psrlw $8, %xmm2
757; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
758; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
759; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
760; X86-SSE-NEXT:    pand %xmm1, %xmm0
761; X86-SSE-NEXT:    retl
762  %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
763  %shift = lshr <16 x i8> %a, %splat
764  ret <16 x i8> %shift
765}
766
767;
768; Uniform Variable Modulo Shifts
769;
770
771define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
772; SSE-LABEL: splatvar_modulo_shift_v2i64:
773; SSE:       # %bb.0:
774; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
775; SSE-NEXT:    psrlq %xmm1, %xmm0
776; SSE-NEXT:    retq
777;
778; AVX-LABEL: splatvar_modulo_shift_v2i64:
779; AVX:       # %bb.0:
780; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
781; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
782; AVX-NEXT:    retq
783;
784; XOP-LABEL: splatvar_modulo_shift_v2i64:
785; XOP:       # %bb.0:
786; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
787; XOP-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
788; XOP-NEXT:    retq
789;
790; AVX512-LABEL: splatvar_modulo_shift_v2i64:
791; AVX512:       # %bb.0:
792; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
793; AVX512-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
794; AVX512-NEXT:    retq
795;
796; AVX512VL-LABEL: splatvar_modulo_shift_v2i64:
797; AVX512VL:       # %bb.0:
798; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
799; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
800; AVX512VL-NEXT:    retq
801;
802; X86-SSE-LABEL: splatvar_modulo_shift_v2i64:
803; X86-SSE:       # %bb.0:
804; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
805; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
806; X86-SSE-NEXT:    retl
807  %mod = and <2 x i64> %b, <i64 63, i64 63>
808  %splat = shufflevector <2 x i64> %mod, <2 x i64> poison, <2 x i32> zeroinitializer
809  %shift = lshr <2 x i64> %a, %splat
810  ret <2 x i64> %shift
811}
812
813define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
814; SSE-LABEL: splatvar_modulo_shift_v4i32:
815; SSE:       # %bb.0:
816; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
817; SSE-NEXT:    psrld %xmm1, %xmm0
818; SSE-NEXT:    retq
819;
820; AVX-LABEL: splatvar_modulo_shift_v4i32:
821; AVX:       # %bb.0:
822; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
823; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
824; AVX-NEXT:    retq
825;
826; XOP-LABEL: splatvar_modulo_shift_v4i32:
827; XOP:       # %bb.0:
828; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
829; XOP-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
830; XOP-NEXT:    retq
831;
832; AVX512-LABEL: splatvar_modulo_shift_v4i32:
833; AVX512:       # %bb.0:
834; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
835; AVX512-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
836; AVX512-NEXT:    retq
837;
838; AVX512VL-LABEL: splatvar_modulo_shift_v4i32:
839; AVX512VL:       # %bb.0:
840; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
841; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
842; AVX512VL-NEXT:    retq
843;
844; X86-SSE-LABEL: splatvar_modulo_shift_v4i32:
845; X86-SSE:       # %bb.0:
846; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
847; X86-SSE-NEXT:    psrld %xmm1, %xmm0
848; X86-SSE-NEXT:    retl
849  %mod = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
850  %splat = shufflevector <4 x i32> %mod, <4 x i32> poison, <4 x i32> zeroinitializer
851  %shift = lshr <4 x i32> %a, %splat
852  ret <4 x i32> %shift
853}
854
855define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
856; SSE-LABEL: splatvar_modulo_shift_v8i16:
857; SSE:       # %bb.0:
858; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
859; SSE-NEXT:    psrlw %xmm1, %xmm0
860; SSE-NEXT:    retq
861;
862; AVX-LABEL: splatvar_modulo_shift_v8i16:
863; AVX:       # %bb.0:
864; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
865; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
866; AVX-NEXT:    retq
867;
868; XOP-LABEL: splatvar_modulo_shift_v8i16:
869; XOP:       # %bb.0:
870; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
871; XOP-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
872; XOP-NEXT:    retq
873;
874; AVX512-LABEL: splatvar_modulo_shift_v8i16:
875; AVX512:       # %bb.0:
876; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
877; AVX512-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
878; AVX512-NEXT:    retq
879;
880; AVX512VL-LABEL: splatvar_modulo_shift_v8i16:
881; AVX512VL:       # %bb.0:
882; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
883; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
884; AVX512VL-NEXT:    retq
885;
886; X86-SSE-LABEL: splatvar_modulo_shift_v8i16:
887; X86-SSE:       # %bb.0:
888; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
889; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
890; X86-SSE-NEXT:    retl
891  %mod = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
892  %splat = shufflevector <8 x i16> %mod, <8 x i16> poison, <8 x i32> zeroinitializer
893  %shift = lshr <8 x i16> %a, %splat
894  ret <8 x i16> %shift
895}
896
897define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
898; SSE2-LABEL: splatvar_modulo_shift_v16i8:
899; SSE2:       # %bb.0:
900; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
901; SSE2-NEXT:    psrlw %xmm1, %xmm0
902; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
903; SSE2-NEXT:    psrlw %xmm1, %xmm2
904; SSE2-NEXT:    psrlw $8, %xmm2
905; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
906; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
907; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
908; SSE2-NEXT:    pand %xmm1, %xmm0
909; SSE2-NEXT:    retq
910;
911; SSE41-LABEL: splatvar_modulo_shift_v16i8:
912; SSE41:       # %bb.0:
913; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
914; SSE41-NEXT:    psrlw %xmm1, %xmm0
915; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
916; SSE41-NEXT:    psrlw %xmm1, %xmm2
917; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
918; SSE41-NEXT:    pand %xmm2, %xmm0
919; SSE41-NEXT:    retq
920;
921; AVX1-LABEL: splatvar_modulo_shift_v16i8:
922; AVX1:       # %bb.0:
923; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
924; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
925; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
926; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
927; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
928; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
929; AVX1-NEXT:    retq
930;
931; AVX2-LABEL: splatvar_modulo_shift_v16i8:
932; AVX2:       # %bb.0:
933; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
934; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
935; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
936; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
937; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
938; AVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
939; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
940; AVX2-NEXT:    retq
941;
942; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
943; XOPAVX1:       # %bb.0:
944; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
945; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
946; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
947; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
948; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
949; XOPAVX1-NEXT:    retq
950;
951; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
952; XOPAVX2:       # %bb.0:
953; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
954; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
955; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
956; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
957; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
958; XOPAVX2-NEXT:    retq
959;
960; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
961; AVX512DQ:       # %bb.0:
962; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
963; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
964; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
965; AVX512DQ-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
966; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
967; AVX512DQ-NEXT:    vzeroupper
968; AVX512DQ-NEXT:    retq
969;
970; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
971; AVX512BW:       # %bb.0:
972; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
973; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
974; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
975; AVX512BW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
976; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
977; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
978; AVX512BW-NEXT:    vzeroupper
979; AVX512BW-NEXT:    retq
980;
981; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
982; AVX512DQVL:       # %bb.0:
983; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
984; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
985; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
986; AVX512DQVL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
987; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
988; AVX512DQVL-NEXT:    vzeroupper
989; AVX512DQVL-NEXT:    retq
990;
991; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
992; AVX512BWVL:       # %bb.0:
993; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
994; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
995; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
996; AVX512BWVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
997; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
998; AVX512BWVL-NEXT:    vzeroupper
999; AVX512BWVL-NEXT:    retq
1000;
1001; X86-SSE-LABEL: splatvar_modulo_shift_v16i8:
1002; X86-SSE:       # %bb.0:
1003; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1004; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1005; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1006; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1007; X86-SSE-NEXT:    psrlw $8, %xmm2
1008; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1009; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7]
1010; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
1011; X86-SSE-NEXT:    pand %xmm1, %xmm0
1012; X86-SSE-NEXT:    retl
1013  %mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1014  %splat = shufflevector <16 x i8> %mod, <16 x i8> poison, <16 x i32> zeroinitializer
1015  %shift = lshr <16 x i8> %a, %splat
1016  ret <16 x i8> %shift
1017}
1018
1019;
1020; Constant Shifts
1021;
1022
1023define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
1024; SSE2-LABEL: constant_shift_v2i64:
1025; SSE2:       # %bb.0:
1026; SSE2-NEXT:    movdqa %xmm0, %xmm1
1027; SSE2-NEXT:    psrlq $1, %xmm1
1028; SSE2-NEXT:    psrlq $7, %xmm0
1029; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1030; SSE2-NEXT:    retq
1031;
1032; SSE41-LABEL: constant_shift_v2i64:
1033; SSE41:       # %bb.0:
1034; SSE41-NEXT:    movdqa %xmm0, %xmm1
1035; SSE41-NEXT:    psrlq $7, %xmm1
1036; SSE41-NEXT:    psrlq $1, %xmm0
1037; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1038; SSE41-NEXT:    retq
1039;
1040; AVX1-LABEL: constant_shift_v2i64:
1041; AVX1:       # %bb.0:
1042; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1043; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1044; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1045; AVX1-NEXT:    retq
1046;
1047; AVX2-LABEL: constant_shift_v2i64:
1048; AVX2:       # %bb.0:
1049; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1050; AVX2-NEXT:    retq
1051;
1052; XOPAVX1-LABEL: constant_shift_v2i64:
1053; XOPAVX1:       # %bb.0:
1054; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1055; XOPAVX1-NEXT:    retq
1056;
1057; XOPAVX2-LABEL: constant_shift_v2i64:
1058; XOPAVX2:       # %bb.0:
1059; XOPAVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1060; XOPAVX2-NEXT:    retq
1061;
1062; AVX512-LABEL: constant_shift_v2i64:
1063; AVX512:       # %bb.0:
1064; AVX512-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1065; AVX512-NEXT:    retq
1066;
1067; AVX512VL-LABEL: constant_shift_v2i64:
1068; AVX512VL:       # %bb.0:
1069; AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1070; AVX512VL-NEXT:    retq
1071;
1072; X86-SSE-LABEL: constant_shift_v2i64:
1073; X86-SSE:       # %bb.0:
1074; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1075; X86-SSE-NEXT:    psrlq $1, %xmm1
1076; X86-SSE-NEXT:    psrlq $7, %xmm0
1077; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1078; X86-SSE-NEXT:    retl
1079  %shift = lshr <2 x i64> %a, <i64 1, i64 7>
1080  ret <2 x i64> %shift
1081}
1082
1083define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
1084; SSE2-LABEL: constant_shift_v4i32:
1085; SSE2:       # %bb.0:
1086; SSE2-NEXT:    movdqa %xmm0, %xmm1
1087; SSE2-NEXT:    psrld $7, %xmm1
1088; SSE2-NEXT:    movdqa %xmm0, %xmm2
1089; SSE2-NEXT:    psrld $6, %xmm2
1090; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1091; SSE2-NEXT:    movdqa %xmm0, %xmm1
1092; SSE2-NEXT:    psrld $5, %xmm1
1093; SSE2-NEXT:    psrld $4, %xmm0
1094; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1095; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1096; SSE2-NEXT:    retq
1097;
1098; SSE41-LABEL: constant_shift_v4i32:
1099; SSE41:       # %bb.0:
1100; SSE41-NEXT:    movdqa %xmm0, %xmm1
1101; SSE41-NEXT:    psrld $7, %xmm1
1102; SSE41-NEXT:    movdqa %xmm0, %xmm2
1103; SSE41-NEXT:    psrld $5, %xmm2
1104; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1105; SSE41-NEXT:    movdqa %xmm0, %xmm1
1106; SSE41-NEXT:    psrld $6, %xmm1
1107; SSE41-NEXT:    psrld $4, %xmm0
1108; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1109; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1110; SSE41-NEXT:    retq
1111;
1112; AVX1-LABEL: constant_shift_v4i32:
1113; AVX1:       # %bb.0:
1114; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
1115; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
1116; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1117; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
1118; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm0
1119; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1120; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1121; AVX1-NEXT:    retq
1122;
1123; AVX2-LABEL: constant_shift_v4i32:
1124; AVX2:       # %bb.0:
1125; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1126; AVX2-NEXT:    retq
1127;
1128; XOPAVX1-LABEL: constant_shift_v4i32:
1129; XOPAVX1:       # %bb.0:
1130; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1131; XOPAVX1-NEXT:    retq
1132;
1133; XOPAVX2-LABEL: constant_shift_v4i32:
1134; XOPAVX2:       # %bb.0:
1135; XOPAVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1136; XOPAVX2-NEXT:    retq
1137;
1138; AVX512-LABEL: constant_shift_v4i32:
1139; AVX512:       # %bb.0:
1140; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1141; AVX512-NEXT:    retq
1142;
1143; AVX512VL-LABEL: constant_shift_v4i32:
1144; AVX512VL:       # %bb.0:
1145; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1146; AVX512VL-NEXT:    retq
1147;
1148; X86-SSE-LABEL: constant_shift_v4i32:
1149; X86-SSE:       # %bb.0:
1150; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1151; X86-SSE-NEXT:    psrld $7, %xmm1
1152; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1153; X86-SSE-NEXT:    psrld $6, %xmm2
1154; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1155; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1156; X86-SSE-NEXT:    psrld $5, %xmm1
1157; X86-SSE-NEXT:    psrld $4, %xmm0
1158; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1159; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1160; X86-SSE-NEXT:    retl
1161  %shift = lshr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1162  ret <4 x i32> %shift
1163}
1164
1165define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind {
1166; SSE-LABEL: constant_shift_v8i16_pairs:
1167; SSE:       # %bb.0:
1168; SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096]
1169; SSE-NEXT:    retq
1170;
1171; AVX1-LABEL: constant_shift_v8i16_pairs:
1172; AVX1:       # %bb.0:
1173; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096]
1174; AVX1-NEXT:    retq
1175;
1176; AVX2-LABEL: constant_shift_v8i16_pairs:
1177; AVX2:       # %bb.0:
1178; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1179; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1180; AVX2-NEXT:    retq
1181;
1182; XOP-LABEL: constant_shift_v8i16_pairs:
1183; XOP:       # %bb.0:
1184; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1185; XOP-NEXT:    retq
1186;
1187; AVX512DQ-LABEL: constant_shift_v8i16_pairs:
1188; AVX512DQ:       # %bb.0:
1189; AVX512DQ-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1190; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1191; AVX512DQ-NEXT:    retq
1192;
1193; AVX512BW-LABEL: constant_shift_v8i16_pairs:
1194; AVX512BW:       # %bb.0:
1195; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1196; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,1,3,3,2,2,4,4]
1197; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1198; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1199; AVX512BW-NEXT:    vzeroupper
1200; AVX512BW-NEXT:    retq
1201;
1202; AVX512DQVL-LABEL: constant_shift_v8i16_pairs:
1203; AVX512DQVL:       # %bb.0:
1204; AVX512DQVL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1205; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1206; AVX512DQVL-NEXT:    retq
1207;
1208; AVX512BWVL-LABEL: constant_shift_v8i16_pairs:
1209; AVX512BWVL:       # %bb.0:
1210; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1211; AVX512BWVL-NEXT:    retq
1212;
1213; X86-SSE-LABEL: constant_shift_v8i16_pairs:
1214; X86-SSE:       # %bb.0:
1215; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,32768,8192,8192,16384,16384,4096,4096]
1216; X86-SSE-NEXT:    retl
1217  %shift = lshr <8 x i16> %a, <i16 1, i16 1, i16 3, i16 3, i16 2, i16 2, i16 4, i16 4>
1218  ret <8 x i16> %shift
1219}
1220
1221define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
1222; SSE2-LABEL: constant_shift_v8i16:
1223; SSE2:       # %bb.0:
1224; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1225; SSE2-NEXT:    pandn %xmm0, %xmm1
1226; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1227; SSE2-NEXT:    por %xmm1, %xmm0
1228; SSE2-NEXT:    retq
1229;
1230; SSE41-LABEL: constant_shift_v8i16:
1231; SSE41:       # %bb.0:
1232; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,4096,2048,1024,512]
1233; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
1234; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1235; SSE41-NEXT:    retq
1236;
1237; AVX-LABEL: constant_shift_v8i16:
1238; AVX:       # %bb.0:
1239; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1240; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1241; AVX-NEXT:    retq
1242;
1243; XOP-LABEL: constant_shift_v8i16:
1244; XOP:       # %bb.0:
1245; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1246; XOP-NEXT:    retq
1247;
1248; AVX512DQ-LABEL: constant_shift_v8i16:
1249; AVX512DQ:       # %bb.0:
1250; AVX512DQ-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1251; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1252; AVX512DQ-NEXT:    retq
1253;
1254; AVX512BW-LABEL: constant_shift_v8i16:
1255; AVX512BW:       # %bb.0:
1256; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1257; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1258; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1259; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1260; AVX512BW-NEXT:    vzeroupper
1261; AVX512BW-NEXT:    retq
1262;
1263; AVX512DQVL-LABEL: constant_shift_v8i16:
1264; AVX512DQVL:       # %bb.0:
1265; AVX512DQVL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1266; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1267; AVX512DQVL-NEXT:    retq
1268;
1269; AVX512BWVL-LABEL: constant_shift_v8i16:
1270; AVX512BWVL:       # %bb.0:
1271; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1272; AVX512BWVL-NEXT:    retq
1273;
1274; X86-SSE-LABEL: constant_shift_v8i16:
1275; X86-SSE:       # %bb.0:
1276; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,65535,65535]
1277; X86-SSE-NEXT:    pandn %xmm0, %xmm1
1278; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1279; X86-SSE-NEXT:    por %xmm1, %xmm0
1280; X86-SSE-NEXT:    retl
1281  %shift = lshr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1282  ret <8 x i16> %shift
1283}
1284
1285define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
1286; SSE-LABEL: constant_shift_v16i8_pairs:
1287; SSE:       # %bb.0:
1288; SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1289; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1290; SSE-NEXT:    retq
1291;
1292; AVX-LABEL: constant_shift_v16i8_pairs:
1293; AVX:       # %bb.0:
1294; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1295; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1296; AVX-NEXT:    retq
1297;
1298; XOP-LABEL: constant_shift_v16i8_pairs:
1299; XOP:       # %bb.0:
1300; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1301; XOP-NEXT:    retq
1302;
1303; AVX512DQ-LABEL: constant_shift_v16i8_pairs:
1304; AVX512DQ:       # %bb.0:
1305; AVX512DQ-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1306; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1307; AVX512DQ-NEXT:    retq
1308;
1309; AVX512BW-LABEL: constant_shift_v16i8_pairs:
1310; AVX512BW:       # %bb.0:
1311; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1312; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [7,2,4,6,1,2,3,4]
1313; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1314; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1315; AVX512BW-NEXT:    vzeroupper
1316; AVX512BW-NEXT:    retq
1317;
1318; AVX512DQVL-LABEL: constant_shift_v16i8_pairs:
1319; AVX512DQVL:       # %bb.0:
1320; AVX512DQVL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1321; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1322; AVX512DQVL-NEXT:    retq
1323;
1324; AVX512BWVL-LABEL: constant_shift_v16i8_pairs:
1325; AVX512BWVL:       # %bb.0:
1326; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1327; AVX512BWVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1328; AVX512BWVL-NEXT:    retq
1329;
1330; X86-SSE-LABEL: constant_shift_v16i8_pairs:
1331; X86-SSE:       # %bb.0:
1332; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [512,16384,4096,1024,32768,16384,8192,4096]
1333; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1334; X86-SSE-NEXT:    retl
1335  %shift = lshr <16 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
1336  ret <16 x i8> %shift
1337}
1338
1339define <16 x i8> @constant_shift_v16i8_quads(<16 x i8> %a) nounwind {
1340; SSE-LABEL: constant_shift_v16i8_quads:
1341; SSE:       # %bb.0:
1342; SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192]
1343; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1344; SSE-NEXT:    retq
1345;
1346; AVX1-LABEL: constant_shift_v16i8_quads:
1347; AVX1:       # %bb.0:
1348; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192]
1349; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1350; AVX1-NEXT:    retq
1351;
1352; AVX2-LABEL: constant_shift_v16i8_quads:
1353; AVX2:       # %bb.0:
1354; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1355; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1356; AVX2-NEXT:    retq
1357;
1358; XOP-LABEL: constant_shift_v16i8_quads:
1359; XOP:       # %bb.0:
1360; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1361; XOP-NEXT:    retq
1362;
1363; AVX512-LABEL: constant_shift_v16i8_quads:
1364; AVX512:       # %bb.0:
1365; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1366; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1367; AVX512-NEXT:    retq
1368;
1369; AVX512VL-LABEL: constant_shift_v16i8_quads:
1370; AVX512VL:       # %bb.0:
1371; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1372; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1373; AVX512VL-NEXT:    retq
1374;
1375; X86-SSE-LABEL: constant_shift_v16i8_quads:
1376; X86-SSE:       # %bb.0:
1377; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [16384,16384,4096,4096,32768,32768,8192,8192]
1378; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1379; X86-SSE-NEXT:    retl
1380  %shift = lshr <16 x i8> %a, <i8 2, i8 2, i8 2, i8 2, i8 4, i8 4, i8 4, i8 4, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3>
1381  ret <16 x i8> %shift
1382}
1383
1384define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
1385; SSE2-LABEL: constant_shift_v16i8:
1386; SSE2:       # %bb.0:
1387; SSE2-NEXT:    pxor %xmm1, %xmm1
1388; SSE2-NEXT:    movdqa %xmm0, %xmm2
1389; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1390; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [2,4,8,16,32,64,128,256]
1391; SSE2-NEXT:    psrlw $8, %xmm2
1392; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1393; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
1394; SSE2-NEXT:    psrlw $8, %xmm0
1395; SSE2-NEXT:    packuswb %xmm2, %xmm0
1396; SSE2-NEXT:    retq
1397;
1398; SSE41-LABEL: constant_shift_v16i8:
1399; SSE41:       # %bb.0:
1400; SSE41-NEXT:    pxor %xmm2, %xmm2
1401; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1402; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1403; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,4,8,16,32,64,128,256]
1404; SSE41-NEXT:    psrlw $8, %xmm0
1405; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
1406; SSE41-NEXT:    psrlw $8, %xmm1
1407; SSE41-NEXT:    packuswb %xmm0, %xmm1
1408; SSE41-NEXT:    movdqa %xmm1, %xmm0
1409; SSE41-NEXT:    retq
1410;
1411; AVX1-LABEL: constant_shift_v16i8:
1412; AVX1:       # %bb.0:
1413; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1414; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1415; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1416; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1417; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1418; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1419; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1420; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1421; AVX1-NEXT:    retq
1422;
1423; AVX2-LABEL: constant_shift_v16i8:
1424; AVX2:       # %bb.0:
1425; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1426; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,2,4,8,16,32,64,128,256]
1427; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1428; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1429; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1430; AVX2-NEXT:    vzeroupper
1431; AVX2-NEXT:    retq
1432;
1433; XOP-LABEL: constant_shift_v16i8:
1434; XOP:       # %bb.0:
1435; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1436; XOP-NEXT:    retq
1437;
1438; AVX512DQ-LABEL: constant_shift_v16i8:
1439; AVX512DQ:       # %bb.0:
1440; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1441; AVX512DQ-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1442; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1443; AVX512DQ-NEXT:    vzeroupper
1444; AVX512DQ-NEXT:    retq
1445;
1446; AVX512BW-LABEL: constant_shift_v16i8:
1447; AVX512BW:       # %bb.0:
1448; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1449; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1450; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1451; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1452; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1453; AVX512BW-NEXT:    vzeroupper
1454; AVX512BW-NEXT:    retq
1455;
1456; AVX512DQVL-LABEL: constant_shift_v16i8:
1457; AVX512DQVL:       # %bb.0:
1458; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
1459; AVX512DQVL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1460; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1461; AVX512DQVL-NEXT:    vzeroupper
1462; AVX512DQVL-NEXT:    retq
1463;
1464; AVX512BWVL-LABEL: constant_shift_v16i8:
1465; AVX512BWVL:       # %bb.0:
1466; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
1467; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1468; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1469; AVX512BWVL-NEXT:    vzeroupper
1470; AVX512BWVL-NEXT:    retq
1471;
1472; X86-SSE-LABEL: constant_shift_v16i8:
1473; X86-SSE:       # %bb.0:
1474; X86-SSE-NEXT:    pxor %xmm1, %xmm1
1475; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1476; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1477; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [2,4,8,16,32,64,128,256]
1478; X86-SSE-NEXT:    psrlw $8, %xmm2
1479; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1480; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
1481; X86-SSE-NEXT:    psrlw $8, %xmm0
1482; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
1483; X86-SSE-NEXT:    retl
1484  %shift = lshr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1485  ret <16 x i8> %shift
1486}
1487
1488;
1489; Uniform Constant Shifts
1490;
1491
1492define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1493; SSE-LABEL: splatconstant_shift_v2i64:
1494; SSE:       # %bb.0:
1495; SSE-NEXT:    psrlq $7, %xmm0
1496; SSE-NEXT:    retq
1497;
1498; AVX-LABEL: splatconstant_shift_v2i64:
1499; AVX:       # %bb.0:
1500; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0
1501; AVX-NEXT:    retq
1502;
1503; XOP-LABEL: splatconstant_shift_v2i64:
1504; XOP:       # %bb.0:
1505; XOP-NEXT:    vpsrlq $7, %xmm0, %xmm0
1506; XOP-NEXT:    retq
1507;
1508; AVX512-LABEL: splatconstant_shift_v2i64:
1509; AVX512:       # %bb.0:
1510; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
1511; AVX512-NEXT:    retq
1512;
1513; AVX512VL-LABEL: splatconstant_shift_v2i64:
1514; AVX512VL:       # %bb.0:
1515; AVX512VL-NEXT:    vpsrlq $7, %xmm0, %xmm0
1516; AVX512VL-NEXT:    retq
1517;
1518; X86-SSE-LABEL: splatconstant_shift_v2i64:
1519; X86-SSE:       # %bb.0:
1520; X86-SSE-NEXT:    psrlq $7, %xmm0
1521; X86-SSE-NEXT:    retl
1522  %shift = lshr <2 x i64> %a, <i64 7, i64 7>
1523  ret <2 x i64> %shift
1524}
1525
1526define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1527; SSE-LABEL: splatconstant_shift_v4i32:
1528; SSE:       # %bb.0:
1529; SSE-NEXT:    psrld $5, %xmm0
1530; SSE-NEXT:    retq
1531;
1532; AVX-LABEL: splatconstant_shift_v4i32:
1533; AVX:       # %bb.0:
1534; AVX-NEXT:    vpsrld $5, %xmm0, %xmm0
1535; AVX-NEXT:    retq
1536;
1537; XOP-LABEL: splatconstant_shift_v4i32:
1538; XOP:       # %bb.0:
1539; XOP-NEXT:    vpsrld $5, %xmm0, %xmm0
1540; XOP-NEXT:    retq
1541;
1542; AVX512-LABEL: splatconstant_shift_v4i32:
1543; AVX512:       # %bb.0:
1544; AVX512-NEXT:    vpsrld $5, %xmm0, %xmm0
1545; AVX512-NEXT:    retq
1546;
1547; AVX512VL-LABEL: splatconstant_shift_v4i32:
1548; AVX512VL:       # %bb.0:
1549; AVX512VL-NEXT:    vpsrld $5, %xmm0, %xmm0
1550; AVX512VL-NEXT:    retq
1551;
1552; X86-SSE-LABEL: splatconstant_shift_v4i32:
1553; X86-SSE:       # %bb.0:
1554; X86-SSE-NEXT:    psrld $5, %xmm0
1555; X86-SSE-NEXT:    retl
1556  %shift = lshr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1557  ret <4 x i32> %shift
1558}
1559
1560define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1561; SSE-LABEL: splatconstant_shift_v8i16:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    psrlw $3, %xmm0
1564; SSE-NEXT:    retq
1565;
1566; AVX-LABEL: splatconstant_shift_v8i16:
1567; AVX:       # %bb.0:
1568; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1569; AVX-NEXT:    retq
1570;
1571; XOP-LABEL: splatconstant_shift_v8i16:
1572; XOP:       # %bb.0:
1573; XOP-NEXT:    vpsrlw $3, %xmm0, %xmm0
1574; XOP-NEXT:    retq
1575;
1576; AVX512-LABEL: splatconstant_shift_v8i16:
1577; AVX512:       # %bb.0:
1578; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1579; AVX512-NEXT:    retq
1580;
1581; AVX512VL-LABEL: splatconstant_shift_v8i16:
1582; AVX512VL:       # %bb.0:
1583; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
1584; AVX512VL-NEXT:    retq
1585;
1586; X86-SSE-LABEL: splatconstant_shift_v8i16:
1587; X86-SSE:       # %bb.0:
1588; X86-SSE-NEXT:    psrlw $3, %xmm0
1589; X86-SSE-NEXT:    retl
1590  %shift = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1591  ret <8 x i16> %shift
1592}
1593
1594define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1595; SSE-LABEL: splatconstant_shift_v16i8:
1596; SSE:       # %bb.0:
1597; SSE-NEXT:    psrlw $3, %xmm0
1598; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1599; SSE-NEXT:    retq
1600;
1601; AVX-LABEL: splatconstant_shift_v16i8:
1602; AVX:       # %bb.0:
1603; AVX-NEXT:    vpsrlw $3, %xmm0, %xmm0
1604; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1605; AVX-NEXT:    retq
1606;
1607; XOP-LABEL: splatconstant_shift_v16i8:
1608; XOP:       # %bb.0:
1609; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1610; XOP-NEXT:    retq
1611;
1612; AVX512-LABEL: splatconstant_shift_v16i8:
1613; AVX512:       # %bb.0:
1614; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
1615; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1616; AVX512-NEXT:    retq
1617;
1618; AVX512VL-LABEL: splatconstant_shift_v16i8:
1619; AVX512VL:       # %bb.0:
1620; AVX512VL-NEXT:    vpsrlw $3, %xmm0, %xmm0
1621; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1622; AVX512VL-NEXT:    retq
1623;
1624; X86-SSE-LABEL: splatconstant_shift_v16i8:
1625; X86-SSE:       # %bb.0:
1626; X86-SSE-NEXT:    psrlw $3, %xmm0
1627; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1628; X86-SSE-NEXT:    retl
1629  %shift = lshr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1630  ret <16 x i8> %shift
1631}
1632
1633define <4 x i32> @vector_variable_shift_right(<4 x i1> %cond, <4 x i32> %x, <4 x i32> %y, <4 x i32> %z) nounwind {
1634; SSE2-LABEL: vector_variable_shift_right:
1635; SSE2:       # %bb.0:
1636; SSE2-NEXT:    xorps %xmm4, %xmm4
1637; SSE2-NEXT:    xorps %xmm5, %xmm5
1638; SSE2-NEXT:    movss {{.*#+}} xmm5 = xmm2[0],xmm5[1,2,3]
1639; SSE2-NEXT:    movss {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3]
1640; SSE2-NEXT:    pslld $31, %xmm0
1641; SSE2-NEXT:    psrad $31, %xmm0
1642; SSE2-NEXT:    movdqa %xmm3, %xmm1
1643; SSE2-NEXT:    psrld %xmm4, %xmm1
1644; SSE2-NEXT:    psrld %xmm5, %xmm3
1645; SSE2-NEXT:    pand %xmm0, %xmm1
1646; SSE2-NEXT:    pandn %xmm3, %xmm0
1647; SSE2-NEXT:    por %xmm1, %xmm0
1648; SSE2-NEXT:    retq
1649;
1650; SSE41-LABEL: vector_variable_shift_right:
1651; SSE41:       # %bb.0:
1652; SSE41-NEXT:    pslld $31, %xmm0
1653; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1654; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1655; SSE41-NEXT:    movdqa %xmm3, %xmm4
1656; SSE41-NEXT:    psrld %xmm1, %xmm4
1657; SSE41-NEXT:    psrld %xmm2, %xmm3
1658; SSE41-NEXT:    blendvps %xmm0, %xmm4, %xmm3
1659; SSE41-NEXT:    movaps %xmm3, %xmm0
1660; SSE41-NEXT:    retq
1661;
1662; AVX1-LABEL: vector_variable_shift_right:
1663; AVX1:       # %bb.0:
1664; AVX1-NEXT:    vpslld $31, %xmm0, %xmm0
1665; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
1666; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1667; AVX1-NEXT:    vpsrld %xmm1, %xmm3, %xmm1
1668; AVX1-NEXT:    vpsrld %xmm2, %xmm3, %xmm2
1669; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1670; AVX1-NEXT:    retq
1671;
1672; AVX2-LABEL: vector_variable_shift_right:
1673; AVX2:       # %bb.0:
1674; AVX2-NEXT:    vpslld $31, %xmm0, %xmm0
1675; AVX2-NEXT:    vbroadcastss %xmm1, %xmm1
1676; AVX2-NEXT:    vbroadcastss %xmm2, %xmm2
1677; AVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1678; AVX2-NEXT:    vpsrlvd %xmm0, %xmm3, %xmm0
1679; AVX2-NEXT:    retq
1680;
1681; XOPAVX1-LABEL: vector_variable_shift_right:
1682; XOPAVX1:       # %bb.0:
1683; XOPAVX1-NEXT:    vpslld $31, %xmm0, %xmm0
1684; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
1685; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
1686; XOPAVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1687; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1688; XOPAVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
1689; XOPAVX1-NEXT:    vpshld %xmm0, %xmm3, %xmm0
1690; XOPAVX1-NEXT:    retq
1691;
1692; XOPAVX2-LABEL: vector_variable_shift_right:
1693; XOPAVX2:       # %bb.0:
1694; XOPAVX2-NEXT:    vpslld $31, %xmm0, %xmm0
1695; XOPAVX2-NEXT:    vbroadcastss %xmm1, %xmm1
1696; XOPAVX2-NEXT:    vbroadcastss %xmm2, %xmm2
1697; XOPAVX2-NEXT:    vblendvps %xmm0, %xmm1, %xmm2, %xmm0
1698; XOPAVX2-NEXT:    vpsrlvd %xmm0, %xmm3, %xmm0
1699; XOPAVX2-NEXT:    retq
1700;
1701; AVX512DQ-LABEL: vector_variable_shift_right:
1702; AVX512DQ:       # %bb.0:
1703; AVX512DQ-NEXT:    vpslld $31, %xmm0, %xmm0
1704; AVX512DQ-NEXT:    vpmovd2m %zmm0, %k1
1705; AVX512DQ-NEXT:    vpbroadcastd %xmm1, %xmm0
1706; AVX512DQ-NEXT:    vpbroadcastd %xmm2, %xmm1
1707; AVX512DQ-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1708; AVX512DQ-NEXT:    vpsrlvd %xmm1, %xmm3, %xmm0
1709; AVX512DQ-NEXT:    vzeroupper
1710; AVX512DQ-NEXT:    retq
1711;
1712; AVX512BW-LABEL: vector_variable_shift_right:
1713; AVX512BW:       # %bb.0:
1714; AVX512BW-NEXT:    vpslld $31, %xmm0, %xmm0
1715; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k1
1716; AVX512BW-NEXT:    vpbroadcastd %xmm1, %xmm0
1717; AVX512BW-NEXT:    vpbroadcastd %xmm2, %xmm1
1718; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
1719; AVX512BW-NEXT:    vpsrlvd %xmm1, %xmm3, %xmm0
1720; AVX512BW-NEXT:    vzeroupper
1721; AVX512BW-NEXT:    retq
1722;
1723; AVX512DQVL-LABEL: vector_variable_shift_right:
1724; AVX512DQVL:       # %bb.0:
1725; AVX512DQVL-NEXT:    vpslld $31, %xmm0, %xmm0
1726; AVX512DQVL-NEXT:    vpmovd2m %xmm0, %k1
1727; AVX512DQVL-NEXT:    vpbroadcastd %xmm2, %xmm0
1728; AVX512DQVL-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
1729; AVX512DQVL-NEXT:    vpsrlvd %xmm0, %xmm3, %xmm0
1730; AVX512DQVL-NEXT:    retq
1731;
1732; AVX512BWVL-LABEL: vector_variable_shift_right:
1733; AVX512BWVL:       # %bb.0:
1734; AVX512BWVL-NEXT:    vpslld $31, %xmm0, %xmm0
1735; AVX512BWVL-NEXT:    vptestmd %xmm0, %xmm0, %k1
1736; AVX512BWVL-NEXT:    vpbroadcastd %xmm2, %xmm0
1737; AVX512BWVL-NEXT:    vpbroadcastd %xmm1, %xmm0 {%k1}
1738; AVX512BWVL-NEXT:    vpsrlvd %xmm0, %xmm3, %xmm0
1739; AVX512BWVL-NEXT:    retq
1740;
1741; X86-SSE-LABEL: vector_variable_shift_right:
1742; X86-SSE:       # %bb.0:
1743; X86-SSE-NEXT:    pushl %ebp
1744; X86-SSE-NEXT:    movl %esp, %ebp
1745; X86-SSE-NEXT:    andl $-16, %esp
1746; X86-SSE-NEXT:    subl $16, %esp
1747; X86-SSE-NEXT:    xorps %xmm3, %xmm3
1748; X86-SSE-NEXT:    xorps %xmm4, %xmm4
1749; X86-SSE-NEXT:    movss {{.*#+}} xmm4 = xmm2[0],xmm4[1,2,3]
1750; X86-SSE-NEXT:    movss {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3]
1751; X86-SSE-NEXT:    pslld $31, %xmm0
1752; X86-SSE-NEXT:    psrad $31, %xmm0
1753; X86-SSE-NEXT:    movdqa 8(%ebp), %xmm1
1754; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
1755; X86-SSE-NEXT:    psrld %xmm3, %xmm2
1756; X86-SSE-NEXT:    psrld %xmm4, %xmm1
1757; X86-SSE-NEXT:    pand %xmm0, %xmm2
1758; X86-SSE-NEXT:    pandn %xmm1, %xmm0
1759; X86-SSE-NEXT:    por %xmm2, %xmm0
1760; X86-SSE-NEXT:    movl %ebp, %esp
1761; X86-SSE-NEXT:    popl %ebp
1762; X86-SSE-NEXT:    retl
1763  %splat1 = shufflevector <4 x i32> %x, <4 x i32> poison, <4 x i32> zeroinitializer
1764  %splat2 = shufflevector <4 x i32> %y, <4 x i32> poison, <4 x i32> zeroinitializer
1765  %sel = select <4 x i1> %cond, <4 x i32> %splat1, <4 x i32> %splat2
1766  %sh = lshr <4 x i32> %z, %sel
1767  ret <4 x i32> %sh
1768}
1769