xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll (revision 2a922903bf5d5b0012c1f8f2a5396d44cfff4630)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
15
16;
17; Variable Shifts
18;
19
20define <2 x i32> @var_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
21; SSE2-LABEL: var_shift_v2i32:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
24; SSE2-NEXT:    movdqa %xmm0, %xmm3
25; SSE2-NEXT:    psrad %xmm2, %xmm3
26; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
27; SSE2-NEXT:    movdqa %xmm0, %xmm2
28; SSE2-NEXT:    psrad %xmm4, %xmm2
29; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
30; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
31; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
32; SSE2-NEXT:    movdqa %xmm0, %xmm4
33; SSE2-NEXT:    psrad %xmm3, %xmm4
34; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
35; SSE2-NEXT:    psrad %xmm1, %xmm0
36; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
37; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
38; SSE2-NEXT:    movaps %xmm2, %xmm0
39; SSE2-NEXT:    retq
40;
41; SSE41-LABEL: var_shift_v2i32:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
44; SSE41-NEXT:    movdqa %xmm0, %xmm3
45; SSE41-NEXT:    psrad %xmm2, %xmm3
46; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
47; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
48; SSE41-NEXT:    movdqa %xmm0, %xmm5
49; SSE41-NEXT:    psrad %xmm4, %xmm5
50; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
51; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
52; SSE41-NEXT:    movdqa %xmm0, %xmm3
53; SSE41-NEXT:    psrad %xmm1, %xmm3
54; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
55; SSE41-NEXT:    psrad %xmm1, %xmm0
56; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
57; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
58; SSE41-NEXT:    retq
59;
60; AVX1-LABEL: var_shift_v2i32:
61; AVX1:       # %bb.0:
62; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
63; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
64; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
65; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
66; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
67; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
68; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
69; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
70; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
71; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
72; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
73; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
74; AVX1-NEXT:    retq
75;
76; AVX2-LABEL: var_shift_v2i32:
77; AVX2:       # %bb.0:
78; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
79; AVX2-NEXT:    retq
80;
81; XOPAVX1-LABEL: var_shift_v2i32:
82; XOPAVX1:       # %bb.0:
83; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
84; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
85; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
86; XOPAVX1-NEXT:    retq
87;
88; XOPAVX2-LABEL: var_shift_v2i32:
89; XOPAVX2:       # %bb.0:
90; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
91; XOPAVX2-NEXT:    retq
92;
93; AVX512-LABEL: var_shift_v2i32:
94; AVX512:       # %bb.0:
95; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
96; AVX512-NEXT:    retq
97;
98; AVX512VL-LABEL: var_shift_v2i32:
99; AVX512VL:       # %bb.0:
100; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
101; AVX512VL-NEXT:    retq
102;
103; X86-SSE-LABEL: var_shift_v2i32:
104; X86-SSE:       # %bb.0:
105; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
106; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
107; X86-SSE-NEXT:    psrad %xmm2, %xmm3
108; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
109; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
110; X86-SSE-NEXT:    psrad %xmm4, %xmm2
111; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
112; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
113; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
114; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
115; X86-SSE-NEXT:    psrad %xmm3, %xmm4
116; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
117; X86-SSE-NEXT:    psrad %xmm1, %xmm0
118; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
119; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
120; X86-SSE-NEXT:    movaps %xmm2, %xmm0
121; X86-SSE-NEXT:    retl
122  %shift = ashr <2 x i32> %a, %b
123  ret <2 x i32> %shift
124}
125
126define <4 x i16> @var_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
127; SSE2-LABEL: var_shift_v4i16:
128; SSE2:       # %bb.0:
129; SSE2-NEXT:    psllw $12, %xmm1
130; SSE2-NEXT:    movdqa %xmm1, %xmm2
131; SSE2-NEXT:    psraw $15, %xmm2
132; SSE2-NEXT:    movdqa %xmm2, %xmm3
133; SSE2-NEXT:    pandn %xmm0, %xmm3
134; SSE2-NEXT:    psraw $8, %xmm0
135; SSE2-NEXT:    pand %xmm2, %xmm0
136; SSE2-NEXT:    por %xmm3, %xmm0
137; SSE2-NEXT:    paddw %xmm1, %xmm1
138; SSE2-NEXT:    movdqa %xmm1, %xmm2
139; SSE2-NEXT:    psraw $15, %xmm2
140; SSE2-NEXT:    movdqa %xmm2, %xmm3
141; SSE2-NEXT:    pandn %xmm0, %xmm3
142; SSE2-NEXT:    psraw $4, %xmm0
143; SSE2-NEXT:    pand %xmm2, %xmm0
144; SSE2-NEXT:    por %xmm3, %xmm0
145; SSE2-NEXT:    paddw %xmm1, %xmm1
146; SSE2-NEXT:    movdqa %xmm1, %xmm2
147; SSE2-NEXT:    psraw $15, %xmm2
148; SSE2-NEXT:    movdqa %xmm2, %xmm3
149; SSE2-NEXT:    pandn %xmm0, %xmm3
150; SSE2-NEXT:    psraw $2, %xmm0
151; SSE2-NEXT:    pand %xmm2, %xmm0
152; SSE2-NEXT:    por %xmm3, %xmm0
153; SSE2-NEXT:    paddw %xmm1, %xmm1
154; SSE2-NEXT:    psraw $15, %xmm1
155; SSE2-NEXT:    movdqa %xmm1, %xmm2
156; SSE2-NEXT:    pandn %xmm0, %xmm2
157; SSE2-NEXT:    psraw $1, %xmm0
158; SSE2-NEXT:    pand %xmm1, %xmm0
159; SSE2-NEXT:    por %xmm2, %xmm0
160; SSE2-NEXT:    retq
161;
162; SSE41-LABEL: var_shift_v4i16:
163; SSE41:       # %bb.0:
164; SSE41-NEXT:    movdqa %xmm0, %xmm2
165; SSE41-NEXT:    movdqa %xmm1, %xmm0
166; SSE41-NEXT:    psllw $12, %xmm0
167; SSE41-NEXT:    psllw $4, %xmm1
168; SSE41-NEXT:    por %xmm1, %xmm0
169; SSE41-NEXT:    movdqa %xmm0, %xmm1
170; SSE41-NEXT:    paddw %xmm0, %xmm1
171; SSE41-NEXT:    movdqa %xmm2, %xmm3
172; SSE41-NEXT:    psraw $8, %xmm3
173; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
174; SSE41-NEXT:    movdqa %xmm2, %xmm3
175; SSE41-NEXT:    psraw $4, %xmm3
176; SSE41-NEXT:    movdqa %xmm1, %xmm0
177; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
178; SSE41-NEXT:    movdqa %xmm2, %xmm3
179; SSE41-NEXT:    psraw $2, %xmm3
180; SSE41-NEXT:    paddw %xmm1, %xmm1
181; SSE41-NEXT:    movdqa %xmm1, %xmm0
182; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
183; SSE41-NEXT:    movdqa %xmm2, %xmm3
184; SSE41-NEXT:    psraw $1, %xmm3
185; SSE41-NEXT:    paddw %xmm1, %xmm1
186; SSE41-NEXT:    movdqa %xmm1, %xmm0
187; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
188; SSE41-NEXT:    movdqa %xmm2, %xmm0
189; SSE41-NEXT:    retq
190;
191; AVX1-LABEL: var_shift_v4i16:
192; AVX1:       # %bb.0:
193; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
194; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
195; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
196; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
197; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
198; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
199; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
200; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
201; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
202; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
203; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
204; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
205; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
206; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
207; AVX1-NEXT:    retq
208;
209; AVX2-LABEL: var_shift_v4i16:
210; AVX2:       # %bb.0:
211; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
212; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
213; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
214; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
215; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
216; AVX2-NEXT:    vzeroupper
217; AVX2-NEXT:    retq
218;
219; XOP-LABEL: var_shift_v4i16:
220; XOP:       # %bb.0:
221; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
222; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
223; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
224; XOP-NEXT:    retq
225;
226; AVX512DQ-LABEL: var_shift_v4i16:
227; AVX512DQ:       # %bb.0:
228; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
229; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
230; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
231; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
232; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
233; AVX512DQ-NEXT:    vzeroupper
234; AVX512DQ-NEXT:    retq
235;
236; AVX512BW-LABEL: var_shift_v4i16:
237; AVX512BW:       # %bb.0:
238; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
239; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
240; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
241; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
242; AVX512BW-NEXT:    vzeroupper
243; AVX512BW-NEXT:    retq
244;
245; AVX512DQVL-LABEL: var_shift_v4i16:
246; AVX512DQVL:       # %bb.0:
247; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
248; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
249; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
250; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
251; AVX512DQVL-NEXT:    vzeroupper
252; AVX512DQVL-NEXT:    retq
253;
254; AVX512BWVL-LABEL: var_shift_v4i16:
255; AVX512BWVL:       # %bb.0:
256; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
257; AVX512BWVL-NEXT:    retq
258;
259; X86-SSE-LABEL: var_shift_v4i16:
260; X86-SSE:       # %bb.0:
261; X86-SSE-NEXT:    psllw $12, %xmm1
262; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
263; X86-SSE-NEXT:    psraw $15, %xmm2
264; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
265; X86-SSE-NEXT:    pandn %xmm0, %xmm3
266; X86-SSE-NEXT:    psraw $8, %xmm0
267; X86-SSE-NEXT:    pand %xmm2, %xmm0
268; X86-SSE-NEXT:    por %xmm3, %xmm0
269; X86-SSE-NEXT:    paddw %xmm1, %xmm1
270; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
271; X86-SSE-NEXT:    psraw $15, %xmm2
272; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
273; X86-SSE-NEXT:    pandn %xmm0, %xmm3
274; X86-SSE-NEXT:    psraw $4, %xmm0
275; X86-SSE-NEXT:    pand %xmm2, %xmm0
276; X86-SSE-NEXT:    por %xmm3, %xmm0
277; X86-SSE-NEXT:    paddw %xmm1, %xmm1
278; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
279; X86-SSE-NEXT:    psraw $15, %xmm2
280; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
281; X86-SSE-NEXT:    pandn %xmm0, %xmm3
282; X86-SSE-NEXT:    psraw $2, %xmm0
283; X86-SSE-NEXT:    pand %xmm2, %xmm0
284; X86-SSE-NEXT:    por %xmm3, %xmm0
285; X86-SSE-NEXT:    paddw %xmm1, %xmm1
286; X86-SSE-NEXT:    psraw $15, %xmm1
287; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
288; X86-SSE-NEXT:    pandn %xmm0, %xmm2
289; X86-SSE-NEXT:    psraw $1, %xmm0
290; X86-SSE-NEXT:    pand %xmm1, %xmm0
291; X86-SSE-NEXT:    por %xmm2, %xmm0
292; X86-SSE-NEXT:    retl
293  %shift = ashr <4 x i16> %a, %b
294  ret <4 x i16> %shift
295}
296
297define <2 x i16> @var_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
298; SSE2-LABEL: var_shift_v2i16:
299; SSE2:       # %bb.0:
300; SSE2-NEXT:    psllw $12, %xmm1
301; SSE2-NEXT:    movdqa %xmm1, %xmm2
302; SSE2-NEXT:    psraw $15, %xmm2
303; SSE2-NEXT:    movdqa %xmm2, %xmm3
304; SSE2-NEXT:    pandn %xmm0, %xmm3
305; SSE2-NEXT:    psraw $8, %xmm0
306; SSE2-NEXT:    pand %xmm2, %xmm0
307; SSE2-NEXT:    por %xmm3, %xmm0
308; SSE2-NEXT:    paddw %xmm1, %xmm1
309; SSE2-NEXT:    movdqa %xmm1, %xmm2
310; SSE2-NEXT:    psraw $15, %xmm2
311; SSE2-NEXT:    movdqa %xmm2, %xmm3
312; SSE2-NEXT:    pandn %xmm0, %xmm3
313; SSE2-NEXT:    psraw $4, %xmm0
314; SSE2-NEXT:    pand %xmm2, %xmm0
315; SSE2-NEXT:    por %xmm3, %xmm0
316; SSE2-NEXT:    paddw %xmm1, %xmm1
317; SSE2-NEXT:    movdqa %xmm1, %xmm2
318; SSE2-NEXT:    psraw $15, %xmm2
319; SSE2-NEXT:    movdqa %xmm2, %xmm3
320; SSE2-NEXT:    pandn %xmm0, %xmm3
321; SSE2-NEXT:    psraw $2, %xmm0
322; SSE2-NEXT:    pand %xmm2, %xmm0
323; SSE2-NEXT:    por %xmm3, %xmm0
324; SSE2-NEXT:    paddw %xmm1, %xmm1
325; SSE2-NEXT:    psraw $15, %xmm1
326; SSE2-NEXT:    movdqa %xmm1, %xmm2
327; SSE2-NEXT:    pandn %xmm0, %xmm2
328; SSE2-NEXT:    psraw $1, %xmm0
329; SSE2-NEXT:    pand %xmm1, %xmm0
330; SSE2-NEXT:    por %xmm2, %xmm0
331; SSE2-NEXT:    retq
332;
333; SSE41-LABEL: var_shift_v2i16:
334; SSE41:       # %bb.0:
335; SSE41-NEXT:    movdqa %xmm0, %xmm2
336; SSE41-NEXT:    movdqa %xmm1, %xmm0
337; SSE41-NEXT:    psllw $12, %xmm0
338; SSE41-NEXT:    psllw $4, %xmm1
339; SSE41-NEXT:    por %xmm1, %xmm0
340; SSE41-NEXT:    movdqa %xmm0, %xmm1
341; SSE41-NEXT:    paddw %xmm0, %xmm1
342; SSE41-NEXT:    movdqa %xmm2, %xmm3
343; SSE41-NEXT:    psraw $8, %xmm3
344; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
345; SSE41-NEXT:    movdqa %xmm2, %xmm3
346; SSE41-NEXT:    psraw $4, %xmm3
347; SSE41-NEXT:    movdqa %xmm1, %xmm0
348; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
349; SSE41-NEXT:    movdqa %xmm2, %xmm3
350; SSE41-NEXT:    psraw $2, %xmm3
351; SSE41-NEXT:    paddw %xmm1, %xmm1
352; SSE41-NEXT:    movdqa %xmm1, %xmm0
353; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
354; SSE41-NEXT:    movdqa %xmm2, %xmm3
355; SSE41-NEXT:    psraw $1, %xmm3
356; SSE41-NEXT:    paddw %xmm1, %xmm1
357; SSE41-NEXT:    movdqa %xmm1, %xmm0
358; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
359; SSE41-NEXT:    movdqa %xmm2, %xmm0
360; SSE41-NEXT:    retq
361;
362; AVX1-LABEL: var_shift_v2i16:
363; AVX1:       # %bb.0:
364; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
365; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
366; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
367; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
368; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
369; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
371; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
372; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
373; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
374; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
375; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
376; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
377; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
378; AVX1-NEXT:    retq
379;
380; AVX2-LABEL: var_shift_v2i16:
381; AVX2:       # %bb.0:
382; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
383; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
384; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
385; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
386; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
387; AVX2-NEXT:    vzeroupper
388; AVX2-NEXT:    retq
389;
390; XOP-LABEL: var_shift_v2i16:
391; XOP:       # %bb.0:
392; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
393; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
394; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
395; XOP-NEXT:    retq
396;
397; AVX512DQ-LABEL: var_shift_v2i16:
398; AVX512DQ:       # %bb.0:
399; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
400; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
401; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
402; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
403; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
404; AVX512DQ-NEXT:    vzeroupper
405; AVX512DQ-NEXT:    retq
406;
407; AVX512BW-LABEL: var_shift_v2i16:
408; AVX512BW:       # %bb.0:
409; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
410; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
411; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
412; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
413; AVX512BW-NEXT:    vzeroupper
414; AVX512BW-NEXT:    retq
415;
416; AVX512DQVL-LABEL: var_shift_v2i16:
417; AVX512DQVL:       # %bb.0:
418; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
419; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
420; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
421; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
422; AVX512DQVL-NEXT:    vzeroupper
423; AVX512DQVL-NEXT:    retq
424;
425; AVX512BWVL-LABEL: var_shift_v2i16:
426; AVX512BWVL:       # %bb.0:
427; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
428; AVX512BWVL-NEXT:    retq
429;
430; X86-SSE-LABEL: var_shift_v2i16:
431; X86-SSE:       # %bb.0:
432; X86-SSE-NEXT:    psllw $12, %xmm1
433; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
434; X86-SSE-NEXT:    psraw $15, %xmm2
435; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
436; X86-SSE-NEXT:    pandn %xmm0, %xmm3
437; X86-SSE-NEXT:    psraw $8, %xmm0
438; X86-SSE-NEXT:    pand %xmm2, %xmm0
439; X86-SSE-NEXT:    por %xmm3, %xmm0
440; X86-SSE-NEXT:    paddw %xmm1, %xmm1
441; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
442; X86-SSE-NEXT:    psraw $15, %xmm2
443; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
444; X86-SSE-NEXT:    pandn %xmm0, %xmm3
445; X86-SSE-NEXT:    psraw $4, %xmm0
446; X86-SSE-NEXT:    pand %xmm2, %xmm0
447; X86-SSE-NEXT:    por %xmm3, %xmm0
448; X86-SSE-NEXT:    paddw %xmm1, %xmm1
449; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
450; X86-SSE-NEXT:    psraw $15, %xmm2
451; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
452; X86-SSE-NEXT:    pandn %xmm0, %xmm3
453; X86-SSE-NEXT:    psraw $2, %xmm0
454; X86-SSE-NEXT:    pand %xmm2, %xmm0
455; X86-SSE-NEXT:    por %xmm3, %xmm0
456; X86-SSE-NEXT:    paddw %xmm1, %xmm1
457; X86-SSE-NEXT:    psraw $15, %xmm1
458; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
459; X86-SSE-NEXT:    pandn %xmm0, %xmm2
460; X86-SSE-NEXT:    psraw $1, %xmm0
461; X86-SSE-NEXT:    pand %xmm1, %xmm0
462; X86-SSE-NEXT:    por %xmm2, %xmm0
463; X86-SSE-NEXT:    retl
464  %shift = ashr <2 x i16> %a, %b
465  ret <2 x i16> %shift
466}
467
468define <8 x i8> @var_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
469; SSE2-LABEL: var_shift_v8i8:
470; SSE2:       # %bb.0:
471; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
472; SSE2-NEXT:    psllw $5, %xmm1
473; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
474; SSE2-NEXT:    pxor %xmm3, %xmm3
475; SSE2-NEXT:    pxor %xmm5, %xmm5
476; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
477; SSE2-NEXT:    movdqa %xmm5, %xmm6
478; SSE2-NEXT:    pandn %xmm2, %xmm6
479; SSE2-NEXT:    psraw $4, %xmm2
480; SSE2-NEXT:    pand %xmm5, %xmm2
481; SSE2-NEXT:    por %xmm6, %xmm2
482; SSE2-NEXT:    paddw %xmm4, %xmm4
483; SSE2-NEXT:    pxor %xmm5, %xmm5
484; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
485; SSE2-NEXT:    movdqa %xmm5, %xmm6
486; SSE2-NEXT:    pandn %xmm2, %xmm6
487; SSE2-NEXT:    psraw $2, %xmm2
488; SSE2-NEXT:    pand %xmm5, %xmm2
489; SSE2-NEXT:    por %xmm6, %xmm2
490; SSE2-NEXT:    paddw %xmm4, %xmm4
491; SSE2-NEXT:    pxor %xmm5, %xmm5
492; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
493; SSE2-NEXT:    movdqa %xmm5, %xmm4
494; SSE2-NEXT:    pandn %xmm2, %xmm4
495; SSE2-NEXT:    psraw $1, %xmm2
496; SSE2-NEXT:    pand %xmm5, %xmm2
497; SSE2-NEXT:    por %xmm4, %xmm2
498; SSE2-NEXT:    psrlw $8, %xmm2
499; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
500; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
501; SSE2-NEXT:    pxor %xmm4, %xmm4
502; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
503; SSE2-NEXT:    movdqa %xmm4, %xmm5
504; SSE2-NEXT:    pandn %xmm0, %xmm5
505; SSE2-NEXT:    psraw $4, %xmm0
506; SSE2-NEXT:    pand %xmm4, %xmm0
507; SSE2-NEXT:    por %xmm5, %xmm0
508; SSE2-NEXT:    paddw %xmm1, %xmm1
509; SSE2-NEXT:    pxor %xmm4, %xmm4
510; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
511; SSE2-NEXT:    movdqa %xmm4, %xmm5
512; SSE2-NEXT:    pandn %xmm0, %xmm5
513; SSE2-NEXT:    psraw $2, %xmm0
514; SSE2-NEXT:    pand %xmm4, %xmm0
515; SSE2-NEXT:    por %xmm5, %xmm0
516; SSE2-NEXT:    paddw %xmm1, %xmm1
517; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
518; SSE2-NEXT:    movdqa %xmm3, %xmm1
519; SSE2-NEXT:    pandn %xmm0, %xmm1
520; SSE2-NEXT:    psraw $1, %xmm0
521; SSE2-NEXT:    pand %xmm3, %xmm0
522; SSE2-NEXT:    por %xmm1, %xmm0
523; SSE2-NEXT:    psrlw $8, %xmm0
524; SSE2-NEXT:    packuswb %xmm2, %xmm0
525; SSE2-NEXT:    retq
526;
527; SSE41-LABEL: var_shift_v8i8:
528; SSE41:       # %bb.0:
529; SSE41-NEXT:    movdqa %xmm0, %xmm2
530; SSE41-NEXT:    psllw $5, %xmm1
531; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
532; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
533; SSE41-NEXT:    movdqa %xmm3, %xmm4
534; SSE41-NEXT:    psraw $4, %xmm4
535; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
536; SSE41-NEXT:    movdqa %xmm3, %xmm4
537; SSE41-NEXT:    psraw $2, %xmm4
538; SSE41-NEXT:    paddw %xmm0, %xmm0
539; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
540; SSE41-NEXT:    movdqa %xmm3, %xmm4
541; SSE41-NEXT:    psraw $1, %xmm4
542; SSE41-NEXT:    paddw %xmm0, %xmm0
543; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
544; SSE41-NEXT:    psrlw $8, %xmm3
545; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
546; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
547; SSE41-NEXT:    movdqa %xmm1, %xmm2
548; SSE41-NEXT:    psraw $4, %xmm2
549; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
550; SSE41-NEXT:    movdqa %xmm1, %xmm2
551; SSE41-NEXT:    psraw $2, %xmm2
552; SSE41-NEXT:    paddw %xmm0, %xmm0
553; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
554; SSE41-NEXT:    movdqa %xmm1, %xmm2
555; SSE41-NEXT:    psraw $1, %xmm2
556; SSE41-NEXT:    paddw %xmm0, %xmm0
557; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
558; SSE41-NEXT:    psrlw $8, %xmm1
559; SSE41-NEXT:    packuswb %xmm3, %xmm1
560; SSE41-NEXT:    movdqa %xmm1, %xmm0
561; SSE41-NEXT:    retq
562;
563; AVX-LABEL: var_shift_v8i8:
564; AVX:       # %bb.0:
565; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
566; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
567; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
568; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
569; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
570; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
571; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
572; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
573; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
574; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
575; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
576; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
577; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
578; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
579; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
580; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
581; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
582; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
583; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
584; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
585; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
586; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
587; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
588; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
589; AVX-NEXT:    retq
590;
591; XOP-LABEL: var_shift_v8i8:
592; XOP:       # %bb.0:
593; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
594; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
595; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
596; XOP-NEXT:    retq
597;
598; AVX512DQ-LABEL: var_shift_v8i8:
599; AVX512DQ:       # %bb.0:
600; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
601; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
602; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
603; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
604; AVX512DQ-NEXT:    vzeroupper
605; AVX512DQ-NEXT:    retq
606;
607; AVX512BW-LABEL: var_shift_v8i8:
608; AVX512BW:       # %bb.0:
609; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
610; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
611; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
612; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
613; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
614; AVX512BW-NEXT:    vzeroupper
615; AVX512BW-NEXT:    retq
616;
617; AVX512DQVL-LABEL: var_shift_v8i8:
618; AVX512DQVL:       # %bb.0:
619; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
620; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
621; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
622; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
623; AVX512DQVL-NEXT:    vzeroupper
624; AVX512DQVL-NEXT:    retq
625;
626; AVX512BWVL-LABEL: var_shift_v8i8:
627; AVX512BWVL:       # %bb.0:
628; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
629; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
630; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
631; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
632; AVX512BWVL-NEXT:    vzeroupper
633; AVX512BWVL-NEXT:    retq
634;
635; X86-SSE-LABEL: var_shift_v8i8:
636; X86-SSE:       # %bb.0:
637; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
638; X86-SSE-NEXT:    psllw $5, %xmm1
639; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
640; X86-SSE-NEXT:    pxor %xmm3, %xmm3
641; X86-SSE-NEXT:    pxor %xmm5, %xmm5
642; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
643; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
644; X86-SSE-NEXT:    pandn %xmm2, %xmm6
645; X86-SSE-NEXT:    psraw $4, %xmm2
646; X86-SSE-NEXT:    pand %xmm5, %xmm2
647; X86-SSE-NEXT:    por %xmm6, %xmm2
648; X86-SSE-NEXT:    paddw %xmm4, %xmm4
649; X86-SSE-NEXT:    pxor %xmm5, %xmm5
650; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
651; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
652; X86-SSE-NEXT:    pandn %xmm2, %xmm6
653; X86-SSE-NEXT:    psraw $2, %xmm2
654; X86-SSE-NEXT:    pand %xmm5, %xmm2
655; X86-SSE-NEXT:    por %xmm6, %xmm2
656; X86-SSE-NEXT:    paddw %xmm4, %xmm4
657; X86-SSE-NEXT:    pxor %xmm5, %xmm5
658; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
659; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
660; X86-SSE-NEXT:    pandn %xmm2, %xmm4
661; X86-SSE-NEXT:    psraw $1, %xmm2
662; X86-SSE-NEXT:    pand %xmm5, %xmm2
663; X86-SSE-NEXT:    por %xmm4, %xmm2
664; X86-SSE-NEXT:    psrlw $8, %xmm2
665; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
666; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
667; X86-SSE-NEXT:    pxor %xmm4, %xmm4
668; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
669; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
670; X86-SSE-NEXT:    pandn %xmm0, %xmm5
671; X86-SSE-NEXT:    psraw $4, %xmm0
672; X86-SSE-NEXT:    pand %xmm4, %xmm0
673; X86-SSE-NEXT:    por %xmm5, %xmm0
674; X86-SSE-NEXT:    paddw %xmm1, %xmm1
675; X86-SSE-NEXT:    pxor %xmm4, %xmm4
676; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
677; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
678; X86-SSE-NEXT:    pandn %xmm0, %xmm5
679; X86-SSE-NEXT:    psraw $2, %xmm0
680; X86-SSE-NEXT:    pand %xmm4, %xmm0
681; X86-SSE-NEXT:    por %xmm5, %xmm0
682; X86-SSE-NEXT:    paddw %xmm1, %xmm1
683; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
684; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
685; X86-SSE-NEXT:    pandn %xmm0, %xmm1
686; X86-SSE-NEXT:    psraw $1, %xmm0
687; X86-SSE-NEXT:    pand %xmm3, %xmm0
688; X86-SSE-NEXT:    por %xmm1, %xmm0
689; X86-SSE-NEXT:    psrlw $8, %xmm0
690; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
691; X86-SSE-NEXT:    retl
692  %shift = ashr <8 x i8> %a, %b
693  ret <8 x i8> %shift
694}
695
696define <4 x i8> @var_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
697; SSE2-LABEL: var_shift_v4i8:
698; SSE2:       # %bb.0:
699; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
700; SSE2-NEXT:    psllw $5, %xmm1
701; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
702; SSE2-NEXT:    pxor %xmm3, %xmm3
703; SSE2-NEXT:    pxor %xmm5, %xmm5
704; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
705; SSE2-NEXT:    movdqa %xmm5, %xmm6
706; SSE2-NEXT:    pandn %xmm2, %xmm6
707; SSE2-NEXT:    psraw $4, %xmm2
708; SSE2-NEXT:    pand %xmm5, %xmm2
709; SSE2-NEXT:    por %xmm6, %xmm2
710; SSE2-NEXT:    paddw %xmm4, %xmm4
711; SSE2-NEXT:    pxor %xmm5, %xmm5
712; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
713; SSE2-NEXT:    movdqa %xmm5, %xmm6
714; SSE2-NEXT:    pandn %xmm2, %xmm6
715; SSE2-NEXT:    psraw $2, %xmm2
716; SSE2-NEXT:    pand %xmm5, %xmm2
717; SSE2-NEXT:    por %xmm6, %xmm2
718; SSE2-NEXT:    paddw %xmm4, %xmm4
719; SSE2-NEXT:    pxor %xmm5, %xmm5
720; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
721; SSE2-NEXT:    movdqa %xmm5, %xmm4
722; SSE2-NEXT:    pandn %xmm2, %xmm4
723; SSE2-NEXT:    psraw $1, %xmm2
724; SSE2-NEXT:    pand %xmm5, %xmm2
725; SSE2-NEXT:    por %xmm4, %xmm2
726; SSE2-NEXT:    psrlw $8, %xmm2
727; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
728; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
729; SSE2-NEXT:    pxor %xmm4, %xmm4
730; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
731; SSE2-NEXT:    movdqa %xmm4, %xmm5
732; SSE2-NEXT:    pandn %xmm0, %xmm5
733; SSE2-NEXT:    psraw $4, %xmm0
734; SSE2-NEXT:    pand %xmm4, %xmm0
735; SSE2-NEXT:    por %xmm5, %xmm0
736; SSE2-NEXT:    paddw %xmm1, %xmm1
737; SSE2-NEXT:    pxor %xmm4, %xmm4
738; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
739; SSE2-NEXT:    movdqa %xmm4, %xmm5
740; SSE2-NEXT:    pandn %xmm0, %xmm5
741; SSE2-NEXT:    psraw $2, %xmm0
742; SSE2-NEXT:    pand %xmm4, %xmm0
743; SSE2-NEXT:    por %xmm5, %xmm0
744; SSE2-NEXT:    paddw %xmm1, %xmm1
745; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
746; SSE2-NEXT:    movdqa %xmm3, %xmm1
747; SSE2-NEXT:    pandn %xmm0, %xmm1
748; SSE2-NEXT:    psraw $1, %xmm0
749; SSE2-NEXT:    pand %xmm3, %xmm0
750; SSE2-NEXT:    por %xmm1, %xmm0
751; SSE2-NEXT:    psrlw $8, %xmm0
752; SSE2-NEXT:    packuswb %xmm2, %xmm0
753; SSE2-NEXT:    retq
754;
755; SSE41-LABEL: var_shift_v4i8:
756; SSE41:       # %bb.0:
757; SSE41-NEXT:    movdqa %xmm0, %xmm2
758; SSE41-NEXT:    psllw $5, %xmm1
759; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
760; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
761; SSE41-NEXT:    movdqa %xmm3, %xmm4
762; SSE41-NEXT:    psraw $4, %xmm4
763; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
764; SSE41-NEXT:    movdqa %xmm3, %xmm4
765; SSE41-NEXT:    psraw $2, %xmm4
766; SSE41-NEXT:    paddw %xmm0, %xmm0
767; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
768; SSE41-NEXT:    movdqa %xmm3, %xmm4
769; SSE41-NEXT:    psraw $1, %xmm4
770; SSE41-NEXT:    paddw %xmm0, %xmm0
771; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
772; SSE41-NEXT:    psrlw $8, %xmm3
773; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
774; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
775; SSE41-NEXT:    movdqa %xmm1, %xmm2
776; SSE41-NEXT:    psraw $4, %xmm2
777; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
778; SSE41-NEXT:    movdqa %xmm1, %xmm2
779; SSE41-NEXT:    psraw $2, %xmm2
780; SSE41-NEXT:    paddw %xmm0, %xmm0
781; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
782; SSE41-NEXT:    movdqa %xmm1, %xmm2
783; SSE41-NEXT:    psraw $1, %xmm2
784; SSE41-NEXT:    paddw %xmm0, %xmm0
785; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
786; SSE41-NEXT:    psrlw $8, %xmm1
787; SSE41-NEXT:    packuswb %xmm3, %xmm1
788; SSE41-NEXT:    movdqa %xmm1, %xmm0
789; SSE41-NEXT:    retq
790;
791; AVX-LABEL: var_shift_v4i8:
792; AVX:       # %bb.0:
793; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
794; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
795; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
796; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
797; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
798; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
799; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
800; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
801; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
802; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
803; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
804; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
805; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
806; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
807; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
808; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
809; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
810; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
811; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
812; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
813; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
814; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
815; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
816; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
817; AVX-NEXT:    retq
818;
819; XOP-LABEL: var_shift_v4i8:
820; XOP:       # %bb.0:
821; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
822; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
823; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
824; XOP-NEXT:    retq
825;
826; AVX512DQ-LABEL: var_shift_v4i8:
827; AVX512DQ:       # %bb.0:
828; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
829; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
830; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
831; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
832; AVX512DQ-NEXT:    vzeroupper
833; AVX512DQ-NEXT:    retq
834;
835; AVX512BW-LABEL: var_shift_v4i8:
836; AVX512BW:       # %bb.0:
837; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
838; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
839; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
840; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
841; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
842; AVX512BW-NEXT:    vzeroupper
843; AVX512BW-NEXT:    retq
844;
845; AVX512DQVL-LABEL: var_shift_v4i8:
846; AVX512DQVL:       # %bb.0:
847; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
848; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
849; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
850; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
851; AVX512DQVL-NEXT:    vzeroupper
852; AVX512DQVL-NEXT:    retq
853;
854; AVX512BWVL-LABEL: var_shift_v4i8:
855; AVX512BWVL:       # %bb.0:
856; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
857; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
858; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
859; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
860; AVX512BWVL-NEXT:    vzeroupper
861; AVX512BWVL-NEXT:    retq
862;
863; X86-SSE-LABEL: var_shift_v4i8:
864; X86-SSE:       # %bb.0:
865; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
866; X86-SSE-NEXT:    psllw $5, %xmm1
867; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
868; X86-SSE-NEXT:    pxor %xmm3, %xmm3
869; X86-SSE-NEXT:    pxor %xmm5, %xmm5
870; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
871; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
872; X86-SSE-NEXT:    pandn %xmm2, %xmm6
873; X86-SSE-NEXT:    psraw $4, %xmm2
874; X86-SSE-NEXT:    pand %xmm5, %xmm2
875; X86-SSE-NEXT:    por %xmm6, %xmm2
876; X86-SSE-NEXT:    paddw %xmm4, %xmm4
877; X86-SSE-NEXT:    pxor %xmm5, %xmm5
878; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
879; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
880; X86-SSE-NEXT:    pandn %xmm2, %xmm6
881; X86-SSE-NEXT:    psraw $2, %xmm2
882; X86-SSE-NEXT:    pand %xmm5, %xmm2
883; X86-SSE-NEXT:    por %xmm6, %xmm2
884; X86-SSE-NEXT:    paddw %xmm4, %xmm4
885; X86-SSE-NEXT:    pxor %xmm5, %xmm5
886; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
887; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
888; X86-SSE-NEXT:    pandn %xmm2, %xmm4
889; X86-SSE-NEXT:    psraw $1, %xmm2
890; X86-SSE-NEXT:    pand %xmm5, %xmm2
891; X86-SSE-NEXT:    por %xmm4, %xmm2
892; X86-SSE-NEXT:    psrlw $8, %xmm2
893; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
894; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
895; X86-SSE-NEXT:    pxor %xmm4, %xmm4
896; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
897; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
898; X86-SSE-NEXT:    pandn %xmm0, %xmm5
899; X86-SSE-NEXT:    psraw $4, %xmm0
900; X86-SSE-NEXT:    pand %xmm4, %xmm0
901; X86-SSE-NEXT:    por %xmm5, %xmm0
902; X86-SSE-NEXT:    paddw %xmm1, %xmm1
903; X86-SSE-NEXT:    pxor %xmm4, %xmm4
904; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
905; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
906; X86-SSE-NEXT:    pandn %xmm0, %xmm5
907; X86-SSE-NEXT:    psraw $2, %xmm0
908; X86-SSE-NEXT:    pand %xmm4, %xmm0
909; X86-SSE-NEXT:    por %xmm5, %xmm0
910; X86-SSE-NEXT:    paddw %xmm1, %xmm1
911; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
912; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
913; X86-SSE-NEXT:    pandn %xmm0, %xmm1
914; X86-SSE-NEXT:    psraw $1, %xmm0
915; X86-SSE-NEXT:    pand %xmm3, %xmm0
916; X86-SSE-NEXT:    por %xmm1, %xmm0
917; X86-SSE-NEXT:    psrlw $8, %xmm0
918; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
919; X86-SSE-NEXT:    retl
920  %shift = ashr <4 x i8> %a, %b
921  ret <4 x i8> %shift
922}
923
924define <2 x i8> @var_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
925; SSE2-LABEL: var_shift_v2i8:
926; SSE2:       # %bb.0:
927; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
928; SSE2-NEXT:    psllw $5, %xmm1
929; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
930; SSE2-NEXT:    pxor %xmm3, %xmm3
931; SSE2-NEXT:    pxor %xmm5, %xmm5
932; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
933; SSE2-NEXT:    movdqa %xmm5, %xmm6
934; SSE2-NEXT:    pandn %xmm2, %xmm6
935; SSE2-NEXT:    psraw $4, %xmm2
936; SSE2-NEXT:    pand %xmm5, %xmm2
937; SSE2-NEXT:    por %xmm6, %xmm2
938; SSE2-NEXT:    paddw %xmm4, %xmm4
939; SSE2-NEXT:    pxor %xmm5, %xmm5
940; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
941; SSE2-NEXT:    movdqa %xmm5, %xmm6
942; SSE2-NEXT:    pandn %xmm2, %xmm6
943; SSE2-NEXT:    psraw $2, %xmm2
944; SSE2-NEXT:    pand %xmm5, %xmm2
945; SSE2-NEXT:    por %xmm6, %xmm2
946; SSE2-NEXT:    paddw %xmm4, %xmm4
947; SSE2-NEXT:    pxor %xmm5, %xmm5
948; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
949; SSE2-NEXT:    movdqa %xmm5, %xmm4
950; SSE2-NEXT:    pandn %xmm2, %xmm4
951; SSE2-NEXT:    psraw $1, %xmm2
952; SSE2-NEXT:    pand %xmm5, %xmm2
953; SSE2-NEXT:    por %xmm4, %xmm2
954; SSE2-NEXT:    psrlw $8, %xmm2
955; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
956; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
957; SSE2-NEXT:    pxor %xmm4, %xmm4
958; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
959; SSE2-NEXT:    movdqa %xmm4, %xmm5
960; SSE2-NEXT:    pandn %xmm0, %xmm5
961; SSE2-NEXT:    psraw $4, %xmm0
962; SSE2-NEXT:    pand %xmm4, %xmm0
963; SSE2-NEXT:    por %xmm5, %xmm0
964; SSE2-NEXT:    paddw %xmm1, %xmm1
965; SSE2-NEXT:    pxor %xmm4, %xmm4
966; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
967; SSE2-NEXT:    movdqa %xmm4, %xmm5
968; SSE2-NEXT:    pandn %xmm0, %xmm5
969; SSE2-NEXT:    psraw $2, %xmm0
970; SSE2-NEXT:    pand %xmm4, %xmm0
971; SSE2-NEXT:    por %xmm5, %xmm0
972; SSE2-NEXT:    paddw %xmm1, %xmm1
973; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
974; SSE2-NEXT:    movdqa %xmm3, %xmm1
975; SSE2-NEXT:    pandn %xmm0, %xmm1
976; SSE2-NEXT:    psraw $1, %xmm0
977; SSE2-NEXT:    pand %xmm3, %xmm0
978; SSE2-NEXT:    por %xmm1, %xmm0
979; SSE2-NEXT:    psrlw $8, %xmm0
980; SSE2-NEXT:    packuswb %xmm2, %xmm0
981; SSE2-NEXT:    retq
982;
983; SSE41-LABEL: var_shift_v2i8:
984; SSE41:       # %bb.0:
985; SSE41-NEXT:    movdqa %xmm0, %xmm2
986; SSE41-NEXT:    psllw $5, %xmm1
987; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
988; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
989; SSE41-NEXT:    movdqa %xmm3, %xmm4
990; SSE41-NEXT:    psraw $4, %xmm4
991; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
992; SSE41-NEXT:    movdqa %xmm3, %xmm4
993; SSE41-NEXT:    psraw $2, %xmm4
994; SSE41-NEXT:    paddw %xmm0, %xmm0
995; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
996; SSE41-NEXT:    movdqa %xmm3, %xmm4
997; SSE41-NEXT:    psraw $1, %xmm4
998; SSE41-NEXT:    paddw %xmm0, %xmm0
999; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
1000; SSE41-NEXT:    psrlw $8, %xmm3
1001; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
1002; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1003; SSE41-NEXT:    movdqa %xmm1, %xmm2
1004; SSE41-NEXT:    psraw $4, %xmm2
1005; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1006; SSE41-NEXT:    movdqa %xmm1, %xmm2
1007; SSE41-NEXT:    psraw $2, %xmm2
1008; SSE41-NEXT:    paddw %xmm0, %xmm0
1009; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1010; SSE41-NEXT:    movdqa %xmm1, %xmm2
1011; SSE41-NEXT:    psraw $1, %xmm2
1012; SSE41-NEXT:    paddw %xmm0, %xmm0
1013; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
1014; SSE41-NEXT:    psrlw $8, %xmm1
1015; SSE41-NEXT:    packuswb %xmm3, %xmm1
1016; SSE41-NEXT:    movdqa %xmm1, %xmm0
1017; SSE41-NEXT:    retq
1018;
1019; AVX-LABEL: var_shift_v2i8:
1020; AVX:       # %bb.0:
1021; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
1022; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1023; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1024; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
1025; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1026; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
1027; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1028; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
1029; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
1030; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
1031; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
1032; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
1033; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1034; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1035; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
1036; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1037; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
1038; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1039; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1040; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
1041; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
1042; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
1043; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1044; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1045; AVX-NEXT:    retq
1046;
1047; XOP-LABEL: var_shift_v2i8:
1048; XOP:       # %bb.0:
1049; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1050; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1051; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1052; XOP-NEXT:    retq
1053;
1054; AVX512DQ-LABEL: var_shift_v2i8:
1055; AVX512DQ:       # %bb.0:
1056; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1057; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1058; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
1059; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1060; AVX512DQ-NEXT:    vzeroupper
1061; AVX512DQ-NEXT:    retq
1062;
1063; AVX512BW-LABEL: var_shift_v2i8:
1064; AVX512BW:       # %bb.0:
1065; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1066; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1067; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1068; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1069; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1070; AVX512BW-NEXT:    vzeroupper
1071; AVX512BW-NEXT:    retq
1072;
1073; AVX512DQVL-LABEL: var_shift_v2i8:
1074; AVX512DQVL:       # %bb.0:
1075; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
1076; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1077; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
1078; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1079; AVX512DQVL-NEXT:    vzeroupper
1080; AVX512DQVL-NEXT:    retq
1081;
1082; AVX512BWVL-LABEL: var_shift_v2i8:
1083; AVX512BWVL:       # %bb.0:
1084; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
1085; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1086; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
1087; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1088; AVX512BWVL-NEXT:    vzeroupper
1089; AVX512BWVL-NEXT:    retq
1090;
1091; X86-SSE-LABEL: var_shift_v2i8:
1092; X86-SSE:       # %bb.0:
1093; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
1094; X86-SSE-NEXT:    psllw $5, %xmm1
1095; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
1096; X86-SSE-NEXT:    pxor %xmm3, %xmm3
1097; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1098; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1099; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
1100; X86-SSE-NEXT:    pandn %xmm2, %xmm6
1101; X86-SSE-NEXT:    psraw $4, %xmm2
1102; X86-SSE-NEXT:    pand %xmm5, %xmm2
1103; X86-SSE-NEXT:    por %xmm6, %xmm2
1104; X86-SSE-NEXT:    paddw %xmm4, %xmm4
1105; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1106; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1107; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
1108; X86-SSE-NEXT:    pandn %xmm2, %xmm6
1109; X86-SSE-NEXT:    psraw $2, %xmm2
1110; X86-SSE-NEXT:    pand %xmm5, %xmm2
1111; X86-SSE-NEXT:    por %xmm6, %xmm2
1112; X86-SSE-NEXT:    paddw %xmm4, %xmm4
1113; X86-SSE-NEXT:    pxor %xmm5, %xmm5
1114; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
1115; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
1116; X86-SSE-NEXT:    pandn %xmm2, %xmm4
1117; X86-SSE-NEXT:    psraw $1, %xmm2
1118; X86-SSE-NEXT:    pand %xmm5, %xmm2
1119; X86-SSE-NEXT:    por %xmm4, %xmm2
1120; X86-SSE-NEXT:    psrlw $8, %xmm2
1121; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1122; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1123; X86-SSE-NEXT:    pxor %xmm4, %xmm4
1124; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
1125; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
1126; X86-SSE-NEXT:    pandn %xmm0, %xmm5
1127; X86-SSE-NEXT:    psraw $4, %xmm0
1128; X86-SSE-NEXT:    pand %xmm4, %xmm0
1129; X86-SSE-NEXT:    por %xmm5, %xmm0
1130; X86-SSE-NEXT:    paddw %xmm1, %xmm1
1131; X86-SSE-NEXT:    pxor %xmm4, %xmm4
1132; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
1133; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
1134; X86-SSE-NEXT:    pandn %xmm0, %xmm5
1135; X86-SSE-NEXT:    psraw $2, %xmm0
1136; X86-SSE-NEXT:    pand %xmm4, %xmm0
1137; X86-SSE-NEXT:    por %xmm5, %xmm0
1138; X86-SSE-NEXT:    paddw %xmm1, %xmm1
1139; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
1140; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
1141; X86-SSE-NEXT:    pandn %xmm0, %xmm1
1142; X86-SSE-NEXT:    psraw $1, %xmm0
1143; X86-SSE-NEXT:    pand %xmm3, %xmm0
1144; X86-SSE-NEXT:    por %xmm1, %xmm0
1145; X86-SSE-NEXT:    psrlw $8, %xmm0
1146; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
1147; X86-SSE-NEXT:    retl
1148  %shift = ashr <2 x i8> %a, %b
1149  ret <2 x i8> %shift
1150}
1151
1152;
1153; Uniform Variable Shifts
1154;
1155
1156define <2 x i32> @splatvar_shift_v2i32(<2 x i32> %a, <2 x i32> %b) nounwind {
1157; SSE2-LABEL: splatvar_shift_v2i32:
1158; SSE2:       # %bb.0:
1159; SSE2-NEXT:    xorps %xmm2, %xmm2
1160; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1161; SSE2-NEXT:    psrad %xmm2, %xmm0
1162; SSE2-NEXT:    retq
1163;
1164; SSE41-LABEL: splatvar_shift_v2i32:
1165; SSE41:       # %bb.0:
1166; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1167; SSE41-NEXT:    psrad %xmm1, %xmm0
1168; SSE41-NEXT:    retq
1169;
1170; AVX-LABEL: splatvar_shift_v2i32:
1171; AVX:       # %bb.0:
1172; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1173; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1174; AVX-NEXT:    retq
1175;
1176; XOP-LABEL: splatvar_shift_v2i32:
1177; XOP:       # %bb.0:
1178; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1179; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1180; XOP-NEXT:    retq
1181;
1182; AVX512-LABEL: splatvar_shift_v2i32:
1183; AVX512:       # %bb.0:
1184; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1185; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1186; AVX512-NEXT:    retq
1187;
1188; AVX512VL-LABEL: splatvar_shift_v2i32:
1189; AVX512VL:       # %bb.0:
1190; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
1191; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1192; AVX512VL-NEXT:    retq
1193;
1194; X86-SSE-LABEL: splatvar_shift_v2i32:
1195; X86-SSE:       # %bb.0:
1196; X86-SSE-NEXT:    xorps %xmm2, %xmm2
1197; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
1198; X86-SSE-NEXT:    psrad %xmm2, %xmm0
1199; X86-SSE-NEXT:    retl
1200  %splat = shufflevector <2 x i32> %b, <2 x i32> poison, <2 x i32> zeroinitializer
1201  %shift = ashr <2 x i32> %a, %splat
1202  ret <2 x i32> %shift
1203}
1204
1205define <4 x i16> @splatvar_shift_v4i16(<4 x i16> %a, <4 x i16> %b) nounwind {
1206; SSE2-LABEL: splatvar_shift_v4i16:
1207; SSE2:       # %bb.0:
1208; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1209; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1210; SSE2-NEXT:    psraw %xmm1, %xmm0
1211; SSE2-NEXT:    retq
1212;
1213; SSE41-LABEL: splatvar_shift_v4i16:
1214; SSE41:       # %bb.0:
1215; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1216; SSE41-NEXT:    psraw %xmm1, %xmm0
1217; SSE41-NEXT:    retq
1218;
1219; AVX-LABEL: splatvar_shift_v4i16:
1220; AVX:       # %bb.0:
1221; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1222; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1223; AVX-NEXT:    retq
1224;
1225; XOP-LABEL: splatvar_shift_v4i16:
1226; XOP:       # %bb.0:
1227; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1228; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1229; XOP-NEXT:    retq
1230;
1231; AVX512-LABEL: splatvar_shift_v4i16:
1232; AVX512:       # %bb.0:
1233; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1234; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1235; AVX512-NEXT:    retq
1236;
1237; AVX512VL-LABEL: splatvar_shift_v4i16:
1238; AVX512VL:       # %bb.0:
1239; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1240; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1241; AVX512VL-NEXT:    retq
1242;
1243; X86-SSE-LABEL: splatvar_shift_v4i16:
1244; X86-SSE:       # %bb.0:
1245; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1246; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1247; X86-SSE-NEXT:    psraw %xmm1, %xmm0
1248; X86-SSE-NEXT:    retl
1249  %splat = shufflevector <4 x i16> %b, <4 x i16> poison, <4 x i32> zeroinitializer
1250  %shift = ashr <4 x i16> %a, %splat
1251  ret <4 x i16> %shift
1252}
1253
1254define <2 x i16> @splatvar_shift_v2i16(<2 x i16> %a, <2 x i16> %b) nounwind {
1255; SSE2-LABEL: splatvar_shift_v2i16:
1256; SSE2:       # %bb.0:
1257; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1258; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1259; SSE2-NEXT:    psraw %xmm1, %xmm0
1260; SSE2-NEXT:    retq
1261;
1262; SSE41-LABEL: splatvar_shift_v2i16:
1263; SSE41:       # %bb.0:
1264; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1265; SSE41-NEXT:    psraw %xmm1, %xmm0
1266; SSE41-NEXT:    retq
1267;
1268; AVX-LABEL: splatvar_shift_v2i16:
1269; AVX:       # %bb.0:
1270; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1271; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1272; AVX-NEXT:    retq
1273;
1274; XOP-LABEL: splatvar_shift_v2i16:
1275; XOP:       # %bb.0:
1276; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1277; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1278; XOP-NEXT:    retq
1279;
1280; AVX512-LABEL: splatvar_shift_v2i16:
1281; AVX512:       # %bb.0:
1282; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1283; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1284; AVX512-NEXT:    retq
1285;
1286; AVX512VL-LABEL: splatvar_shift_v2i16:
1287; AVX512VL:       # %bb.0:
1288; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
1289; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1290; AVX512VL-NEXT:    retq
1291;
1292; X86-SSE-LABEL: splatvar_shift_v2i16:
1293; X86-SSE:       # %bb.0:
1294; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
1295; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1296; X86-SSE-NEXT:    psraw %xmm1, %xmm0
1297; X86-SSE-NEXT:    retl
1298  %splat = shufflevector <2 x i16> %b, <2 x i16> poison, <2 x i32> zeroinitializer
1299  %shift = ashr <2 x i16> %a, %splat
1300  ret <2 x i16> %shift
1301}
1302
1303define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind {
1304; SSE2-LABEL: splatvar_shift_v8i8:
1305; SSE2:       # %bb.0:
1306; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1307; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1308; SSE2-NEXT:    psrlw %xmm1, %xmm0
1309; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1310; SSE2-NEXT:    psrlw %xmm1, %xmm2
1311; SSE2-NEXT:    psrlw $8, %xmm2
1312; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1313; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1314; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1315; SSE2-NEXT:    pand %xmm2, %xmm0
1316; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1317; SSE2-NEXT:    psrlw %xmm1, %xmm2
1318; SSE2-NEXT:    pxor %xmm2, %xmm0
1319; SSE2-NEXT:    psubb %xmm2, %xmm0
1320; SSE2-NEXT:    retq
1321;
1322; SSE41-LABEL: splatvar_shift_v8i8:
1323; SSE41:       # %bb.0:
1324; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1325; SSE41-NEXT:    psrlw %xmm1, %xmm0
1326; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1327; SSE41-NEXT:    psrlw %xmm1, %xmm2
1328; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1329; SSE41-NEXT:    pand %xmm2, %xmm0
1330; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1331; SSE41-NEXT:    psrlw %xmm1, %xmm2
1332; SSE41-NEXT:    pxor %xmm2, %xmm0
1333; SSE41-NEXT:    psubb %xmm2, %xmm0
1334; SSE41-NEXT:    retq
1335;
1336; AVX1-LABEL: splatvar_shift_v8i8:
1337; AVX1:       # %bb.0:
1338; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1339; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1340; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1341; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1342; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1343; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1344; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1345; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1346; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1347; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1348; AVX1-NEXT:    retq
1349;
1350; AVX2-LABEL: splatvar_shift_v8i8:
1351; AVX2:       # %bb.0:
1352; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1353; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1354; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1355; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1356; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1357; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1358; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1359; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1360; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1361; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1362; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1363; AVX2-NEXT:    retq
1364;
1365; XOPAVX1-LABEL: splatvar_shift_v8i8:
1366; XOPAVX1:       # %bb.0:
1367; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1368; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1369; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1370; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1371; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1372; XOPAVX1-NEXT:    retq
1373;
1374; XOPAVX2-LABEL: splatvar_shift_v8i8:
1375; XOPAVX2:       # %bb.0:
1376; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1377; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1378; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1379; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1380; XOPAVX2-NEXT:    retq
1381;
1382; AVX512DQ-LABEL: splatvar_shift_v8i8:
1383; AVX512DQ:       # %bb.0:
1384; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1385; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1386; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1387; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1388; AVX512DQ-NEXT:    vzeroupper
1389; AVX512DQ-NEXT:    retq
1390;
1391; AVX512BW-LABEL: splatvar_shift_v8i8:
1392; AVX512BW:       # %bb.0:
1393; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1394; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1395; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1396; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1397; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1398; AVX512BW-NEXT:    vzeroupper
1399; AVX512BW-NEXT:    retq
1400;
1401; AVX512DQVL-LABEL: splatvar_shift_v8i8:
1402; AVX512DQVL:       # %bb.0:
1403; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1404; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1405; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1406; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1407; AVX512DQVL-NEXT:    vzeroupper
1408; AVX512DQVL-NEXT:    retq
1409;
1410; AVX512BWVL-LABEL: splatvar_shift_v8i8:
1411; AVX512BWVL:       # %bb.0:
1412; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1413; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1414; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1415; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1416; AVX512BWVL-NEXT:    vzeroupper
1417; AVX512BWVL-NEXT:    retq
1418;
1419; X86-SSE-LABEL: splatvar_shift_v8i8:
1420; X86-SSE:       # %bb.0:
1421; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1422; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1423; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1424; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1425; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1426; X86-SSE-NEXT:    psrlw $8, %xmm2
1427; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1428; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1429; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1430; X86-SSE-NEXT:    pand %xmm2, %xmm0
1431; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1432; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1433; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1434; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1435; X86-SSE-NEXT:    retl
1436  %splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer
1437  %shift = ashr <8 x i8> %a, %splat
1438  ret <8 x i8> %shift
1439}
1440
1441define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind {
1442; SSE2-LABEL: splatvar_shift_v4i8:
1443; SSE2:       # %bb.0:
1444; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1445; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1446; SSE2-NEXT:    psrlw %xmm1, %xmm0
1447; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1448; SSE2-NEXT:    psrlw %xmm1, %xmm2
1449; SSE2-NEXT:    psrlw $8, %xmm2
1450; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1451; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1452; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1453; SSE2-NEXT:    pand %xmm2, %xmm0
1454; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1455; SSE2-NEXT:    psrlw %xmm1, %xmm2
1456; SSE2-NEXT:    pxor %xmm2, %xmm0
1457; SSE2-NEXT:    psubb %xmm2, %xmm0
1458; SSE2-NEXT:    retq
1459;
1460; SSE41-LABEL: splatvar_shift_v4i8:
1461; SSE41:       # %bb.0:
1462; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1463; SSE41-NEXT:    psrlw %xmm1, %xmm0
1464; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1465; SSE41-NEXT:    psrlw %xmm1, %xmm2
1466; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1467; SSE41-NEXT:    pand %xmm2, %xmm0
1468; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1469; SSE41-NEXT:    psrlw %xmm1, %xmm2
1470; SSE41-NEXT:    pxor %xmm2, %xmm0
1471; SSE41-NEXT:    psubb %xmm2, %xmm0
1472; SSE41-NEXT:    retq
1473;
1474; AVX1-LABEL: splatvar_shift_v4i8:
1475; AVX1:       # %bb.0:
1476; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1477; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1478; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1479; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1480; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1481; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1482; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1483; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1484; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1485; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1486; AVX1-NEXT:    retq
1487;
1488; AVX2-LABEL: splatvar_shift_v4i8:
1489; AVX2:       # %bb.0:
1490; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1491; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1492; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1493; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1494; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1495; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1496; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1497; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1498; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1499; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1500; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1501; AVX2-NEXT:    retq
1502;
1503; XOPAVX1-LABEL: splatvar_shift_v4i8:
1504; XOPAVX1:       # %bb.0:
1505; XOPAVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1506; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
1507; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1508; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1509; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1510; XOPAVX1-NEXT:    retq
1511;
1512; XOPAVX2-LABEL: splatvar_shift_v4i8:
1513; XOPAVX2:       # %bb.0:
1514; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1515; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1516; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1517; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1518; XOPAVX2-NEXT:    retq
1519;
1520; AVX512DQ-LABEL: splatvar_shift_v4i8:
1521; AVX512DQ:       # %bb.0:
1522; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1523; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1524; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1525; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1526; AVX512DQ-NEXT:    vzeroupper
1527; AVX512DQ-NEXT:    retq
1528;
1529; AVX512BW-LABEL: splatvar_shift_v4i8:
1530; AVX512BW:       # %bb.0:
1531; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1532; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1533; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1534; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1535; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1536; AVX512BW-NEXT:    vzeroupper
1537; AVX512BW-NEXT:    retq
1538;
1539; AVX512DQVL-LABEL: splatvar_shift_v4i8:
1540; AVX512DQVL:       # %bb.0:
1541; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1542; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1543; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1544; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1545; AVX512DQVL-NEXT:    vzeroupper
1546; AVX512DQVL-NEXT:    retq
1547;
1548; AVX512BWVL-LABEL: splatvar_shift_v4i8:
1549; AVX512BWVL:       # %bb.0:
1550; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1551; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1552; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1553; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1554; AVX512BWVL-NEXT:    vzeroupper
1555; AVX512BWVL-NEXT:    retq
1556;
1557; X86-SSE-LABEL: splatvar_shift_v4i8:
1558; X86-SSE:       # %bb.0:
1559; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1560; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1561; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1562; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1563; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1564; X86-SSE-NEXT:    psrlw $8, %xmm2
1565; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1566; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1567; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1568; X86-SSE-NEXT:    pand %xmm2, %xmm0
1569; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1570; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1571; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1572; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1573; X86-SSE-NEXT:    retl
1574  %splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer
1575  %shift = ashr <4 x i8> %a, %splat
1576  ret <4 x i8> %shift
1577}
1578
1579define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind {
1580; SSE2-LABEL: splatvar_shift_v2i8:
1581; SSE2:       # %bb.0:
1582; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1583; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1584; SSE2-NEXT:    psrlw %xmm1, %xmm0
1585; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1586; SSE2-NEXT:    psrlw %xmm1, %xmm2
1587; SSE2-NEXT:    psrlw $8, %xmm2
1588; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1589; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1590; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1591; SSE2-NEXT:    pand %xmm2, %xmm0
1592; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1593; SSE2-NEXT:    psrlw %xmm1, %xmm2
1594; SSE2-NEXT:    pxor %xmm2, %xmm0
1595; SSE2-NEXT:    psubb %xmm2, %xmm0
1596; SSE2-NEXT:    retq
1597;
1598; SSE41-LABEL: splatvar_shift_v2i8:
1599; SSE41:       # %bb.0:
1600; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1601; SSE41-NEXT:    psrlw %xmm1, %xmm0
1602; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1603; SSE41-NEXT:    psrlw %xmm1, %xmm2
1604; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1605; SSE41-NEXT:    pand %xmm2, %xmm0
1606; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1607; SSE41-NEXT:    psrlw %xmm1, %xmm2
1608; SSE41-NEXT:    pxor %xmm2, %xmm0
1609; SSE41-NEXT:    psubb %xmm2, %xmm0
1610; SSE41-NEXT:    retq
1611;
1612; AVX1-LABEL: splatvar_shift_v2i8:
1613; AVX1:       # %bb.0:
1614; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1615; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1616; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1617; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1618; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1619; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1620; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1621; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1622; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1623; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1624; AVX1-NEXT:    retq
1625;
1626; AVX2-LABEL: splatvar_shift_v2i8:
1627; AVX2:       # %bb.0:
1628; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1629; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1630; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1631; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1632; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1633; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1634; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1635; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1636; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1637; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1638; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1639; AVX2-NEXT:    retq
1640;
1641; XOP-LABEL: splatvar_shift_v2i8:
1642; XOP:       # %bb.0:
1643; XOP-NEXT:    insertq {{.*#+}} xmm1 = xmm1[0,0,2,3,4,5,6,7,u,u,u,u,u,u,u,u]
1644; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1645; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1646; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1647; XOP-NEXT:    retq
1648;
1649; AVX512DQ-LABEL: splatvar_shift_v2i8:
1650; AVX512DQ:       # %bb.0:
1651; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1652; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1653; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1654; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1655; AVX512DQ-NEXT:    vzeroupper
1656; AVX512DQ-NEXT:    retq
1657;
1658; AVX512BW-LABEL: splatvar_shift_v2i8:
1659; AVX512BW:       # %bb.0:
1660; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1661; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1662; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1663; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1664; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1665; AVX512BW-NEXT:    vzeroupper
1666; AVX512BW-NEXT:    retq
1667;
1668; AVX512DQVL-LABEL: splatvar_shift_v2i8:
1669; AVX512DQVL:       # %bb.0:
1670; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1671; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1672; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1673; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1674; AVX512DQVL-NEXT:    vzeroupper
1675; AVX512DQVL-NEXT:    retq
1676;
1677; AVX512BWVL-LABEL: splatvar_shift_v2i8:
1678; AVX512BWVL:       # %bb.0:
1679; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1680; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1681; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1682; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1683; AVX512BWVL-NEXT:    vzeroupper
1684; AVX512BWVL-NEXT:    retq
1685;
1686; X86-SSE-LABEL: splatvar_shift_v2i8:
1687; X86-SSE:       # %bb.0:
1688; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
1689; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
1690; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1691; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1692; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1693; X86-SSE-NEXT:    psrlw $8, %xmm2
1694; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1695; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1696; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1697; X86-SSE-NEXT:    pand %xmm2, %xmm0
1698; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1699; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1700; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1701; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1702; X86-SSE-NEXT:    retl
1703  %splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer
1704  %shift = ashr <2 x i8> %a, %splat
1705  ret <2 x i8> %shift
1706}
1707
1708;
1709; Constant Shifts
1710;
1711
1712define <2 x i32> @constant_shift_v2i32(<2 x i32> %a) nounwind {
1713; SSE2-LABEL: constant_shift_v2i32:
1714; SSE2:       # %bb.0:
1715; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1716; SSE2-NEXT:    psrad $4, %xmm0
1717; SSE2-NEXT:    psrad $5, %xmm1
1718; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1719; SSE2-NEXT:    retq
1720;
1721; SSE41-LABEL: constant_shift_v2i32:
1722; SSE41:       # %bb.0:
1723; SSE41-NEXT:    movdqa %xmm0, %xmm1
1724; SSE41-NEXT:    psrad $5, %xmm1
1725; SSE41-NEXT:    psrad $4, %xmm0
1726; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1727; SSE41-NEXT:    retq
1728;
1729; AVX1-LABEL: constant_shift_v2i32:
1730; AVX1:       # %bb.0:
1731; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm1
1732; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
1733; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1734; AVX1-NEXT:    retq
1735;
1736; AVX2-LABEL: constant_shift_v2i32:
1737; AVX2:       # %bb.0:
1738; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1739; AVX2-NEXT:    retq
1740;
1741; XOPAVX1-LABEL: constant_shift_v2i32:
1742; XOPAVX1:       # %bb.0:
1743; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1744; XOPAVX1-NEXT:    retq
1745;
1746; XOPAVX2-LABEL: constant_shift_v2i32:
1747; XOPAVX2:       # %bb.0:
1748; XOPAVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1749; XOPAVX2-NEXT:    retq
1750;
1751; AVX512-LABEL: constant_shift_v2i32:
1752; AVX512:       # %bb.0:
1753; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1754; AVX512-NEXT:    retq
1755;
1756; AVX512VL-LABEL: constant_shift_v2i32:
1757; AVX512VL:       # %bb.0:
1758; AVX512VL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1759; AVX512VL-NEXT:    retq
1760;
1761; X86-SSE-LABEL: constant_shift_v2i32:
1762; X86-SSE:       # %bb.0:
1763; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
1764; X86-SSE-NEXT:    psrad $4, %xmm0
1765; X86-SSE-NEXT:    psrad $5, %xmm1
1766; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1767; X86-SSE-NEXT:    retl
1768  %shift = ashr <2 x i32> %a, <i32 4, i32 5>
1769  ret <2 x i32> %shift
1770}
1771
1772define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
1773; SSE2-LABEL: constant_shift_v4i16:
1774; SSE2:       # %bb.0:
1775; SSE2-NEXT:    movdqa %xmm0, %xmm1
1776; SSE2-NEXT:    psraw $2, %xmm1
1777; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1778; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1779; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535]
1780; SSE2-NEXT:    movaps %xmm1, %xmm2
1781; SSE2-NEXT:    andps %xmm0, %xmm2
1782; SSE2-NEXT:    psraw $1, %xmm1
1783; SSE2-NEXT:    andnps %xmm1, %xmm0
1784; SSE2-NEXT:    orps %xmm2, %xmm0
1785; SSE2-NEXT:    retq
1786;
1787; SSE41-LABEL: constant_shift_v4i16:
1788; SSE41:       # %bb.0:
1789; SSE41-NEXT:    movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0]
1790; SSE41-NEXT:    pmulhw %xmm0, %xmm1
1791; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1792; SSE41-NEXT:    psraw $1, %xmm0
1793; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1794; SSE41-NEXT:    retq
1795;
1796; AVX-LABEL: constant_shift_v4i16:
1797; AVX:       # %bb.0:
1798; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,16384,8192,u,u,u,u]
1799; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1800; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
1801; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1802; AVX-NEXT:    retq
1803;
1804; XOP-LABEL: constant_shift_v4i16:
1805; XOP:       # %bb.0:
1806; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1807; XOP-NEXT:    retq
1808;
1809; AVX512DQ-LABEL: constant_shift_v4i16:
1810; AVX512DQ:       # %bb.0:
1811; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1812; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1813; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
1814; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1815; AVX512DQ-NEXT:    vzeroupper
1816; AVX512DQ-NEXT:    retq
1817;
1818; AVX512BW-LABEL: constant_shift_v4i16:
1819; AVX512BW:       # %bb.0:
1820; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1821; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
1822; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1823; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1824; AVX512BW-NEXT:    vzeroupper
1825; AVX512BW-NEXT:    retq
1826;
1827; AVX512DQVL-LABEL: constant_shift_v4i16:
1828; AVX512DQVL:       # %bb.0:
1829; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
1830; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1831; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
1832; AVX512DQVL-NEXT:    vzeroupper
1833; AVX512DQVL-NEXT:    retq
1834;
1835; AVX512BWVL-LABEL: constant_shift_v4i16:
1836; AVX512BWVL:       # %bb.0:
1837; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1838; AVX512BWVL-NEXT:    retq
1839;
1840; X86-SSE-LABEL: constant_shift_v4i16:
1841; X86-SSE:       # %bb.0:
1842; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1843; X86-SSE-NEXT:    psraw $2, %xmm1
1844; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm0[0,0]
1845; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
1846; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,65535,65535,65535]
1847; X86-SSE-NEXT:    movaps %xmm1, %xmm2
1848; X86-SSE-NEXT:    andps %xmm0, %xmm2
1849; X86-SSE-NEXT:    psraw $1, %xmm1
1850; X86-SSE-NEXT:    andnps %xmm1, %xmm0
1851; X86-SSE-NEXT:    orps %xmm2, %xmm0
1852; X86-SSE-NEXT:    retl
1853  %shift = ashr <4 x i16> %a, <i16 0, i16 1, i16 2, i16 3>
1854  ret <4 x i16> %shift
1855}
1856
1857define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
1858; SSE2-LABEL: constant_shift_v2i16:
1859; SSE2:       # %bb.0:
1860; SSE2-NEXT:    movdqa %xmm0, %xmm1
1861; SSE2-NEXT:    psraw $3, %xmm1
1862; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1863; SSE2-NEXT:    psraw $2, %xmm0
1864; SSE2-NEXT:    pand %xmm2, %xmm0
1865; SSE2-NEXT:    pandn %xmm1, %xmm2
1866; SSE2-NEXT:    por %xmm2, %xmm0
1867; SSE2-NEXT:    retq
1868;
1869; SSE41-LABEL: constant_shift_v2i16:
1870; SSE41:       # %bb.0:
1871; SSE41-NEXT:    movdqa %xmm0, %xmm1
1872; SSE41-NEXT:    psraw $3, %xmm1
1873; SSE41-NEXT:    psraw $2, %xmm0
1874; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1875; SSE41-NEXT:    retq
1876;
1877; AVX-LABEL: constant_shift_v2i16:
1878; AVX:       # %bb.0:
1879; AVX-NEXT:    vpsraw $3, %xmm0, %xmm1
1880; AVX-NEXT:    vpsraw $2, %xmm0, %xmm0
1881; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1882; AVX-NEXT:    retq
1883;
1884; XOP-LABEL: constant_shift_v2i16:
1885; XOP:       # %bb.0:
1886; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1887; XOP-NEXT:    retq
1888;
1889; AVX512DQ-LABEL: constant_shift_v2i16:
1890; AVX512DQ:       # %bb.0:
1891; AVX512DQ-NEXT:    vpsraw $3, %xmm0, %xmm1
1892; AVX512DQ-NEXT:    vpsraw $2, %xmm0, %xmm0
1893; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1894; AVX512DQ-NEXT:    retq
1895;
1896; AVX512BW-LABEL: constant_shift_v2i16:
1897; AVX512BW:       # %bb.0:
1898; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1899; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
1900; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1901; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1902; AVX512BW-NEXT:    vzeroupper
1903; AVX512BW-NEXT:    retq
1904;
1905; AVX512DQVL-LABEL: constant_shift_v2i16:
1906; AVX512DQVL:       # %bb.0:
1907; AVX512DQVL-NEXT:    vpsraw $3, %xmm0, %xmm1
1908; AVX512DQVL-NEXT:    vpsraw $2, %xmm0, %xmm0
1909; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3,4,5,6,7]
1910; AVX512DQVL-NEXT:    retq
1911;
1912; AVX512BWVL-LABEL: constant_shift_v2i16:
1913; AVX512BWVL:       # %bb.0:
1914; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1915; AVX512BWVL-NEXT:    retq
1916;
1917; X86-SSE-LABEL: constant_shift_v2i16:
1918; X86-SSE:       # %bb.0:
1919; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1920; X86-SSE-NEXT:    psraw $3, %xmm1
1921; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,65535,65535,65535,65535]
1922; X86-SSE-NEXT:    psraw $2, %xmm0
1923; X86-SSE-NEXT:    pand %xmm2, %xmm0
1924; X86-SSE-NEXT:    pandn %xmm1, %xmm2
1925; X86-SSE-NEXT:    por %xmm2, %xmm0
1926; X86-SSE-NEXT:    retl
1927  %shift = ashr <2 x i16> %a, <i16 2, i16 3>
1928  ret <2 x i16> %shift
1929}
1930
1931define <8 x i8> @constant_shift_v8i8(<8 x i8> %a) nounwind {
1932; SSE-LABEL: constant_shift_v8i8:
1933; SSE:       # %bb.0:
1934; SSE-NEXT:    pxor %xmm1, %xmm1
1935; SSE-NEXT:    movdqa %xmm0, %xmm2
1936; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
1937; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1938; SSE-NEXT:    psraw $8, %xmm0
1939; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
1940; SSE-NEXT:    psrlw $8, %xmm0
1941; SSE-NEXT:    packuswb %xmm2, %xmm0
1942; SSE-NEXT:    retq
1943;
1944; AVX1-LABEL: constant_shift_v8i8:
1945; AVX1:       # %bb.0:
1946; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1947; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
1948; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1949; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1950; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1951; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1952; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1953; AVX1-NEXT:    retq
1954;
1955; AVX2-LABEL: constant_shift_v8i8:
1956; AVX2:       # %bb.0:
1957; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1958; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,256,256,256,256,256,256,256]
1959; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1960; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1961; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1962; AVX2-NEXT:    vzeroupper
1963; AVX2-NEXT:    retq
1964;
1965; XOP-LABEL: constant_shift_v8i8:
1966; XOP:       # %bb.0:
1967; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1968; XOP-NEXT:    retq
1969;
1970; AVX512DQ-LABEL: constant_shift_v8i8:
1971; AVX512DQ:       # %bb.0:
1972; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1973; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1974; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1975; AVX512DQ-NEXT:    vzeroupper
1976; AVX512DQ-NEXT:    retq
1977;
1978; AVX512BW-LABEL: constant_shift_v8i8:
1979; AVX512BW:       # %bb.0:
1980; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,0,0,0,0,0,0,0,0]
1981; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1982; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1983; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1984; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1985; AVX512BW-NEXT:    vzeroupper
1986; AVX512BW-NEXT:    retq
1987;
1988; AVX512DQVL-LABEL: constant_shift_v8i8:
1989; AVX512DQVL:       # %bb.0:
1990; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1991; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1992; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1993; AVX512DQVL-NEXT:    vzeroupper
1994; AVX512DQVL-NEXT:    retq
1995;
1996; AVX512BWVL-LABEL: constant_shift_v8i8:
1997; AVX512BWVL:       # %bb.0:
1998; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1999; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2000; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2001; AVX512BWVL-NEXT:    vzeroupper
2002; AVX512BWVL-NEXT:    retq
2003;
2004; X86-SSE-LABEL: constant_shift_v8i8:
2005; X86-SSE:       # %bb.0:
2006; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2007; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2008; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2009; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2010; X86-SSE-NEXT:    psraw $8, %xmm0
2011; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
2012; X86-SSE-NEXT:    psrlw $8, %xmm0
2013; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2014; X86-SSE-NEXT:    retl
2015  %shift = ashr <8 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
2016  ret <8 x i8> %shift
2017}
2018
2019define <4 x i8> @constant_shift_v4i8(<4 x i8> %a) nounwind {
2020; SSE-LABEL: constant_shift_v4i8:
2021; SSE:       # %bb.0:
2022; SSE-NEXT:    pxor %xmm1, %xmm1
2023; SSE-NEXT:    movdqa %xmm0, %xmm2
2024; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2025; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2026; SSE-NEXT:    psraw $8, %xmm0
2027; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,256,256,256,256]
2028; SSE-NEXT:    psrlw $8, %xmm0
2029; SSE-NEXT:    packuswb %xmm2, %xmm0
2030; SSE-NEXT:    retq
2031;
2032; AVX1-LABEL: constant_shift_v4i8:
2033; AVX1:       # %bb.0:
2034; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2035; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2036; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2037; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
2038; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,256,256,256,256]
2039; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2040; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2041; AVX1-NEXT:    retq
2042;
2043; AVX2-LABEL: constant_shift_v4i8:
2044; AVX2:       # %bb.0:
2045; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2046; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,256,256,256,256,256,256,256,256,256,256,256,256]
2047; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2048; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2049; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2050; AVX2-NEXT:    vzeroupper
2051; AVX2-NEXT:    retq
2052;
2053; XOP-LABEL: constant_shift_v4i8:
2054; XOP:       # %bb.0:
2055; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2056; XOP-NEXT:    retq
2057;
2058; AVX512DQ-LABEL: constant_shift_v4i8:
2059; AVX512DQ:       # %bb.0:
2060; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
2061; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2062; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2063; AVX512DQ-NEXT:    vzeroupper
2064; AVX512DQ-NEXT:    retq
2065;
2066; AVX512BW-LABEL: constant_shift_v4i8:
2067; AVX512BW:       # %bb.0:
2068; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,0,0,0,0,0,0,0,0,0,0,0,0]
2069; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
2070; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
2071; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2072; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2073; AVX512BW-NEXT:    vzeroupper
2074; AVX512BW-NEXT:    retq
2075;
2076; AVX512DQVL-LABEL: constant_shift_v4i8:
2077; AVX512DQVL:       # %bb.0:
2078; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
2079; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2080; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2081; AVX512DQVL-NEXT:    vzeroupper
2082; AVX512DQVL-NEXT:    retq
2083;
2084; AVX512BWVL-LABEL: constant_shift_v4i8:
2085; AVX512BWVL:       # %bb.0:
2086; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2087; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2088; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2089; AVX512BWVL-NEXT:    vzeroupper
2090; AVX512BWVL-NEXT:    retq
2091;
2092; X86-SSE-LABEL: constant_shift_v4i8:
2093; X86-SSE:       # %bb.0:
2094; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2095; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2096; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2097; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2098; X86-SSE-NEXT:    psraw $8, %xmm0
2099; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,256,256,256,256]
2100; X86-SSE-NEXT:    psrlw $8, %xmm0
2101; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2102; X86-SSE-NEXT:    retl
2103  %shift = ashr <4 x i8> %a, <i8 0, i8 1, i8 2, i8 3>
2104  ret <4 x i8> %shift
2105}
2106
2107define <2 x i8> @constant_shift_v2i8(<2 x i8> %a) nounwind {
2108; SSE-LABEL: constant_shift_v2i8:
2109; SSE:       # %bb.0:
2110; SSE-NEXT:    pxor %xmm1, %xmm1
2111; SSE-NEXT:    movdqa %xmm0, %xmm2
2112; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2113; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2114; SSE-NEXT:    psraw $8, %xmm0
2115; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [64,32,256,256,256,256,256,256]
2116; SSE-NEXT:    psrlw $8, %xmm0
2117; SSE-NEXT:    packuswb %xmm2, %xmm0
2118; SSE-NEXT:    retq
2119;
2120; AVX1-LABEL: constant_shift_v2i8:
2121; AVX1:       # %bb.0:
2122; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
2123; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
2124; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2125; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
2126; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [64,32,256,256,256,256,256,256]
2127; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
2128; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2129; AVX1-NEXT:    retq
2130;
2131; AVX2-LABEL: constant_shift_v2i8:
2132; AVX2:       # %bb.0:
2133; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
2134; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,32,256,256,256,256,256,256,256,256,256,256,256,256,256,256]
2135; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2136; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2137; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
2138; AVX2-NEXT:    vzeroupper
2139; AVX2-NEXT:    retq
2140;
2141; XOP-LABEL: constant_shift_v2i8:
2142; XOP:       # %bb.0:
2143; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2144; XOP-NEXT:    retq
2145;
2146; AVX512DQ-LABEL: constant_shift_v2i8:
2147; AVX512DQ:       # %bb.0:
2148; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
2149; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2150; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
2151; AVX512DQ-NEXT:    vzeroupper
2152; AVX512DQ-NEXT:    retq
2153;
2154; AVX512BW-LABEL: constant_shift_v2i8:
2155; AVX512BW:       # %bb.0:
2156; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
2157; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
2158; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
2159; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2160; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2161; AVX512BW-NEXT:    vzeroupper
2162; AVX512BW-NEXT:    retq
2163;
2164; AVX512DQVL-LABEL: constant_shift_v2i8:
2165; AVX512DQVL:       # %bb.0:
2166; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
2167; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2168; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
2169; AVX512DQVL-NEXT:    vzeroupper
2170; AVX512DQVL-NEXT:    retq
2171;
2172; AVX512BWVL-LABEL: constant_shift_v2i8:
2173; AVX512BWVL:       # %bb.0:
2174; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
2175; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2176; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
2177; AVX512BWVL-NEXT:    vzeroupper
2178; AVX512BWVL-NEXT:    retq
2179;
2180; X86-SSE-LABEL: constant_shift_v2i8:
2181; X86-SSE:       # %bb.0:
2182; X86-SSE-NEXT:    pxor %xmm1, %xmm1
2183; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
2184; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
2185; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2186; X86-SSE-NEXT:    psraw $8, %xmm0
2187; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [64,32,256,256,256,256,256,256]
2188; X86-SSE-NEXT:    psrlw $8, %xmm0
2189; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
2190; X86-SSE-NEXT:    retl
2191  %shift = ashr <2 x i8> %a, <i8 2, i8 3>
2192  ret <2 x i8> %shift
2193}
2194
2195;
2196; Uniform Constant Shifts
2197;
2198
2199define <2 x i32> @splatconstant_shift_v2i32(<2 x i32> %a) nounwind {
2200; SSE-LABEL: splatconstant_shift_v2i32:
2201; SSE:       # %bb.0:
2202; SSE-NEXT:    psrad $5, %xmm0
2203; SSE-NEXT:    retq
2204;
2205; AVX-LABEL: splatconstant_shift_v2i32:
2206; AVX:       # %bb.0:
2207; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
2208; AVX-NEXT:    retq
2209;
2210; XOP-LABEL: splatconstant_shift_v2i32:
2211; XOP:       # %bb.0:
2212; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
2213; XOP-NEXT:    retq
2214;
2215; AVX512-LABEL: splatconstant_shift_v2i32:
2216; AVX512:       # %bb.0:
2217; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
2218; AVX512-NEXT:    retq
2219;
2220; AVX512VL-LABEL: splatconstant_shift_v2i32:
2221; AVX512VL:       # %bb.0:
2222; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
2223; AVX512VL-NEXT:    retq
2224;
2225; X86-SSE-LABEL: splatconstant_shift_v2i32:
2226; X86-SSE:       # %bb.0:
2227; X86-SSE-NEXT:    psrad $5, %xmm0
2228; X86-SSE-NEXT:    retl
2229  %shift = ashr <2 x i32> %a, <i32 5, i32 5>
2230  ret <2 x i32> %shift
2231}
2232
2233define <4 x i16> @splatconstant_shift_v4i16(<4 x i16> %a) nounwind {
2234; SSE-LABEL: splatconstant_shift_v4i16:
2235; SSE:       # %bb.0:
2236; SSE-NEXT:    psraw $3, %xmm0
2237; SSE-NEXT:    retq
2238;
2239; AVX-LABEL: splatconstant_shift_v4i16:
2240; AVX:       # %bb.0:
2241; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
2242; AVX-NEXT:    retq
2243;
2244; XOP-LABEL: splatconstant_shift_v4i16:
2245; XOP:       # %bb.0:
2246; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
2247; XOP-NEXT:    retq
2248;
2249; AVX512-LABEL: splatconstant_shift_v4i16:
2250; AVX512:       # %bb.0:
2251; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
2252; AVX512-NEXT:    retq
2253;
2254; AVX512VL-LABEL: splatconstant_shift_v4i16:
2255; AVX512VL:       # %bb.0:
2256; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
2257; AVX512VL-NEXT:    retq
2258;
2259; X86-SSE-LABEL: splatconstant_shift_v4i16:
2260; X86-SSE:       # %bb.0:
2261; X86-SSE-NEXT:    psraw $3, %xmm0
2262; X86-SSE-NEXT:    retl
2263  %shift = ashr <4 x i16> %a, <i16 3, i16 3, i16 3, i16 3>
2264  ret <4 x i16> %shift
2265}
2266
2267define <2 x i16> @splatconstant_shift_v2i16(<2 x i16> %a) nounwind {
2268; SSE-LABEL: splatconstant_shift_v2i16:
2269; SSE:       # %bb.0:
2270; SSE-NEXT:    psraw $3, %xmm0
2271; SSE-NEXT:    retq
2272;
2273; AVX-LABEL: splatconstant_shift_v2i16:
2274; AVX:       # %bb.0:
2275; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
2276; AVX-NEXT:    retq
2277;
2278; XOP-LABEL: splatconstant_shift_v2i16:
2279; XOP:       # %bb.0:
2280; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
2281; XOP-NEXT:    retq
2282;
2283; AVX512-LABEL: splatconstant_shift_v2i16:
2284; AVX512:       # %bb.0:
2285; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
2286; AVX512-NEXT:    retq
2287;
2288; AVX512VL-LABEL: splatconstant_shift_v2i16:
2289; AVX512VL:       # %bb.0:
2290; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
2291; AVX512VL-NEXT:    retq
2292;
2293; X86-SSE-LABEL: splatconstant_shift_v2i16:
2294; X86-SSE:       # %bb.0:
2295; X86-SSE-NEXT:    psraw $3, %xmm0
2296; X86-SSE-NEXT:    retl
2297  %shift = ashr <2 x i16> %a, <i16 3, i16 3>
2298  ret <2 x i16> %shift
2299}
2300
2301define <8 x i8> @splatconstant_shift_v8i8(<8 x i8> %a) nounwind {
2302; SSE-LABEL: splatconstant_shift_v8i8:
2303; SSE:       # %bb.0:
2304; SSE-NEXT:    psrlw $3, %xmm0
2305; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2306; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2307; SSE-NEXT:    pxor %xmm1, %xmm0
2308; SSE-NEXT:    psubb %xmm1, %xmm0
2309; SSE-NEXT:    retq
2310;
2311; AVX1-LABEL: splatconstant_shift_v8i8:
2312; AVX1:       # %bb.0:
2313; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
2314; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2315; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2316; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2317; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2318; AVX1-NEXT:    retq
2319;
2320; AVX2-LABEL: splatconstant_shift_v8i8:
2321; AVX2:       # %bb.0:
2322; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
2323; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2324; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2325; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2326; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2327; AVX2-NEXT:    retq
2328;
2329; XOP-LABEL: splatconstant_shift_v8i8:
2330; XOP:       # %bb.0:
2331; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2332; XOP-NEXT:    retq
2333;
2334; AVX512-LABEL: splatconstant_shift_v8i8:
2335; AVX512:       # %bb.0:
2336; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2337; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2338; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2339; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2340; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2341; AVX512-NEXT:    retq
2342;
2343; AVX512DQVL-LABEL: splatconstant_shift_v8i8:
2344; AVX512DQVL:       # %bb.0:
2345; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2346; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2347; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2348; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2349; AVX512DQVL-NEXT:    retq
2350;
2351; AVX512BWVL-LABEL: splatconstant_shift_v8i8:
2352; AVX512BWVL:       # %bb.0:
2353; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2354; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2355; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2356; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2357; AVX512BWVL-NEXT:    retq
2358;
2359; X86-SSE-LABEL: splatconstant_shift_v8i8:
2360; X86-SSE:       # %bb.0:
2361; X86-SSE-NEXT:    psrlw $3, %xmm0
2362; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2363; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2364; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2365; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2366; X86-SSE-NEXT:    retl
2367  %shift = ashr <8 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2368  ret <8 x i8> %shift
2369}
2370
2371define <4 x i8> @splatconstant_shift_v4i8(<4 x i8> %a) nounwind {
2372; SSE-LABEL: splatconstant_shift_v4i8:
2373; SSE:       # %bb.0:
2374; SSE-NEXT:    psrlw $3, %xmm0
2375; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2376; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2377; SSE-NEXT:    pxor %xmm1, %xmm0
2378; SSE-NEXT:    psubb %xmm1, %xmm0
2379; SSE-NEXT:    retq
2380;
2381; AVX1-LABEL: splatconstant_shift_v4i8:
2382; AVX1:       # %bb.0:
2383; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
2384; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2385; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2386; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2387; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2388; AVX1-NEXT:    retq
2389;
2390; AVX2-LABEL: splatconstant_shift_v4i8:
2391; AVX2:       # %bb.0:
2392; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
2393; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2394; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2395; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2396; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2397; AVX2-NEXT:    retq
2398;
2399; XOP-LABEL: splatconstant_shift_v4i8:
2400; XOP:       # %bb.0:
2401; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2402; XOP-NEXT:    retq
2403;
2404; AVX512-LABEL: splatconstant_shift_v4i8:
2405; AVX512:       # %bb.0:
2406; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2407; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2408; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2409; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2410; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2411; AVX512-NEXT:    retq
2412;
2413; AVX512DQVL-LABEL: splatconstant_shift_v4i8:
2414; AVX512DQVL:       # %bb.0:
2415; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2416; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2417; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2418; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2419; AVX512DQVL-NEXT:    retq
2420;
2421; AVX512BWVL-LABEL: splatconstant_shift_v4i8:
2422; AVX512BWVL:       # %bb.0:
2423; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2424; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2425; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2426; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2427; AVX512BWVL-NEXT:    retq
2428;
2429; X86-SSE-LABEL: splatconstant_shift_v4i8:
2430; X86-SSE:       # %bb.0:
2431; X86-SSE-NEXT:    psrlw $3, %xmm0
2432; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2433; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2434; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2435; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2436; X86-SSE-NEXT:    retl
2437  %shift = ashr <4 x i8> %a, <i8 3, i8 3, i8 3, i8 3>
2438  ret <4 x i8> %shift
2439}
2440
2441define <2 x i8> @splatconstant_shift_v2i8(<2 x i8> %a) nounwind {
2442; SSE-LABEL: splatconstant_shift_v2i8:
2443; SSE:       # %bb.0:
2444; SSE-NEXT:    psrlw $3, %xmm0
2445; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2446; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2447; SSE-NEXT:    pxor %xmm1, %xmm0
2448; SSE-NEXT:    psubb %xmm1, %xmm0
2449; SSE-NEXT:    retq
2450;
2451; AVX1-LABEL: splatconstant_shift_v2i8:
2452; AVX1:       # %bb.0:
2453; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
2454; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2455; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2456; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2457; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2458; AVX1-NEXT:    retq
2459;
2460; AVX2-LABEL: splatconstant_shift_v2i8:
2461; AVX2:       # %bb.0:
2462; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
2463; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2464; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2465; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2466; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2467; AVX2-NEXT:    retq
2468;
2469; XOP-LABEL: splatconstant_shift_v2i8:
2470; XOP:       # %bb.0:
2471; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2472; XOP-NEXT:    retq
2473;
2474; AVX512-LABEL: splatconstant_shift_v2i8:
2475; AVX512:       # %bb.0:
2476; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2477; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2478; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2479; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2480; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2481; AVX512-NEXT:    retq
2482;
2483; AVX512DQVL-LABEL: splatconstant_shift_v2i8:
2484; AVX512DQVL:       # %bb.0:
2485; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2486; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2487; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2488; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2489; AVX512DQVL-NEXT:    retq
2490;
2491; AVX512BWVL-LABEL: splatconstant_shift_v2i8:
2492; AVX512BWVL:       # %bb.0:
2493; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2494; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2495; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2496; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2497; AVX512BWVL-NEXT:    retq
2498;
2499; X86-SSE-LABEL: splatconstant_shift_v2i8:
2500; X86-SSE:       # %bb.0:
2501; X86-SSE-NEXT:    psrlw $3, %xmm0
2502; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2503; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2504; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2505; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2506; X86-SSE-NEXT:    retl
2507  %shift = ashr <2 x i8> %a, <i8 3, i8 3>
2508  ret <2 x i8> %shift
2509}
2510