xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll (revision 2a922903bf5d5b0012c1f8f2a5396d44cfff4630)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
12;
13; Just one 32-bit run to make sure we do reasonable things for i64 shifts.
14; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86-SSE
15
16;
17; Variable Shifts
18;
19
20define <2 x i64> @var_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
21; SSE2-LABEL: var_shift_v2i64:
22; SSE2:       # %bb.0:
23; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
24; SSE2-NEXT:    movdqa %xmm2, %xmm3
25; SSE2-NEXT:    psrlq %xmm1, %xmm3
26; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
27; SSE2-NEXT:    psrlq %xmm4, %xmm2
28; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
29; SSE2-NEXT:    movdqa %xmm0, %xmm3
30; SSE2-NEXT:    psrlq %xmm1, %xmm3
31; SSE2-NEXT:    psrlq %xmm4, %xmm0
32; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
33; SSE2-NEXT:    xorpd %xmm2, %xmm0
34; SSE2-NEXT:    psubq %xmm2, %xmm0
35; SSE2-NEXT:    retq
36;
37; SSE41-LABEL: var_shift_v2i64:
38; SSE41:       # %bb.0:
39; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
40; SSE41-NEXT:    movdqa %xmm2, %xmm3
41; SSE41-NEXT:    psrlq %xmm1, %xmm3
42; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
43; SSE41-NEXT:    psrlq %xmm4, %xmm2
44; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
45; SSE41-NEXT:    movdqa %xmm0, %xmm3
46; SSE41-NEXT:    psrlq %xmm1, %xmm3
47; SSE41-NEXT:    psrlq %xmm4, %xmm0
48; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
49; SSE41-NEXT:    pxor %xmm2, %xmm0
50; SSE41-NEXT:    psubq %xmm2, %xmm0
51; SSE41-NEXT:    retq
52;
53; AVX1-LABEL: var_shift_v2i64:
54; AVX1:       # %bb.0:
55; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
56; AVX1-NEXT:    # xmm2 = mem[0,0]
57; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm3
58; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
59; AVX1-NEXT:    vpsrlq %xmm4, %xmm2, %xmm2
60; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
61; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm1
62; AVX1-NEXT:    vpsrlq %xmm4, %xmm0, %xmm0
63; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
64; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
65; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
66; AVX1-NEXT:    retq
67;
68; AVX2-LABEL: var_shift_v2i64:
69; AVX2:       # %bb.0:
70; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
71; AVX2-NEXT:    vpsrlvq %xmm1, %xmm2, %xmm2
72; AVX2-NEXT:    vpsrlvq %xmm1, %xmm0, %xmm0
73; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
74; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
75; AVX2-NEXT:    retq
76;
77; XOP-LABEL: var_shift_v2i64:
78; XOP:       # %bb.0:
79; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
80; XOP-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
81; XOP-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
82; XOP-NEXT:    retq
83;
84; AVX512-LABEL: var_shift_v2i64:
85; AVX512:       # %bb.0:
86; AVX512-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
87; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
88; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
89; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
90; AVX512-NEXT:    vzeroupper
91; AVX512-NEXT:    retq
92;
93; AVX512VL-LABEL: var_shift_v2i64:
94; AVX512VL:       # %bb.0:
95; AVX512VL-NEXT:    vpsravq %xmm1, %xmm0, %xmm0
96; AVX512VL-NEXT:    retq
97;
98; X86-SSE-LABEL: var_shift_v2i64:
99; X86-SSE:       # %bb.0:
100; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
101; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
102; X86-SSE-NEXT:    psrlq %xmm1, %xmm3
103; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
104; X86-SSE-NEXT:    psrlq %xmm4, %xmm2
105; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = xmm3[0],xmm2[1]
106; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
107; X86-SSE-NEXT:    psrlq %xmm1, %xmm3
108; X86-SSE-NEXT:    psrlq %xmm4, %xmm0
109; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm3[0],xmm0[1]
110; X86-SSE-NEXT:    xorpd %xmm2, %xmm0
111; X86-SSE-NEXT:    psubq %xmm2, %xmm0
112; X86-SSE-NEXT:    retl
113  %shift = ashr <2 x i64> %a, %b
114  ret <2 x i64> %shift
115}
116
117define <4 x i32> @var_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
118; SSE2-LABEL: var_shift_v4i32:
119; SSE2:       # %bb.0:
120; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
121; SSE2-NEXT:    movdqa %xmm0, %xmm3
122; SSE2-NEXT:    psrad %xmm2, %xmm3
123; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
124; SSE2-NEXT:    movdqa %xmm0, %xmm2
125; SSE2-NEXT:    psrad %xmm4, %xmm2
126; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
127; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
128; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
129; SSE2-NEXT:    movdqa %xmm0, %xmm4
130; SSE2-NEXT:    psrad %xmm3, %xmm4
131; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
132; SSE2-NEXT:    psrad %xmm1, %xmm0
133; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
134; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
135; SSE2-NEXT:    movaps %xmm2, %xmm0
136; SSE2-NEXT:    retq
137;
138; SSE41-LABEL: var_shift_v4i32:
139; SSE41:       # %bb.0:
140; SSE41-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
141; SSE41-NEXT:    movdqa %xmm0, %xmm3
142; SSE41-NEXT:    psrad %xmm2, %xmm3
143; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
144; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[2,3,3,3,4,5,6,7]
145; SSE41-NEXT:    movdqa %xmm0, %xmm5
146; SSE41-NEXT:    psrad %xmm4, %xmm5
147; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4,5,6,7]
148; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
149; SSE41-NEXT:    movdqa %xmm0, %xmm3
150; SSE41-NEXT:    psrad %xmm1, %xmm3
151; SSE41-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[0,1,1,1,4,5,6,7]
152; SSE41-NEXT:    psrad %xmm1, %xmm0
153; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
154; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm5[2,3],xmm0[4,5],xmm5[6,7]
155; SSE41-NEXT:    retq
156;
157; AVX1-LABEL: var_shift_v4i32:
158; AVX1:       # %bb.0:
159; AVX1-NEXT:    vpsrldq {{.*#+}} xmm2 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
160; AVX1-NEXT:    vpsrad %xmm2, %xmm0, %xmm2
161; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm3
162; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
163; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
164; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
165; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
166; AVX1-NEXT:    vpsrad %xmm3, %xmm0, %xmm3
167; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
168; AVX1-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
169; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
170; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
171; AVX1-NEXT:    retq
172;
173; AVX2-LABEL: var_shift_v4i32:
174; AVX2:       # %bb.0:
175; AVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
176; AVX2-NEXT:    retq
177;
178; XOPAVX1-LABEL: var_shift_v4i32:
179; XOPAVX1:       # %bb.0:
180; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
181; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm2, %xmm1
182; XOPAVX1-NEXT:    vpshad %xmm1, %xmm0, %xmm0
183; XOPAVX1-NEXT:    retq
184;
185; XOPAVX2-LABEL: var_shift_v4i32:
186; XOPAVX2:       # %bb.0:
187; XOPAVX2-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
188; XOPAVX2-NEXT:    retq
189;
190; AVX512-LABEL: var_shift_v4i32:
191; AVX512:       # %bb.0:
192; AVX512-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
193; AVX512-NEXT:    retq
194;
195; AVX512VL-LABEL: var_shift_v4i32:
196; AVX512VL:       # %bb.0:
197; AVX512VL-NEXT:    vpsravd %xmm1, %xmm0, %xmm0
198; AVX512VL-NEXT:    retq
199;
200; X86-SSE-LABEL: var_shift_v4i32:
201; X86-SSE:       # %bb.0:
202; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
203; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
204; X86-SSE-NEXT:    psrad %xmm2, %xmm3
205; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[0,1,1,1,4,5,6,7]
206; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
207; X86-SSE-NEXT:    psrad %xmm4, %xmm2
208; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
209; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
210; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
211; X86-SSE-NEXT:    movdqa %xmm0, %xmm4
212; X86-SSE-NEXT:    psrad %xmm3, %xmm4
213; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,1,4,5,6,7]
214; X86-SSE-NEXT:    psrad %xmm1, %xmm0
215; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm4[1]
216; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm0[0,3]
217; X86-SSE-NEXT:    movaps %xmm2, %xmm0
218; X86-SSE-NEXT:    retl
219  %shift = ashr <4 x i32> %a, %b
220  ret <4 x i32> %shift
221}
222
223define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
224; SSE2-LABEL: var_shift_v8i16:
225; SSE2:       # %bb.0:
226; SSE2-NEXT:    psllw $12, %xmm1
227; SSE2-NEXT:    movdqa %xmm1, %xmm2
228; SSE2-NEXT:    psraw $15, %xmm2
229; SSE2-NEXT:    movdqa %xmm2, %xmm3
230; SSE2-NEXT:    pandn %xmm0, %xmm3
231; SSE2-NEXT:    psraw $8, %xmm0
232; SSE2-NEXT:    pand %xmm2, %xmm0
233; SSE2-NEXT:    por %xmm3, %xmm0
234; SSE2-NEXT:    paddw %xmm1, %xmm1
235; SSE2-NEXT:    movdqa %xmm1, %xmm2
236; SSE2-NEXT:    psraw $15, %xmm2
237; SSE2-NEXT:    movdqa %xmm2, %xmm3
238; SSE2-NEXT:    pandn %xmm0, %xmm3
239; SSE2-NEXT:    psraw $4, %xmm0
240; SSE2-NEXT:    pand %xmm2, %xmm0
241; SSE2-NEXT:    por %xmm3, %xmm0
242; SSE2-NEXT:    paddw %xmm1, %xmm1
243; SSE2-NEXT:    movdqa %xmm1, %xmm2
244; SSE2-NEXT:    psraw $15, %xmm2
245; SSE2-NEXT:    movdqa %xmm2, %xmm3
246; SSE2-NEXT:    pandn %xmm0, %xmm3
247; SSE2-NEXT:    psraw $2, %xmm0
248; SSE2-NEXT:    pand %xmm2, %xmm0
249; SSE2-NEXT:    por %xmm3, %xmm0
250; SSE2-NEXT:    paddw %xmm1, %xmm1
251; SSE2-NEXT:    psraw $15, %xmm1
252; SSE2-NEXT:    movdqa %xmm1, %xmm2
253; SSE2-NEXT:    pandn %xmm0, %xmm2
254; SSE2-NEXT:    psraw $1, %xmm0
255; SSE2-NEXT:    pand %xmm1, %xmm0
256; SSE2-NEXT:    por %xmm2, %xmm0
257; SSE2-NEXT:    retq
258;
259; SSE41-LABEL: var_shift_v8i16:
260; SSE41:       # %bb.0:
261; SSE41-NEXT:    movdqa %xmm0, %xmm2
262; SSE41-NEXT:    movdqa %xmm1, %xmm0
263; SSE41-NEXT:    psllw $12, %xmm0
264; SSE41-NEXT:    psllw $4, %xmm1
265; SSE41-NEXT:    por %xmm1, %xmm0
266; SSE41-NEXT:    movdqa %xmm0, %xmm1
267; SSE41-NEXT:    paddw %xmm0, %xmm1
268; SSE41-NEXT:    movdqa %xmm2, %xmm3
269; SSE41-NEXT:    psraw $8, %xmm3
270; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
271; SSE41-NEXT:    movdqa %xmm2, %xmm3
272; SSE41-NEXT:    psraw $4, %xmm3
273; SSE41-NEXT:    movdqa %xmm1, %xmm0
274; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
275; SSE41-NEXT:    movdqa %xmm2, %xmm3
276; SSE41-NEXT:    psraw $2, %xmm3
277; SSE41-NEXT:    paddw %xmm1, %xmm1
278; SSE41-NEXT:    movdqa %xmm1, %xmm0
279; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
280; SSE41-NEXT:    movdqa %xmm2, %xmm3
281; SSE41-NEXT:    psraw $1, %xmm3
282; SSE41-NEXT:    paddw %xmm1, %xmm1
283; SSE41-NEXT:    movdqa %xmm1, %xmm0
284; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
285; SSE41-NEXT:    movdqa %xmm2, %xmm0
286; SSE41-NEXT:    retq
287;
288; AVX1-LABEL: var_shift_v8i16:
289; AVX1:       # %bb.0:
290; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm2
291; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
292; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
293; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm2
294; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm3
295; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
296; AVX1-NEXT:    vpsraw $4, %xmm0, %xmm1
297; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
298; AVX1-NEXT:    vpsraw $2, %xmm0, %xmm1
299; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
300; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
301; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
302; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
303; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
304; AVX1-NEXT:    retq
305;
306; AVX2-LABEL: var_shift_v8i16:
307; AVX2:       # %bb.0:
308; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
309; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
310; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
311; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
312; AVX2-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0
313; AVX2-NEXT:    vzeroupper
314; AVX2-NEXT:    retq
315;
316; XOP-LABEL: var_shift_v8i16:
317; XOP:       # %bb.0:
318; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
319; XOP-NEXT:    vpsubw %xmm1, %xmm2, %xmm1
320; XOP-NEXT:    vpshaw %xmm1, %xmm0, %xmm0
321; XOP-NEXT:    retq
322;
323; AVX512DQ-LABEL: var_shift_v8i16:
324; AVX512DQ:       # %bb.0:
325; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
326; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
327; AVX512DQ-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
328; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
329; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
330; AVX512DQ-NEXT:    vzeroupper
331; AVX512DQ-NEXT:    retq
332;
333; AVX512BW-LABEL: var_shift_v8i16:
334; AVX512BW:       # %bb.0:
335; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
336; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
337; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
338; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
339; AVX512BW-NEXT:    vzeroupper
340; AVX512BW-NEXT:    retq
341;
342; AVX512DQVL-LABEL: var_shift_v8i16:
343; AVX512DQVL:       # %bb.0:
344; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
345; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
346; AVX512DQVL-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
347; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
348; AVX512DQVL-NEXT:    vzeroupper
349; AVX512DQVL-NEXT:    retq
350;
351; AVX512BWVL-LABEL: var_shift_v8i16:
352; AVX512BWVL:       # %bb.0:
353; AVX512BWVL-NEXT:    vpsravw %xmm1, %xmm0, %xmm0
354; AVX512BWVL-NEXT:    retq
355;
356; X86-SSE-LABEL: var_shift_v8i16:
357; X86-SSE:       # %bb.0:
358; X86-SSE-NEXT:    psllw $12, %xmm1
359; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
360; X86-SSE-NEXT:    psraw $15, %xmm2
361; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
362; X86-SSE-NEXT:    pandn %xmm0, %xmm3
363; X86-SSE-NEXT:    psraw $8, %xmm0
364; X86-SSE-NEXT:    pand %xmm2, %xmm0
365; X86-SSE-NEXT:    por %xmm3, %xmm0
366; X86-SSE-NEXT:    paddw %xmm1, %xmm1
367; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
368; X86-SSE-NEXT:    psraw $15, %xmm2
369; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
370; X86-SSE-NEXT:    pandn %xmm0, %xmm3
371; X86-SSE-NEXT:    psraw $4, %xmm0
372; X86-SSE-NEXT:    pand %xmm2, %xmm0
373; X86-SSE-NEXT:    por %xmm3, %xmm0
374; X86-SSE-NEXT:    paddw %xmm1, %xmm1
375; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
376; X86-SSE-NEXT:    psraw $15, %xmm2
377; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
378; X86-SSE-NEXT:    pandn %xmm0, %xmm3
379; X86-SSE-NEXT:    psraw $2, %xmm0
380; X86-SSE-NEXT:    pand %xmm2, %xmm0
381; X86-SSE-NEXT:    por %xmm3, %xmm0
382; X86-SSE-NEXT:    paddw %xmm1, %xmm1
383; X86-SSE-NEXT:    psraw $15, %xmm1
384; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
385; X86-SSE-NEXT:    pandn %xmm0, %xmm2
386; X86-SSE-NEXT:    psraw $1, %xmm0
387; X86-SSE-NEXT:    pand %xmm1, %xmm0
388; X86-SSE-NEXT:    por %xmm2, %xmm0
389; X86-SSE-NEXT:    retl
390  %shift = ashr <8 x i16> %a, %b
391  ret <8 x i16> %shift
392}
393
394define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
395; SSE2-LABEL: var_shift_v16i8:
396; SSE2:       # %bb.0:
397; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
398; SSE2-NEXT:    psllw $5, %xmm1
399; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
400; SSE2-NEXT:    pxor %xmm3, %xmm3
401; SSE2-NEXT:    pxor %xmm5, %xmm5
402; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
403; SSE2-NEXT:    movdqa %xmm5, %xmm6
404; SSE2-NEXT:    pandn %xmm2, %xmm6
405; SSE2-NEXT:    psraw $4, %xmm2
406; SSE2-NEXT:    pand %xmm5, %xmm2
407; SSE2-NEXT:    por %xmm6, %xmm2
408; SSE2-NEXT:    paddw %xmm4, %xmm4
409; SSE2-NEXT:    pxor %xmm5, %xmm5
410; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
411; SSE2-NEXT:    movdqa %xmm5, %xmm6
412; SSE2-NEXT:    pandn %xmm2, %xmm6
413; SSE2-NEXT:    psraw $2, %xmm2
414; SSE2-NEXT:    pand %xmm5, %xmm2
415; SSE2-NEXT:    por %xmm6, %xmm2
416; SSE2-NEXT:    paddw %xmm4, %xmm4
417; SSE2-NEXT:    pxor %xmm5, %xmm5
418; SSE2-NEXT:    pcmpgtw %xmm4, %xmm5
419; SSE2-NEXT:    movdqa %xmm5, %xmm4
420; SSE2-NEXT:    pandn %xmm2, %xmm4
421; SSE2-NEXT:    psraw $1, %xmm2
422; SSE2-NEXT:    pand %xmm5, %xmm2
423; SSE2-NEXT:    por %xmm4, %xmm2
424; SSE2-NEXT:    psrlw $8, %xmm2
425; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
426; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
427; SSE2-NEXT:    pxor %xmm4, %xmm4
428; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
429; SSE2-NEXT:    movdqa %xmm4, %xmm5
430; SSE2-NEXT:    pandn %xmm0, %xmm5
431; SSE2-NEXT:    psraw $4, %xmm0
432; SSE2-NEXT:    pand %xmm4, %xmm0
433; SSE2-NEXT:    por %xmm5, %xmm0
434; SSE2-NEXT:    paddw %xmm1, %xmm1
435; SSE2-NEXT:    pxor %xmm4, %xmm4
436; SSE2-NEXT:    pcmpgtw %xmm1, %xmm4
437; SSE2-NEXT:    movdqa %xmm4, %xmm5
438; SSE2-NEXT:    pandn %xmm0, %xmm5
439; SSE2-NEXT:    psraw $2, %xmm0
440; SSE2-NEXT:    pand %xmm4, %xmm0
441; SSE2-NEXT:    por %xmm5, %xmm0
442; SSE2-NEXT:    paddw %xmm1, %xmm1
443; SSE2-NEXT:    pcmpgtw %xmm1, %xmm3
444; SSE2-NEXT:    movdqa %xmm3, %xmm1
445; SSE2-NEXT:    pandn %xmm0, %xmm1
446; SSE2-NEXT:    psraw $1, %xmm0
447; SSE2-NEXT:    pand %xmm3, %xmm0
448; SSE2-NEXT:    por %xmm1, %xmm0
449; SSE2-NEXT:    psrlw $8, %xmm0
450; SSE2-NEXT:    packuswb %xmm2, %xmm0
451; SSE2-NEXT:    retq
452;
453; SSE41-LABEL: var_shift_v16i8:
454; SSE41:       # %bb.0:
455; SSE41-NEXT:    movdqa %xmm0, %xmm2
456; SSE41-NEXT:    psllw $5, %xmm1
457; SSE41-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
458; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
459; SSE41-NEXT:    movdqa %xmm3, %xmm4
460; SSE41-NEXT:    psraw $4, %xmm4
461; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
462; SSE41-NEXT:    movdqa %xmm3, %xmm4
463; SSE41-NEXT:    psraw $2, %xmm4
464; SSE41-NEXT:    paddw %xmm0, %xmm0
465; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
466; SSE41-NEXT:    movdqa %xmm3, %xmm4
467; SSE41-NEXT:    psraw $1, %xmm4
468; SSE41-NEXT:    paddw %xmm0, %xmm0
469; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
470; SSE41-NEXT:    psrlw $8, %xmm3
471; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
472; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
473; SSE41-NEXT:    movdqa %xmm1, %xmm2
474; SSE41-NEXT:    psraw $4, %xmm2
475; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
476; SSE41-NEXT:    movdqa %xmm1, %xmm2
477; SSE41-NEXT:    psraw $2, %xmm2
478; SSE41-NEXT:    paddw %xmm0, %xmm0
479; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
480; SSE41-NEXT:    movdqa %xmm1, %xmm2
481; SSE41-NEXT:    psraw $1, %xmm2
482; SSE41-NEXT:    paddw %xmm0, %xmm0
483; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
484; SSE41-NEXT:    psrlw $8, %xmm1
485; SSE41-NEXT:    packuswb %xmm3, %xmm1
486; SSE41-NEXT:    movdqa %xmm1, %xmm0
487; SSE41-NEXT:    retq
488;
489; AVX-LABEL: var_shift_v16i8:
490; AVX:       # %bb.0:
491; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
492; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
493; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
494; AVX-NEXT:    vpsraw $4, %xmm3, %xmm4
495; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
496; AVX-NEXT:    vpsraw $2, %xmm3, %xmm4
497; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
498; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm3
499; AVX-NEXT:    vpsraw $1, %xmm3, %xmm4
500; AVX-NEXT:    vpaddw %xmm2, %xmm2, %xmm2
501; AVX-NEXT:    vpblendvb %xmm2, %xmm4, %xmm3, %xmm2
502; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
503; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
504; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
505; AVX-NEXT:    vpsraw $4, %xmm0, %xmm3
506; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
507; AVX-NEXT:    vpsraw $2, %xmm0, %xmm3
508; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
509; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
510; AVX-NEXT:    vpsraw $1, %xmm0, %xmm3
511; AVX-NEXT:    vpaddw %xmm1, %xmm1, %xmm1
512; AVX-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
513; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
514; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
515; AVX-NEXT:    retq
516;
517; XOP-LABEL: var_shift_v16i8:
518; XOP:       # %bb.0:
519; XOP-NEXT:    vpxor %xmm2, %xmm2, %xmm2
520; XOP-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
521; XOP-NEXT:    vpshab %xmm1, %xmm0, %xmm0
522; XOP-NEXT:    retq
523;
524; AVX512DQ-LABEL: var_shift_v16i8:
525; AVX512DQ:       # %bb.0:
526; AVX512DQ-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
527; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
528; AVX512DQ-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
529; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
530; AVX512DQ-NEXT:    vzeroupper
531; AVX512DQ-NEXT:    retq
532;
533; AVX512BW-LABEL: var_shift_v16i8:
534; AVX512BW:       # %bb.0:
535; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
536; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
537; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
538; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
539; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
540; AVX512BW-NEXT:    vzeroupper
541; AVX512BW-NEXT:    retq
542;
543; AVX512DQVL-LABEL: var_shift_v16i8:
544; AVX512DQVL:       # %bb.0:
545; AVX512DQVL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
546; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
547; AVX512DQVL-NEXT:    vpsravd %zmm1, %zmm0, %zmm0
548; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
549; AVX512DQVL-NEXT:    vzeroupper
550; AVX512DQVL-NEXT:    retq
551;
552; AVX512BWVL-LABEL: var_shift_v16i8:
553; AVX512BWVL:       # %bb.0:
554; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
555; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
556; AVX512BWVL-NEXT:    vpsravw %ymm1, %ymm0, %ymm0
557; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
558; AVX512BWVL-NEXT:    vzeroupper
559; AVX512BWVL-NEXT:    retq
560;
561; X86-SSE-LABEL: var_shift_v16i8:
562; X86-SSE:       # %bb.0:
563; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
564; X86-SSE-NEXT:    psllw $5, %xmm1
565; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
566; X86-SSE-NEXT:    pxor %xmm3, %xmm3
567; X86-SSE-NEXT:    pxor %xmm5, %xmm5
568; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
569; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
570; X86-SSE-NEXT:    pandn %xmm2, %xmm6
571; X86-SSE-NEXT:    psraw $4, %xmm2
572; X86-SSE-NEXT:    pand %xmm5, %xmm2
573; X86-SSE-NEXT:    por %xmm6, %xmm2
574; X86-SSE-NEXT:    paddw %xmm4, %xmm4
575; X86-SSE-NEXT:    pxor %xmm5, %xmm5
576; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
577; X86-SSE-NEXT:    movdqa %xmm5, %xmm6
578; X86-SSE-NEXT:    pandn %xmm2, %xmm6
579; X86-SSE-NEXT:    psraw $2, %xmm2
580; X86-SSE-NEXT:    pand %xmm5, %xmm2
581; X86-SSE-NEXT:    por %xmm6, %xmm2
582; X86-SSE-NEXT:    paddw %xmm4, %xmm4
583; X86-SSE-NEXT:    pxor %xmm5, %xmm5
584; X86-SSE-NEXT:    pcmpgtw %xmm4, %xmm5
585; X86-SSE-NEXT:    movdqa %xmm5, %xmm4
586; X86-SSE-NEXT:    pandn %xmm2, %xmm4
587; X86-SSE-NEXT:    psraw $1, %xmm2
588; X86-SSE-NEXT:    pand %xmm5, %xmm2
589; X86-SSE-NEXT:    por %xmm4, %xmm2
590; X86-SSE-NEXT:    psrlw $8, %xmm2
591; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
592; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
593; X86-SSE-NEXT:    pxor %xmm4, %xmm4
594; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
595; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
596; X86-SSE-NEXT:    pandn %xmm0, %xmm5
597; X86-SSE-NEXT:    psraw $4, %xmm0
598; X86-SSE-NEXT:    pand %xmm4, %xmm0
599; X86-SSE-NEXT:    por %xmm5, %xmm0
600; X86-SSE-NEXT:    paddw %xmm1, %xmm1
601; X86-SSE-NEXT:    pxor %xmm4, %xmm4
602; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm4
603; X86-SSE-NEXT:    movdqa %xmm4, %xmm5
604; X86-SSE-NEXT:    pandn %xmm0, %xmm5
605; X86-SSE-NEXT:    psraw $2, %xmm0
606; X86-SSE-NEXT:    pand %xmm4, %xmm0
607; X86-SSE-NEXT:    por %xmm5, %xmm0
608; X86-SSE-NEXT:    paddw %xmm1, %xmm1
609; X86-SSE-NEXT:    pcmpgtw %xmm1, %xmm3
610; X86-SSE-NEXT:    movdqa %xmm3, %xmm1
611; X86-SSE-NEXT:    pandn %xmm0, %xmm1
612; X86-SSE-NEXT:    psraw $1, %xmm0
613; X86-SSE-NEXT:    pand %xmm3, %xmm0
614; X86-SSE-NEXT:    por %xmm1, %xmm0
615; X86-SSE-NEXT:    psrlw $8, %xmm0
616; X86-SSE-NEXT:    packuswb %xmm2, %xmm0
617; X86-SSE-NEXT:    retl
618  %shift = ashr <16 x i8> %a, %b
619  ret <16 x i8> %shift
620}
621
622;
623; Uniform Variable Shifts
624;
625
626define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
627; SSE-LABEL: splatvar_shift_v2i64:
628; SSE:       # %bb.0:
629; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
630; SSE-NEXT:    psrlq %xmm1, %xmm2
631; SSE-NEXT:    psrlq %xmm1, %xmm0
632; SSE-NEXT:    pxor %xmm2, %xmm0
633; SSE-NEXT:    psubq %xmm2, %xmm0
634; SSE-NEXT:    retq
635;
636; AVX1-LABEL: splatvar_shift_v2i64:
637; AVX1:       # %bb.0:
638; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
639; AVX1-NEXT:    # xmm2 = mem[0,0]
640; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
641; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
642; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
643; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
644; AVX1-NEXT:    retq
645;
646; AVX2-LABEL: splatvar_shift_v2i64:
647; AVX2:       # %bb.0:
648; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
649; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
650; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
651; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
652; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
653; AVX2-NEXT:    retq
654;
655; XOPAVX1-LABEL: splatvar_shift_v2i64:
656; XOPAVX1:       # %bb.0:
657; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
658; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
659; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
660; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
661; XOPAVX1-NEXT:    retq
662;
663; XOPAVX2-LABEL: splatvar_shift_v2i64:
664; XOPAVX2:       # %bb.0:
665; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
666; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
667; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
668; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
669; XOPAVX2-NEXT:    retq
670;
671; AVX512-LABEL: splatvar_shift_v2i64:
672; AVX512:       # %bb.0:
673; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
674; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
675; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
676; AVX512-NEXT:    vzeroupper
677; AVX512-NEXT:    retq
678;
679; AVX512VL-LABEL: splatvar_shift_v2i64:
680; AVX512VL:       # %bb.0:
681; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
682; AVX512VL-NEXT:    retq
683;
684; X86-SSE-LABEL: splatvar_shift_v2i64:
685; X86-SSE:       # %bb.0:
686; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
687; X86-SSE-NEXT:    psrlq %xmm1, %xmm2
688; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
689; X86-SSE-NEXT:    pxor %xmm2, %xmm0
690; X86-SSE-NEXT:    psubq %xmm2, %xmm0
691; X86-SSE-NEXT:    retl
692  %splat = shufflevector <2 x i64> %b, <2 x i64> poison, <2 x i32> zeroinitializer
693  %shift = ashr <2 x i64> %a, %splat
694  ret <2 x i64> %shift
695}
696
697define <4 x i32> @splatvar_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
698; SSE2-LABEL: splatvar_shift_v4i32:
699; SSE2:       # %bb.0:
700; SSE2-NEXT:    xorps %xmm2, %xmm2
701; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
702; SSE2-NEXT:    psrad %xmm2, %xmm0
703; SSE2-NEXT:    retq
704;
705; SSE41-LABEL: splatvar_shift_v4i32:
706; SSE41:       # %bb.0:
707; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
708; SSE41-NEXT:    psrad %xmm1, %xmm0
709; SSE41-NEXT:    retq
710;
711; AVX-LABEL: splatvar_shift_v4i32:
712; AVX:       # %bb.0:
713; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
714; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
715; AVX-NEXT:    retq
716;
717; XOP-LABEL: splatvar_shift_v4i32:
718; XOP:       # %bb.0:
719; XOP-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
720; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
721; XOP-NEXT:    retq
722;
723; AVX512-LABEL: splatvar_shift_v4i32:
724; AVX512:       # %bb.0:
725; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
726; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
727; AVX512-NEXT:    retq
728;
729; AVX512VL-LABEL: splatvar_shift_v4i32:
730; AVX512VL:       # %bb.0:
731; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
732; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
733; AVX512VL-NEXT:    retq
734;
735; X86-SSE-LABEL: splatvar_shift_v4i32:
736; X86-SSE:       # %bb.0:
737; X86-SSE-NEXT:    xorps %xmm2, %xmm2
738; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
739; X86-SSE-NEXT:    psrad %xmm2, %xmm0
740; X86-SSE-NEXT:    retl
741  %splat = shufflevector <4 x i32> %b, <4 x i32> poison, <4 x i32> zeroinitializer
742  %shift = ashr <4 x i32> %a, %splat
743  ret <4 x i32> %shift
744}
745
746define <8 x i16> @splatvar_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
747; SSE2-LABEL: splatvar_shift_v8i16:
748; SSE2:       # %bb.0:
749; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
750; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
751; SSE2-NEXT:    psraw %xmm1, %xmm0
752; SSE2-NEXT:    retq
753;
754; SSE41-LABEL: splatvar_shift_v8i16:
755; SSE41:       # %bb.0:
756; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
757; SSE41-NEXT:    psraw %xmm1, %xmm0
758; SSE41-NEXT:    retq
759;
760; AVX-LABEL: splatvar_shift_v8i16:
761; AVX:       # %bb.0:
762; AVX-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
763; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
764; AVX-NEXT:    retq
765;
766; XOP-LABEL: splatvar_shift_v8i16:
767; XOP:       # %bb.0:
768; XOP-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
769; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
770; XOP-NEXT:    retq
771;
772; AVX512-LABEL: splatvar_shift_v8i16:
773; AVX512:       # %bb.0:
774; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
775; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
776; AVX512-NEXT:    retq
777;
778; AVX512VL-LABEL: splatvar_shift_v8i16:
779; AVX512VL:       # %bb.0:
780; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
781; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
782; AVX512VL-NEXT:    retq
783;
784; X86-SSE-LABEL: splatvar_shift_v8i16:
785; X86-SSE:       # %bb.0:
786; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1]
787; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
788; X86-SSE-NEXT:    psraw %xmm1, %xmm0
789; X86-SSE-NEXT:    retl
790  %splat = shufflevector <8 x i16> %b, <8 x i16> poison, <8 x i32> zeroinitializer
791  %shift = ashr <8 x i16> %a, %splat
792  ret <8 x i16> %shift
793}
794
795define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
796; SSE2-LABEL: splatvar_shift_v16i8:
797; SSE2:       # %bb.0:
798; SSE2-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
799; SSE2-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
800; SSE2-NEXT:    psrlw %xmm1, %xmm0
801; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
802; SSE2-NEXT:    psrlw %xmm1, %xmm2
803; SSE2-NEXT:    psrlw $8, %xmm2
804; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
805; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
806; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
807; SSE2-NEXT:    pand %xmm2, %xmm0
808; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
809; SSE2-NEXT:    psrlw %xmm1, %xmm2
810; SSE2-NEXT:    pxor %xmm2, %xmm0
811; SSE2-NEXT:    psubb %xmm2, %xmm0
812; SSE2-NEXT:    retq
813;
814; SSE41-LABEL: splatvar_shift_v16i8:
815; SSE41:       # %bb.0:
816; SSE41-NEXT:    pmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
817; SSE41-NEXT:    psrlw %xmm1, %xmm0
818; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
819; SSE41-NEXT:    psrlw %xmm1, %xmm2
820; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
821; SSE41-NEXT:    pand %xmm2, %xmm0
822; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
823; SSE41-NEXT:    psrlw %xmm1, %xmm2
824; SSE41-NEXT:    pxor %xmm2, %xmm0
825; SSE41-NEXT:    psubb %xmm2, %xmm0
826; SSE41-NEXT:    retq
827;
828; AVX1-LABEL: splatvar_shift_v16i8:
829; AVX1:       # %bb.0:
830; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
831; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
832; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
833; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
834; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
835; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
836; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
837; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
838; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
839; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
840; AVX1-NEXT:    retq
841;
842; AVX2-LABEL: splatvar_shift_v16i8:
843; AVX2:       # %bb.0:
844; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
845; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
846; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
847; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
848; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
849; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
850; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
851; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
852; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
853; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
854; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
855; AVX2-NEXT:    retq
856;
857; XOPAVX1-LABEL: splatvar_shift_v16i8:
858; XOPAVX1:       # %bb.0:
859; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
860; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
861; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
862; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
863; XOPAVX1-NEXT:    retq
864;
865; XOPAVX2-LABEL: splatvar_shift_v16i8:
866; XOPAVX2:       # %bb.0:
867; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
868; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
869; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
870; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
871; XOPAVX2-NEXT:    retq
872;
873; AVX512DQ-LABEL: splatvar_shift_v16i8:
874; AVX512DQ:       # %bb.0:
875; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
876; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
877; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
878; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
879; AVX512DQ-NEXT:    vzeroupper
880; AVX512DQ-NEXT:    retq
881;
882; AVX512BW-LABEL: splatvar_shift_v16i8:
883; AVX512BW:       # %bb.0:
884; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
885; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
886; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
887; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
888; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
889; AVX512BW-NEXT:    vzeroupper
890; AVX512BW-NEXT:    retq
891;
892; AVX512DQVL-LABEL: splatvar_shift_v16i8:
893; AVX512DQVL:       # %bb.0:
894; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
895; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
896; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
897; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
898; AVX512DQVL-NEXT:    vzeroupper
899; AVX512DQVL-NEXT:    retq
900;
901; AVX512BWVL-LABEL: splatvar_shift_v16i8:
902; AVX512BWVL:       # %bb.0:
903; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
904; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
905; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
906; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
907; AVX512BWVL-NEXT:    vzeroupper
908; AVX512BWVL-NEXT:    retq
909;
910; X86-SSE-LABEL: splatvar_shift_v16i8:
911; X86-SSE:       # %bb.0:
912; X86-SSE-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0]
913; X86-SSE-NEXT:    psrldq {{.*#+}} xmm1 = xmm1[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
914; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
915; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
916; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
917; X86-SSE-NEXT:    psrlw $8, %xmm2
918; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
919; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
920; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
921; X86-SSE-NEXT:    pand %xmm2, %xmm0
922; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
923; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
924; X86-SSE-NEXT:    pxor %xmm2, %xmm0
925; X86-SSE-NEXT:    psubb %xmm2, %xmm0
926; X86-SSE-NEXT:    retl
927  %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer
928  %shift = ashr <16 x i8> %a, %splat
929  ret <16 x i8> %shift
930}
931
932;
933; Uniform Variable Modulo Shifts
934;
935
936define <2 x i64> @splatvar_modulo_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
937; SSE-LABEL: splatvar_modulo_shift_v2i64:
938; SSE:       # %bb.0:
939; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
940; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
941; SSE-NEXT:    psrlq %xmm1, %xmm2
942; SSE-NEXT:    psrlq %xmm1, %xmm0
943; SSE-NEXT:    pxor %xmm2, %xmm0
944; SSE-NEXT:    psubq %xmm2, %xmm0
945; SSE-NEXT:    retq
946;
947; AVX1-LABEL: splatvar_modulo_shift_v2i64:
948; AVX1:       # %bb.0:
949; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
950; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
951; AVX1-NEXT:    # xmm2 = mem[0,0]
952; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
953; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
954; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
955; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
956; AVX1-NEXT:    retq
957;
958; AVX2-LABEL: splatvar_modulo_shift_v2i64:
959; AVX2:       # %bb.0:
960; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
961; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
962; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
963; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
964; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
965; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
966; AVX2-NEXT:    retq
967;
968; XOPAVX1-LABEL: splatvar_modulo_shift_v2i64:
969; XOPAVX1:       # %bb.0:
970; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
971; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
972; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
973; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
974; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
975; XOPAVX1-NEXT:    retq
976;
977; XOPAVX2-LABEL: splatvar_modulo_shift_v2i64:
978; XOPAVX2:       # %bb.0:
979; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
980; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
981; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
982; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
983; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
984; XOPAVX2-NEXT:    retq
985;
986; AVX512-LABEL: splatvar_modulo_shift_v2i64:
987; AVX512:       # %bb.0:
988; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
989; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
990; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
991; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
992; AVX512-NEXT:    vzeroupper
993; AVX512-NEXT:    retq
994;
995; AVX512VL-LABEL: splatvar_modulo_shift_v2i64:
996; AVX512VL:       # %bb.0:
997; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
998; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
999; AVX512VL-NEXT:    retq
1000;
1001; X86-SSE-LABEL: splatvar_modulo_shift_v2i64:
1002; X86-SSE:       # %bb.0:
1003; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1004; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
1005; X86-SSE-NEXT:    psrlq %xmm1, %xmm2
1006; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
1007; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1008; X86-SSE-NEXT:    psubq %xmm2, %xmm0
1009; X86-SSE-NEXT:    retl
1010  %mod = and <2 x i64> %b, <i64 63, i64 63>
1011  %splat = shufflevector <2 x i64> %mod, <2 x i64> poison, <2 x i32> zeroinitializer
1012  %shift = ashr <2 x i64> %a, %splat
1013  ret <2 x i64> %shift
1014}
1015
1016define <4 x i32> @splatvar_modulo_shift_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
1017; SSE-LABEL: splatvar_modulo_shift_v4i32:
1018; SSE:       # %bb.0:
1019; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1020; SSE-NEXT:    psrad %xmm1, %xmm0
1021; SSE-NEXT:    retq
1022;
1023; AVX-LABEL: splatvar_modulo_shift_v4i32:
1024; AVX:       # %bb.0:
1025; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1026; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1027; AVX-NEXT:    retq
1028;
1029; XOP-LABEL: splatvar_modulo_shift_v4i32:
1030; XOP:       # %bb.0:
1031; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1032; XOP-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1033; XOP-NEXT:    retq
1034;
1035; AVX512-LABEL: splatvar_modulo_shift_v4i32:
1036; AVX512:       # %bb.0:
1037; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1038; AVX512-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1039; AVX512-NEXT:    retq
1040;
1041; AVX512VL-LABEL: splatvar_modulo_shift_v4i32:
1042; AVX512VL:       # %bb.0:
1043; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1044; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0
1045; AVX512VL-NEXT:    retq
1046;
1047; X86-SSE-LABEL: splatvar_modulo_shift_v4i32:
1048; X86-SSE:       # %bb.0:
1049; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1050; X86-SSE-NEXT:    psrad %xmm1, %xmm0
1051; X86-SSE-NEXT:    retl
1052  %mod = and <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
1053  %splat = shufflevector <4 x i32> %mod, <4 x i32> poison, <4 x i32> zeroinitializer
1054  %shift = ashr <4 x i32> %a, %splat
1055  ret <4 x i32> %shift
1056}
1057
1058define <8 x i16> @splatvar_modulo_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
1059; SSE-LABEL: splatvar_modulo_shift_v8i16:
1060; SSE:       # %bb.0:
1061; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1062; SSE-NEXT:    psraw %xmm1, %xmm0
1063; SSE-NEXT:    retq
1064;
1065; AVX-LABEL: splatvar_modulo_shift_v8i16:
1066; AVX:       # %bb.0:
1067; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1068; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1069; AVX-NEXT:    retq
1070;
1071; XOP-LABEL: splatvar_modulo_shift_v8i16:
1072; XOP:       # %bb.0:
1073; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1074; XOP-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1075; XOP-NEXT:    retq
1076;
1077; AVX512-LABEL: splatvar_modulo_shift_v8i16:
1078; AVX512:       # %bb.0:
1079; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1080; AVX512-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1081; AVX512-NEXT:    retq
1082;
1083; AVX512VL-LABEL: splatvar_modulo_shift_v8i16:
1084; AVX512VL:       # %bb.0:
1085; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1086; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0
1087; AVX512VL-NEXT:    retq
1088;
1089; X86-SSE-LABEL: splatvar_modulo_shift_v8i16:
1090; X86-SSE:       # %bb.0:
1091; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1092; X86-SSE-NEXT:    psraw %xmm1, %xmm0
1093; X86-SSE-NEXT:    retl
1094  %mod = and <8 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1095  %splat = shufflevector <8 x i16> %mod, <8 x i16> poison, <8 x i32> zeroinitializer
1096  %shift = ashr <8 x i16> %a, %splat
1097  ret <8 x i16> %shift
1098}
1099
1100define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
1101; SSE2-LABEL: splatvar_modulo_shift_v16i8:
1102; SSE2:       # %bb.0:
1103; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1104; SSE2-NEXT:    psrlw %xmm1, %xmm0
1105; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
1106; SSE2-NEXT:    psrlw %xmm1, %xmm2
1107; SSE2-NEXT:    psrlw $8, %xmm2
1108; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1109; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1110; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1111; SSE2-NEXT:    pand %xmm2, %xmm0
1112; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1113; SSE2-NEXT:    psrlw %xmm1, %xmm2
1114; SSE2-NEXT:    pxor %xmm2, %xmm0
1115; SSE2-NEXT:    psubb %xmm2, %xmm0
1116; SSE2-NEXT:    retq
1117;
1118; SSE41-LABEL: splatvar_modulo_shift_v16i8:
1119; SSE41:       # %bb.0:
1120; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1121; SSE41-NEXT:    psrlw %xmm1, %xmm0
1122; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
1123; SSE41-NEXT:    psrlw %xmm1, %xmm2
1124; SSE41-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1125; SSE41-NEXT:    pand %xmm2, %xmm0
1126; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1127; SSE41-NEXT:    psrlw %xmm1, %xmm2
1128; SSE41-NEXT:    pxor %xmm2, %xmm0
1129; SSE41-NEXT:    psubb %xmm2, %xmm0
1130; SSE41-NEXT:    retq
1131;
1132; AVX1-LABEL: splatvar_modulo_shift_v16i8:
1133; AVX1:       # %bb.0:
1134; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1135; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1136; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1137; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1138; AVX1-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1139; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1140; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1141; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1142; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1143; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1144; AVX1-NEXT:    retq
1145;
1146; AVX2-LABEL: splatvar_modulo_shift_v16i8:
1147; AVX2:       # %bb.0:
1148; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1149; AVX2-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1150; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1151; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1152; AVX2-NEXT:    vpsrlw $8, %xmm2, %xmm2
1153; AVX2-NEXT:    vpbroadcastb %xmm2, %xmm2
1154; AVX2-NEXT:    vpand %xmm2, %xmm0, %xmm0
1155; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
1156; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1157; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1158; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1159; AVX2-NEXT:    retq
1160;
1161; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
1162; XOPAVX1:       # %bb.0:
1163; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1164; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1165; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1166; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1167; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1168; XOPAVX1-NEXT:    retq
1169;
1170; XOPAVX2-LABEL: splatvar_modulo_shift_v16i8:
1171; XOPAVX2:       # %bb.0:
1172; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1173; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1174; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1175; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1176; XOPAVX2-NEXT:    vpshab %xmm1, %xmm0, %xmm0
1177; XOPAVX2-NEXT:    retq
1178;
1179; AVX512DQ-LABEL: splatvar_modulo_shift_v16i8:
1180; AVX512DQ:       # %bb.0:
1181; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1182; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1183; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1184; AVX512DQ-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1185; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1186; AVX512DQ-NEXT:    vzeroupper
1187; AVX512DQ-NEXT:    retq
1188;
1189; AVX512BW-LABEL: splatvar_modulo_shift_v16i8:
1190; AVX512BW:       # %bb.0:
1191; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1192; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1193; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1194; AVX512BW-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1195; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1196; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1197; AVX512BW-NEXT:    vzeroupper
1198; AVX512BW-NEXT:    retq
1199;
1200; AVX512DQVL-LABEL: splatvar_modulo_shift_v16i8:
1201; AVX512DQVL:       # %bb.0:
1202; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1203; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1204; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1205; AVX512DQVL-NEXT:    vpsrad %xmm1, %zmm0, %zmm0
1206; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1207; AVX512DQVL-NEXT:    vzeroupper
1208; AVX512DQVL-NEXT:    retq
1209;
1210; AVX512BWVL-LABEL: splatvar_modulo_shift_v16i8:
1211; AVX512BWVL:       # %bb.0:
1212; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1213; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1214; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1215; AVX512BWVL-NEXT:    vpsraw %xmm1, %ymm0, %ymm0
1216; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1217; AVX512BWVL-NEXT:    vzeroupper
1218; AVX512BWVL-NEXT:    retq
1219;
1220; X86-SSE-LABEL: splatvar_modulo_shift_v16i8:
1221; X86-SSE:       # %bb.0:
1222; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1223; X86-SSE-NEXT:    psrlw %xmm1, %xmm0
1224; X86-SSE-NEXT:    pcmpeqd %xmm2, %xmm2
1225; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1226; X86-SSE-NEXT:    psrlw $8, %xmm2
1227; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1228; X86-SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
1229; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
1230; X86-SSE-NEXT:    pand %xmm2, %xmm0
1231; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896]
1232; X86-SSE-NEXT:    psrlw %xmm1, %xmm2
1233; X86-SSE-NEXT:    pxor %xmm2, %xmm0
1234; X86-SSE-NEXT:    psubb %xmm2, %xmm0
1235; X86-SSE-NEXT:    retl
1236  %mod = and <16 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1237  %splat = shufflevector <16 x i8> %mod, <16 x i8> poison, <16 x i32> zeroinitializer
1238  %shift = ashr <16 x i8> %a, %splat
1239  ret <16 x i8> %shift
1240}
1241
1242;
1243; Constant Shifts
1244;
1245
1246define <2 x i64> @constant_shift_v2i64(<2 x i64> %a) nounwind {
1247; SSE2-LABEL: constant_shift_v2i64:
1248; SSE2:       # %bb.0:
1249; SSE2-NEXT:    movdqa %xmm0, %xmm1
1250; SSE2-NEXT:    psrlq $1, %xmm1
1251; SSE2-NEXT:    psrlq $7, %xmm0
1252; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1253; SSE2-NEXT:    movapd {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1254; SSE2-NEXT:    xorpd %xmm1, %xmm0
1255; SSE2-NEXT:    psubq %xmm1, %xmm0
1256; SSE2-NEXT:    retq
1257;
1258; SSE41-LABEL: constant_shift_v2i64:
1259; SSE41:       # %bb.0:
1260; SSE41-NEXT:    movdqa %xmm0, %xmm1
1261; SSE41-NEXT:    psrlq $7, %xmm1
1262; SSE41-NEXT:    psrlq $1, %xmm0
1263; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1264; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1265; SSE41-NEXT:    pxor %xmm1, %xmm0
1266; SSE41-NEXT:    psubq %xmm1, %xmm0
1267; SSE41-NEXT:    retq
1268;
1269; AVX1-LABEL: constant_shift_v2i64:
1270; AVX1:       # %bb.0:
1271; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1272; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1273; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1274; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1275; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1276; AVX1-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1277; AVX1-NEXT:    retq
1278;
1279; AVX2-LABEL: constant_shift_v2i64:
1280; AVX2:       # %bb.0:
1281; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1282; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4611686018427387904,72057594037927936]
1283; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1284; AVX2-NEXT:    vpsubq %xmm1, %xmm0, %xmm0
1285; AVX2-NEXT:    retq
1286;
1287; XOP-LABEL: constant_shift_v2i64:
1288; XOP:       # %bb.0:
1289; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1290; XOP-NEXT:    retq
1291;
1292; AVX512-LABEL: constant_shift_v2i64:
1293; AVX512:       # %bb.0:
1294; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1295; AVX512-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [1,7]
1296; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
1297; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1298; AVX512-NEXT:    vzeroupper
1299; AVX512-NEXT:    retq
1300;
1301; AVX512VL-LABEL: constant_shift_v2i64:
1302; AVX512VL:       # %bb.0:
1303; AVX512VL-NEXT:    vpsravq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1304; AVX512VL-NEXT:    retq
1305;
1306; X86-SSE-LABEL: constant_shift_v2i64:
1307; X86-SSE:       # %bb.0:
1308; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1309; X86-SSE-NEXT:    psrlq $1, %xmm1
1310; X86-SSE-NEXT:    psrlq $7, %xmm0
1311; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1312; X86-SSE-NEXT:    movapd {{.*#+}} xmm1 = [0,1073741824,0,16777216]
1313; X86-SSE-NEXT:    xorpd %xmm1, %xmm0
1314; X86-SSE-NEXT:    psubq %xmm1, %xmm0
1315; X86-SSE-NEXT:    retl
1316  %shift = ashr <2 x i64> %a, <i64 1, i64 7>
1317  ret <2 x i64> %shift
1318}
1319
1320define <4 x i32> @constant_shift_v4i32(<4 x i32> %a) nounwind {
1321; SSE2-LABEL: constant_shift_v4i32:
1322; SSE2:       # %bb.0:
1323; SSE2-NEXT:    movdqa %xmm0, %xmm1
1324; SSE2-NEXT:    psrad $7, %xmm1
1325; SSE2-NEXT:    movdqa %xmm0, %xmm2
1326; SSE2-NEXT:    psrad $6, %xmm2
1327; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1328; SSE2-NEXT:    movdqa %xmm0, %xmm1
1329; SSE2-NEXT:    psrad $5, %xmm1
1330; SSE2-NEXT:    psrad $4, %xmm0
1331; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1332; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1333; SSE2-NEXT:    retq
1334;
1335; SSE41-LABEL: constant_shift_v4i32:
1336; SSE41:       # %bb.0:
1337; SSE41-NEXT:    movdqa %xmm0, %xmm1
1338; SSE41-NEXT:    psrad $7, %xmm1
1339; SSE41-NEXT:    movdqa %xmm0, %xmm2
1340; SSE41-NEXT:    psrad $5, %xmm2
1341; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1342; SSE41-NEXT:    movdqa %xmm0, %xmm1
1343; SSE41-NEXT:    psrad $6, %xmm1
1344; SSE41-NEXT:    psrad $4, %xmm0
1345; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1346; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1347; SSE41-NEXT:    retq
1348;
1349; AVX1-LABEL: constant_shift_v4i32:
1350; AVX1:       # %bb.0:
1351; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1352; AVX1-NEXT:    vpsrad $5, %xmm0, %xmm2
1353; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1354; AVX1-NEXT:    vpsrad $6, %xmm0, %xmm2
1355; AVX1-NEXT:    vpsrad $4, %xmm0, %xmm0
1356; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1357; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1358; AVX1-NEXT:    retq
1359;
1360; AVX2-LABEL: constant_shift_v4i32:
1361; AVX2:       # %bb.0:
1362; AVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1363; AVX2-NEXT:    retq
1364;
1365; XOPAVX1-LABEL: constant_shift_v4i32:
1366; XOPAVX1:       # %bb.0:
1367; XOPAVX1-NEXT:    vpshad {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1368; XOPAVX1-NEXT:    retq
1369;
1370; XOPAVX2-LABEL: constant_shift_v4i32:
1371; XOPAVX2:       # %bb.0:
1372; XOPAVX2-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1373; XOPAVX2-NEXT:    retq
1374;
1375; AVX512-LABEL: constant_shift_v4i32:
1376; AVX512:       # %bb.0:
1377; AVX512-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1378; AVX512-NEXT:    retq
1379;
1380; AVX512VL-LABEL: constant_shift_v4i32:
1381; AVX512VL:       # %bb.0:
1382; AVX512VL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1383; AVX512VL-NEXT:    retq
1384;
1385; X86-SSE-LABEL: constant_shift_v4i32:
1386; X86-SSE:       # %bb.0:
1387; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1388; X86-SSE-NEXT:    psrad $7, %xmm1
1389; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
1390; X86-SSE-NEXT:    psrad $6, %xmm2
1391; X86-SSE-NEXT:    punpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm1[1]
1392; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1393; X86-SSE-NEXT:    psrad $5, %xmm1
1394; X86-SSE-NEXT:    psrad $4, %xmm0
1395; X86-SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
1396; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
1397; X86-SSE-NEXT:    retl
1398  %shift = ashr <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1399  ret <4 x i32> %shift
1400}
1401
1402define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
1403; SSE2-LABEL: constant_shift_v8i16:
1404; SSE2:       # %bb.0:
1405; SSE2-NEXT:    movdqa %xmm0, %xmm1
1406; SSE2-NEXT:    psraw $4, %xmm1
1407; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1408; SSE2-NEXT:    movapd %xmm1, %xmm2
1409; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
1410; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
1411; SSE2-NEXT:    psraw $2, %xmm1
1412; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1413; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1414; SSE2-NEXT:    movaps %xmm2, %xmm1
1415; SSE2-NEXT:    andps %xmm0, %xmm1
1416; SSE2-NEXT:    psraw $1, %xmm2
1417; SSE2-NEXT:    andnps %xmm2, %xmm0
1418; SSE2-NEXT:    orps %xmm1, %xmm0
1419; SSE2-NEXT:    retq
1420;
1421; SSE41-LABEL: constant_shift_v8i16:
1422; SSE41:       # %bb.0:
1423; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,16384,8192,4096,2048,1024,512]
1424; SSE41-NEXT:    pmulhw %xmm0, %xmm1
1425; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1426; SSE41-NEXT:    psraw $1, %xmm0
1427; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1428; SSE41-NEXT:    retq
1429;
1430; AVX-LABEL: constant_shift_v8i16:
1431; AVX:       # %bb.0:
1432; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,16384,8192,4096,2048,1024,512]
1433; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1434; AVX-NEXT:    vpsraw $1, %xmm0, %xmm0
1435; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5,6,7]
1436; AVX-NEXT:    retq
1437;
1438; XOP-LABEL: constant_shift_v8i16:
1439; XOP:       # %bb.0:
1440; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1441; XOP-NEXT:    retq
1442;
1443; AVX512DQ-LABEL: constant_shift_v8i16:
1444; AVX512DQ:       # %bb.0:
1445; AVX512DQ-NEXT:    vpmovsxwd %xmm0, %ymm0
1446; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1447; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
1448; AVX512DQ-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1449; AVX512DQ-NEXT:    vzeroupper
1450; AVX512DQ-NEXT:    retq
1451;
1452; AVX512BW-LABEL: constant_shift_v8i16:
1453; AVX512BW:       # %bb.0:
1454; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1455; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1456; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1457; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1458; AVX512BW-NEXT:    vzeroupper
1459; AVX512BW-NEXT:    retq
1460;
1461; AVX512DQVL-LABEL: constant_shift_v8i16:
1462; AVX512DQVL:       # %bb.0:
1463; AVX512DQVL-NEXT:    vpmovsxwd %xmm0, %ymm0
1464; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1465; AVX512DQVL-NEXT:    vpmovdw %ymm0, %xmm0
1466; AVX512DQVL-NEXT:    vzeroupper
1467; AVX512DQVL-NEXT:    retq
1468;
1469; AVX512BWVL-LABEL: constant_shift_v8i16:
1470; AVX512BWVL:       # %bb.0:
1471; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1472; AVX512BWVL-NEXT:    retq
1473;
1474; X86-SSE-LABEL: constant_shift_v8i16:
1475; X86-SSE:       # %bb.0:
1476; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1477; X86-SSE-NEXT:    psraw $4, %xmm1
1478; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
1479; X86-SSE-NEXT:    movapd %xmm1, %xmm2
1480; X86-SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm1[2,3]
1481; X86-SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3,2,3]
1482; X86-SSE-NEXT:    psraw $2, %xmm1
1483; X86-SSE-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
1484; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [65535,0,65535,0,65535,0,65535,0]
1485; X86-SSE-NEXT:    movaps %xmm2, %xmm1
1486; X86-SSE-NEXT:    andps %xmm0, %xmm1
1487; X86-SSE-NEXT:    psraw $1, %xmm2
1488; X86-SSE-NEXT:    andnps %xmm2, %xmm0
1489; X86-SSE-NEXT:    orps %xmm1, %xmm0
1490; X86-SSE-NEXT:    retl
1491  %shift = ashr <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1492  ret <8 x i16> %shift
1493}
1494
1495define <8 x i16> @constant_shift_v8i16_pairs(<8 x i16> %a) nounwind {
1496; SSE2-LABEL: constant_shift_v8i16_pairs:
1497; SSE2:       # %bb.0:
1498; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [512,512,u,u,8192,8192,1024,1024]
1499; SSE2-NEXT:    pmulhw %xmm0, %xmm1
1500; SSE2-NEXT:    psraw $1, %xmm0
1501; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1502; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1503; SSE2-NEXT:    retq
1504;
1505; SSE41-LABEL: constant_shift_v8i16_pairs:
1506; SSE41:       # %bb.0:
1507; SSE41-NEXT:    movdqa %xmm0, %xmm1
1508; SSE41-NEXT:    psraw $1, %xmm1
1509; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [512,512,u,u,8192,8192,1024,1024]
1510; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1511; SSE41-NEXT:    retq
1512;
1513; AVX1-LABEL: constant_shift_v8i16_pairs:
1514; AVX1:       # %bb.0:
1515; AVX1-NEXT:    vpsraw $1, %xmm0, %xmm1
1516; AVX1-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,512,u,u,8192,8192,1024,1024]
1517; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6,7]
1518; AVX1-NEXT:    retq
1519;
1520; AVX2-LABEL: constant_shift_v8i16_pairs:
1521; AVX2:       # %bb.0:
1522; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1523; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1524; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512]
1525; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1526; AVX2-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1527; AVX2-NEXT:    retq
1528;
1529; XOP-LABEL: constant_shift_v8i16_pairs:
1530; XOP:       # %bb.0:
1531; XOP-NEXT:    vpshaw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1532; XOP-NEXT:    retq
1533;
1534; AVX512DQ-LABEL: constant_shift_v8i16_pairs:
1535; AVX512DQ:       # %bb.0:
1536; AVX512DQ-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1537; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1538; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512]
1539; AVX512DQ-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1540; AVX512DQ-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1541; AVX512DQ-NEXT:    retq
1542;
1543; AVX512BW-LABEL: constant_shift_v8i16_pairs:
1544; AVX512BW:       # %bb.0:
1545; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1546; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [7,7,1,1,3,3,6,6]
1547; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1548; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1549; AVX512BW-NEXT:    vzeroupper
1550; AVX512BW-NEXT:    retq
1551;
1552; AVX512DQVL-LABEL: constant_shift_v8i16_pairs:
1553; AVX512DQVL:       # %bb.0:
1554; AVX512DQVL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1555; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [256,256,16384,16384,4096,4096,512,512]
1556; AVX512DQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
1557; AVX512DQVL-NEXT:    vpsubw %xmm1, %xmm0, %xmm0
1558; AVX512DQVL-NEXT:    retq
1559;
1560; AVX512BWVL-LABEL: constant_shift_v8i16_pairs:
1561; AVX512BWVL:       # %bb.0:
1562; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1563; AVX512BWVL-NEXT:    retq
1564;
1565; X86-SSE-LABEL: constant_shift_v8i16_pairs:
1566; X86-SSE:       # %bb.0:
1567; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [512,512,u,u,8192,8192,1024,1024]
1568; X86-SSE-NEXT:    pmulhw %xmm0, %xmm1
1569; X86-SSE-NEXT:    psraw $1, %xmm0
1570; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,0]
1571; X86-SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
1572; X86-SSE-NEXT:    retl
1573  %shift = ashr <8 x i16> %a, <i16 7, i16 7, i16 1, i16 1, i16 3, i16 3, i16 6, i16 6>
1574  ret <8 x i16> %shift
1575}
1576
1577define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
1578; SSE-LABEL: constant_shift_v16i8:
1579; SSE:       # %bb.0:
1580; SSE-NEXT:    movdqa %xmm0, %xmm1
1581; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1582; SSE-NEXT:    psraw $8, %xmm1
1583; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256]
1584; SSE-NEXT:    psrlw $8, %xmm1
1585; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1586; SSE-NEXT:    psraw $8, %xmm0
1587; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [256,128,64,32,16,8,4,2]
1588; SSE-NEXT:    psrlw $8, %xmm0
1589; SSE-NEXT:    packuswb %xmm1, %xmm0
1590; SSE-NEXT:    retq
1591;
1592; AVX1-LABEL: constant_shift_v16i8:
1593; AVX1:       # %bb.0:
1594; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1595; AVX1-NEXT:    vpsraw $8, %xmm1, %xmm1
1596; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1597; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1598; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1599; AVX1-NEXT:    vpsraw $8, %xmm0, %xmm0
1600; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1601; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1602; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1603; AVX1-NEXT:    retq
1604;
1605; AVX2-LABEL: constant_shift_v16i8:
1606; AVX2:       # %bb.0:
1607; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm0
1608; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,2,4,8,16,32,64,128,256]
1609; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1610; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1611; AVX2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1612; AVX2-NEXT:    vzeroupper
1613; AVX2-NEXT:    retq
1614;
1615; XOP-LABEL: constant_shift_v16i8:
1616; XOP:       # %bb.0:
1617; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1618; XOP-NEXT:    retq
1619;
1620; AVX512DQ-LABEL: constant_shift_v16i8:
1621; AVX512DQ:       # %bb.0:
1622; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
1623; AVX512DQ-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1624; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
1625; AVX512DQ-NEXT:    vzeroupper
1626; AVX512DQ-NEXT:    retq
1627;
1628; AVX512BW-LABEL: constant_shift_v16i8:
1629; AVX512BW:       # %bb.0:
1630; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,7,6,5,4,3,2,1,0]
1631; AVX512BW-NEXT:    vpmovsxbw %xmm0, %ymm0
1632; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
1633; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1634; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
1635; AVX512BW-NEXT:    vzeroupper
1636; AVX512BW-NEXT:    retq
1637;
1638; AVX512DQVL-LABEL: constant_shift_v16i8:
1639; AVX512DQVL:       # %bb.0:
1640; AVX512DQVL-NEXT:    vpmovsxbd %xmm0, %zmm0
1641; AVX512DQVL-NEXT:    vpsravd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1642; AVX512DQVL-NEXT:    vpmovdb %zmm0, %xmm0
1643; AVX512DQVL-NEXT:    vzeroupper
1644; AVX512DQVL-NEXT:    retq
1645;
1646; AVX512BWVL-LABEL: constant_shift_v16i8:
1647; AVX512BWVL:       # %bb.0:
1648; AVX512BWVL-NEXT:    vpmovsxbw %xmm0, %ymm0
1649; AVX512BWVL-NEXT:    vpsravw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1650; AVX512BWVL-NEXT:    vpmovwb %ymm0, %xmm0
1651; AVX512BWVL-NEXT:    vzeroupper
1652; AVX512BWVL-NEXT:    retq
1653;
1654; X86-SSE-LABEL: constant_shift_v16i8:
1655; X86-SSE:       # %bb.0:
1656; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
1657; X86-SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1658; X86-SSE-NEXT:    psraw $8, %xmm1
1659; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,4,8,16,32,64,128,256]
1660; X86-SSE-NEXT:    psrlw $8, %xmm1
1661; X86-SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1662; X86-SSE-NEXT:    psraw $8, %xmm0
1663; X86-SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [256,128,64,32,16,8,4,2]
1664; X86-SSE-NEXT:    psrlw $8, %xmm0
1665; X86-SSE-NEXT:    packuswb %xmm1, %xmm0
1666; X86-SSE-NEXT:    retl
1667  %shift = ashr <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1668  ret <16 x i8> %shift
1669}
1670
1671define <16 x i8> @constant_shift_v16i8_pairs(<16 x i8> %a) nounwind {
1672; SSE2-LABEL: constant_shift_v16i8_pairs:
1673; SSE2:       # %bb.0:
1674; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
1675; SSE2-NEXT:    pandn %xmm0, %xmm1
1676; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1677; SSE2-NEXT:    por %xmm1, %xmm0
1678; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1679; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1680; SSE2-NEXT:    pxor %xmm1, %xmm0
1681; SSE2-NEXT:    psubb %xmm1, %xmm0
1682; SSE2-NEXT:    retq
1683;
1684; SSE41-LABEL: constant_shift_v16i8_pairs:
1685; SSE41:       # %bb.0:
1686; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [32768,4096,512,8192,16384,u,2048,1024]
1687; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
1688; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
1689; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1690; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1691; SSE41-NEXT:    pxor %xmm1, %xmm0
1692; SSE41-NEXT:    psubb %xmm1, %xmm0
1693; SSE41-NEXT:    retq
1694;
1695; AVX-LABEL: constant_shift_v16i8_pairs:
1696; AVX:       # %bb.0:
1697; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024]
1698; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
1699; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1700; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1701; AVX-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1702; AVX-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1703; AVX-NEXT:    retq
1704;
1705; XOP-LABEL: constant_shift_v16i8_pairs:
1706; XOP:       # %bb.0:
1707; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1708; XOP-NEXT:    retq
1709;
1710; AVX512DQ-LABEL: constant_shift_v16i8_pairs:
1711; AVX512DQ:       # %bb.0:
1712; AVX512DQ-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024]
1713; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
1714; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1715; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1716; AVX512DQ-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1717; AVX512DQ-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1718; AVX512DQ-NEXT:    retq
1719;
1720; AVX512BW-LABEL: constant_shift_v16i8_pairs:
1721; AVX512BW:       # %bb.0:
1722; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1723; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [1,4,7,3,2,0,5,6]
1724; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1725; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1726; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1727; AVX512BW-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1728; AVX512BW-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1729; AVX512BW-NEXT:    vzeroupper
1730; AVX512BW-NEXT:    retq
1731;
1732; AVX512DQVL-LABEL: constant_shift_v16i8_pairs:
1733; AVX512DQVL:       # %bb.0:
1734; AVX512DQVL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [32768,4096,512,8192,16384,u,2048,1024]
1735; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4],xmm0[5],xmm1[6,7]
1736; AVX512DQVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1737; AVX512DQVL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
1738; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1739; AVX512DQVL-NEXT:    retq
1740;
1741; AVX512BWVL-LABEL: constant_shift_v16i8_pairs:
1742; AVX512BWVL:       # %bb.0:
1743; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1744; AVX512BWVL-NEXT:    vmovdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1745; AVX512BWVL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
1746; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1747; AVX512BWVL-NEXT:    retq
1748;
1749; X86-SSE-LABEL: constant_shift_v16i8_pairs:
1750; X86-SSE:       # %bb.0:
1751; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,65535,65535]
1752; X86-SSE-NEXT:    pandn %xmm0, %xmm1
1753; X86-SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1754; X86-SSE-NEXT:    por %xmm1, %xmm0
1755; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1756; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [64,64,8,8,1,1,16,16,32,32,128,128,4,4,2,2]
1757; X86-SSE-NEXT:    pxor %xmm1, %xmm0
1758; X86-SSE-NEXT:    psubb %xmm1, %xmm0
1759; X86-SSE-NEXT:    retl
1760  %shift = ashr <16 x i8> %a, <i8 1, i8 1, i8 4, i8 4, i8 7, i8 7, i8 3, i8 3, i8 2, i8 2, i8 0, i8 0, i8 5, i8 5, i8 6, i8 6>
1761  ret <16 x i8> %shift
1762}
1763
1764define <16 x i8> @constant_shift_v16i8_quads(<16 x i8> %a) nounwind {
1765; SSE2-LABEL: constant_shift_v16i8_quads:
1766; SSE2:       # %bb.0:
1767; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384]
1768; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
1769; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1770; SSE2-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1771; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1772; SSE2-NEXT:    xorps %xmm0, %xmm1
1773; SSE2-NEXT:    psubb %xmm0, %xmm1
1774; SSE2-NEXT:    movdqa %xmm1, %xmm0
1775; SSE2-NEXT:    retq
1776;
1777; SSE41-LABEL: constant_shift_v16i8_quads:
1778; SSE41:       # %bb.0:
1779; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384]
1780; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
1781; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1782; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1783; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1784; SSE41-NEXT:    pxor %xmm1, %xmm0
1785; SSE41-NEXT:    psubb %xmm1, %xmm0
1786; SSE41-NEXT:    retq
1787;
1788; AVX1-LABEL: constant_shift_v16i8_quads:
1789; AVX1:       # %bb.0:
1790; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,32768,32768,8192,8192,16384,16384]
1791; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1792; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1793; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1794; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1795; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1796; AVX1-NEXT:    retq
1797;
1798; AVX2-LABEL: constant_shift_v16i8_quads:
1799; AVX2:       # %bb.0:
1800; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1801; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1802; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1803; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1804; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1805; AVX2-NEXT:    retq
1806;
1807; XOP-LABEL: constant_shift_v16i8_quads:
1808; XOP:       # %bb.0:
1809; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1810; XOP-NEXT:    retq
1811;
1812; AVX512-LABEL: constant_shift_v16i8_quads:
1813; AVX512:       # %bb.0:
1814; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1815; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1816; AVX512-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1817; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1818; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1819; AVX512-NEXT:    retq
1820;
1821; AVX512VL-LABEL: constant_shift_v16i8_quads:
1822; AVX512VL:       # %bb.0:
1823; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1824; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1825; AVX512VL-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
1826; AVX512VL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1827; AVX512VL-NEXT:    retq
1828;
1829; X86-SSE-LABEL: constant_shift_v16i8_quads:
1830; X86-SSE:       # %bb.0:
1831; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,32768,32768,8192,8192,16384,16384]
1832; X86-SSE-NEXT:    pmulhuw %xmm0, %xmm1
1833; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
1834; X86-SSE-NEXT:    andps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1835; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [128,128,128,128,64,64,64,64,16,16,16,16,32,32,32,32]
1836; X86-SSE-NEXT:    xorps %xmm0, %xmm1
1837; X86-SSE-NEXT:    psubb %xmm0, %xmm1
1838; X86-SSE-NEXT:    movdqa %xmm1, %xmm0
1839; X86-SSE-NEXT:    retl
1840  %shift = ashr <16 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2>
1841  ret <16 x i8> %shift
1842}
1843
1844;
1845; Uniform Constant Shifts
1846;
1847
1848define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
1849; SSE2-LABEL: splatconstant_shift_v2i64:
1850; SSE2:       # %bb.0:
1851; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
1852; SSE2-NEXT:    psrad $7, %xmm1
1853; SSE2-NEXT:    psrlq $7, %xmm0
1854; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1855; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1856; SSE2-NEXT:    retq
1857;
1858; SSE41-LABEL: splatconstant_shift_v2i64:
1859; SSE41:       # %bb.0:
1860; SSE41-NEXT:    movdqa %xmm0, %xmm1
1861; SSE41-NEXT:    psrad $7, %xmm1
1862; SSE41-NEXT:    psrlq $7, %xmm0
1863; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1864; SSE41-NEXT:    retq
1865;
1866; AVX1-LABEL: splatconstant_shift_v2i64:
1867; AVX1:       # %bb.0:
1868; AVX1-NEXT:    vpsrad $7, %xmm0, %xmm1
1869; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1870; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1871; AVX1-NEXT:    retq
1872;
1873; AVX2-LABEL: splatconstant_shift_v2i64:
1874; AVX2:       # %bb.0:
1875; AVX2-NEXT:    vpsrad $7, %xmm0, %xmm1
1876; AVX2-NEXT:    vpsrlq $7, %xmm0, %xmm0
1877; AVX2-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
1878; AVX2-NEXT:    retq
1879;
1880; XOP-LABEL: splatconstant_shift_v2i64:
1881; XOP:       # %bb.0:
1882; XOP-NEXT:    vpshaq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1883; XOP-NEXT:    retq
1884;
1885; AVX512-LABEL: splatconstant_shift_v2i64:
1886; AVX512:       # %bb.0:
1887; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1888; AVX512-NEXT:    vpsraq $7, %zmm0, %zmm0
1889; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1890; AVX512-NEXT:    vzeroupper
1891; AVX512-NEXT:    retq
1892;
1893; AVX512VL-LABEL: splatconstant_shift_v2i64:
1894; AVX512VL:       # %bb.0:
1895; AVX512VL-NEXT:    vpsraq $7, %xmm0, %xmm0
1896; AVX512VL-NEXT:    retq
1897;
1898; X86-SSE-LABEL: splatconstant_shift_v2i64:
1899; X86-SSE:       # %bb.0:
1900; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,3,2,3]
1901; X86-SSE-NEXT:    psrad $7, %xmm1
1902; X86-SSE-NEXT:    psrlq $7, %xmm0
1903; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1904; X86-SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1905; X86-SSE-NEXT:    retl
1906  %shift = ashr <2 x i64> %a, <i64 7, i64 7>
1907  ret <2 x i64> %shift
1908}
1909
1910define <4 x i32> @splatconstant_shift_v4i32(<4 x i32> %a) nounwind {
1911; SSE-LABEL: splatconstant_shift_v4i32:
1912; SSE:       # %bb.0:
1913; SSE-NEXT:    psrad $5, %xmm0
1914; SSE-NEXT:    retq
1915;
1916; AVX-LABEL: splatconstant_shift_v4i32:
1917; AVX:       # %bb.0:
1918; AVX-NEXT:    vpsrad $5, %xmm0, %xmm0
1919; AVX-NEXT:    retq
1920;
1921; XOP-LABEL: splatconstant_shift_v4i32:
1922; XOP:       # %bb.0:
1923; XOP-NEXT:    vpsrad $5, %xmm0, %xmm0
1924; XOP-NEXT:    retq
1925;
1926; AVX512-LABEL: splatconstant_shift_v4i32:
1927; AVX512:       # %bb.0:
1928; AVX512-NEXT:    vpsrad $5, %xmm0, %xmm0
1929; AVX512-NEXT:    retq
1930;
1931; AVX512VL-LABEL: splatconstant_shift_v4i32:
1932; AVX512VL:       # %bb.0:
1933; AVX512VL-NEXT:    vpsrad $5, %xmm0, %xmm0
1934; AVX512VL-NEXT:    retq
1935;
1936; X86-SSE-LABEL: splatconstant_shift_v4i32:
1937; X86-SSE:       # %bb.0:
1938; X86-SSE-NEXT:    psrad $5, %xmm0
1939; X86-SSE-NEXT:    retl
1940  %shift = ashr <4 x i32> %a, <i32 5, i32 5, i32 5, i32 5>
1941  ret <4 x i32> %shift
1942}
1943
1944define <8 x i16> @splatconstant_shift_v8i16(<8 x i16> %a) nounwind {
1945; SSE-LABEL: splatconstant_shift_v8i16:
1946; SSE:       # %bb.0:
1947; SSE-NEXT:    psraw $3, %xmm0
1948; SSE-NEXT:    retq
1949;
1950; AVX-LABEL: splatconstant_shift_v8i16:
1951; AVX:       # %bb.0:
1952; AVX-NEXT:    vpsraw $3, %xmm0, %xmm0
1953; AVX-NEXT:    retq
1954;
1955; XOP-LABEL: splatconstant_shift_v8i16:
1956; XOP:       # %bb.0:
1957; XOP-NEXT:    vpsraw $3, %xmm0, %xmm0
1958; XOP-NEXT:    retq
1959;
1960; AVX512-LABEL: splatconstant_shift_v8i16:
1961; AVX512:       # %bb.0:
1962; AVX512-NEXT:    vpsraw $3, %xmm0, %xmm0
1963; AVX512-NEXT:    retq
1964;
1965; AVX512VL-LABEL: splatconstant_shift_v8i16:
1966; AVX512VL:       # %bb.0:
1967; AVX512VL-NEXT:    vpsraw $3, %xmm0, %xmm0
1968; AVX512VL-NEXT:    retq
1969;
1970; X86-SSE-LABEL: splatconstant_shift_v8i16:
1971; X86-SSE:       # %bb.0:
1972; X86-SSE-NEXT:    psraw $3, %xmm0
1973; X86-SSE-NEXT:    retl
1974  %shift = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1975  ret <8 x i16> %shift
1976}
1977
1978define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) nounwind {
1979; SSE-LABEL: splatconstant_shift_v16i8:
1980; SSE:       # %bb.0:
1981; SSE-NEXT:    psrlw $3, %xmm0
1982; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1983; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1984; SSE-NEXT:    pxor %xmm1, %xmm0
1985; SSE-NEXT:    psubb %xmm1, %xmm0
1986; SSE-NEXT:    retq
1987;
1988; AVX1-LABEL: splatconstant_shift_v16i8:
1989; AVX1:       # %bb.0:
1990; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1991; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1992; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
1993; AVX1-NEXT:    vpxor %xmm1, %xmm0, %xmm0
1994; AVX1-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
1995; AVX1-NEXT:    retq
1996;
1997; AVX2-LABEL: splatconstant_shift_v16i8:
1998; AVX2:       # %bb.0:
1999; AVX2-NEXT:    vpsrlw $3, %xmm0, %xmm0
2000; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2001; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2002; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2003; AVX2-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2004; AVX2-NEXT:    retq
2005;
2006; XOP-LABEL: splatconstant_shift_v16i8:
2007; XOP:       # %bb.0:
2008; XOP-NEXT:    vpshab {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2009; XOP-NEXT:    retq
2010;
2011; AVX512-LABEL: splatconstant_shift_v16i8:
2012; AVX512:       # %bb.0:
2013; AVX512-NEXT:    vpsrlw $3, %xmm0, %xmm0
2014; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2015; AVX512-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2016; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
2017; AVX512-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2018; AVX512-NEXT:    retq
2019;
2020; AVX512DQVL-LABEL: splatconstant_shift_v16i8:
2021; AVX512DQVL:       # %bb.0:
2022; AVX512DQVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2023; AVX512DQVL-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2024; AVX512DQVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2025; AVX512DQVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2026; AVX512DQVL-NEXT:    retq
2027;
2028; AVX512BWVL-LABEL: splatconstant_shift_v16i8:
2029; AVX512BWVL:       # %bb.0:
2030; AVX512BWVL-NEXT:    vpsrlw $3, %xmm0, %xmm0
2031; AVX512BWVL-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2032; AVX512BWVL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 ^ (xmm0 & mem)
2033; AVX512BWVL-NEXT:    vpsubb %xmm1, %xmm0, %xmm0
2034; AVX512BWVL-NEXT:    retq
2035;
2036; X86-SSE-LABEL: splatconstant_shift_v16i8:
2037; X86-SSE:       # %bb.0:
2038; X86-SSE-NEXT:    psrlw $3, %xmm0
2039; X86-SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2040; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
2041; X86-SSE-NEXT:    pxor %xmm1, %xmm0
2042; X86-SSE-NEXT:    psubb %xmm1, %xmm0
2043; X86-SSE-NEXT:    retl
2044  %shift = ashr <16 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
2045  ret <16 x i8> %shift
2046}
2047
2048define <2 x i64> @PR52719(<2 x i64> %a0, i32 %a1) {
2049; SSE-LABEL: PR52719:
2050; SSE:       # %bb.0:
2051; SSE-NEXT:    movd %edi, %xmm1
2052; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
2053; SSE-NEXT:    psrlq %xmm1, %xmm2
2054; SSE-NEXT:    psrlq %xmm1, %xmm0
2055; SSE-NEXT:    pxor %xmm2, %xmm0
2056; SSE-NEXT:    psubq %xmm2, %xmm0
2057; SSE-NEXT:    retq
2058;
2059; AVX1-LABEL: PR52719:
2060; AVX1:       # %bb.0:
2061; AVX1-NEXT:    vmovd %edi, %xmm1
2062; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
2063; AVX1-NEXT:    # xmm2 = mem[0,0]
2064; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
2065; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
2066; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm0
2067; AVX1-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
2068; AVX1-NEXT:    retq
2069;
2070; AVX2-LABEL: PR52719:
2071; AVX2:       # %bb.0:
2072; AVX2-NEXT:    vmovd %edi, %xmm1
2073; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
2074; AVX2-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
2075; AVX2-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
2076; AVX2-NEXT:    vpxor %xmm2, %xmm0, %xmm0
2077; AVX2-NEXT:    vpsubq %xmm2, %xmm0, %xmm0
2078; AVX2-NEXT:    retq
2079;
2080; XOPAVX1-LABEL: PR52719:
2081; XOPAVX1:       # %bb.0:
2082; XOPAVX1-NEXT:    vmovd %edi, %xmm1
2083; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
2084; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2085; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
2086; XOPAVX1-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
2087; XOPAVX1-NEXT:    retq
2088;
2089; XOPAVX2-LABEL: PR52719:
2090; XOPAVX2:       # %bb.0:
2091; XOPAVX2-NEXT:    vmovd %edi, %xmm1
2092; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
2093; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2094; XOPAVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm1
2095; XOPAVX2-NEXT:    vpshaq %xmm1, %xmm0, %xmm0
2096; XOPAVX2-NEXT:    retq
2097;
2098; AVX512-LABEL: PR52719:
2099; AVX512:       # %bb.0:
2100; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2101; AVX512-NEXT:    vmovd %edi, %xmm1
2102; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
2103; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2104; AVX512-NEXT:    vzeroupper
2105; AVX512-NEXT:    retq
2106;
2107; AVX512VL-LABEL: PR52719:
2108; AVX512VL:       # %bb.0:
2109; AVX512VL-NEXT:    vmovd %edi, %xmm1
2110; AVX512VL-NEXT:    vpsraq %xmm1, %xmm0, %xmm0
2111; AVX512VL-NEXT:    retq
2112;
2113; X86-SSE-LABEL: PR52719:
2114; X86-SSE:       # %bb.0:
2115; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
2116; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648]
2117; X86-SSE-NEXT:    psrlq %xmm1, %xmm2
2118; X86-SSE-NEXT:    psrlq %xmm1, %xmm0
2119; X86-SSE-NEXT:    pxor %xmm2, %xmm0
2120; X86-SSE-NEXT:    psubq %xmm2, %xmm0
2121; X86-SSE-NEXT:    retl
2122  %vec = insertelement <2 x i32> poison, i32 %a1, i64 0
2123  %splat = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
2124  %zext = zext <2 x i32> %splat to <2 x i64>
2125  %ashr = ashr <2 x i64> %a0, %zext
2126  ret <2 x i64> %ashr
2127}
2128