xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll (revision 5c181a9191bfb758575329ff7eb8db4fc46ffac9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=AVX512,AVX512DQ
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512DQVL
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VL,AVX512BWVL
10;
11; 32-bit runs to make sure we do reasonable things for i64 shifts.
12; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=X86-AVX1
13; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
14
15;
16; Variable Shifts
17;
18
19define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
20; AVX1-LABEL: var_shift_v4i64:
21; AVX1:       # %bb.0:
22; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
23; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
24; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
25; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
26; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
27; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
28; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
29; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
31; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
32; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
33; AVX1-NEXT:    retq
34;
35; AVX2-LABEL: var_shift_v4i64:
36; AVX2:       # %bb.0:
37; AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
38; AVX2-NEXT:    retq
39;
40; XOPAVX1-LABEL: var_shift_v4i64:
41; XOPAVX1:       # %bb.0:
42; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
43; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
44; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
45; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
46; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm4, %xmm2
47; XOPAVX1-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
48; XOPAVX1-NEXT:    vpshlq %xmm1, %xmm0, %xmm0
49; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
50; XOPAVX1-NEXT:    retq
51;
52; XOPAVX2-LABEL: var_shift_v4i64:
53; XOPAVX2:       # %bb.0:
54; XOPAVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
55; XOPAVX2-NEXT:    retq
56;
57; AVX512-LABEL: var_shift_v4i64:
58; AVX512:       # %bb.0:
59; AVX512-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
60; AVX512-NEXT:    retq
61;
62; AVX512VL-LABEL: var_shift_v4i64:
63; AVX512VL:       # %bb.0:
64; AVX512VL-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
65; AVX512VL-NEXT:    retq
66;
67; X86-AVX1-LABEL: var_shift_v4i64:
68; X86-AVX1:       # %bb.0:
69; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
70; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
71; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm4
72; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
73; X86-AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm2
74; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
75; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm3
76; X86-AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
77; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
78; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
79; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
80; X86-AVX1-NEXT:    retl
81;
82; X86-AVX2-LABEL: var_shift_v4i64:
83; X86-AVX2:       # %bb.0:
84; X86-AVX2-NEXT:    vpsrlvq %ymm1, %ymm0, %ymm0
85; X86-AVX2-NEXT:    retl
86  %shift = lshr <4 x i64> %a, %b
87  ret <4 x i64> %shift
88}
89
90define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
91; AVX1-LABEL: var_shift_v8i32:
92; AVX1:       # %bb.0:
93; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
94; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
95; AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
96; AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
97; AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
98; AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
99; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
100; AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
101; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
102; AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
103; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
104; AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
105; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
106; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
107; AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
108; AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
109; AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
110; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
111; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
112; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
113; AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
114; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
115; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
116; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
117; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
118; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
119; AVX1-NEXT:    retq
120;
121; AVX2-LABEL: var_shift_v8i32:
122; AVX2:       # %bb.0:
123; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
124; AVX2-NEXT:    retq
125;
126; XOPAVX1-LABEL: var_shift_v8i32:
127; XOPAVX1:       # %bb.0:
128; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
129; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
130; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
131; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
132; XOPAVX1-NEXT:    vpshld %xmm2, %xmm4, %xmm2
133; XOPAVX1-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
134; XOPAVX1-NEXT:    vpshld %xmm1, %xmm0, %xmm0
135; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
136; XOPAVX1-NEXT:    retq
137;
138; XOPAVX2-LABEL: var_shift_v8i32:
139; XOPAVX2:       # %bb.0:
140; XOPAVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
141; XOPAVX2-NEXT:    retq
142;
143; AVX512-LABEL: var_shift_v8i32:
144; AVX512:       # %bb.0:
145; AVX512-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
146; AVX512-NEXT:    retq
147;
148; AVX512VL-LABEL: var_shift_v8i32:
149; AVX512VL:       # %bb.0:
150; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
151; AVX512VL-NEXT:    retq
152;
153; X86-AVX1-LABEL: var_shift_v8i32:
154; X86-AVX1:       # %bb.0:
155; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
156; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
157; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
158; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm2, %xmm4
159; X86-AVX1-NEXT:    vpsrlq $32, %xmm3, %xmm5
160; X86-AVX1-NEXT:    vpsrld %xmm5, %xmm2, %xmm5
161; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7]
162; X86-AVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
163; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3]
164; X86-AVX1-NEXT:    vpsrld %xmm6, %xmm2, %xmm6
165; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
166; X86-AVX1-NEXT:    vpsrld %xmm3, %xmm2, %xmm2
167; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7]
168; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7]
169; X86-AVX1-NEXT:    vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
170; X86-AVX1-NEXT:    vpsrld %xmm3, %xmm0, %xmm3
171; X86-AVX1-NEXT:    vpsrlq $32, %xmm1, %xmm4
172; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
173; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
174; X86-AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
175; X86-AVX1-NEXT:    vpsrld %xmm4, %xmm0, %xmm4
176; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
177; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
178; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7]
179; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
180; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
181; X86-AVX1-NEXT:    retl
182;
183; X86-AVX2-LABEL: var_shift_v8i32:
184; X86-AVX2:       # %bb.0:
185; X86-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
186; X86-AVX2-NEXT:    retl
187  %shift = lshr <8 x i32> %a, %b
188  ret <8 x i32> %shift
189}
190
191define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
192; AVX1-LABEL: var_shift_v16i16:
193; AVX1:       # %bb.0:
194; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
195; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
196; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
197; AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
198; AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
199; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
200; AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
201; AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
202; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
203; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
204; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
205; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
206; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
207; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
208; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
209; AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
210; AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
211; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
212; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
213; AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
214; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
215; AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
216; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
217; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
218; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
219; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
220; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
221; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
222; AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
223; AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
224; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
225; AVX1-NEXT:    retq
226;
227; AVX2-LABEL: var_shift_v16i16:
228; AVX2:       # %bb.0:
229; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
230; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
231; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
232; AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
233; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
234; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
235; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
236; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
237; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
238; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
239; AVX2-NEXT:    retq
240;
241; XOPAVX1-LABEL: var_shift_v16i16:
242; XOPAVX1:       # %bb.0:
243; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
244; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
245; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
246; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
247; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
248; XOPAVX1-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
249; XOPAVX1-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
250; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
251; XOPAVX1-NEXT:    retq
252;
253; XOPAVX2-LABEL: var_shift_v16i16:
254; XOPAVX2:       # %bb.0:
255; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
256; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
257; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
258; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
259; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm4, %xmm2
260; XOPAVX2-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
261; XOPAVX2-NEXT:    vpshlw %xmm1, %xmm0, %xmm0
262; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
263; XOPAVX2-NEXT:    retq
264;
265; AVX512DQ-LABEL: var_shift_v16i16:
266; AVX512DQ:       # %bb.0:
267; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
268; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
269; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
270; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
271; AVX512DQ-NEXT:    retq
272;
273; AVX512BW-LABEL: var_shift_v16i16:
274; AVX512BW:       # %bb.0:
275; AVX512BW-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
276; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
277; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
278; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
279; AVX512BW-NEXT:    retq
280;
281; AVX512DQVL-LABEL: var_shift_v16i16:
282; AVX512DQVL:       # %bb.0:
283; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
284; AVX512DQVL-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
285; AVX512DQVL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
286; AVX512DQVL-NEXT:    vpmovdw %zmm0, %ymm0
287; AVX512DQVL-NEXT:    retq
288;
289; AVX512BWVL-LABEL: var_shift_v16i16:
290; AVX512BWVL:       # %bb.0:
291; AVX512BWVL-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
292; AVX512BWVL-NEXT:    retq
293;
294; X86-AVX1-LABEL: var_shift_v16i16:
295; X86-AVX1:       # %bb.0:
296; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
297; X86-AVX1-NEXT:    vpsllw $12, %xmm2, %xmm3
298; X86-AVX1-NEXT:    vpsllw $4, %xmm2, %xmm2
299; X86-AVX1-NEXT:    vpor %xmm3, %xmm2, %xmm2
300; X86-AVX1-NEXT:    vpaddw %xmm2, %xmm2, %xmm3
301; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
302; X86-AVX1-NEXT:    vpsrlw $8, %xmm4, %xmm5
303; X86-AVX1-NEXT:    vpblendvb %xmm2, %xmm5, %xmm4, %xmm2
304; X86-AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm4
305; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
306; X86-AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm4
307; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
308; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
309; X86-AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm4
310; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
311; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm4, %xmm2, %xmm2
312; X86-AVX1-NEXT:    vpsllw $12, %xmm1, %xmm3
313; X86-AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
314; X86-AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
315; X86-AVX1-NEXT:    vpaddw %xmm1, %xmm1, %xmm3
316; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm4
317; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm4, %xmm0, %xmm0
318; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm1
319; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
320; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm1
321; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
322; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
323; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm1
324; X86-AVX1-NEXT:    vpaddw %xmm3, %xmm3, %xmm3
325; X86-AVX1-NEXT:    vpblendvb %xmm3, %xmm1, %xmm0, %xmm0
326; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
327; X86-AVX1-NEXT:    retl
328;
329; X86-AVX2-LABEL: var_shift_v16i16:
330; X86-AVX2:       # %bb.0:
331; X86-AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
332; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
333; X86-AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15]
334; X86-AVX2-NEXT:    vpsrlvd %ymm3, %ymm4, %ymm3
335; X86-AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
336; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
337; X86-AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11]
338; X86-AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
339; X86-AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
340; X86-AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
341; X86-AVX2-NEXT:    retl
342  %shift = lshr <16 x i16> %a, %b
343  ret <16 x i16> %shift
344}
345
346define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
347; AVX1-LABEL: var_shift_v32i8:
348; AVX1:       # %bb.0:
349; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
350; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
351; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
352; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
353; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
354; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
355; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
356; AVX1-NEXT:    vpsrlw $2, %xmm2, %xmm3
357; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
358; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
359; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
360; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
361; AVX1-NEXT:    vpsrlw $1, %xmm2, %xmm3
362; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
363; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
364; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
365; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
366; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
367; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
368; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
369; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
370; AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm3
371; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
372; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
373; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
374; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm3
375; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
376; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
377; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
378; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
379; AVX1-NEXT:    retq
380;
381; AVX2-LABEL: var_shift_v32i8:
382; AVX2:       # %bb.0:
383; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
384; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
385; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
386; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
387; AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
388; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
389; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
390; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
391; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
392; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
393; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
394; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
395; AVX2-NEXT:    retq
396;
397; XOPAVX1-LABEL: var_shift_v32i8:
398; XOPAVX1:       # %bb.0:
399; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
400; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
401; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
402; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
403; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
404; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
405; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
406; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
407; XOPAVX1-NEXT:    retq
408;
409; XOPAVX2-LABEL: var_shift_v32i8:
410; XOPAVX2:       # %bb.0:
411; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
412; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
413; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
414; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm4
415; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm4, %xmm2
416; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm3, %xmm1
417; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
418; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
419; XOPAVX2-NEXT:    retq
420;
421; AVX512DQ-LABEL: var_shift_v32i8:
422; AVX512DQ:       # %bb.0:
423; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
424; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm2
425; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
426; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
427; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm2
428; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
429; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
430; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
431; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm2
432; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
433; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
434; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
435; AVX512DQ-NEXT:    retq
436;
437; AVX512BW-LABEL: var_shift_v32i8:
438; AVX512BW:       # %bb.0:
439; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
440; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
441; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
442; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
443; AVX512BW-NEXT:    retq
444;
445; AVX512DQVL-LABEL: var_shift_v32i8:
446; AVX512DQVL:       # %bb.0:
447; AVX512DQVL-NEXT:    vpsllw $5, %ymm1, %ymm1
448; AVX512DQVL-NEXT:    vpsrlw $4, %ymm0, %ymm2
449; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
450; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
451; AVX512DQVL-NEXT:    vpsrlw $2, %ymm0, %ymm2
452; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
453; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
454; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
455; AVX512DQVL-NEXT:    vpsrlw $1, %ymm0, %ymm2
456; AVX512DQVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm2
457; AVX512DQVL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
458; AVX512DQVL-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
459; AVX512DQVL-NEXT:    retq
460;
461; AVX512BWVL-LABEL: var_shift_v32i8:
462; AVX512BWVL:       # %bb.0:
463; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
464; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
465; AVX512BWVL-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
466; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
467; AVX512BWVL-NEXT:    retq
468;
469; X86-AVX1-LABEL: var_shift_v32i8:
470; X86-AVX1:       # %bb.0:
471; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
472; X86-AVX1-NEXT:    vpsrlw $4, %xmm3, %xmm4
473; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
474; X86-AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm4
475; X86-AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
476; X86-AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
477; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
478; X86-AVX1-NEXT:    vpsrlw $2, %xmm3, %xmm4
479; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
480; X86-AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
481; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
482; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
483; X86-AVX1-NEXT:    vpsrlw $1, %xmm3, %xmm4
484; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
485; X86-AVX1-NEXT:    vpand %xmm7, %xmm4, %xmm4
486; X86-AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
487; X86-AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm3, %xmm3
488; X86-AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm4
489; X86-AVX1-NEXT:    vpand %xmm2, %xmm4, %xmm2
490; X86-AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
491; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
492; X86-AVX1-NEXT:    vpsrlw $2, %xmm0, %xmm2
493; X86-AVX1-NEXT:    vpand %xmm6, %xmm2, %xmm2
494; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
495; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
496; X86-AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm2
497; X86-AVX1-NEXT:    vpand %xmm7, %xmm2, %xmm2
498; X86-AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
499; X86-AVX1-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
500; X86-AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
501; X86-AVX1-NEXT:    retl
502;
503; X86-AVX2-LABEL: var_shift_v32i8:
504; X86-AVX2:       # %bb.0:
505; X86-AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
506; X86-AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
507; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
508; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
509; X86-AVX2-NEXT:    vpsrlw $2, %ymm0, %ymm2
510; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
511; X86-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
512; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
513; X86-AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm2
514; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2
515; X86-AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
516; X86-AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
517; X86-AVX2-NEXT:    retl
518  %shift = lshr <32 x i8> %a, %b
519  ret <32 x i8> %shift
520}
521
522;
523; Uniform Variable Shifts
524;
525
526define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
527; AVX1-LABEL: splatvar_shift_v4i64:
528; AVX1:       # %bb.0:
529; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
530; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
531; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
532; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
533; AVX1-NEXT:    retq
534;
535; AVX2-LABEL: splatvar_shift_v4i64:
536; AVX2:       # %bb.0:
537; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
538; AVX2-NEXT:    retq
539;
540; XOPAVX1-LABEL: splatvar_shift_v4i64:
541; XOPAVX1:       # %bb.0:
542; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
543; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
544; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
545; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
546; XOPAVX1-NEXT:    retq
547;
548; XOPAVX2-LABEL: splatvar_shift_v4i64:
549; XOPAVX2:       # %bb.0:
550; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
551; XOPAVX2-NEXT:    retq
552;
553; AVX512-LABEL: splatvar_shift_v4i64:
554; AVX512:       # %bb.0:
555; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
556; AVX512-NEXT:    retq
557;
558; AVX512VL-LABEL: splatvar_shift_v4i64:
559; AVX512VL:       # %bb.0:
560; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
561; AVX512VL-NEXT:    retq
562;
563; X86-AVX1-LABEL: splatvar_shift_v4i64:
564; X86-AVX1:       # %bb.0:
565; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
566; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
567; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
568; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
569; X86-AVX1-NEXT:    retl
570;
571; X86-AVX2-LABEL: splatvar_shift_v4i64:
572; X86-AVX2:       # %bb.0:
573; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
574; X86-AVX2-NEXT:    retl
575  %splat = shufflevector <4 x i64> %b, <4 x i64> poison, <4 x i32> zeroinitializer
576  %shift = lshr <4 x i64> %a, %splat
577  ret <4 x i64> %shift
578}
579
580define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
581; AVX1-LABEL: splatvar_shift_v8i32:
582; AVX1:       # %bb.0:
583; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
584; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
585; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
586; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
587; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
588; AVX1-NEXT:    retq
589;
590; AVX2-LABEL: splatvar_shift_v8i32:
591; AVX2:       # %bb.0:
592; AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
593; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
594; AVX2-NEXT:    retq
595;
596; XOPAVX1-LABEL: splatvar_shift_v8i32:
597; XOPAVX1:       # %bb.0:
598; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
599; XOPAVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
600; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
601; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
602; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
603; XOPAVX1-NEXT:    retq
604;
605; XOPAVX2-LABEL: splatvar_shift_v8i32:
606; XOPAVX2:       # %bb.0:
607; XOPAVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
608; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
609; XOPAVX2-NEXT:    retq
610;
611; AVX512-LABEL: splatvar_shift_v8i32:
612; AVX512:       # %bb.0:
613; AVX512-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
614; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
615; AVX512-NEXT:    retq
616;
617; AVX512VL-LABEL: splatvar_shift_v8i32:
618; AVX512VL:       # %bb.0:
619; AVX512VL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
620; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
621; AVX512VL-NEXT:    retq
622;
623; X86-AVX1-LABEL: splatvar_shift_v8i32:
624; X86-AVX1:       # %bb.0:
625; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
626; X86-AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
627; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
628; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
629; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
630; X86-AVX1-NEXT:    retl
631;
632; X86-AVX2-LABEL: splatvar_shift_v8i32:
633; X86-AVX2:       # %bb.0:
634; X86-AVX2-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
635; X86-AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
636; X86-AVX2-NEXT:    retl
637  %splat = shufflevector <8 x i32> %b, <8 x i32> poison, <8 x i32> zeroinitializer
638  %shift = lshr <8 x i32> %a, %splat
639  ret <8 x i32> %shift
640}
641
642define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
643; AVX1-LABEL: splatvar_shift_v16i16:
644; AVX1:       # %bb.0:
645; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
646; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
647; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
648; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
649; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
650; AVX1-NEXT:    retq
651;
652; AVX2-LABEL: splatvar_shift_v16i16:
653; AVX2:       # %bb.0:
654; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
655; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
656; AVX2-NEXT:    retq
657;
658; XOPAVX1-LABEL: splatvar_shift_v16i16:
659; XOPAVX1:       # %bb.0:
660; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
661; XOPAVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
662; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
663; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
664; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
665; XOPAVX1-NEXT:    retq
666;
667; XOPAVX2-LABEL: splatvar_shift_v16i16:
668; XOPAVX2:       # %bb.0:
669; XOPAVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
670; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
671; XOPAVX2-NEXT:    retq
672;
673; AVX512-LABEL: splatvar_shift_v16i16:
674; AVX512:       # %bb.0:
675; AVX512-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
676; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
677; AVX512-NEXT:    retq
678;
679; AVX512VL-LABEL: splatvar_shift_v16i16:
680; AVX512VL:       # %bb.0:
681; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
682; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
683; AVX512VL-NEXT:    retq
684;
685; X86-AVX1-LABEL: splatvar_shift_v16i16:
686; X86-AVX1:       # %bb.0:
687; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
688; X86-AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
689; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
690; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
691; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
692; X86-AVX1-NEXT:    retl
693;
694; X86-AVX2-LABEL: splatvar_shift_v16i16:
695; X86-AVX2:       # %bb.0:
696; X86-AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
697; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
698; X86-AVX2-NEXT:    retl
699  %splat = shufflevector <16 x i16> %b, <16 x i16> poison, <16 x i32> zeroinitializer
700  %shift = lshr <16 x i16> %a, %splat
701  ret <16 x i16> %shift
702}
703
704define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
705; AVX1-LABEL: splatvar_shift_v32i8:
706; AVX1:       # %bb.0:
707; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
708; AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
709; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
710; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
711; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
712; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
713; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
714; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
715; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
716; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
717; AVX1-NEXT:    retq
718;
719; AVX2-LABEL: splatvar_shift_v32i8:
720; AVX2:       # %bb.0:
721; AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
722; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
723; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
724; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
725; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
726; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
727; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
728; AVX2-NEXT:    retq
729;
730; XOPAVX1-LABEL: splatvar_shift_v32i8:
731; XOPAVX1:       # %bb.0:
732; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
733; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
734; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
735; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
736; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
737; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
738; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
739; XOPAVX1-NEXT:    retq
740;
741; XOPAVX2-LABEL: splatvar_shift_v32i8:
742; XOPAVX2:       # %bb.0:
743; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
744; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
745; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
746; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
747; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
748; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
749; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
750; XOPAVX2-NEXT:    retq
751;
752; AVX512DQ-LABEL: splatvar_shift_v32i8:
753; AVX512DQ:       # %bb.0:
754; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
755; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
756; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
757; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
758; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
759; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
760; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
761; AVX512DQ-NEXT:    retq
762;
763; AVX512BW-LABEL: splatvar_shift_v32i8:
764; AVX512BW:       # %bb.0:
765; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
766; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
767; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
768; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
769; AVX512BW-NEXT:    retq
770;
771; AVX512DQVL-LABEL: splatvar_shift_v32i8:
772; AVX512DQVL:       # %bb.0:
773; AVX512DQVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
774; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
775; AVX512DQVL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
776; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
777; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
778; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
779; AVX512DQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
780; AVX512DQVL-NEXT:    retq
781;
782; AVX512BWVL-LABEL: splatvar_shift_v32i8:
783; AVX512BWVL:       # %bb.0:
784; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
785; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
786; AVX512BWVL-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
787; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
788; AVX512BWVL-NEXT:    retq
789;
790; X86-AVX1-LABEL: splatvar_shift_v32i8:
791; X86-AVX1:       # %bb.0:
792; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
793; X86-AVX1-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
794; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
795; X86-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
796; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
797; X86-AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
798; X86-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
799; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
800; X86-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
801; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
802; X86-AVX1-NEXT:    retl
803;
804; X86-AVX2-LABEL: splatvar_shift_v32i8:
805; X86-AVX2:       # %bb.0:
806; X86-AVX2-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
807; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
808; X86-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
809; X86-AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
810; X86-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
811; X86-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
812; X86-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
813; X86-AVX2-NEXT:    retl
814  %splat = shufflevector <32 x i8> %b, <32 x i8> poison, <32 x i32> zeroinitializer
815  %shift = lshr <32 x i8> %a, %splat
816  ret <32 x i8> %shift
817}
818
819;
820; Uniform Variable Modulo Shifts
821;
822
823define <4 x i64> @splatvar_modulo_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
824; AVX1-LABEL: splatvar_modulo_shift_v4i64:
825; AVX1:       # %bb.0:
826; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
827; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
828; AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
829; AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
830; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
831; AVX1-NEXT:    retq
832;
833; AVX2-LABEL: splatvar_modulo_shift_v4i64:
834; AVX2:       # %bb.0:
835; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
836; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
837; AVX2-NEXT:    retq
838;
839; XOPAVX1-LABEL: splatvar_modulo_shift_v4i64:
840; XOPAVX1:       # %bb.0:
841; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
842; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
843; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
844; XOPAVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
845; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
846; XOPAVX1-NEXT:    retq
847;
848; XOPAVX2-LABEL: splatvar_modulo_shift_v4i64:
849; XOPAVX2:       # %bb.0:
850; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
851; XOPAVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
852; XOPAVX2-NEXT:    retq
853;
854; AVX512-LABEL: splatvar_modulo_shift_v4i64:
855; AVX512:       # %bb.0:
856; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
857; AVX512-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
858; AVX512-NEXT:    retq
859;
860; AVX512VL-LABEL: splatvar_modulo_shift_v4i64:
861; AVX512VL:       # %bb.0:
862; AVX512VL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm1
863; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
864; AVX512VL-NEXT:    retq
865;
866; X86-AVX1-LABEL: splatvar_modulo_shift_v4i64:
867; X86-AVX1:       # %bb.0:
868; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
869; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
870; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm2, %xmm2
871; X86-AVX1-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0
872; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
873; X86-AVX1-NEXT:    retl
874;
875; X86-AVX2-LABEL: splatvar_modulo_shift_v4i64:
876; X86-AVX2:       # %bb.0:
877; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
878; X86-AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
879; X86-AVX2-NEXT:    retl
880  %mod = and <4 x i64> %b, <i64 63, i64 63, i64 63, i64 63>
881  %splat = shufflevector <4 x i64> %mod, <4 x i64> poison, <4 x i32> zeroinitializer
882  %shift = lshr <4 x i64> %a, %splat
883  ret <4 x i64> %shift
884}
885
886define <8 x i32> @splatvar_modulo_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
887; AVX1-LABEL: splatvar_modulo_shift_v8i32:
888; AVX1:       # %bb.0:
889; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
890; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
891; AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
892; AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
893; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
894; AVX1-NEXT:    retq
895;
896; AVX2-LABEL: splatvar_modulo_shift_v8i32:
897; AVX2:       # %bb.0:
898; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
899; AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
900; AVX2-NEXT:    retq
901;
902; XOPAVX1-LABEL: splatvar_modulo_shift_v8i32:
903; XOPAVX1:       # %bb.0:
904; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
905; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
906; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
907; XOPAVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
908; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
909; XOPAVX1-NEXT:    retq
910;
911; XOPAVX2-LABEL: splatvar_modulo_shift_v8i32:
912; XOPAVX2:       # %bb.0:
913; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
914; XOPAVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
915; XOPAVX2-NEXT:    retq
916;
917; AVX512-LABEL: splatvar_modulo_shift_v8i32:
918; AVX512:       # %bb.0:
919; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
920; AVX512-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
921; AVX512-NEXT:    retq
922;
923; AVX512VL-LABEL: splatvar_modulo_shift_v8i32:
924; AVX512VL:       # %bb.0:
925; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
926; AVX512VL-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
927; AVX512VL-NEXT:    retq
928;
929; X86-AVX1-LABEL: splatvar_modulo_shift_v8i32:
930; X86-AVX1:       # %bb.0:
931; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
932; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
933; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm2, %xmm2
934; X86-AVX1-NEXT:    vpsrld %xmm1, %xmm0, %xmm0
935; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
936; X86-AVX1-NEXT:    retl
937;
938; X86-AVX2-LABEL: splatvar_modulo_shift_v8i32:
939; X86-AVX2:       # %bb.0:
940; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
941; X86-AVX2-NEXT:    vpsrld %xmm1, %ymm0, %ymm0
942; X86-AVX2-NEXT:    retl
943  %mod = and <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
944  %splat = shufflevector <8 x i32> %mod, <8 x i32> poison, <8 x i32> zeroinitializer
945  %shift = lshr <8 x i32> %a, %splat
946  ret <8 x i32> %shift
947}
948
949define <16 x i16> @splatvar_modulo_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
950; AVX1-LABEL: splatvar_modulo_shift_v16i16:
951; AVX1:       # %bb.0:
952; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
953; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
954; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
955; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
956; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
957; AVX1-NEXT:    retq
958;
959; AVX2-LABEL: splatvar_modulo_shift_v16i16:
960; AVX2:       # %bb.0:
961; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
962; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
963; AVX2-NEXT:    retq
964;
965; XOPAVX1-LABEL: splatvar_modulo_shift_v16i16:
966; XOPAVX1:       # %bb.0:
967; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
968; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
969; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
970; XOPAVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
971; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
972; XOPAVX1-NEXT:    retq
973;
974; XOPAVX2-LABEL: splatvar_modulo_shift_v16i16:
975; XOPAVX2:       # %bb.0:
976; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
977; XOPAVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
978; XOPAVX2-NEXT:    retq
979;
980; AVX512-LABEL: splatvar_modulo_shift_v16i16:
981; AVX512:       # %bb.0:
982; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
983; AVX512-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
984; AVX512-NEXT:    retq
985;
986; AVX512VL-LABEL: splatvar_modulo_shift_v16i16:
987; AVX512VL:       # %bb.0:
988; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
989; AVX512VL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
990; AVX512VL-NEXT:    retq
991;
992; X86-AVX1-LABEL: splatvar_modulo_shift_v16i16:
993; X86-AVX1:       # %bb.0:
994; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
995; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
996; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
997; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
998; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
999; X86-AVX1-NEXT:    retl
1000;
1001; X86-AVX2-LABEL: splatvar_modulo_shift_v16i16:
1002; X86-AVX2:       # %bb.0:
1003; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1004; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1005; X86-AVX2-NEXT:    retl
1006  %mod = and <16 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
1007  %splat = shufflevector <16 x i16> %mod, <16 x i16> poison, <16 x i32> zeroinitializer
1008  %shift = lshr <16 x i16> %a, %splat
1009  ret <16 x i16> %shift
1010}
1011
1012define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
1013; AVX1-LABEL: splatvar_modulo_shift_v32i8:
1014; AVX1:       # %bb.0:
1015; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1016; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1017; AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1018; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1019; AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
1020; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1021; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1022; AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1023; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1024; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1025; AVX1-NEXT:    retq
1026;
1027; AVX2-LABEL: splatvar_modulo_shift_v32i8:
1028; AVX2:       # %bb.0:
1029; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1030; AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1031; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1032; AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1033; AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
1034; AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
1035; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1036; AVX2-NEXT:    retq
1037;
1038; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
1039; XOPAVX1:       # %bb.0:
1040; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1041; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
1042; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1043; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1044; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1045; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
1046; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1047; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1048; XOPAVX1-NEXT:    retq
1049;
1050; XOPAVX2-LABEL: splatvar_modulo_shift_v32i8:
1051; XOPAVX2:       # %bb.0:
1052; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
1053; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1054; XOPAVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1055; XOPAVX2-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
1056; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
1057; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
1058; XOPAVX2-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
1059; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
1060; XOPAVX2-NEXT:    retq
1061;
1062; AVX512DQ-LABEL: splatvar_modulo_shift_v32i8:
1063; AVX512DQ:       # %bb.0:
1064; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1065; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1066; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1067; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1068; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
1069; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
1070; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
1071; AVX512DQ-NEXT:    retq
1072;
1073; AVX512BW-LABEL: splatvar_modulo_shift_v32i8:
1074; AVX512BW:       # %bb.0:
1075; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1076; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1077; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1078; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1079; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1080; AVX512BW-NEXT:    retq
1081;
1082; AVX512DQVL-LABEL: splatvar_modulo_shift_v32i8:
1083; AVX512DQVL:       # %bb.0:
1084; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1085; AVX512DQVL-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1086; AVX512DQVL-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1087; AVX512DQVL-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1088; AVX512DQVL-NEXT:    vpsrlw $8, %xmm1, %xmm1
1089; AVX512DQVL-NEXT:    vpbroadcastb %xmm1, %ymm1
1090; AVX512DQVL-NEXT:    vpand %ymm1, %ymm0, %ymm0
1091; AVX512DQVL-NEXT:    retq
1092;
1093; AVX512BWVL-LABEL: splatvar_modulo_shift_v32i8:
1094; AVX512BWVL:       # %bb.0:
1095; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1096; AVX512BWVL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
1097; AVX512BWVL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
1098; AVX512BWVL-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
1099; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1100; AVX512BWVL-NEXT:    retq
1101;
1102; X86-AVX1-LABEL: splatvar_modulo_shift_v32i8:
1103; X86-AVX1:       # %bb.0:
1104; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1105; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1106; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm2, %xmm2
1107; X86-AVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
1108; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm3, %xmm3
1109; X86-AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
1110; X86-AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1111; X86-AVX1-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0
1112; X86-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1113; X86-AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1114; X86-AVX1-NEXT:    retl
1115;
1116; X86-AVX2-LABEL: splatvar_modulo_shift_v32i8:
1117; X86-AVX2:       # %bb.0:
1118; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1119; X86-AVX2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1120; X86-AVX2-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
1121; X86-AVX2-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
1122; X86-AVX2-NEXT:    vpsrlw $8, %xmm1, %xmm1
1123; X86-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
1124; X86-AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
1125; X86-AVX2-NEXT:    retl
1126  %mod = and <32 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
1127  %splat = shufflevector <32 x i8> %mod, <32 x i8> poison, <32 x i32> zeroinitializer
1128  %shift = lshr <32 x i8> %a, %splat
1129  ret <32 x i8> %shift
1130}
1131
1132;
1133; Constant Shifts
1134;
1135
1136define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind {
1137; AVX1-LABEL: constant_shift_v4i64:
1138; AVX1:       # %bb.0:
1139; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1140; AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
1141; AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
1142; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1143; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
1144; AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1145; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1146; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1147; AVX1-NEXT:    retq
1148;
1149; AVX2-LABEL: constant_shift_v4i64:
1150; AVX2:       # %bb.0:
1151; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1152; AVX2-NEXT:    retq
1153;
1154; XOPAVX1-LABEL: constant_shift_v4i64:
1155; XOPAVX1:       # %bb.0:
1156; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1157; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1158; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1159; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1160; XOPAVX1-NEXT:    retq
1161;
1162; XOPAVX2-LABEL: constant_shift_v4i64:
1163; XOPAVX2:       # %bb.0:
1164; XOPAVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1165; XOPAVX2-NEXT:    retq
1166;
1167; AVX512-LABEL: constant_shift_v4i64:
1168; AVX512:       # %bb.0:
1169; AVX512-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1170; AVX512-NEXT:    retq
1171;
1172; AVX512VL-LABEL: constant_shift_v4i64:
1173; AVX512VL:       # %bb.0:
1174; AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1175; AVX512VL-NEXT:    retq
1176;
1177; X86-AVX1-LABEL: constant_shift_v4i64:
1178; X86-AVX1:       # %bb.0:
1179; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1180; X86-AVX1-NEXT:    vpsrlq $62, %xmm1, %xmm2
1181; X86-AVX1-NEXT:    vpsrlq $31, %xmm1, %xmm1
1182; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1183; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm2
1184; X86-AVX1-NEXT:    vpsrlq $1, %xmm0, %xmm0
1185; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1186; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1187; X86-AVX1-NEXT:    retl
1188;
1189; X86-AVX2-LABEL: constant_shift_v4i64:
1190; X86-AVX2:       # %bb.0:
1191; X86-AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1192; X86-AVX2-NEXT:    retl
1193  %shift = lshr <4 x i64> %a, <i64 1, i64 7, i64 31, i64 62>
1194  ret <4 x i64> %shift
1195}
1196
1197define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind {
1198; AVX1-LABEL: constant_shift_v8i32:
1199; AVX1:       # %bb.0:
1200; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
1201; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
1202; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1203; AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
1204; AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
1205; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1206; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1207; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1208; AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
1209; AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
1210; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1211; AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
1212; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1213; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1214; AVX1-NEXT:    retq
1215;
1216; AVX2-LABEL: constant_shift_v8i32:
1217; AVX2:       # %bb.0:
1218; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1219; AVX2-NEXT:    retq
1220;
1221; XOPAVX1-LABEL: constant_shift_v8i32:
1222; XOPAVX1:       # %bb.0:
1223; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1224; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1225; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1226; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1227; XOPAVX1-NEXT:    retq
1228;
1229; XOPAVX2-LABEL: constant_shift_v8i32:
1230; XOPAVX2:       # %bb.0:
1231; XOPAVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1232; XOPAVX2-NEXT:    retq
1233;
1234; AVX512-LABEL: constant_shift_v8i32:
1235; AVX512:       # %bb.0:
1236; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1237; AVX512-NEXT:    retq
1238;
1239; AVX512VL-LABEL: constant_shift_v8i32:
1240; AVX512VL:       # %bb.0:
1241; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1242; AVX512VL-NEXT:    retq
1243;
1244; X86-AVX1-LABEL: constant_shift_v8i32:
1245; X86-AVX1:       # %bb.0:
1246; X86-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm1
1247; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm2
1248; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1249; X86-AVX1-NEXT:    vpsrld $6, %xmm0, %xmm2
1250; X86-AVX1-NEXT:    vpsrld $4, %xmm0, %xmm3
1251; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1252; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1253; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1254; X86-AVX1-NEXT:    vpsrld $7, %xmm0, %xmm2
1255; X86-AVX1-NEXT:    vpsrld $9, %xmm0, %xmm3
1256; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1257; X86-AVX1-NEXT:    vpsrld $8, %xmm0, %xmm0
1258; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
1259; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1260; X86-AVX1-NEXT:    retl
1261;
1262; X86-AVX2-LABEL: constant_shift_v8i32:
1263; X86-AVX2:       # %bb.0:
1264; X86-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1265; X86-AVX2-NEXT:    retl
1266  %shift = lshr <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
1267  ret <8 x i32> %shift
1268}
1269
1270define <16 x i16> @constant_shift_v16i16_pairs(<16 x i16> %a) nounwind {
1271; AVX1-LABEL: constant_shift_v16i16_pairs:
1272; AVX1:       # %bb.0:
1273; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,u,32768,32768,16384,16384,8192,8192]
1274; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1275; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1276; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [4096,4096,2048,2048,1024,1024,512,512]
1277; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1278; AVX1-NEXT:    retq
1279;
1280; AVX2-LABEL: constant_shift_v16i16_pairs:
1281; AVX2:       # %bb.0:
1282; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1283; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1284; AVX2-NEXT:    retq
1285;
1286; XOPAVX1-LABEL: constant_shift_v16i16_pairs:
1287; XOPAVX1:       # %bb.0:
1288; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1289; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1290; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1291; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1292; XOPAVX1-NEXT:    retq
1293;
1294; XOPAVX2-LABEL: constant_shift_v16i16_pairs:
1295; XOPAVX2:       # %bb.0:
1296; XOPAVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,u,32768,32768,16384,16384,8192,8192,4096,4096,2048,2048,1024,1024,512,512]
1297; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
1298; XOPAVX2-NEXT:    retq
1299;
1300; AVX512DQ-LABEL: constant_shift_v16i16_pairs:
1301; AVX512DQ:       # %bb.0:
1302; AVX512DQ-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1303; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1304; AVX512DQ-NEXT:    retq
1305;
1306; AVX512BW-LABEL: constant_shift_v16i16_pairs:
1307; AVX512BW:       # %bb.0:
1308; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1309; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1310; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1311; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1312; AVX512BW-NEXT:    retq
1313;
1314; AVX512DQVL-LABEL: constant_shift_v16i16_pairs:
1315; AVX512DQVL:       # %bb.0:
1316; AVX512DQVL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1317; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1318; AVX512DQVL-NEXT:    retq
1319;
1320; AVX512BWVL-LABEL: constant_shift_v16i16_pairs:
1321; AVX512BWVL:       # %bb.0:
1322; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1323; AVX512BWVL-NEXT:    retq
1324;
1325; X86-AVX1-LABEL: constant_shift_v16i16_pairs:
1326; X86-AVX1:       # %bb.0:
1327; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,u,32768,32768,16384,16384,8192,8192]
1328; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7]
1329; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1330; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [4096,4096,2048,2048,1024,1024,512,512]
1331; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1332; X86-AVX1-NEXT:    retl
1333;
1334; X86-AVX2-LABEL: constant_shift_v16i16_pairs:
1335; X86-AVX2:       # %bb.0:
1336; X86-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1337; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1338; X86-AVX2-NEXT:    retl
1339  %shift = lshr <16 x i16> %a, <i16 0, i16 0, i16 1, i16 1, i16 2, i16 2, i16 3, i16 3, i16 4, i16 4, i16 5, i16 5, i16 6, i16 6, i16 7, i16 7>
1340  ret <16 x i16> %shift
1341}
1342
1343define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind {
1344; AVX1-LABEL: constant_shift_v16i16:
1345; AVX1:       # %bb.0:
1346; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1347; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1348; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1349; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1350; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1351; AVX1-NEXT:    retq
1352;
1353; AVX2-LABEL: constant_shift_v16i16:
1354; AVX2:       # %bb.0:
1355; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1356; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1357; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1358; AVX2-NEXT:    retq
1359;
1360; XOPAVX1-LABEL: constant_shift_v16i16:
1361; XOPAVX1:       # %bb.0:
1362; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1363; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1364; XOPAVX1-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1365; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1366; XOPAVX1-NEXT:    retq
1367;
1368; XOPAVX2-LABEL: constant_shift_v16i16:
1369; XOPAVX2:       # %bb.0:
1370; XOPAVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1371; XOPAVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1372; XOPAVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1373; XOPAVX2-NEXT:    retq
1374;
1375; AVX512DQ-LABEL: constant_shift_v16i16:
1376; AVX512DQ:       # %bb.0:
1377; AVX512DQ-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1378; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1379; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1380; AVX512DQ-NEXT:    retq
1381;
1382; AVX512BW-LABEL: constant_shift_v16i16:
1383; AVX512BW:       # %bb.0:
1384; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1385; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1386; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1387; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1388; AVX512BW-NEXT:    retq
1389;
1390; AVX512DQVL-LABEL: constant_shift_v16i16:
1391; AVX512DQVL:       # %bb.0:
1392; AVX512DQVL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1393; AVX512DQVL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1394; AVX512DQVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1395; AVX512DQVL-NEXT:    retq
1396;
1397; AVX512BWVL-LABEL: constant_shift_v16i16:
1398; AVX512BWVL:       # %bb.0:
1399; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1400; AVX512BWVL-NEXT:    retq
1401;
1402; X86-AVX1-LABEL: constant_shift_v16i16:
1403; X86-AVX1:       # %bb.0:
1404; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [u,32768,16384,8192,4096,2048,1024,512]
1405; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1406; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1407; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [256,128,64,32,16,8,4,2]
1408; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1409; X86-AVX1-NEXT:    retl
1410;
1411; X86-AVX2-LABEL: constant_shift_v16i16:
1412; X86-AVX2:       # %bb.0:
1413; X86-AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1 # [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
1414; X86-AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
1415; X86-AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
1416; X86-AVX2-NEXT:    retl
1417  %shift = lshr <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1418  ret <16 x i16> %shift
1419}
1420
1421define <32 x i8> @constant_shift_v32i8_pairs(<32 x i8> %a) nounwind {
1422; AVX1-LABEL: constant_shift_v32i8_pairs:
1423; AVX1:       # %bb.0:
1424; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1425; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096]
1426; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
1427; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15]
1428; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1429; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm0
1430; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1431; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1432; AVX1-NEXT:    retq
1433;
1434; AVX2-LABEL: constant_shift_v32i8_pairs:
1435; AVX2:       # %bb.0:
1436; AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1437; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1438; AVX2-NEXT:    retq
1439;
1440; XOPAVX1-LABEL: constant_shift_v32i8_pairs:
1441; XOPAVX1:       # %bb.0:
1442; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1443; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252]
1444; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1445; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1446; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1447; XOPAVX1-NEXT:    retq
1448;
1449; XOPAVX2-LABEL: constant_shift_v32i8_pairs:
1450; XOPAVX2:       # %bb.0:
1451; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1452; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [249,249,254,254,252,252,250,250,255,255,254,254,253,253,252,252]
1453; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1454; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1455; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1456; XOPAVX2-NEXT:    retq
1457;
1458; AVX512DQ-LABEL: constant_shift_v32i8_pairs:
1459; AVX512DQ:       # %bb.0:
1460; AVX512DQ-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1461; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1462; AVX512DQ-NEXT:    retq
1463;
1464; AVX512BW-LABEL: constant_shift_v32i8_pairs:
1465; AVX512BW:       # %bb.0:
1466; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1467; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [7,2,4,6,1,2,3,4,7,2,4,6,1,2,3,4]
1468; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
1469; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
1470; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1471; AVX512BW-NEXT:    retq
1472;
1473; AVX512DQVL-LABEL: constant_shift_v32i8_pairs:
1474; AVX512DQVL:       # %bb.0:
1475; AVX512DQVL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1476; AVX512DQVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1477; AVX512DQVL-NEXT:    retq
1478;
1479; AVX512BWVL-LABEL: constant_shift_v32i8_pairs:
1480; AVX512BWVL:       # %bb.0:
1481; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1482; AVX512BWVL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1483; AVX512BWVL-NEXT:    retq
1484;
1485; X86-AVX1-LABEL: constant_shift_v32i8_pairs:
1486; X86-AVX1:       # %bb.0:
1487; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1488; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [512,16384,4096,1024,32768,16384,8192,4096]
1489; X86-AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm1
1490; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15]
1491; X86-AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1492; X86-AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm0
1493; X86-AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1494; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1495; X86-AVX1-NEXT:    retl
1496;
1497; X86-AVX2-LABEL: constant_shift_v32i8_pairs:
1498; X86-AVX2:       # %bb.0:
1499; X86-AVX2-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
1500; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1501; X86-AVX2-NEXT:    retl
1502  %shift = lshr <32 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
1503  ret <32 x i8> %shift
1504}
1505
1506define <32 x i8> @constant_shift_v32i8_quads(<32 x i8> %a) nounwind {
1507; AVX1-LABEL: constant_shift_v32i8_quads:
1508; AVX1:       # %bb.0:
1509; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 # [8192,8192,16384,16384,32768,32768,u,u]
1510; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1511; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1512; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1513; AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [512,512,1024,1024,2048,2048,4096,4096]
1514; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1515; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1516; AVX1-NEXT:    retq
1517;
1518; AVX2-LABEL: constant_shift_v32i8_quads:
1519; AVX2:       # %bb.0:
1520; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1521; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1522; AVX2-NEXT:    retq
1523;
1524; XOPAVX1-LABEL: constant_shift_v32i8_quads:
1525; XOPAVX1:       # %bb.0:
1526; XOPAVX1-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1527; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1528; XOPAVX1-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1529; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1530; XOPAVX1-NEXT:    retq
1531;
1532; XOPAVX2-LABEL: constant_shift_v32i8_quads:
1533; XOPAVX2:       # %bb.0:
1534; XOPAVX2-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1535; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1536; XOPAVX2-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1537; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1538; XOPAVX2-NEXT:    retq
1539;
1540; AVX512-LABEL: constant_shift_v32i8_quads:
1541; AVX512:       # %bb.0:
1542; AVX512-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1543; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1544; AVX512-NEXT:    retq
1545;
1546; AVX512VL-LABEL: constant_shift_v32i8_quads:
1547; AVX512VL:       # %bb.0:
1548; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1549; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1550; AVX512VL-NEXT:    retq
1551;
1552; X86-AVX1-LABEL: constant_shift_v32i8_quads:
1553; X86-AVX1:       # %bb.0:
1554; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 # [8192,8192,16384,16384,32768,32768,u,u]
1555; X86-AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
1556; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1
1557; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1558; X86-AVX1-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0 # [512,512,1024,1024,2048,2048,4096,4096]
1559; X86-AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
1560; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1561; X86-AVX1-NEXT:    retl
1562;
1563; X86-AVX2-LABEL: constant_shift_v32i8_quads:
1564; X86-AVX2:       # %bb.0:
1565; X86-AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1566; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1567; X86-AVX2-NEXT:    retl
1568  %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0, i8 7, i8 7, i8 7, i8 7, i8 6, i8 6, i8 6, i8 6, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4>
1569  ret <32 x i8> %shift
1570}
1571
1572define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind {
1573; AVX1-LABEL: constant_shift_v32i8:
1574; AVX1:       # %bb.0:
1575; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1576; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1577; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1578; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1579; AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1580; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1581; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1582; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1583; AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
1584; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1585; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
1586; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1587; AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
1588; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1589; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1590; AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
1591; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1592; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1593; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1594; AVX1-NEXT:    retq
1595;
1596; AVX2-LABEL: constant_shift_v32i8:
1597; AVX2:       # %bb.0:
1598; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1599; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1600; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1601; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
1602; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1603; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1604; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1605; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1606; AVX2-NEXT:    retq
1607;
1608; XOPAVX1-LABEL: constant_shift_v32i8:
1609; XOPAVX1:       # %bb.0:
1610; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1611; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1612; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1613; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1614; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1615; XOPAVX1-NEXT:    retq
1616;
1617; XOPAVX2-LABEL: constant_shift_v32i8:
1618; XOPAVX2:       # %bb.0:
1619; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1620; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,255,254,253,252,251,250,249,249,250,251,252,253,254,255,0]
1621; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1622; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1623; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1624; XOPAVX2-NEXT:    retq
1625;
1626; AVX512DQ-LABEL: constant_shift_v32i8:
1627; AVX512DQ:       # %bb.0:
1628; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1629; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1630; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1631; AVX512DQ-NEXT:    vpsrlw $8, %ymm2, %ymm2
1632; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1633; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1634; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
1635; AVX512DQ-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1636; AVX512DQ-NEXT:    retq
1637;
1638; AVX512BW-LABEL: constant_shift_v32i8:
1639; AVX512BW:       # %bb.0:
1640; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1641; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1642; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
1643; AVX512BW-NEXT:    retq
1644;
1645; AVX512DQVL-LABEL: constant_shift_v32i8:
1646; AVX512DQVL:       # %bb.0:
1647; AVX512DQVL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1648; AVX512DQVL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1649; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1650; AVX512DQVL-NEXT:    vpsrlw $8, %ymm2, %ymm2
1651; AVX512DQVL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1652; AVX512DQVL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1653; AVX512DQVL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1654; AVX512DQVL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1655; AVX512DQVL-NEXT:    retq
1656;
1657; AVX512BWVL-LABEL: constant_shift_v32i8:
1658; AVX512BWVL:       # %bb.0:
1659; AVX512BWVL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
1660; AVX512BWVL-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
1661; AVX512BWVL-NEXT:    vpmovwb %zmm0, %ymm0
1662; AVX512BWVL-NEXT:    retq
1663;
1664; X86-AVX1-LABEL: constant_shift_v32i8:
1665; X86-AVX1:       # %bb.0:
1666; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1667; X86-AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
1668; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
1669; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,4,8,16,32,64,128,256]
1670; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm3, %xmm3
1671; X86-AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
1672; X86-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
1673; X86-AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [256,128,64,32,16,8,4,2]
1674; X86-AVX1-NEXT:    vpmullw %xmm5, %xmm1, %xmm1
1675; X86-AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1676; X86-AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
1677; X86-AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
1678; X86-AVX1-NEXT:    vpmullw %xmm4, %xmm2, %xmm2
1679; X86-AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1680; X86-AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
1681; X86-AVX1-NEXT:    vpmullw %xmm5, %xmm0, %xmm0
1682; X86-AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1683; X86-AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1684; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1685; X86-AVX1-NEXT:    retl
1686;
1687; X86-AVX2-LABEL: constant_shift_v32i8:
1688; X86-AVX2:       # %bb.0:
1689; X86-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1690; X86-AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31]
1691; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm2, %ymm2 # [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
1692; X86-AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
1693; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23]
1694; X86-AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 # [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
1695; X86-AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1696; X86-AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
1697; X86-AVX2-NEXT:    retl
1698  %shift = lshr <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
1699  ret <32 x i8> %shift
1700}
1701
1702;
1703; Uniform Constant Shifts
1704;
1705
1706define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
1707; AVX1-LABEL: splatconstant_shift_v4i64:
1708; AVX1:       # %bb.0:
1709; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1710; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1711; AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1712; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1713; AVX1-NEXT:    retq
1714;
1715; AVX2-LABEL: splatconstant_shift_v4i64:
1716; AVX2:       # %bb.0:
1717; AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1718; AVX2-NEXT:    retq
1719;
1720; XOPAVX1-LABEL: splatconstant_shift_v4i64:
1721; XOPAVX1:       # %bb.0:
1722; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1723; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1724; XOPAVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1725; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1726; XOPAVX1-NEXT:    retq
1727;
1728; XOPAVX2-LABEL: splatconstant_shift_v4i64:
1729; XOPAVX2:       # %bb.0:
1730; XOPAVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1731; XOPAVX2-NEXT:    retq
1732;
1733; AVX512-LABEL: splatconstant_shift_v4i64:
1734; AVX512:       # %bb.0:
1735; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
1736; AVX512-NEXT:    retq
1737;
1738; AVX512VL-LABEL: splatconstant_shift_v4i64:
1739; AVX512VL:       # %bb.0:
1740; AVX512VL-NEXT:    vpsrlq $7, %ymm0, %ymm0
1741; AVX512VL-NEXT:    retq
1742;
1743; X86-AVX1-LABEL: splatconstant_shift_v4i64:
1744; X86-AVX1:       # %bb.0:
1745; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm1
1746; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1747; X86-AVX1-NEXT:    vpsrlq $7, %xmm0, %xmm0
1748; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1749; X86-AVX1-NEXT:    retl
1750;
1751; X86-AVX2-LABEL: splatconstant_shift_v4i64:
1752; X86-AVX2:       # %bb.0:
1753; X86-AVX2-NEXT:    vpsrlq $7, %ymm0, %ymm0
1754; X86-AVX2-NEXT:    retl
1755  %shift = lshr <4 x i64> %a, <i64 7, i64 7, i64 7, i64 7>
1756  ret <4 x i64> %shift
1757}
1758
1759define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind {
1760; AVX1-LABEL: splatconstant_shift_v8i32:
1761; AVX1:       # %bb.0:
1762; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1763; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1764; AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1765; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1766; AVX1-NEXT:    retq
1767;
1768; AVX2-LABEL: splatconstant_shift_v8i32:
1769; AVX2:       # %bb.0:
1770; AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1771; AVX2-NEXT:    retq
1772;
1773; XOPAVX1-LABEL: splatconstant_shift_v8i32:
1774; XOPAVX1:       # %bb.0:
1775; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1776; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1777; XOPAVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1778; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1779; XOPAVX1-NEXT:    retq
1780;
1781; XOPAVX2-LABEL: splatconstant_shift_v8i32:
1782; XOPAVX2:       # %bb.0:
1783; XOPAVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1784; XOPAVX2-NEXT:    retq
1785;
1786; AVX512-LABEL: splatconstant_shift_v8i32:
1787; AVX512:       # %bb.0:
1788; AVX512-NEXT:    vpsrld $5, %ymm0, %ymm0
1789; AVX512-NEXT:    retq
1790;
1791; AVX512VL-LABEL: splatconstant_shift_v8i32:
1792; AVX512VL:       # %bb.0:
1793; AVX512VL-NEXT:    vpsrld $5, %ymm0, %ymm0
1794; AVX512VL-NEXT:    retq
1795;
1796; X86-AVX1-LABEL: splatconstant_shift_v8i32:
1797; X86-AVX1:       # %bb.0:
1798; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm1
1799; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1800; X86-AVX1-NEXT:    vpsrld $5, %xmm0, %xmm0
1801; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1802; X86-AVX1-NEXT:    retl
1803;
1804; X86-AVX2-LABEL: splatconstant_shift_v8i32:
1805; X86-AVX2:       # %bb.0:
1806; X86-AVX2-NEXT:    vpsrld $5, %ymm0, %ymm0
1807; X86-AVX2-NEXT:    retl
1808  %shift = lshr <8 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
1809  ret <8 x i32> %shift
1810}
1811
1812define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind {
1813; AVX1-LABEL: splatconstant_shift_v16i16:
1814; AVX1:       # %bb.0:
1815; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1816; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1817; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1818; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1819; AVX1-NEXT:    retq
1820;
1821; AVX2-LABEL: splatconstant_shift_v16i16:
1822; AVX2:       # %bb.0:
1823; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1824; AVX2-NEXT:    retq
1825;
1826; XOPAVX1-LABEL: splatconstant_shift_v16i16:
1827; XOPAVX1:       # %bb.0:
1828; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1829; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1830; XOPAVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1831; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1832; XOPAVX1-NEXT:    retq
1833;
1834; XOPAVX2-LABEL: splatconstant_shift_v16i16:
1835; XOPAVX2:       # %bb.0:
1836; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1837; XOPAVX2-NEXT:    retq
1838;
1839; AVX512-LABEL: splatconstant_shift_v16i16:
1840; AVX512:       # %bb.0:
1841; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1842; AVX512-NEXT:    retq
1843;
1844; AVX512VL-LABEL: splatconstant_shift_v16i16:
1845; AVX512VL:       # %bb.0:
1846; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
1847; AVX512VL-NEXT:    retq
1848;
1849; X86-AVX1-LABEL: splatconstant_shift_v16i16:
1850; X86-AVX1:       # %bb.0:
1851; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm1
1852; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1853; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1854; X86-AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1855; X86-AVX1-NEXT:    retl
1856;
1857; X86-AVX2-LABEL: splatconstant_shift_v16i16:
1858; X86-AVX2:       # %bb.0:
1859; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1860; X86-AVX2-NEXT:    retl
1861  %shift = lshr <16 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
1862  ret <16 x i16> %shift
1863}
1864
1865define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind {
1866; AVX1-LABEL: splatconstant_shift_v32i8:
1867; AVX1:       # %bb.0:
1868; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1869; AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1870; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1871; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1872; AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1873; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1874; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1875; AVX1-NEXT:    retq
1876;
1877; AVX2-LABEL: splatconstant_shift_v32i8:
1878; AVX2:       # %bb.0:
1879; AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1880; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1881; AVX2-NEXT:    retq
1882;
1883; XOPAVX1-LABEL: splatconstant_shift_v32i8:
1884; XOPAVX1:       # %bb.0:
1885; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1886; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [253,253,253,253,253,253,253,253,253,253,253,253,253,253,253,253]
1887; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1888; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
1889; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1890; XOPAVX1-NEXT:    retq
1891;
1892; XOPAVX2-LABEL: splatconstant_shift_v32i8:
1893; XOPAVX2:       # %bb.0:
1894; XOPAVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1895; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1896; XOPAVX2-NEXT:    retq
1897;
1898; AVX512-LABEL: splatconstant_shift_v32i8:
1899; AVX512:       # %bb.0:
1900; AVX512-NEXT:    vpsrlw $3, %ymm0, %ymm0
1901; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1902; AVX512-NEXT:    retq
1903;
1904; AVX512VL-LABEL: splatconstant_shift_v32i8:
1905; AVX512VL:       # %bb.0:
1906; AVX512VL-NEXT:    vpsrlw $3, %ymm0, %ymm0
1907; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1908; AVX512VL-NEXT:    retq
1909;
1910; X86-AVX1-LABEL: splatconstant_shift_v32i8:
1911; X86-AVX1:       # %bb.0:
1912; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1913; X86-AVX1-NEXT:    vpsrlw $3, %xmm1, %xmm1
1914; X86-AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31]
1915; X86-AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
1916; X86-AVX1-NEXT:    vpsrlw $3, %xmm0, %xmm0
1917; X86-AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
1918; X86-AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1919; X86-AVX1-NEXT:    retl
1920;
1921; X86-AVX2-LABEL: splatconstant_shift_v32i8:
1922; X86-AVX2:       # %bb.0:
1923; X86-AVX2-NEXT:    vpsrlw $3, %ymm0, %ymm0
1924; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
1925; X86-AVX2-NEXT:    retl
1926  %shift = lshr <32 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
1927  ret <32 x i8> %shift
1928}
1929
1930;
1931; Special Cases
1932;
1933
1934define <4 x i64> @shift32_v4i64(<4 x i64> %a) nounwind {
1935; AVX1-LABEL: shift32_v4i64:
1936; AVX1:       # %bb.0:
1937; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1938; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1939; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1940; AVX1-NEXT:    retq
1941;
1942; AVX2-LABEL: shift32_v4i64:
1943; AVX2:       # %bb.0:
1944; AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
1945; AVX2-NEXT:    retq
1946;
1947; XOPAVX1-LABEL: shift32_v4i64:
1948; XOPAVX1:       # %bb.0:
1949; XOPAVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1950; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1951; XOPAVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1952; XOPAVX1-NEXT:    retq
1953;
1954; XOPAVX2-LABEL: shift32_v4i64:
1955; XOPAVX2:       # %bb.0:
1956; XOPAVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
1957; XOPAVX2-NEXT:    retq
1958;
1959; AVX512-LABEL: shift32_v4i64:
1960; AVX512:       # %bb.0:
1961; AVX512-NEXT:    vpsrlq $32, %ymm0, %ymm0
1962; AVX512-NEXT:    retq
1963;
1964; AVX512VL-LABEL: shift32_v4i64:
1965; AVX512VL:       # %bb.0:
1966; AVX512VL-NEXT:    vpsrlq $32, %ymm0, %ymm0
1967; AVX512VL-NEXT:    retq
1968;
1969; X86-AVX1-LABEL: shift32_v4i64:
1970; X86-AVX1:       # %bb.0:
1971; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
1972; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm1[1,3],ymm0[5,7],ymm1[5,7]
1973; X86-AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2,1,3,4,6,5,7]
1974; X86-AVX1-NEXT:    retl
1975;
1976; X86-AVX2-LABEL: shift32_v4i64:
1977; X86-AVX2:       # %bb.0:
1978; X86-AVX2-NEXT:    vpsrlq $32, %ymm0, %ymm0
1979; X86-AVX2-NEXT:    retl
1980  %shift = lshr <4 x i64> %a, <i64 32, i64 32, i64 32, i64 32>
1981  ret <4 x i64> %shift
1982}
1983
1984define <4 x i32> @sh_trunc_sh_vec(<4 x i64> %x) {
1985; AVX1-LABEL: sh_trunc_sh_vec:
1986; AVX1:       # %bb.0:
1987; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1988; AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
1989; AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
1990; AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1991; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1992; AVX1-NEXT:    vzeroupper
1993; AVX1-NEXT:    retq
1994;
1995; AVX2-LABEL: sh_trunc_sh_vec:
1996; AVX2:       # %bb.0:
1997; AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
1998; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1999; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2000; AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
2001; AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
2002; AVX2-NEXT:    vzeroupper
2003; AVX2-NEXT:    retq
2004;
2005; XOPAVX1-LABEL: sh_trunc_sh_vec:
2006; XOPAVX1:       # %bb.0:
2007; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2008; XOPAVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
2009; XOPAVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
2010; XOPAVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2011; XOPAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2012; XOPAVX1-NEXT:    vzeroupper
2013; XOPAVX1-NEXT:    retq
2014;
2015; XOPAVX2-LABEL: sh_trunc_sh_vec:
2016; XOPAVX2:       # %bb.0:
2017; XOPAVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
2018; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2019; XOPAVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2020; XOPAVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
2021; XOPAVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
2022; XOPAVX2-NEXT:    vzeroupper
2023; XOPAVX2-NEXT:    retq
2024;
2025; AVX512-LABEL: sh_trunc_sh_vec:
2026; AVX512:       # %bb.0:
2027; AVX512-NEXT:    vpsrlq $36, %ymm0, %ymm0
2028; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
2029; AVX512-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
2030; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
2031; AVX512-NEXT:    vzeroupper
2032; AVX512-NEXT:    retq
2033;
2034; AVX512VL-LABEL: sh_trunc_sh_vec:
2035; AVX512VL:       # %bb.0:
2036; AVX512VL-NEXT:    vpsrlq $36, %ymm0, %ymm0
2037; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
2038; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
2039; AVX512VL-NEXT:    vzeroupper
2040; AVX512VL-NEXT:    retq
2041;
2042; X86-AVX1-LABEL: sh_trunc_sh_vec:
2043; X86-AVX1:       # %bb.0:
2044; X86-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
2045; X86-AVX1-NEXT:    vpsrlq $36, %xmm1, %xmm1
2046; X86-AVX1-NEXT:    vpsrlq $36, %xmm0, %xmm0
2047; X86-AVX1-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2048; X86-AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
2049; X86-AVX1-NEXT:    vzeroupper
2050; X86-AVX1-NEXT:    retl
2051;
2052; X86-AVX2-LABEL: sh_trunc_sh_vec:
2053; X86-AVX2:       # %bb.0:
2054; X86-AVX2-NEXT:    vpsrlq $36, %ymm0, %ymm0
2055; X86-AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
2056; X86-AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
2057; X86-AVX2-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1048575,1048575,1048575,1048575]
2058; X86-AVX2-NEXT:    vandps %xmm1, %xmm0, %xmm0
2059; X86-AVX2-NEXT:    vzeroupper
2060; X86-AVX2-NEXT:    retl
2061  %s = lshr <4 x i64> %x, <i64 24, i64 24, i64 24, i64 24>
2062  %t = trunc <4 x i64> %s to <4 x i32>
2063  %r = lshr <4 x i32> %t, <i32 12, i32 12, i32 12, i32 12>
2064  ret <4 x i32> %r
2065}
2066