xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll (revision 5c181a9191bfb758575329ff7eb8db4fc46ffac9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
4
5;
6; Variable Shifts
7;
8
9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10; ALL-LABEL: var_shift_v8i64:
11; ALL:       # %bb.0:
12; ALL-NEXT:    vpsrlvq %zmm1, %zmm0, %zmm0
13; ALL-NEXT:    retq
14  %shift = lshr <8 x i64> %a, %b
15  ret <8 x i64> %shift
16}
17
18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19; ALL-LABEL: var_shift_v16i32:
20; ALL:       # %bb.0:
21; ALL-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
22; ALL-NEXT:    retq
23  %shift = lshr <16 x i32> %a, %b
24  ret <16 x i32> %shift
25}
26
27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28; AVX512DQ-LABEL: var_shift_v32i16:
29; AVX512DQ:       # %bb.0:
30; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
31; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
32; AVX512DQ-NEXT:    vpsrlvd %zmm2, %zmm3, %zmm2
33; AVX512DQ-NEXT:    vpmovdw %zmm2, %ymm2
34; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
35; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
36; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
37; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
38; AVX512DQ-NEXT:    vpsrlvd %zmm1, %zmm0, %zmm0
39; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
40; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
41; AVX512DQ-NEXT:    retq
42;
43; AVX512BW-LABEL: var_shift_v32i16:
44; AVX512BW:       # %bb.0:
45; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
46; AVX512BW-NEXT:    retq
47  %shift = lshr <32 x i16> %a, %b
48  ret <32 x i16> %shift
49}
50
51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
52; AVX512DQ-LABEL: var_shift_v64i8:
53; AVX512DQ:       # %bb.0:
54; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
55; AVX512DQ-NEXT:    vpsrlw $4, %ymm2, %ymm3
56; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
57; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
58; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
59; AVX512DQ-NEXT:    vpsllw $5, %ymm5, %ymm5
60; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
61; AVX512DQ-NEXT:    vpsrlw $2, %ymm2, %ymm3
62; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63]
63; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
64; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
65; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
66; AVX512DQ-NEXT:    vpsrlw $1, %ymm2, %ymm3
67; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127]
68; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
69; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
70; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
71; AVX512DQ-NEXT:    vpsrlw $4, %ymm0, %ymm3
72; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
73; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
74; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
75; AVX512DQ-NEXT:    vpsrlw $2, %ymm0, %ymm3
76; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
77; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
78; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
79; AVX512DQ-NEXT:    vpsrlw $1, %ymm0, %ymm3
80; AVX512DQ-NEXT:    vpand %ymm7, %ymm3, %ymm3
81; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
82; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
83; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
84; AVX512DQ-NEXT:    retq
85;
86; AVX512BW-LABEL: var_shift_v64i8:
87; AVX512BW:       # %bb.0:
88; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm2
89; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
90; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
91; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
92; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
93; AVX512BW-NEXT:    vpsrlw $2, %zmm0, %zmm2
94; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
95; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
96; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
97; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
98; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm2
99; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
100; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
101; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
102; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
103; AVX512BW-NEXT:    retq
104  %shift = lshr <64 x i8> %a, %b
105  ret <64 x i8> %shift
106}
107
108;
109; Uniform Variable Shifts
110;
111
112define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
113; ALL-LABEL: splatvar_shift_v8i64:
114; ALL:       # %bb.0:
115; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
116; ALL-NEXT:    retq
117  %splat = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> zeroinitializer
118  %shift = lshr <8 x i64> %a, %splat
119  ret <8 x i64> %shift
120}
121
122define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
123; ALL-LABEL: splatvar_shift_v16i32:
124; ALL:       # %bb.0:
125; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
126; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
127; ALL-NEXT:    retq
128  %splat = shufflevector <16 x i32> %b, <16 x i32> poison, <16 x i32> zeroinitializer
129  %shift = lshr <16 x i32> %a, %splat
130  ret <16 x i32> %shift
131}
132
133define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
134; AVX512DQ-LABEL: splatvar_shift_v32i16:
135; AVX512DQ:       # %bb.0:
136; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
137; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
138; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
139; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
140; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
141; AVX512DQ-NEXT:    retq
142;
143; AVX512BW-LABEL: splatvar_shift_v32i16:
144; AVX512BW:       # %bb.0:
145; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
146; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
147; AVX512BW-NEXT:    retq
148  %splat = shufflevector <32 x i16> %b, <32 x i16> poison, <32 x i32> zeroinitializer
149  %shift = lshr <32 x i16> %a, %splat
150  ret <32 x i16> %shift
151}
152
153define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
154; AVX512DQ-LABEL: splatvar_shift_v64i8:
155; AVX512DQ:       # %bb.0:
156; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
157; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
158; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
159; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
160; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
161; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
162; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
163; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
164; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
165; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
166; AVX512DQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
167; AVX512DQ-NEXT:    retq
168;
169; AVX512BW-LABEL: splatvar_shift_v64i8:
170; AVX512BW:       # %bb.0:
171; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
172; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
173; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
174; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
175; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
176; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
177; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
178; AVX512BW-NEXT:    retq
179  %splat = shufflevector <64 x i8> %b, <64 x i8> poison, <64 x i32> zeroinitializer
180  %shift = lshr <64 x i8> %a, %splat
181  ret <64 x i8> %shift
182}
183
184;
185; Uniform Variable Modulo Shifts
186;
187
188define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
189; ALL-LABEL: splatvar_modulo_shift_v8i64:
190; ALL:       # %bb.0:
191; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
192; ALL-NEXT:    vpsrlq %xmm1, %zmm0, %zmm0
193; ALL-NEXT:    retq
194  %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
195  %splat = shufflevector <8 x i64> %mod, <8 x i64> poison, <8 x i32> zeroinitializer
196  %shift = lshr <8 x i64> %a, %splat
197  ret <8 x i64> %shift
198}
199
200define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
201; ALL-LABEL: splatvar_modulo_shift_v16i32:
202; ALL:       # %bb.0:
203; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
204; ALL-NEXT:    vpsrld %xmm1, %zmm0, %zmm0
205; ALL-NEXT:    retq
206  %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
207  %splat = shufflevector <16 x i32> %mod, <16 x i32> poison, <16 x i32> zeroinitializer
208  %shift = lshr <16 x i32> %a, %splat
209  ret <16 x i32> %shift
210}
211
212define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
213; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16:
214; AVX512DQ:       # %bb.0:
215; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
216; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
217; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
218; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
219; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
220; AVX512DQ-NEXT:    retq
221;
222; AVX512BW-LABEL: splatvar_modulo_shift_v32i16:
223; AVX512BW:       # %bb.0:
224; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
225; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
226; AVX512BW-NEXT:    retq
227  %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
228  %splat = shufflevector <32 x i16> %mod, <32 x i16> poison, <32 x i32> zeroinitializer
229  %shift = lshr <32 x i16> %a, %splat
230  ret <32 x i16> %shift
231}
232
233define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
234; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8:
235; AVX512DQ:       # %bb.0:
236; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
237; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
238; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm2, %ymm2
239; AVX512DQ-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
240; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
241; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
242; AVX512DQ-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
243; AVX512DQ-NEXT:    vpsrlw $8, %xmm1, %xmm1
244; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
245; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
246; AVX512DQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
247; AVX512DQ-NEXT:    retq
248;
249; AVX512BW-LABEL: splatvar_modulo_shift_v64i8:
250; AVX512BW:       # %bb.0:
251; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
252; AVX512BW-NEXT:    vpsrlw %xmm1, %zmm0, %zmm0
253; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
254; AVX512BW-NEXT:    vpsrlw %xmm1, %xmm2, %xmm1
255; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
256; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
257; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
258; AVX512BW-NEXT:    retq
259  %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
260  %splat = shufflevector <64 x i8> %mod, <64 x i8> poison, <64 x i32> zeroinitializer
261  %shift = lshr <64 x i8> %a, %splat
262  ret <64 x i8> %shift
263}
264
265;
266; Constant Shifts
267;
268
269define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
270; ALL-LABEL: constant_shift_v8i64:
271; ALL:       # %bb.0:
272; ALL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
273; ALL-NEXT:    retq
274  %shift = lshr <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
275  ret <8 x i64> %shift
276}
277
278define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
279; ALL-LABEL: constant_shift_v16i32:
280; ALL:       # %bb.0:
281; ALL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
282; ALL-NEXT:    retq
283  %shift = lshr <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
284  ret <16 x i32> %shift
285}
286
287define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
288; AVX512DQ-LABEL: constant_shift_v32i16:
289; AVX512DQ:       # %bb.0:
290; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
291; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
292; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
293; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
294; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
295; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm2
296; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3,4,5,6,7]
297; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
298; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
299; AVX512DQ-NEXT:    retq
300;
301; AVX512BW-LABEL: constant_shift_v32i16:
302; AVX512BW:       # %bb.0:
303; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
304; AVX512BW-NEXT:    retq
305  %shift = lshr <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
306  ret <32 x i16> %shift
307}
308
309define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind {
310; AVX512DQ-LABEL: constant_shift_v32i16_pairs:
311; AVX512DQ:       # %bb.0:
312; AVX512DQ-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
313; AVX512DQ-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
314; AVX512DQ-NEXT:    retq
315;
316; AVX512BW-LABEL: constant_shift_v32i16_pairs:
317; AVX512BW:       # %bb.0:
318; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
319; AVX512BW-NEXT:    retq
320  %shift = lshr <32 x i16> %a, <i16 8, i16 8, i16 8, i16 8, i16 9, i16 9, i16 9, i16 9, i16 10, i16 10, i16 10, i16 10, i16 11, i16 11, i16 11, i16 11, i16 12, i16 12, i16 12, i16 12, i16 13, i16 13, i16 13, i16 13, i16 14, i16 14, i16 14, i16 14, i16 15, i16 15, i16 15, i16 15>
321  ret <32 x i16> %shift
322}
323
324define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind {
325; AVX512DQ-LABEL: constant_shift_v64i8_pairs:
326; AVX512DQ:       # %bb.0:
327; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
328; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [512,16384,4096,1024,32768,16384,8192,4096,512,16384,4096,1024,32768,16384,8192,4096]
329; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
330; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
331; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
332; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
333; AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} zmm1 = [1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15,1,1,63,63,15,15,3,3,127,127,63,63,31,31,15,15]
334; AVX512DQ-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
335; AVX512DQ-NEXT:    vpandq %zmm0, %zmm1, %zmm0
336; AVX512DQ-NEXT:    retq
337;
338; AVX512BW-LABEL: constant_shift_v64i8_pairs:
339; AVX512BW:       # %bb.0:
340; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
341; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
342; AVX512BW-NEXT:    retq
343  %shift = lshr <64 x i8> %a, <i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4, i8 7, i8 7, i8 2, i8 2, i8 4, i8 4, i8 6, i8 6, i8 1, i8 1, i8 2, i8 2, i8 3, i8 3, i8 4, i8 4>
344  ret <64 x i8> %shift
345}
346
347define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind {
348; ALL-LABEL: constant_shift_v64i8_quads:
349; ALL:       # %bb.0:
350; ALL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
351; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
352; ALL-NEXT:    retq
353  %shift = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3>
354  ret <64 x i8> %shift
355}
356
357define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
358; AVX512DQ-LABEL: constant_shift_v64i8:
359; AVX512DQ:       # %bb.0:
360; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
361; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
362; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
363; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,4,8,16,32,64,128,256,2,4,8,16,32,64,128,256]
364; AVX512DQ-NEXT:    # ymm4 = mem[0,1,0,1]
365; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
366; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
367; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
368; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [256,128,64,32,16,8,4,2,256,128,64,32,16,8,4,2]
369; AVX512DQ-NEXT:    # ymm5 = mem[0,1,0,1]
370; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm1, %ymm1
371; AVX512DQ-NEXT:    vpsrlw $8, %ymm1, %ymm1
372; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm1, %ymm1
373; AVX512DQ-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15],ymm0[24],ymm2[24],ymm0[25],ymm2[25],ymm0[26],ymm2[26],ymm0[27],ymm2[27],ymm0[28],ymm2[28],ymm0[29],ymm2[29],ymm0[30],ymm2[30],ymm0[31],ymm2[31]
374; AVX512DQ-NEXT:    vpmullw %ymm4, %ymm3, %ymm3
375; AVX512DQ-NEXT:    vpsrlw $8, %ymm3, %ymm3
376; AVX512DQ-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23]
377; AVX512DQ-NEXT:    vpmullw %ymm5, %ymm0, %ymm0
378; AVX512DQ-NEXT:    vpsrlw $8, %ymm0, %ymm0
379; AVX512DQ-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
380; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
381; AVX512DQ-NEXT:    retq
382;
383; AVX512BW-LABEL: constant_shift_v64i8:
384; AVX512BW:       # %bb.0:
385; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
386; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31],zmm0[40],zmm1[40],zmm0[41],zmm1[41],zmm0[42],zmm1[42],zmm0[43],zmm1[43],zmm0[44],zmm1[44],zmm0[45],zmm1[45],zmm0[46],zmm1[46],zmm0[47],zmm1[47],zmm0[56],zmm1[56],zmm0[57],zmm1[57],zmm0[58],zmm1[58],zmm0[59],zmm1[59],zmm0[60],zmm1[60],zmm0[61],zmm1[61],zmm0[62],zmm1[62],zmm0[63],zmm1[63]
387; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm2
388; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
389; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[32],zmm1[32],zmm0[33],zmm1[33],zmm0[34],zmm1[34],zmm0[35],zmm1[35],zmm0[36],zmm1[36],zmm0[37],zmm1[37],zmm0[38],zmm1[38],zmm0[39],zmm1[39],zmm0[48],zmm1[48],zmm0[49],zmm1[49],zmm0[50],zmm1[50],zmm0[51],zmm1[51],zmm0[52],zmm1[52],zmm0[53],zmm1[53],zmm0[54],zmm1[54],zmm0[55],zmm1[55]
390; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
391; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
392; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
393; AVX512BW-NEXT:    retq
394  %shift = lshr <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
395  ret <64 x i8> %shift
396}
397
398;
399; Uniform Constant Shifts
400;
401
402define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
403; ALL-LABEL: splatconstant_shift_v8i64:
404; ALL:       # %bb.0:
405; ALL-NEXT:    vpsrlq $7, %zmm0, %zmm0
406; ALL-NEXT:    retq
407  %shift = lshr <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
408  ret <8 x i64> %shift
409}
410
411define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
412; ALL-LABEL: splatconstant_shift_v16i32:
413; ALL:       # %bb.0:
414; ALL-NEXT:    vpsrld $5, %zmm0, %zmm0
415; ALL-NEXT:    retq
416  %shift = lshr <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
417  ret <16 x i32> %shift
418}
419
420define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
421; AVX512DQ-LABEL: splatconstant_shift_v32i16:
422; AVX512DQ:       # %bb.0:
423; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm1
424; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
425; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
426; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
427; AVX512DQ-NEXT:    retq
428;
429; AVX512BW-LABEL: splatconstant_shift_v32i16:
430; AVX512BW:       # %bb.0:
431; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
432; AVX512BW-NEXT:    retq
433  %shift = lshr <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
434  ret <32 x i16> %shift
435}
436
437define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
438; AVX512DQ-LABEL: splatconstant_shift_v64i8:
439; AVX512DQ:       # %bb.0:
440; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm1
441; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
442; AVX512DQ-NEXT:    vpsrlw $3, %ymm0, %ymm0
443; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
444; AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
445; AVX512DQ-NEXT:    retq
446;
447; AVX512BW-LABEL: splatconstant_shift_v64i8:
448; AVX512BW:       # %bb.0:
449; AVX512BW-NEXT:    vpsrlw $3, %zmm0, %zmm0
450; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
451; AVX512BW-NEXT:    retq
452  %shift = lshr <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
453  ret <64 x i8> %shift
454}
455