xref: /llvm-project/llvm/test/CodeGen/X86/vector-shift-shl-512.ll (revision 2a922903bf5d5b0012c1f8f2a5396d44cfff4630)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefixes=ALL,AVX512DQ
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=ALL,AVX512BW
4
5;
6; Variable Shifts
7;
8
9define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
10; ALL-LABEL: var_shift_v8i64:
11; ALL:       # %bb.0:
12; ALL-NEXT:    vpsllvq %zmm1, %zmm0, %zmm0
13; ALL-NEXT:    retq
14  %shift = shl <8 x i64> %a, %b
15  ret <8 x i64> %shift
16}
17
18define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
19; ALL-LABEL: var_shift_v16i32:
20; ALL:       # %bb.0:
21; ALL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
22; ALL-NEXT:    retq
23  %shift = shl <16 x i32> %a, %b
24  ret <16 x i32> %shift
25}
26
27define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
28; AVX512DQ-LABEL: var_shift_v32i16:
29; AVX512DQ:       # %bb.0:
30; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm2 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
31; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm3 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
32; AVX512DQ-NEXT:    vpsllvd %zmm2, %zmm3, %zmm2
33; AVX512DQ-NEXT:    vpmovdw %zmm2, %ymm2
34; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
35; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero
36; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
37; AVX512DQ-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
38; AVX512DQ-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
39; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
40; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
41; AVX512DQ-NEXT:    retq
42;
43; AVX512BW-LABEL: var_shift_v32i16:
44; AVX512BW:       # %bb.0:
45; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
46; AVX512BW-NEXT:    retq
47  %shift = shl <32 x i16> %a, %b
48  ret <32 x i16> %shift
49}
50
51define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
52; AVX512DQ-LABEL: var_shift_v64i8:
53; AVX512DQ:       # %bb.0:
54; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
55; AVX512DQ-NEXT:    vpsllw $4, %ymm2, %ymm3
56; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
57; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
58; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm5
59; AVX512DQ-NEXT:    vpsllw $5, %ymm5, %ymm5
60; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
61; AVX512DQ-NEXT:    vpsllw $2, %ymm2, %ymm3
62; AVX512DQ-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
63; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
64; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
65; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
66; AVX512DQ-NEXT:    vpaddb %ymm2, %ymm2, %ymm3
67; AVX512DQ-NEXT:    vpaddb %ymm5, %ymm5, %ymm5
68; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm3, %ymm2, %ymm2
69; AVX512DQ-NEXT:    vpsllw $4, %ymm0, %ymm3
70; AVX512DQ-NEXT:    vpand %ymm4, %ymm3, %ymm3
71; AVX512DQ-NEXT:    vpsllw $5, %ymm1, %ymm1
72; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
73; AVX512DQ-NEXT:    vpsllw $2, %ymm0, %ymm3
74; AVX512DQ-NEXT:    vpand %ymm6, %ymm3, %ymm3
75; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
76; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
77; AVX512DQ-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
78; AVX512DQ-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
79; AVX512DQ-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
80; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
81; AVX512DQ-NEXT:    retq
82;
83; AVX512BW-LABEL: var_shift_v64i8:
84; AVX512BW:       # %bb.0:
85; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm2
86; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
87; AVX512BW-NEXT:    vpsllw $5, %zmm1, %zmm1
88; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
89; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
90; AVX512BW-NEXT:    vpsllw $2, %zmm0, %zmm2
91; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm2
92; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
93; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
94; AVX512BW-NEXT:    vmovdqu8 %zmm2, %zmm0 {%k1}
95; AVX512BW-NEXT:    vpaddb %zmm1, %zmm1, %zmm1
96; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
97; AVX512BW-NEXT:    vpaddb %zmm0, %zmm0, %zmm0 {%k1}
98; AVX512BW-NEXT:    retq
99  %shift = shl <64 x i8> %a, %b
100  ret <64 x i8> %shift
101}
102
103;
104; Uniform Variable Shifts
105;
106
107define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
108; ALL-LABEL: splatvar_shift_v8i64:
109; ALL:       # %bb.0:
110; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
111; ALL-NEXT:    retq
112  %splat = shufflevector <8 x i64> %b, <8 x i64> poison, <8 x i32> zeroinitializer
113  %shift = shl <8 x i64> %a, %splat
114  ret <8 x i64> %shift
115}
116
117define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
118; ALL-LABEL: splatvar_shift_v16i32:
119; ALL:       # %bb.0:
120; ALL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
121; ALL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
122; ALL-NEXT:    retq
123  %splat = shufflevector <16 x i32> %b, <16 x i32> poison, <16 x i32> zeroinitializer
124  %shift = shl <16 x i32> %a, %splat
125  ret <16 x i32> %shift
126}
127
128define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
129; AVX512DQ-LABEL: splatvar_shift_v32i16:
130; AVX512DQ:       # %bb.0:
131; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
132; AVX512DQ-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
133; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
134; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
135; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
136; AVX512DQ-NEXT:    retq
137;
138; AVX512BW-LABEL: splatvar_shift_v32i16:
139; AVX512BW:       # %bb.0:
140; AVX512BW-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
141; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
142; AVX512BW-NEXT:    retq
143  %splat = shufflevector <32 x i16> %b, <32 x i16> poison, <32 x i32> zeroinitializer
144  %shift = shl <32 x i16> %a, %splat
145  ret <32 x i16> %shift
146}
147
148define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
149; AVX512DQ-LABEL: splatvar_shift_v64i8:
150; AVX512DQ:       # %bb.0:
151; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
152; AVX512DQ-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
153; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
154; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
155; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
156; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
157; AVX512DQ-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
158; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
159; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
160; AVX512DQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
161; AVX512DQ-NEXT:    retq
162;
163; AVX512BW-LABEL: splatvar_shift_v64i8:
164; AVX512BW:       # %bb.0:
165; AVX512BW-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
166; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
167; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
168; AVX512BW-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
169; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
170; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
171; AVX512BW-NEXT:    retq
172  %splat = shufflevector <64 x i8> %b, <64 x i8> poison, <64 x i32> zeroinitializer
173  %shift = shl <64 x i8> %a, %splat
174  ret <64 x i8> %shift
175}
176
177;
178; Uniform Variable Modulo Shifts
179;
180
181define <8 x i64> @splatvar_modulo_shift_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
182; ALL-LABEL: splatvar_modulo_shift_v8i64:
183; ALL:       # %bb.0:
184; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
185; ALL-NEXT:    vpsllq %xmm1, %zmm0, %zmm0
186; ALL-NEXT:    retq
187  %mod = and <8 x i64> %b, <i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63, i64 63>
188  %splat = shufflevector <8 x i64> %mod, <8 x i64> poison, <8 x i32> zeroinitializer
189  %shift = shl <8 x i64> %a, %splat
190  ret <8 x i64> %shift
191}
192
193define <16 x i32> @splatvar_modulo_shift_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
194; ALL-LABEL: splatvar_modulo_shift_v16i32:
195; ALL:       # %bb.0:
196; ALL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
197; ALL-NEXT:    vpslld %xmm1, %zmm0, %zmm0
198; ALL-NEXT:    retq
199  %mod = and <16 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
200  %splat = shufflevector <16 x i32> %mod, <16 x i32> poison, <16 x i32> zeroinitializer
201  %shift = shl <16 x i32> %a, %splat
202  ret <16 x i32> %shift
203}
204
205define <32 x i16> @splatvar_modulo_shift_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
206; AVX512DQ-LABEL: splatvar_modulo_shift_v32i16:
207; AVX512DQ:       # %bb.0:
208; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
209; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
210; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
211; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
212; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
213; AVX512DQ-NEXT:    retq
214;
215; AVX512BW-LABEL: splatvar_modulo_shift_v32i16:
216; AVX512BW:       # %bb.0:
217; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
218; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
219; AVX512BW-NEXT:    retq
220  %mod = and <32 x i16> %b, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
221  %splat = shufflevector <32 x i16> %mod, <32 x i16> poison, <32 x i32> zeroinitializer
222  %shift = shl <32 x i16> %a, %splat
223  ret <32 x i16> %shift
224}
225
226define <64 x i8> @splatvar_modulo_shift_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
227; AVX512DQ-LABEL: splatvar_modulo_shift_v64i8:
228; AVX512DQ:       # %bb.0:
229; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
230; AVX512DQ-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
231; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
232; AVX512DQ-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
233; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
234; AVX512DQ-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
235; AVX512DQ-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
236; AVX512DQ-NEXT:    vpbroadcastb %xmm1, %ymm1
237; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm1, %zmm1
238; AVX512DQ-NEXT:    vpandq %zmm1, %zmm0, %zmm0
239; AVX512DQ-NEXT:    retq
240;
241; AVX512BW-LABEL: splatvar_modulo_shift_v64i8:
242; AVX512BW:       # %bb.0:
243; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
244; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
245; AVX512BW-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
246; AVX512BW-NEXT:    vpsllw %xmm1, %xmm2, %xmm1
247; AVX512BW-NEXT:    vpbroadcastb %xmm1, %zmm1
248; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
249; AVX512BW-NEXT:    retq
250  %mod = and <64 x i8> %b, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
251  %splat = shufflevector <64 x i8> %mod, <64 x i8> poison, <64 x i32> zeroinitializer
252  %shift = shl <64 x i8> %a, %splat
253  ret <64 x i8> %shift
254}
255
256;
257; Constant Shifts
258;
259
260define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) nounwind {
261; ALL-LABEL: constant_shift_v8i64:
262; ALL:       # %bb.0:
263; ALL-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
264; ALL-NEXT:    retq
265  %shift = shl <8 x i64> %a, <i64 1, i64 7, i64 31, i64 62, i64 1, i64 7, i64 31, i64 62>
266  ret <8 x i64> %shift
267}
268
269define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) nounwind {
270; ALL-LABEL: constant_shift_v16i32:
271; ALL:       # %bb.0:
272; ALL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
273; ALL-NEXT:    retq
274  %shift = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 8, i32 7>
275  ret <16 x i32> %shift
276}
277
278define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
279; AVX512DQ-LABEL: constant_shift_v32i16:
280; AVX512DQ:       # %bb.0:
281; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
282; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
283; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
284; AVX512DQ-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
285; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
286; AVX512DQ-NEXT:    retq
287;
288; AVX512BW-LABEL: constant_shift_v32i16:
289; AVX512BW:       # %bb.0:
290; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
291; AVX512BW-NEXT:    retq
292  %shift = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
293  ret <32 x i16> %shift
294}
295
296define <32 x i16> @constant_shift_v32i16_pairs(<32 x i16> %a) nounwind {
297; AVX512DQ-LABEL: constant_shift_v32i16_pairs:
298; AVX512DQ:       # %bb.0:
299; AVX512DQ-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
300; AVX512DQ-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
301; AVX512DQ-NEXT:    retq
302;
303; AVX512BW-LABEL: constant_shift_v32i16_pairs:
304; AVX512BW:       # %bb.0:
305; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
306; AVX512BW-NEXT:    retq
307  %shift = shl <32 x i16> %a, <i16 0, i16 0, i16 1, i16 1, i16 3, i16 3, i16 2, i16 2, i16 6, i16 6, i16 7, i16 7, i16 5, i16 5, i16 4, i16 4, i16 12, i16 12, i16 13, i16 13, i16 15, i16 15, i16 14, i16 14, i16 10, i16 10, i16 11, i16 11, i16 9, i16 9, i16 8, i16 8>
308  ret <32 x i16> %shift
309}
310
311define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) nounwind {
312; AVX512DQ-LABEL: constant_shift_v64i8:
313; AVX512DQ:       # %bb.0:
314; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
315; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
316; AVX512DQ-NEXT:    # ymm2 = mem[0,1,0,1]
317; AVX512DQ-NEXT:    vpmaddubsw %ymm2, %ymm1, %ymm3
318; AVX512DQ-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm2
319; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
320; AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
321; AVX512DQ-NEXT:    # ymm3 = mem[0,1,0,1]
322; AVX512DQ-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm0
323; AVX512DQ-NEXT:    vpsllw $8, %ymm0, %ymm0
324; AVX512DQ-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
325; AVX512DQ-NEXT:    vpsllw $8, %ymm1, %ymm1
326; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
327; AVX512DQ-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
328; AVX512DQ-NEXT:    retq
329;
330; AVX512BW-LABEL: constant_shift_v64i8:
331; AVX512BW:       # %bb.0:
332; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0,1,0,4,0,16,0,64,0,128,0,32,0,8,0,2,0]
333; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1,0,2,0,8,0,32,0,128,0,64,0,16,0,4,0,1]
334; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
335; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
336; AVX512BW-NEXT:    retq
337  %shift = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>
338  ret <64 x i8> %shift
339}
340
341define <64 x i8> @constant_shift_v64i8_pairs(<64 x i8> %a) nounwind {
342; AVX512DQ-LABEL: constant_shift_v64i8_pairs:
343; AVX512DQ:       # %bb.0:
344; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1 # [8,128,64,4,128,1,128,2,32,1,16,128,64,2,16,1]
345; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
346; AVX512DQ-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64,8,16,2,4,64,16,2,2,32,32,64,4,64,16,16]
347; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
348; AVX512DQ-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
349; AVX512DQ-NEXT:    retq
350;
351; AVX512BW-LABEL: constant_shift_v64i8_pairs:
352; AVX512BW:       # %bb.0:
353; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
354; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
355; AVX512BW-NEXT:    retq
356  %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 7, i8 7, i8 6, i8 6, i8 2, i8 2, i8 7, i8 7, i8 0, i8 0, i8 7, i8 7, i8 1, i8 1, i8 5, i8 5, i8 0, i8 0, i8 4, i8 4, i8 7, i8 7, i8 6, i8 6, i8 1, i8 1, i8 4, i8 4, i8 0, i8 0, i8 6, i8 6, i8 3, i8 3, i8 4, i8 4, i8 1, i8 1, i8 2, i8 2, i8 6, i8 6, i8 4, i8 4, i8 1, i8 1, i8 1, i8 1, i8 5, i8 5, i8 5, i8 5, i8 6, i8 6, i8 2, i8 2, i8 6, i8 6, i8 4, i8 4, i8 4, i8 4>
357  ret <64 x i8> %shift
358}
359
360define <64 x i8> @constant_shift_v64i8_quads(<64 x i8> %a) nounwind {
361; ALL-LABEL: constant_shift_v64i8_quads:
362; ALL:       # %bb.0:
363; ALL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
364; ALL-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
365; ALL-NEXT:    retq
366  %shift = shl <64 x i8> %a, <i8 0, i8 0, i8 0, i8 0, i8 1, i8 1, i8 1, i8 1, i8 3, i8 3, i8 3, i8 3, i8 2, i8 2, i8 2, i8 2, i8 6, i8 6, i8 6, i8 6, i8 7, i8 7, i8 7, i8 7, i8 5, i8 5, i8 5, i8 5, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 5, i8 5, i8 5, i8 5, i8 7, i8 7, i8 7, i8 7, i8 6, i8 6, i8 6, i8 6, i8 2, i8 2, i8 2, i8 2, i8 3, i8 3, i8 3, i8 3, i8 1, i8 1, i8 1, i8 1, i8 0, i8 0, i8 0, i8 0>
367  ret <64 x i8> %shift
368}
369
370;
371; Uniform Constant Shifts
372;
373
374define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) nounwind {
375; ALL-LABEL: splatconstant_shift_v8i64:
376; ALL:       # %bb.0:
377; ALL-NEXT:    vpsllq $7, %zmm0, %zmm0
378; ALL-NEXT:    retq
379  %shift = shl <8 x i64> %a, <i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7, i64 7>
380  ret <8 x i64> %shift
381}
382
383define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) nounwind {
384; ALL-LABEL: splatconstant_shift_v16i32:
385; ALL:       # %bb.0:
386; ALL-NEXT:    vpslld $5, %zmm0, %zmm0
387; ALL-NEXT:    retq
388  %shift = shl <16 x i32> %a, <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
389  ret <16 x i32> %shift
390}
391
392define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) nounwind {
393; AVX512DQ-LABEL: splatconstant_shift_v32i16:
394; AVX512DQ:       # %bb.0:
395; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm1
396; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
397; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
398; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
399; AVX512DQ-NEXT:    retq
400;
401; AVX512BW-LABEL: splatconstant_shift_v32i16:
402; AVX512BW:       # %bb.0:
403; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
404; AVX512BW-NEXT:    retq
405  %shift = shl <32 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3, i16 3>
406  ret <32 x i16> %shift
407}
408
409define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) nounwind {
410; AVX512DQ-LABEL: splatconstant_shift_v64i8:
411; AVX512DQ:       # %bb.0:
412; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm1
413; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
414; AVX512DQ-NEXT:    vpsllw $3, %ymm0, %ymm0
415; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
416; AVX512DQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
417; AVX512DQ-NEXT:    retq
418;
419; AVX512BW-LABEL: splatconstant_shift_v64i8:
420; AVX512BW:       # %bb.0:
421; AVX512BW-NEXT:    vpsllw $3, %zmm0, %zmm0
422; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
423; AVX512BW-NEXT:    retq
424  %shift = shl <64 x i8> %a, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
425  ret <64 x i8> %shift
426}
427