xref: /llvm-project/llvm/test/CodeGen/X86/vector-rotate-512.ll (revision a25f2cb3e6953691fade076c8e0ccebf1016d3d9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLBW
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512VBMI2
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLVBMI2
8
9;
10; Variable Rotates
11;
12
13define <8 x i64> @var_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
14; AVX512-LABEL: var_rotate_v8i64:
15; AVX512:       # %bb.0:
16; AVX512-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
17; AVX512-NEXT:    retq
18  %b64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %b
19  %shl = shl <8 x i64> %a, %b
20  %lshr = lshr <8 x i64> %a, %b64
21  %or = or <8 x i64> %shl, %lshr
22  ret <8 x i64> %or
23}
24
25define <16 x i32> @var_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
26; AVX512-LABEL: var_rotate_v16i32:
27; AVX512:       # %bb.0:
28; AVX512-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
29; AVX512-NEXT:    retq
30  %b32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
31  %shl = shl <16 x i32> %a, %b
32  %lshr = lshr <16 x i32> %a, %b32
33  %or = or <16 x i32> %shl, %lshr
34  ret <16 x i32> %or
35}
36
37define <32 x i16> @var_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
38; AVX512F-LABEL: var_rotate_v32i16:
39; AVX512F:       # %bb.0:
40; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
41; AVX512F-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
42; AVX512F-NEXT:    vpand %ymm3, %ymm2, %ymm2
43; AVX512F-NEXT:    vpxor %xmm4, %xmm4, %xmm4
44; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
45; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
46; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
47; AVX512F-NEXT:    vpsllvd %ymm5, %ymm7, %ymm5
48; AVX512F-NEXT:    vpsrld $16, %ymm5, %ymm5
49; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
50; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
51; AVX512F-NEXT:    vpsllvd %ymm2, %ymm6, %ymm2
52; AVX512F-NEXT:    vpsrld $16, %ymm2, %ymm2
53; AVX512F-NEXT:    vpackusdw %ymm5, %ymm2, %ymm2
54; AVX512F-NEXT:    vpand %ymm3, %ymm1, %ymm1
55; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
56; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
57; AVX512F-NEXT:    vpsllvd %ymm3, %ymm5, %ymm3
58; AVX512F-NEXT:    vpsrld $16, %ymm3, %ymm3
59; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
60; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
61; AVX512F-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
62; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
63; AVX512F-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
64; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
65; AVX512F-NEXT:    retq
66;
67; AVX512VL-LABEL: var_rotate_v32i16:
68; AVX512VL:       # %bb.0:
69; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
70; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
71; AVX512VL-NEXT:    vpand %ymm3, %ymm2, %ymm2
72; AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
73; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm2[4],ymm4[4],ymm2[5],ymm4[5],ymm2[6],ymm4[6],ymm2[7],ymm4[7],ymm2[12],ymm4[12],ymm2[13],ymm4[13],ymm2[14],ymm4[14],ymm2[15],ymm4[15]
74; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm6
75; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm6[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
76; AVX512VL-NEXT:    vpsllvd %ymm5, %ymm7, %ymm5
77; AVX512VL-NEXT:    vpsrld $16, %ymm5, %ymm5
78; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
79; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
80; AVX512VL-NEXT:    vpsllvd %ymm2, %ymm6, %ymm2
81; AVX512VL-NEXT:    vpsrld $16, %ymm2, %ymm2
82; AVX512VL-NEXT:    vpackusdw %ymm5, %ymm2, %ymm2
83; AVX512VL-NEXT:    vpand %ymm3, %ymm1, %ymm1
84; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm4[4],ymm1[5],ymm4[5],ymm1[6],ymm4[6],ymm1[7],ymm4[7],ymm1[12],ymm4[12],ymm1[13],ymm4[13],ymm1[14],ymm4[14],ymm1[15],ymm4[15]
85; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm5 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
86; AVX512VL-NEXT:    vpsllvd %ymm3, %ymm5, %ymm3
87; AVX512VL-NEXT:    vpsrld $16, %ymm3, %ymm3
88; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[2],ymm4[2],ymm1[3],ymm4[3],ymm1[8],ymm4[8],ymm1[9],ymm4[9],ymm1[10],ymm4[10],ymm1[11],ymm4[11]
89; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
90; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
91; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
92; AVX512VL-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
93; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
94; AVX512VL-NEXT:    retq
95;
96; AVX512BW-LABEL: var_rotate_v32i16:
97; AVX512BW:       # %bb.0:
98; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
99; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
100; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
101; AVX512BW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
102; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
103; AVX512BW-NEXT:    vporq %zmm0, %zmm2, %zmm0
104; AVX512BW-NEXT:    retq
105;
106; AVX512VLBW-LABEL: var_rotate_v32i16:
107; AVX512VLBW:       # %bb.0:
108; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
109; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
110; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} zmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
111; AVX512VLBW-NEXT:    vpsubw %zmm1, %zmm3, %zmm1
112; AVX512VLBW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
113; AVX512VLBW-NEXT:    vporq %zmm0, %zmm2, %zmm0
114; AVX512VLBW-NEXT:    retq
115;
116; AVX512VBMI2-LABEL: var_rotate_v32i16:
117; AVX512VBMI2:       # %bb.0:
118; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
119; AVX512VBMI2-NEXT:    retq
120;
121; AVX512VLVBMI2-LABEL: var_rotate_v32i16:
122; AVX512VLVBMI2:       # %bb.0:
123; AVX512VLVBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
124; AVX512VLVBMI2-NEXT:    retq
125  %b16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
126  %shl = shl <32 x i16> %a, %b
127  %lshr = lshr <32 x i16> %a, %b16
128  %or = or <32 x i16> %shl, %lshr
129  ret <32 x i16> %or
130}
131
132define <64 x i8> @var_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
133; AVX512F-LABEL: var_rotate_v64i8:
134; AVX512F:       # %bb.0:
135; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
136; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm3
137; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm4
138; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
139; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
140; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
141; AVX512F-NEXT:    vpsllw $5, %ymm3, %ymm3
142; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
143; AVX512F-NEXT:    vpsrlw $6, %ymm2, %ymm4
144; AVX512F-NEXT:    vpsllw $2, %ymm2, %ymm6
145; AVX512F-NEXT:    vpbroadcastd {{.*#+}} zmm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
146; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm6 = zmm4 ^ (zmm7 & (zmm6 ^ zmm4))
147; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
148; AVX512F-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
149; AVX512F-NEXT:    vpsrlw $7, %ymm2, %ymm4
150; AVX512F-NEXT:    vpbroadcastb {{.*#+}} ymm6 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
151; AVX512F-NEXT:    vpand %ymm6, %ymm4, %ymm4
152; AVX512F-NEXT:    vpaddb %ymm2, %ymm2, %ymm8
153; AVX512F-NEXT:    vpor %ymm4, %ymm8, %ymm4
154; AVX512F-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
155; AVX512F-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
156; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
157; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm4
158; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm5 & (zmm4 ^ zmm3))
159; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
160; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
161; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
162; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm4
163; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm4 = zmm3 ^ (zmm7 & (zmm4 ^ zmm3))
164; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
165; AVX512F-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
166; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm3
167; AVX512F-NEXT:    vpand %ymm6, %ymm3, %ymm3
168; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
169; AVX512F-NEXT:    vpor %ymm3, %ymm4, %ymm3
170; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
171; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
172; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
173; AVX512F-NEXT:    retq
174;
175; AVX512VL-LABEL: var_rotate_v64i8:
176; AVX512VL:       # %bb.0:
177; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
178; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm3
179; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm4
180; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm5 = [4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160,4042322160]
181; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
182; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm3
183; AVX512VL-NEXT:    vpsllw $5, %ymm3, %ymm3
184; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm4, %ymm2, %ymm2
185; AVX512VL-NEXT:    vpsrlw $6, %ymm2, %ymm4
186; AVX512VL-NEXT:    vpsllw $2, %ymm2, %ymm6
187; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm7 = [4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268,4244438268]
188; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm6 = ymm4 ^ (ymm7 & (ymm6 ^ ymm4))
189; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
190; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
191; AVX512VL-NEXT:    vpsrlw $7, %ymm2, %ymm4
192; AVX512VL-NEXT:    vpaddb %ymm2, %ymm2, %ymm6
193; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} ymm8 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
194; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm6 = ymm6 | (ymm4 & ymm8)
195; AVX512VL-NEXT:    vpaddb %ymm3, %ymm3, %ymm3
196; AVX512VL-NEXT:    vpblendvb %ymm3, %ymm6, %ymm2, %ymm2
197; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
198; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm4
199; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm5 & (ymm4 ^ ymm3))
200; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
201; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
202; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
203; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm4
204; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm4 = ymm3 ^ (ymm7 & (ymm4 ^ ymm3))
205; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
206; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
207; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm3
208; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm4
209; AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm4 = ymm4 | (ymm3 & ymm8)
210; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
211; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
212; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
213; AVX512VL-NEXT:    retq
214;
215; AVX512BW-LABEL: var_rotate_v64i8:
216; AVX512BW:       # %bb.0:
217; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
218; AVX512BW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
219; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
220; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
221; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm4, %zmm3
222; AVX512BW-NEXT:    vpsrlw $8, %zmm3, %zmm3
223; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
224; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
225; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
226; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
227; AVX512BW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
228; AVX512BW-NEXT:    retq
229;
230; AVX512VLBW-LABEL: var_rotate_v64i8:
231; AVX512VLBW:       # %bb.0:
232; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
233; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
234; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
235; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
236; AVX512VLBW-NEXT:    vpsllvw %zmm3, %zmm4, %zmm3
237; AVX512VLBW-NEXT:    vpsrlw $8, %zmm3, %zmm3
238; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
239; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
240; AVX512VLBW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
241; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
242; AVX512VLBW-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
243; AVX512VLBW-NEXT:    retq
244;
245; AVX512VBMI2-LABEL: var_rotate_v64i8:
246; AVX512VBMI2:       # %bb.0:
247; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
248; AVX512VBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
249; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
250; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
251; AVX512VBMI2-NEXT:    vpsllvw %zmm3, %zmm4, %zmm3
252; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm3, %zmm3
253; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
254; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
255; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
256; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
257; AVX512VBMI2-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
258; AVX512VBMI2-NEXT:    retq
259;
260; AVX512VLVBMI2-LABEL: var_rotate_v64i8:
261; AVX512VLVBMI2:       # %bb.0:
262; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm1
263; AVX512VLVBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
264; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm3 = zmm1[8],zmm2[8],zmm1[9],zmm2[9],zmm1[10],zmm2[10],zmm1[11],zmm2[11],zmm1[12],zmm2[12],zmm1[13],zmm2[13],zmm1[14],zmm2[14],zmm1[15],zmm2[15],zmm1[24],zmm2[24],zmm1[25],zmm2[25],zmm1[26],zmm2[26],zmm1[27],zmm2[27],zmm1[28],zmm2[28],zmm1[29],zmm2[29],zmm1[30],zmm2[30],zmm1[31],zmm2[31],zmm1[40],zmm2[40],zmm1[41],zmm2[41],zmm1[42],zmm2[42],zmm1[43],zmm2[43],zmm1[44],zmm2[44],zmm1[45],zmm2[45],zmm1[46],zmm2[46],zmm1[47],zmm2[47],zmm1[56],zmm2[56],zmm1[57],zmm2[57],zmm1[58],zmm2[58],zmm1[59],zmm2[59],zmm1[60],zmm2[60],zmm1[61],zmm2[61],zmm1[62],zmm2[62],zmm1[63],zmm2[63]
265; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm4 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
266; AVX512VLVBMI2-NEXT:    vpsllvw %zmm3, %zmm4, %zmm3
267; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm3, %zmm3
268; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm1 = zmm1[0],zmm2[0],zmm1[1],zmm2[1],zmm1[2],zmm2[2],zmm1[3],zmm2[3],zmm1[4],zmm2[4],zmm1[5],zmm2[5],zmm1[6],zmm2[6],zmm1[7],zmm2[7],zmm1[16],zmm2[16],zmm1[17],zmm2[17],zmm1[18],zmm2[18],zmm1[19],zmm2[19],zmm1[20],zmm2[20],zmm1[21],zmm2[21],zmm1[22],zmm2[22],zmm1[23],zmm2[23],zmm1[32],zmm2[32],zmm1[33],zmm2[33],zmm1[34],zmm2[34],zmm1[35],zmm2[35],zmm1[36],zmm2[36],zmm1[37],zmm2[37],zmm1[38],zmm2[38],zmm1[39],zmm2[39],zmm1[48],zmm2[48],zmm1[49],zmm2[49],zmm1[50],zmm2[50],zmm1[51],zmm2[51],zmm1[52],zmm2[52],zmm1[53],zmm2[53],zmm1[54],zmm2[54],zmm1[55],zmm2[55]
269; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
270; AVX512VLVBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
271; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
272; AVX512VLVBMI2-NEXT:    vpackuswb %zmm3, %zmm0, %zmm0
273; AVX512VLVBMI2-NEXT:    retq
274  %b8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
275  %shl = shl <64 x i8> %a, %b
276  %lshr = lshr <64 x i8> %a, %b8
277  %or = or <64 x i8> %shl, %lshr
278  ret <64 x i8> %or
279}
280
281;
282; Uniform Variable Rotates
283;
284
285define <8 x i64> @splatvar_rotate_v8i64(<8 x i64> %a, <8 x i64> %b) nounwind {
286; AVX512-LABEL: splatvar_rotate_v8i64:
287; AVX512:       # %bb.0:
288; AVX512-NEXT:    vpbroadcastq %xmm1, %zmm1
289; AVX512-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
290; AVX512-NEXT:    retq
291  %splat = shufflevector <8 x i64> %b, <8 x i64> undef, <8 x i32> zeroinitializer
292  %splat64 = sub <8 x i64> <i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64, i64 64>, %splat
293  %shl = shl <8 x i64> %a, %splat
294  %lshr = lshr <8 x i64> %a, %splat64
295  %or = or <8 x i64> %shl, %lshr
296  ret <8 x i64> %or
297}
298
299define <16 x i32> @splatvar_rotate_v16i32(<16 x i32> %a, <16 x i32> %b) nounwind {
300; AVX512-LABEL: splatvar_rotate_v16i32:
301; AVX512:       # %bb.0:
302; AVX512-NEXT:    vpbroadcastd %xmm1, %zmm1
303; AVX512-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
304; AVX512-NEXT:    retq
305  %splat = shufflevector <16 x i32> %b, <16 x i32> undef, <16 x i32> zeroinitializer
306  %splat32 = sub <16 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
307  %shl = shl <16 x i32> %a, %splat
308  %lshr = lshr <16 x i32> %a, %splat32
309  %or = or <16 x i32> %shl, %lshr
310  ret <16 x i32> %or
311}
312
313define <32 x i16> @splatvar_rotate_v32i16(<32 x i16> %a, <32 x i16> %b) nounwind {
314; AVX512F-LABEL: splatvar_rotate_v32i16:
315; AVX512F:       # %bb.0:
316; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
317; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
318; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
319; AVX512F-NEXT:    vpsrlw $1, %ymm4, %ymm5
320; AVX512F-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
321; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm6
322; AVX512F-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
323; AVX512F-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
324; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
325; AVX512F-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
326; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
327; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
328; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
329; AVX512F-NEXT:    retq
330;
331; AVX512VL-LABEL: splatvar_rotate_v32i16:
332; AVX512VL:       # %bb.0:
333; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
334; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
335; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm4
336; AVX512VL-NEXT:    vpsrlw $1, %ymm4, %ymm5
337; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm5, %ymm5
338; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm6
339; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm6, %ymm3
340; AVX512VL-NEXT:    vinserti64x4 $1, %ymm5, %zmm3, %zmm3
341; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
342; AVX512VL-NEXT:    vpsllw %xmm1, %ymm4, %ymm2
343; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
344; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
345; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
346; AVX512VL-NEXT:    retq
347;
348; AVX512BW-LABEL: splatvar_rotate_v32i16:
349; AVX512BW:       # %bb.0:
350; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
351; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
352; AVX512BW-NEXT:    vpsrlw $1, %zmm0, %zmm4
353; AVX512BW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
354; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
355; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
356; AVX512BW-NEXT:    vporq %zmm3, %zmm0, %zmm0
357; AVX512BW-NEXT:    retq
358;
359; AVX512VLBW-LABEL: splatvar_rotate_v32i16:
360; AVX512VLBW:       # %bb.0:
361; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
362; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
363; AVX512VLBW-NEXT:    vpsrlw $1, %zmm0, %zmm4
364; AVX512VLBW-NEXT:    vpsrlw %xmm3, %zmm4, %zmm3
365; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
366; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
367; AVX512VLBW-NEXT:    vporq %zmm3, %zmm0, %zmm0
368; AVX512VLBW-NEXT:    retq
369;
370; AVX512VBMI2-LABEL: splatvar_rotate_v32i16:
371; AVX512VBMI2:       # %bb.0:
372; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %zmm1
373; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
374; AVX512VBMI2-NEXT:    retq
375;
376; AVX512VLVBMI2-LABEL: splatvar_rotate_v32i16:
377; AVX512VLVBMI2:       # %bb.0:
378; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %zmm1
379; AVX512VLVBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
380; AVX512VLVBMI2-NEXT:    retq
381  %splat = shufflevector <32 x i16> %b, <32 x i16> undef, <32 x i32> zeroinitializer
382  %splat16 = sub <32 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
383  %shl = shl <32 x i16> %a, %splat
384  %lshr = lshr <32 x i16> %a, %splat16
385  %or = or <32 x i16> %shl, %lshr
386  ret <32 x i16> %or
387}
388
389define <64 x i8> @splatvar_rotate_v64i8(<64 x i8> %a, <64 x i8> %b) nounwind {
390; AVX512F-LABEL: splatvar_rotate_v64i8:
391; AVX512F:       # %bb.0:
392; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
393; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
394; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
395; AVX512F-NEXT:    vpsllw %xmm1, %ymm3, %ymm3
396; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
397; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
398; AVX512F-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
399; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
400; AVX512F-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
401; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
402; AVX512F-NEXT:    vpsllw %xmm1, %ymm3, %ymm3
403; AVX512F-NEXT:    vpsrlw $8, %ymm3, %ymm3
404; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
405; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
406; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
407; AVX512F-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
408; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
409; AVX512F-NEXT:    retq
410;
411; AVX512VL-LABEL: splatvar_rotate_v64i8:
412; AVX512VL:       # %bb.0:
413; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
414; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
415; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
416; AVX512VL-NEXT:    vpsllw %xmm1, %ymm3, %ymm3
417; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
418; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
419; AVX512VL-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
420; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
421; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
422; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
423; AVX512VL-NEXT:    vpsllw %xmm1, %ymm3, %ymm3
424; AVX512VL-NEXT:    vpsrlw $8, %ymm3, %ymm3
425; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
426; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
427; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
428; AVX512VL-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
429; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
430; AVX512VL-NEXT:    retq
431;
432; AVX512BW-LABEL: splatvar_rotate_v64i8:
433; AVX512BW:       # %bb.0:
434; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
435; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
436; AVX512BW-NEXT:    vpsllw %xmm1, %zmm2, %zmm2
437; AVX512BW-NEXT:    vpsrlw $8, %zmm2, %zmm2
438; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
439; AVX512BW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
440; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
441; AVX512BW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
442; AVX512BW-NEXT:    retq
443;
444; AVX512VLBW-LABEL: splatvar_rotate_v64i8:
445; AVX512VLBW:       # %bb.0:
446; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
447; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
448; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm2, %zmm2
449; AVX512VLBW-NEXT:    vpsrlw $8, %zmm2, %zmm2
450; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
451; AVX512VLBW-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
452; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
453; AVX512VLBW-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
454; AVX512VLBW-NEXT:    retq
455;
456; AVX512VBMI2-LABEL: splatvar_rotate_v64i8:
457; AVX512VBMI2:       # %bb.0:
458; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
459; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
460; AVX512VBMI2-NEXT:    vpsllw %xmm1, %zmm2, %zmm2
461; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm2, %zmm2
462; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
463; AVX512VBMI2-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
464; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
465; AVX512VBMI2-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
466; AVX512VBMI2-NEXT:    retq
467;
468; AVX512VLVBMI2-LABEL: splatvar_rotate_v64i8:
469; AVX512VLVBMI2:       # %bb.0:
470; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm2 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
471; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
472; AVX512VLVBMI2-NEXT:    vpsllw %xmm1, %zmm2, %zmm2
473; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm2, %zmm2
474; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
475; AVX512VLVBMI2-NEXT:    vpsllw %xmm1, %zmm0, %zmm0
476; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
477; AVX512VLVBMI2-NEXT:    vpackuswb %zmm2, %zmm0, %zmm0
478; AVX512VLVBMI2-NEXT:    retq
479  %splat = shufflevector <64 x i8> %b, <64 x i8> undef, <64 x i32> zeroinitializer
480  %splat8 = sub <64 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
481  %shl = shl <64 x i8> %a, %splat
482  %lshr = lshr <64 x i8> %a, %splat8
483  %or = or <64 x i8> %shl, %lshr
484  ret <64 x i8> %or
485}
486
487;
488; Constant Rotates
489;
490
491define <8 x i64> @constant_rotate_v8i64(<8 x i64> %a) nounwind {
492; AVX512-LABEL: constant_rotate_v8i64:
493; AVX512:       # %bb.0:
494; AVX512-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
495; AVX512-NEXT:    retq
496  %shl = shl <8 x i64> %a, <i64 4, i64 14, i64 50, i64 60, i64 4, i64 14, i64 50, i64 60>
497  %lshr = lshr <8 x i64> %a, <i64 60, i64 50, i64 14, i64 4, i64 60, i64 50, i64 14, i64 4>
498  %or = or <8 x i64> %shl, %lshr
499  ret <8 x i64> %or
500}
501
502define <16 x i32> @constant_rotate_v16i32(<16 x i32> %a) nounwind {
503; AVX512-LABEL: constant_rotate_v16i32:
504; AVX512:       # %bb.0:
505; AVX512-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
506; AVX512-NEXT:    retq
507  %shl = shl <16 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
508  %lshr = lshr <16 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
509  %or = or <16 x i32> %shl, %lshr
510  ret <16 x i32> %or
511}
512
513define <32 x i16> @constant_rotate_v32i16(<32 x i16> %a) nounwind {
514; AVX512F-LABEL: constant_rotate_v32i16:
515; AVX512F:       # %bb.0:
516; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
517; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
518; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
519; AVX512F-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
520; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
521; AVX512F-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
522; AVX512F-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
523; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
524; AVX512F-NEXT:    vporq %zmm3, %zmm0, %zmm0
525; AVX512F-NEXT:    retq
526;
527; AVX512VL-LABEL: constant_rotate_v32i16:
528; AVX512VL:       # %bb.0:
529; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
530; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
531; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
532; AVX512VL-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm4
533; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm4, %zmm3
534; AVX512VL-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
535; AVX512VL-NEXT:    vpmullw %ymm2, %ymm0, %ymm0
536; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
537; AVX512VL-NEXT:    vporq %zmm3, %zmm0, %zmm0
538; AVX512VL-NEXT:    retq
539;
540; AVX512BW-LABEL: constant_rotate_v32i16:
541; AVX512BW:       # %bb.0:
542; AVX512BW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
543; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
544; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
545; AVX512BW-NEXT:    retq
546;
547; AVX512VLBW-LABEL: constant_rotate_v32i16:
548; AVX512VLBW:       # %bb.0:
549; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
550; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
551; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
552; AVX512VLBW-NEXT:    retq
553;
554; AVX512VBMI2-LABEL: constant_rotate_v32i16:
555; AVX512VBMI2:       # %bb.0:
556; AVX512VBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
557; AVX512VBMI2-NEXT:    retq
558;
559; AVX512VLVBMI2-LABEL: constant_rotate_v32i16:
560; AVX512VLVBMI2:       # %bb.0:
561; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
562; AVX512VLVBMI2-NEXT:    retq
563  %shl = shl <32 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15, i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
564  %lshr = lshr <32 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1, i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
565  %or = or <32 x i16> %shl, %lshr
566  ret <32 x i16> %or
567}
568
569define <64 x i8> @constant_rotate_v64i8(<64 x i8> %a) nounwind {
570; AVX512F-LABEL: constant_rotate_v64i8:
571; AVX512F:       # %bb.0:
572; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
573; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
574; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
575; AVX512F-NEXT:    # ymm3 = mem[0,1,0,1]
576; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
577; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
578; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
579; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
580; AVX512F-NEXT:    # ymm4 = mem[0,1,0,1]
581; AVX512F-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
582; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
583; AVX512F-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
584; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
585; AVX512F-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
586; AVX512F-NEXT:    vpsrlw $8, %ymm2, %ymm2
587; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
588; AVX512F-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
589; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
590; AVX512F-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
591; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
592; AVX512F-NEXT:    retq
593;
594; AVX512VL-LABEL: constant_rotate_v64i8:
595; AVX512VL:       # %bb.0:
596; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
597; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
598; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
599; AVX512VL-NEXT:    # ymm3 = mem[0,1,0,1]
600; AVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
601; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
602; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
603; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
604; AVX512VL-NEXT:    # ymm4 = mem[0,1,0,1]
605; AVX512VL-NEXT:    vpmullw %ymm4, %ymm1, %ymm1
606; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
607; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm1, %ymm1
608; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
609; AVX512VL-NEXT:    vpmullw %ymm3, %ymm2, %ymm2
610; AVX512VL-NEXT:    vpsrlw $8, %ymm2, %ymm2
611; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
612; AVX512VL-NEXT:    vpmullw %ymm4, %ymm0, %ymm0
613; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
614; AVX512VL-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
615; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
616; AVX512VL-NEXT:    retq
617;
618; AVX512BW-LABEL: constant_rotate_v64i8:
619; AVX512BW:       # %bb.0:
620; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
621; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
622; AVX512BW-NEXT:    vpsrlw $8, %zmm1, %zmm1
623; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
624; AVX512BW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
625; AVX512BW-NEXT:    vpsrlw $8, %zmm0, %zmm0
626; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
627; AVX512BW-NEXT:    retq
628;
629; AVX512VLBW-LABEL: constant_rotate_v64i8:
630; AVX512VLBW:       # %bb.0:
631; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
632; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
633; AVX512VLBW-NEXT:    vpsrlw $8, %zmm1, %zmm1
634; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
635; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
636; AVX512VLBW-NEXT:    vpsrlw $8, %zmm0, %zmm0
637; AVX512VLBW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
638; AVX512VLBW-NEXT:    retq
639;
640; AVX512VBMI2-LABEL: constant_rotate_v64i8:
641; AVX512VBMI2:       # %bb.0:
642; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
643; AVX512VBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
644; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm1, %zmm1
645; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
646; AVX512VBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
647; AVX512VBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
648; AVX512VBMI2-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
649; AVX512VBMI2-NEXT:    retq
650;
651; AVX512VLVBMI2-LABEL: constant_rotate_v64i8:
652; AVX512VLVBMI2:       # %bb.0:
653; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} zmm1 = zmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31,40,40,41,41,42,42,43,43,44,44,45,45,46,46,47,47,56,56,57,57,58,58,59,59,60,60,61,61,62,62,63,63]
654; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
655; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm1, %zmm1
656; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} zmm0 = zmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23,32,32,33,33,34,34,35,35,36,36,37,37,38,38,39,39,48,48,49,49,50,50,51,51,52,52,53,53,54,54,55,55]
657; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
658; AVX512VLVBMI2-NEXT:    vpsrlw $8, %zmm0, %zmm0
659; AVX512VLVBMI2-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
660; AVX512VLVBMI2-NEXT:    retq
661  %shl = shl <64 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
662  %lshr = lshr <64 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
663  %or = or <64 x i8> %shl, %lshr
664  ret <64 x i8> %or
665}
666
667;
668; Uniform Constant Rotates
669;
670
671define <8 x i64> @splatconstant_rotate_v8i64(<8 x i64> %a) nounwind {
672; AVX512-LABEL: splatconstant_rotate_v8i64:
673; AVX512:       # %bb.0:
674; AVX512-NEXT:    vprolq $14, %zmm0, %zmm0
675; AVX512-NEXT:    retq
676  %shl = shl <8 x i64> %a, <i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14, i64 14>
677  %lshr = lshr <8 x i64> %a, <i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50, i64 50>
678  %or = or <8 x i64> %shl, %lshr
679  ret <8 x i64> %or
680}
681
682define <16 x i32> @splatconstant_rotate_v16i32(<16 x i32> %a) nounwind {
683; AVX512-LABEL: splatconstant_rotate_v16i32:
684; AVX512:       # %bb.0:
685; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
686; AVX512-NEXT:    retq
687  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
688  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
689  %or = or <16 x i32> %shl, %lshr
690  ret <16 x i32> %or
691}
692
693define <32 x i16> @splatconstant_rotate_v32i16(<32 x i16> %a) nounwind {
694; AVX512F-LABEL: splatconstant_rotate_v32i16:
695; AVX512F:       # %bb.0:
696; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm1
697; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
698; AVX512F-NEXT:    vpsrlw $9, %ymm2, %ymm3
699; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
700; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
701; AVX512F-NEXT:    vpsllw $7, %ymm2, %ymm2
702; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
703; AVX512F-NEXT:    vporq %zmm1, %zmm0, %zmm0
704; AVX512F-NEXT:    retq
705;
706; AVX512VL-LABEL: splatconstant_rotate_v32i16:
707; AVX512VL:       # %bb.0:
708; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm1
709; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
710; AVX512VL-NEXT:    vpsrlw $9, %ymm2, %ymm3
711; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
712; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0
713; AVX512VL-NEXT:    vpsllw $7, %ymm2, %ymm2
714; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
715; AVX512VL-NEXT:    vporq %zmm1, %zmm0, %zmm0
716; AVX512VL-NEXT:    retq
717;
718; AVX512BW-LABEL: splatconstant_rotate_v32i16:
719; AVX512BW:       # %bb.0:
720; AVX512BW-NEXT:    vpsrlw $9, %zmm0, %zmm1
721; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
722; AVX512BW-NEXT:    vporq %zmm1, %zmm0, %zmm0
723; AVX512BW-NEXT:    retq
724;
725; AVX512VLBW-LABEL: splatconstant_rotate_v32i16:
726; AVX512VLBW:       # %bb.0:
727; AVX512VLBW-NEXT:    vpsrlw $9, %zmm0, %zmm1
728; AVX512VLBW-NEXT:    vpsllw $7, %zmm0, %zmm0
729; AVX512VLBW-NEXT:    vporq %zmm1, %zmm0, %zmm0
730; AVX512VLBW-NEXT:    retq
731;
732; AVX512VBMI2-LABEL: splatconstant_rotate_v32i16:
733; AVX512VBMI2:       # %bb.0:
734; AVX512VBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
735; AVX512VBMI2-NEXT:    retq
736;
737; AVX512VLVBMI2-LABEL: splatconstant_rotate_v32i16:
738; AVX512VLVBMI2:       # %bb.0:
739; AVX512VLVBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
740; AVX512VLVBMI2-NEXT:    retq
741  %shl = shl <32 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
742  %lshr = lshr <32 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
743  %or = or <32 x i16> %shl, %lshr
744  ret <32 x i16> %or
745}
746
747define <64 x i8> @splatconstant_rotate_v64i8(<64 x i8> %a) nounwind {
748; AVX512F-LABEL: splatconstant_rotate_v64i8:
749; AVX512F:       # %bb.0:
750; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
751; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
752; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
753; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
754; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
755; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
756; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
757; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
758; AVX512F-NEXT:    retq
759;
760; AVX512VL-LABEL: splatconstant_rotate_v64i8:
761; AVX512VL:       # %bb.0:
762; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
763; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
764; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
765; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
766; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
767; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
768; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
769; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
770; AVX512VL-NEXT:    retq
771;
772; AVX512BW-LABEL: splatconstant_rotate_v64i8:
773; AVX512BW:       # %bb.0:
774; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
775; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
776; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
777; AVX512BW-NEXT:    retq
778;
779; AVX512VLBW-LABEL: splatconstant_rotate_v64i8:
780; AVX512VLBW:       # %bb.0:
781; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
782; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
783; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
784; AVX512VLBW-NEXT:    retq
785;
786; AVX512VBMI2-LABEL: splatconstant_rotate_v64i8:
787; AVX512VBMI2:       # %bb.0:
788; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
789; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
790; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
791; AVX512VBMI2-NEXT:    retq
792;
793; AVX512VLVBMI2-LABEL: splatconstant_rotate_v64i8:
794; AVX512VLVBMI2:       # %bb.0:
795; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
796; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
797; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
798; AVX512VLVBMI2-NEXT:    retq
799  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
800  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
801  %or = or <64 x i8> %shl, %lshr
802  ret <64 x i8> %or
803}
804
805;
806; Masked Uniform Constant Rotates
807;
808
809define <8 x i64> @splatconstant_rotate_mask_v8i64(<8 x i64> %a) nounwind {
810; AVX512-LABEL: splatconstant_rotate_mask_v8i64:
811; AVX512:       # %bb.0:
812; AVX512-NEXT:    vpsrlq $49, %zmm0, %zmm0
813; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
814; AVX512-NEXT:    retq
815  %shl = shl <8 x i64> %a, <i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15, i64 15>
816  %lshr = lshr <8 x i64> %a, <i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49, i64 49>
817  %rmask = and <8 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255, i64 255, i64 127, i64 127, i64 255>
818  %lmask = and <8 x i64> %shl, <i64 33, i64 65, i64 129, i64 257, i64 33, i64 65, i64 129, i64 257>
819  %or = or <8 x i64> %lmask, %rmask
820  ret <8 x i64> %or
821}
822
823define <16 x i32> @splatconstant_rotate_mask_v16i32(<16 x i32> %a) nounwind {
824; AVX512-LABEL: splatconstant_rotate_mask_v16i32:
825; AVX512:       # %bb.0:
826; AVX512-NEXT:    vprold $4, %zmm0, %zmm0
827; AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
828; AVX512-NEXT:    retq
829  %shl = shl <16 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
830  %lshr = lshr <16 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
831  %rmask = and <16 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511, i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
832  %lmask = and <16 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3, i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
833  %or = or <16 x i32> %lmask, %rmask
834  ret <16 x i32> %or
835}
836
837define <32 x i16> @splatconstant_rotate_mask_v32i16(<32 x i16> %a) nounwind {
838; AVX512F-LABEL: splatconstant_rotate_mask_v32i16:
839; AVX512F:       # %bb.0:
840; AVX512F-NEXT:    vpsllw $5, %ymm0, %ymm1
841; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
842; AVX512F-NEXT:    vpsllw $5, %ymm2, %ymm3
843; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
844; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm0
845; AVX512F-NEXT:    vpsrlw $11, %ymm2, %ymm2
846; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
847; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
848; AVX512F-NEXT:    retq
849;
850; AVX512VL-LABEL: splatconstant_rotate_mask_v32i16:
851; AVX512VL:       # %bb.0:
852; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm1
853; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
854; AVX512VL-NEXT:    vpsllw $5, %ymm2, %ymm3
855; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
856; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
857; AVX512VL-NEXT:    vpsrlw $11, %ymm2, %ymm2
858; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
859; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
860; AVX512VL-NEXT:    retq
861;
862; AVX512BW-LABEL: splatconstant_rotate_mask_v32i16:
863; AVX512BW:       # %bb.0:
864; AVX512BW-NEXT:    vpsllw $5, %zmm0, %zmm1
865; AVX512BW-NEXT:    vpsrlw $11, %zmm0, %zmm0
866; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
867; AVX512BW-NEXT:    retq
868;
869; AVX512VLBW-LABEL: splatconstant_rotate_mask_v32i16:
870; AVX512VLBW:       # %bb.0:
871; AVX512VLBW-NEXT:    vpsllw $5, %zmm0, %zmm1
872; AVX512VLBW-NEXT:    vpsrlw $11, %zmm0, %zmm0
873; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = mem & (zmm0 | zmm1)
874; AVX512VLBW-NEXT:    retq
875;
876; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v32i16:
877; AVX512VBMI2:       # %bb.0:
878; AVX512VBMI2-NEXT:    vpshldw $5, %zmm0, %zmm0, %zmm0
879; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
880; AVX512VBMI2-NEXT:    retq
881;
882; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v32i16:
883; AVX512VLVBMI2:       # %bb.0:
884; AVX512VLVBMI2-NEXT:    vpshldw $5, %zmm0, %zmm0, %zmm0
885; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
886; AVX512VLVBMI2-NEXT:    retq
887  %shl = shl <32 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
888  %lshr = lshr <32 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
889  %rmask = and <32 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
890  %lmask = and <32 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
891  %or = or <32 x i16> %lmask, %rmask
892  ret <32 x i16> %or
893}
894
895define <64 x i8> @splatconstant_rotate_mask_v64i8(<64 x i8> %a) nounwind {
896; AVX512F-LABEL: splatconstant_rotate_mask_v64i8:
897; AVX512F:       # %bb.0:
898; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm1
899; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
900; AVX512F-NEXT:    vpsllw $4, %ymm2, %ymm3
901; AVX512F-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
902; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm0
903; AVX512F-NEXT:    vpsrlw $4, %ymm2, %ymm2
904; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
905; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
906; AVX512F-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
907; AVX512F-NEXT:    retq
908;
909; AVX512VL-LABEL: splatconstant_rotate_mask_v64i8:
910; AVX512VL:       # %bb.0:
911; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm1
912; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
913; AVX512VL-NEXT:    vpsllw $4, %ymm2, %ymm3
914; AVX512VL-NEXT:    vinserti64x4 $1, %ymm3, %zmm1, %zmm1
915; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm0
916; AVX512VL-NEXT:    vpsrlw $4, %ymm2, %ymm2
917; AVX512VL-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm0
918; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
919; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
920; AVX512VL-NEXT:    retq
921;
922; AVX512BW-LABEL: splatconstant_rotate_mask_v64i8:
923; AVX512BW:       # %bb.0:
924; AVX512BW-NEXT:    vpsllw $4, %zmm0, %zmm1
925; AVX512BW-NEXT:    vpsrlw $4, %zmm0, %zmm0
926; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
927; AVX512BW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
928; AVX512BW-NEXT:    retq
929;
930; AVX512VLBW-LABEL: splatconstant_rotate_mask_v64i8:
931; AVX512VLBW:       # %bb.0:
932; AVX512VLBW-NEXT:    vpsllw $4, %zmm0, %zmm1
933; AVX512VLBW-NEXT:    vpsrlw $4, %zmm0, %zmm0
934; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
935; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
936; AVX512VLBW-NEXT:    retq
937;
938; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v64i8:
939; AVX512VBMI2:       # %bb.0:
940; AVX512VBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
941; AVX512VBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
942; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
943; AVX512VBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
944; AVX512VBMI2-NEXT:    retq
945;
946; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v64i8:
947; AVX512VLVBMI2:       # %bb.0:
948; AVX512VLVBMI2-NEXT:    vpsllw $4, %zmm0, %zmm1
949; AVX512VLVBMI2-NEXT:    vpsrlw $4, %zmm0, %zmm0
950; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
951; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm0, %zmm0
952; AVX512VLVBMI2-NEXT:    retq
953  %shl = shl <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
954  %lshr = lshr <64 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
955  %rmask = and <64 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
956  %lmask = and <64 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
957  %or = or <64 x i8> %lmask, %rmask
958  ret <64 x i8> %or
959}
960