xref: /llvm-project/llvm/test/CodeGen/X86/vector-rotate-256.ll (revision a25f2cb3e6953691fade076c8e0ccebf1016d3d9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX1
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512F
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VL
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512BW
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLBW
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512VBMI2
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLVBMI2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=XOPAVX1
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=XOPAVX2
12
13;
14; Variable Rotates
15;
16
17define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
18; AVX1-LABEL: var_rotate_v4i64:
19; AVX1:       # %bb.0:
20; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
21; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm3
22; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
23; AVX1-NEXT:    vpsubq %xmm4, %xmm2, %xmm2
24; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm5
25; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm6
26; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
27; AVX1-NEXT:    vpsllq %xmm4, %xmm5, %xmm4
28; AVX1-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2,3],xmm4[4,5,6,7]
29; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm6
30; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
31; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
32; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4,5,6,7]
33; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
34; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm4
35; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
36; AVX1-NEXT:    vpsrlq %xmm2, %xmm5, %xmm2
37; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7]
38; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm4
39; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[2,3,2,3]
40; AVX1-NEXT:    vpsrlq %xmm3, %xmm0, %xmm0
41; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2,3],xmm0[4,5,6,7]
42; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
43; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
44; AVX1-NEXT:    retq
45;
46; AVX2-LABEL: var_rotate_v4i64:
47; AVX2:       # %bb.0:
48; AVX2-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [64,64,64,64]
49; AVX2-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
50; AVX2-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
51; AVX2-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
52; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
53; AVX2-NEXT:    retq
54;
55; AVX512NOVLX-LABEL: var_rotate_v4i64:
56; AVX512NOVLX:       # %bb.0:
57; AVX512NOVLX-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
58; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
59; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
60; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
61; AVX512NOVLX-NEXT:    retq
62;
63; AVX512VLX-LABEL: var_rotate_v4i64:
64; AVX512VLX:       # %bb.0:
65; AVX512VLX-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
66; AVX512VLX-NEXT:    retq
67;
68; XOPAVX1-LABEL: var_rotate_v4i64:
69; XOPAVX1:       # %bb.0:
70; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
71; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
72; XOPAVX1-NEXT:    vprotq %xmm2, %xmm3, %xmm2
73; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
74; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
75; XOPAVX1-NEXT:    retq
76;
77; XOPAVX2-LABEL: var_rotate_v4i64:
78; XOPAVX2:       # %bb.0:
79; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
80; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
81; XOPAVX2-NEXT:    vprotq %xmm2, %xmm3, %xmm2
82; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
83; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
84; XOPAVX2-NEXT:    retq
85  %b64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %b
86  %shl = shl <4 x i64> %a, %b
87  %lshr = lshr <4 x i64> %a, %b64
88  %or = or <4 x i64> %shl, %lshr
89  ret <4 x i64> %or
90}
91
92define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
93; AVX1-LABEL: var_rotate_v8i32:
94; AVX1:       # %bb.0:
95; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
96; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
97; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
98; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
99; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
100; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
101; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
102; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm2[1,1,3,3]
103; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
104; AVX1-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[1,1,3,3]
105; AVX1-NEXT:    vpmuludq %xmm5, %xmm7, %xmm5
106; AVX1-NEXT:    vpmuludq %xmm2, %xmm6, %xmm2
107; AVX1-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[1,1,3,3]
108; AVX1-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
109; AVX1-NEXT:    vpshufd {{.*#+}} xmm5 = xmm5[0,0,2,2]
110; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2,3],xmm2[4,5],xmm5[6,7]
111; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
112; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
113; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
114; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
115; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
116; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
117; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[1,1,3,3]
118; AVX1-NEXT:    vpmuludq %xmm3, %xmm4, %xmm3
119; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
120; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
121; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
122; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,0,2,2]
123; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7]
124; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
125; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
126; AVX1-NEXT:    retq
127;
128; AVX2-LABEL: var_rotate_v8i32:
129; AVX2:       # %bb.0:
130; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [31,31,31,31,31,31,31,31]
131; AVX2-NEXT:    vpand %ymm2, %ymm1, %ymm1
132; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm2
133; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [32,32,32,32,32,32,32,32]
134; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
135; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
136; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
137; AVX2-NEXT:    retq
138;
139; AVX512NOVLX-LABEL: var_rotate_v8i32:
140; AVX512NOVLX:       # %bb.0:
141; AVX512NOVLX-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
142; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
143; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
144; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
145; AVX512NOVLX-NEXT:    retq
146;
147; AVX512VLX-LABEL: var_rotate_v8i32:
148; AVX512VLX:       # %bb.0:
149; AVX512VLX-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
150; AVX512VLX-NEXT:    retq
151;
152; XOPAVX1-LABEL: var_rotate_v8i32:
153; XOPAVX1:       # %bb.0:
154; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
155; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
156; XOPAVX1-NEXT:    vprotd %xmm2, %xmm3, %xmm2
157; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
158; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
159; XOPAVX1-NEXT:    retq
160;
161; XOPAVX2-LABEL: var_rotate_v8i32:
162; XOPAVX2:       # %bb.0:
163; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
164; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
165; XOPAVX2-NEXT:    vprotd %xmm2, %xmm3, %xmm2
166; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
167; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
168; XOPAVX2-NEXT:    retq
169  %b32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %b
170  %shl = shl <8 x i32> %a, %b
171  %lshr = lshr <8 x i32> %a, %b32
172  %or = or <8 x i32> %shl, %lshr
173  ret <8 x i32> %or
174}
175
176define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
177; AVX1-LABEL: var_rotate_v16i16:
178; AVX1:       # %bb.0:
179; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
180; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
181; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
182; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
183; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
184; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
185; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
186; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
187; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
188; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
189; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
190; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
191; AVX1-NEXT:    vpackusdw %xmm4, %xmm2, %xmm2
192; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
193; AVX1-NEXT:    vpmulhuw %xmm2, %xmm4, %xmm6
194; AVX1-NEXT:    vpmullw %xmm2, %xmm4, %xmm2
195; AVX1-NEXT:    vpor %xmm6, %xmm2, %xmm2
196; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
197; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4,4,5,5,6,6,7,7]
198; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
199; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
200; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
201; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
202; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
203; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
204; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
205; AVX1-NEXT:    vpackusdw %xmm3, %xmm1, %xmm1
206; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm3
207; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
208; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
209; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
210; AVX1-NEXT:    retq
211;
212; AVX2-LABEL: var_rotate_v16i16:
213; AVX2:       # %bb.0:
214; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
215; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
216; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
217; AVX2-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
218; AVX2-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
219; AVX2-NEXT:    vpsrld $16, %ymm3, %ymm3
220; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
221; AVX2-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
222; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
223; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
224; AVX2-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
225; AVX2-NEXT:    retq
226;
227; AVX512F-LABEL: var_rotate_v16i16:
228; AVX512F:       # %bb.0:
229; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
230; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
231; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
232; AVX512F-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
233; AVX512F-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
234; AVX512F-NEXT:    vpsrld $16, %ymm3, %ymm3
235; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
236; AVX512F-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
237; AVX512F-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
238; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
239; AVX512F-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
240; AVX512F-NEXT:    retq
241;
242; AVX512VL-LABEL: var_rotate_v16i16:
243; AVX512VL:       # %bb.0:
244; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
245; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
246; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15]
247; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} ymm4 = ymm0[4,4,5,5,6,6,7,7,12,12,13,13,14,14,15,15]
248; AVX512VL-NEXT:    vpsllvd %ymm3, %ymm4, %ymm3
249; AVX512VL-NEXT:    vpsrld $16, %ymm3, %ymm3
250; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
251; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
252; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
253; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
254; AVX512VL-NEXT:    vpackusdw %ymm3, %ymm0, %ymm0
255; AVX512VL-NEXT:    retq
256;
257; AVX512BW-LABEL: var_rotate_v16i16:
258; AVX512BW:       # %bb.0:
259; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
260; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
261; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
262; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
263; AVX512BW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
264; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
265; AVX512BW-NEXT:    vpor %ymm0, %ymm2, %ymm0
266; AVX512BW-NEXT:    retq
267;
268; AVX512VLBW-LABEL: var_rotate_v16i16:
269; AVX512VLBW:       # %bb.0:
270; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
271; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2
272; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} ymm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
273; AVX512VLBW-NEXT:    vpsubw %ymm1, %ymm3, %ymm1
274; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
275; AVX512VLBW-NEXT:    vpor %ymm0, %ymm2, %ymm0
276; AVX512VLBW-NEXT:    retq
277;
278; AVX512VBMI2-LABEL: var_rotate_v16i16:
279; AVX512VBMI2:       # %bb.0:
280; AVX512VBMI2-NEXT:    # kill: def $ymm1 killed $ymm1 def $zmm1
281; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
282; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
283; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
284; AVX512VBMI2-NEXT:    retq
285;
286; AVX512VLVBMI2-LABEL: var_rotate_v16i16:
287; AVX512VLVBMI2:       # %bb.0:
288; AVX512VLVBMI2-NEXT:    vpshldvw %ymm1, %ymm0, %ymm0
289; AVX512VLVBMI2-NEXT:    retq
290;
291; XOPAVX1-LABEL: var_rotate_v16i16:
292; XOPAVX1:       # %bb.0:
293; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
294; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
295; XOPAVX1-NEXT:    vprotw %xmm2, %xmm3, %xmm2
296; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
297; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
298; XOPAVX1-NEXT:    retq
299;
300; XOPAVX2-LABEL: var_rotate_v16i16:
301; XOPAVX2:       # %bb.0:
302; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
303; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
304; XOPAVX2-NEXT:    vprotw %xmm2, %xmm3, %xmm2
305; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
306; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
307; XOPAVX2-NEXT:    retq
308  %b16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
309  %shl = shl <16 x i16> %a, %b
310  %lshr = lshr <16 x i16> %a, %b16
311  %or = or <16 x i16> %shl, %lshr
312  ret <16 x i16> %or
313}
314
315define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
316; AVX1-LABEL: var_rotate_v32i8:
317; AVX1:       # %bb.0:
318; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
319; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm3
320; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
321; AVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
322; AVX1-NEXT:    vpsllw $4, %xmm2, %xmm5
323; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm5
324; AVX1-NEXT:    vpor %xmm3, %xmm5, %xmm3
325; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
326; AVX1-NEXT:    vpsllw $5, %xmm5, %xmm5
327; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
328; AVX1-NEXT:    vpsrlw $6, %xmm2, %xmm3
329; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252]
330; AVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
331; AVX1-NEXT:    vpsllw $2, %xmm2, %xmm7
332; AVX1-NEXT:    vpand %xmm6, %xmm7, %xmm7
333; AVX1-NEXT:    vpor %xmm3, %xmm7, %xmm3
334; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
335; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
336; AVX1-NEXT:    vpsrlw $7, %xmm2, %xmm3
337; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
338; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
339; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm8
340; AVX1-NEXT:    vpor %xmm3, %xmm8, %xmm3
341; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
342; AVX1-NEXT:    vpblendvb %xmm5, %xmm3, %xmm2, %xmm2
343; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm3
344; AVX1-NEXT:    vpandn %xmm3, %xmm4, %xmm3
345; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm5
346; AVX1-NEXT:    vpand %xmm4, %xmm5, %xmm4
347; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
348; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
349; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
350; AVX1-NEXT:    vpsrlw $6, %xmm0, %xmm3
351; AVX1-NEXT:    vpandn %xmm3, %xmm6, %xmm3
352; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm4
353; AVX1-NEXT:    vpand %xmm6, %xmm4, %xmm4
354; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
355; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
356; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
357; AVX1-NEXT:    vpsrlw $7, %xmm0, %xmm3
358; AVX1-NEXT:    vpand %xmm7, %xmm3, %xmm3
359; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm4
360; AVX1-NEXT:    vpor %xmm3, %xmm4, %xmm3
361; AVX1-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
362; AVX1-NEXT:    vpblendvb %xmm1, %xmm3, %xmm0, %xmm0
363; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
364; AVX1-NEXT:    retq
365;
366; AVX2-LABEL: var_rotate_v32i8:
367; AVX2:       # %bb.0:
368; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm2
369; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
370; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm3
371; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
372; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
373; AVX2-NEXT:    vpsllw $5, %ymm1, %ymm1
374; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
375; AVX2-NEXT:    vpsrlw $6, %ymm0, %ymm2
376; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
377; AVX2-NEXT:    vpsllw $2, %ymm0, %ymm3
378; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
379; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
380; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
381; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
382; AVX2-NEXT:    vpsrlw $7, %ymm0, %ymm2
383; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
384; AVX2-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
385; AVX2-NEXT:    vpor %ymm2, %ymm3, %ymm2
386; AVX2-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
387; AVX2-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
388; AVX2-NEXT:    retq
389;
390; AVX512F-LABEL: var_rotate_v32i8:
391; AVX512F:       # %bb.0:
392; AVX512F-NEXT:    vpsllw $4, %ymm0, %ymm2
393; AVX512F-NEXT:    vpsrlw $4, %ymm0, %ymm3
394; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
395; AVX512F-NEXT:    vpsllw $5, %ymm1, %ymm1
396; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
397; AVX512F-NEXT:    vpsllw $2, %ymm0, %ymm2
398; AVX512F-NEXT:    vpsrlw $6, %ymm0, %ymm3
399; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm3 = zmm3 ^ (mem & (zmm3 ^ zmm2))
400; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
401; AVX512F-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
402; AVX512F-NEXT:    vpsrlw $7, %ymm0, %ymm2
403; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
404; AVX512F-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
405; AVX512F-NEXT:    vpor %ymm2, %ymm3, %ymm2
406; AVX512F-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
407; AVX512F-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm0
408; AVX512F-NEXT:    retq
409;
410; AVX512VL-LABEL: var_rotate_v32i8:
411; AVX512VL:       # %bb.0:
412; AVX512VL-NEXT:    vpsllw $4, %ymm0, %ymm2
413; AVX512VL-NEXT:    vpsrlw $4, %ymm0, %ymm3
414; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
415; AVX512VL-NEXT:    vpsllw $5, %ymm1, %ymm1
416; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
417; AVX512VL-NEXT:    vpsllw $2, %ymm0, %ymm2
418; AVX512VL-NEXT:    vpsrlw $6, %ymm0, %ymm3
419; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
420; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
421; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
422; AVX512VL-NEXT:    vpsrlw $7, %ymm0, %ymm2
423; AVX512VL-NEXT:    vpaddb %ymm0, %ymm0, %ymm3
424; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
425; AVX512VL-NEXT:    vpaddb %ymm1, %ymm1, %ymm1
426; AVX512VL-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm0
427; AVX512VL-NEXT:    retq
428;
429; AVX512BW-LABEL: var_rotate_v32i8:
430; AVX512BW:       # %bb.0:
431; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
432; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
433; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
434; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
435; AVX512BW-NEXT:    vpsllvw %zmm4, %zmm2, %zmm2
436; AVX512BW-NEXT:    vpsrlw $8, %ymm2, %ymm2
437; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
438; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
439; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
440; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
441; AVX512BW-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
442; AVX512BW-NEXT:    retq
443;
444; AVX512VLBW-LABEL: var_rotate_v32i8:
445; AVX512VLBW:       # %bb.0:
446; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
447; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
448; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
449; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
450; AVX512VLBW-NEXT:    vpsllvw %ymm3, %ymm4, %ymm3
451; AVX512VLBW-NEXT:    vpsrlw $8, %ymm3, %ymm3
452; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
453; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
454; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
455; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
456; AVX512VLBW-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
457; AVX512VLBW-NEXT:    retq
458;
459; AVX512VBMI2-LABEL: var_rotate_v32i8:
460; AVX512VBMI2:       # %bb.0:
461; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
462; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
463; AVX512VBMI2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
464; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11],ymm1[12],ymm3[12],ymm1[13],ymm3[13],ymm1[14],ymm3[14],ymm1[15],ymm3[15],ymm1[24],ymm3[24],ymm1[25],ymm3[25],ymm1[26],ymm3[26],ymm1[27],ymm3[27],ymm1[28],ymm3[28],ymm1[29],ymm3[29],ymm1[30],ymm3[30],ymm1[31],ymm3[31]
465; AVX512VBMI2-NEXT:    vpsllvw %zmm4, %zmm2, %zmm2
466; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm2, %ymm2
467; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
468; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
469; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
470; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
471; AVX512VBMI2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
472; AVX512VBMI2-NEXT:    retq
473;
474; AVX512VLVBMI2-LABEL: var_rotate_v32i8:
475; AVX512VLVBMI2:       # %bb.0:
476; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm1
477; AVX512VLVBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
478; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15],ymm1[24],ymm2[24],ymm1[25],ymm2[25],ymm1[26],ymm2[26],ymm1[27],ymm2[27],ymm1[28],ymm2[28],ymm1[29],ymm2[29],ymm1[30],ymm2[30],ymm1[31],ymm2[31]
479; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm4 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
480; AVX512VLVBMI2-NEXT:    vpsllvw %ymm3, %ymm4, %ymm3
481; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm3, %ymm3
482; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[16],ymm2[16],ymm1[17],ymm2[17],ymm1[18],ymm2[18],ymm1[19],ymm2[19],ymm1[20],ymm2[20],ymm1[21],ymm2[21],ymm1[22],ymm2[22],ymm1[23],ymm2[23]
483; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
484; AVX512VLVBMI2-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
485; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
486; AVX512VLVBMI2-NEXT:    vpackuswb %ymm3, %ymm0, %ymm0
487; AVX512VLVBMI2-NEXT:    retq
488;
489; XOPAVX1-LABEL: var_rotate_v32i8:
490; XOPAVX1:       # %bb.0:
491; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
492; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
493; XOPAVX1-NEXT:    vprotb %xmm2, %xmm3, %xmm2
494; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
495; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
496; XOPAVX1-NEXT:    retq
497;
498; XOPAVX2-LABEL: var_rotate_v32i8:
499; XOPAVX2:       # %bb.0:
500; XOPAVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
501; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
502; XOPAVX2-NEXT:    vprotb %xmm2, %xmm3, %xmm2
503; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
504; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
505; XOPAVX2-NEXT:    retq
506  %b8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
507  %shl = shl <32 x i8> %a, %b
508  %lshr = lshr <32 x i8> %a, %b8
509  %or = or <32 x i8> %shl, %lshr
510  ret <32 x i8> %or
511}
512
513;
514; Uniform Variable Rotates
515;
516
517define <4 x i64> @splatvar_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
518; AVX1-LABEL: splatvar_rotate_v4i64:
519; AVX1:       # %bb.0:
520; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
521; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
522; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
523; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm4
524; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
525; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
526; AVX1-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
527; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
528; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
529; AVX1-NEXT:    vorps %ymm0, %ymm1, %ymm0
530; AVX1-NEXT:    retq
531;
532; AVX2-LABEL: splatvar_rotate_v4i64:
533; AVX2:       # %bb.0:
534; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm2
535; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [64,64]
536; AVX2-NEXT:    vpsubq %xmm1, %xmm3, %xmm1
537; AVX2-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
538; AVX2-NEXT:    vpor %ymm0, %ymm2, %ymm0
539; AVX2-NEXT:    retq
540;
541; AVX512NOVLX-LABEL: splatvar_rotate_v4i64:
542; AVX512NOVLX:       # %bb.0:
543; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
544; AVX512NOVLX-NEXT:    vpbroadcastq %xmm1, %ymm1
545; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
546; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
547; AVX512NOVLX-NEXT:    retq
548;
549; AVX512VLX-LABEL: splatvar_rotate_v4i64:
550; AVX512VLX:       # %bb.0:
551; AVX512VLX-NEXT:    vpbroadcastq %xmm1, %ymm1
552; AVX512VLX-NEXT:    vprolvq %ymm1, %ymm0, %ymm0
553; AVX512VLX-NEXT:    retq
554;
555; XOPAVX1-LABEL: splatvar_rotate_v4i64:
556; XOPAVX1:       # %bb.0:
557; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
558; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
559; XOPAVX1-NEXT:    vprotq %xmm1, %xmm2, %xmm2
560; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
561; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
562; XOPAVX1-NEXT:    retq
563;
564; XOPAVX2-LABEL: splatvar_rotate_v4i64:
565; XOPAVX2:       # %bb.0:
566; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
567; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
568; XOPAVX2-NEXT:    vprotq %xmm1, %xmm2, %xmm2
569; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
570; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
571; XOPAVX2-NEXT:    retq
572  %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer
573  %splat64 = sub <4 x i64> <i64 64, i64 64, i64 64, i64 64>, %splat
574  %shl = shl <4 x i64> %a, %splat
575  %lshr = lshr <4 x i64> %a, %splat64
576  %or = or <4 x i64> %shl, %lshr
577  ret <4 x i64> %or
578}
579
580define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
581; AVX1-LABEL: splatvar_rotate_v8i32:
582; AVX1:       # %bb.0:
583; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
584; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
585; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
586; AVX1-NEXT:    vpsllq %xmm1, %xmm3, %xmm3
587; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[2,2,3,3]
588; AVX1-NEXT:    vpsllq %xmm1, %xmm4, %xmm4
589; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
590; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,1,1]
591; AVX1-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
592; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
593; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
594; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
595; AVX1-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
596; AVX1-NEXT:    retq
597;
598; AVX2-LABEL: splatvar_rotate_v8i32:
599; AVX2:       # %bb.0:
600; AVX2-NEXT:    vpshufd {{.*#+}} ymm2 = ymm0[2,2,3,3,6,6,7,7]
601; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
602; AVX2-NEXT:    vpsllq %xmm1, %ymm2, %ymm2
603; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,0,1,1,4,4,5,5]
604; AVX2-NEXT:    vpsllq %xmm1, %ymm0, %ymm0
605; AVX2-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[1,3],ymm2[1,3],ymm0[5,7],ymm2[5,7]
606; AVX2-NEXT:    retq
607;
608; AVX512NOVLX-LABEL: splatvar_rotate_v8i32:
609; AVX512NOVLX:       # %bb.0:
610; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
611; AVX512NOVLX-NEXT:    vpbroadcastd %xmm1, %ymm1
612; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
613; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
614; AVX512NOVLX-NEXT:    retq
615;
616; AVX512VLX-LABEL: splatvar_rotate_v8i32:
617; AVX512VLX:       # %bb.0:
618; AVX512VLX-NEXT:    vpbroadcastd %xmm1, %ymm1
619; AVX512VLX-NEXT:    vprolvd %ymm1, %ymm0, %ymm0
620; AVX512VLX-NEXT:    retq
621;
622; XOPAVX1-LABEL: splatvar_rotate_v8i32:
623; XOPAVX1:       # %bb.0:
624; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
625; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
626; XOPAVX1-NEXT:    vprotd %xmm1, %xmm2, %xmm2
627; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
628; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
629; XOPAVX1-NEXT:    retq
630;
631; XOPAVX2-LABEL: splatvar_rotate_v8i32:
632; XOPAVX2:       # %bb.0:
633; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
634; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
635; XOPAVX2-NEXT:    vprotd %xmm1, %xmm2, %xmm2
636; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
637; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
638; XOPAVX2-NEXT:    retq
639  %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer
640  %splat32 = sub <8 x i32> <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>, %splat
641  %shl = shl <8 x i32> %a, %splat
642  %lshr = lshr <8 x i32> %a, %splat32
643  %or = or <8 x i32> %shl, %lshr
644  ret <8 x i32> %or
645}
646
647define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
648; AVX1-LABEL: splatvar_rotate_v16i16:
649; AVX1:       # %bb.0:
650; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
651; AVX1-NEXT:    vpandn %xmm2, %xmm1, %xmm3
652; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
653; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm5
654; AVX1-NEXT:    vpsrlw %xmm3, %xmm5, %xmm5
655; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
656; AVX1-NEXT:    vpsllw %xmm1, %xmm4, %xmm2
657; AVX1-NEXT:    vpor %xmm5, %xmm2, %xmm2
658; AVX1-NEXT:    vpsrlw $1, %xmm0, %xmm4
659; AVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
660; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
661; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
662; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
663; AVX1-NEXT:    retq
664;
665; AVX2-LABEL: splatvar_rotate_v16i16:
666; AVX2:       # %bb.0:
667; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
668; AVX2-NEXT:    vpandn %xmm2, %xmm1, %xmm3
669; AVX2-NEXT:    vpsrlw $1, %ymm0, %ymm4
670; AVX2-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
671; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
672; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
673; AVX2-NEXT:    vpor %ymm3, %ymm0, %ymm0
674; AVX2-NEXT:    retq
675;
676; AVX512F-LABEL: splatvar_rotate_v16i16:
677; AVX512F:       # %bb.0:
678; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
679; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
680; AVX512F-NEXT:    vpsrlw $1, %ymm0, %ymm4
681; AVX512F-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
682; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
683; AVX512F-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
684; AVX512F-NEXT:    vpor %ymm3, %ymm0, %ymm0
685; AVX512F-NEXT:    retq
686;
687; AVX512VL-LABEL: splatvar_rotate_v16i16:
688; AVX512VL:       # %bb.0:
689; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
690; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
691; AVX512VL-NEXT:    vpsrlw $1, %ymm0, %ymm4
692; AVX512VL-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
693; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
694; AVX512VL-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
695; AVX512VL-NEXT:    vpor %ymm3, %ymm0, %ymm0
696; AVX512VL-NEXT:    retq
697;
698; AVX512BW-LABEL: splatvar_rotate_v16i16:
699; AVX512BW:       # %bb.0:
700; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
701; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
702; AVX512BW-NEXT:    vpsrlw $1, %ymm0, %ymm4
703; AVX512BW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
704; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
705; AVX512BW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
706; AVX512BW-NEXT:    vpor %ymm3, %ymm0, %ymm0
707; AVX512BW-NEXT:    retq
708;
709; AVX512VLBW-LABEL: splatvar_rotate_v16i16:
710; AVX512VLBW:       # %bb.0:
711; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [15,0,0,0,15,0,0,0]
712; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
713; AVX512VLBW-NEXT:    vpsrlw $1, %ymm0, %ymm4
714; AVX512VLBW-NEXT:    vpsrlw %xmm3, %ymm4, %ymm3
715; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
716; AVX512VLBW-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
717; AVX512VLBW-NEXT:    vpor %ymm3, %ymm0, %ymm0
718; AVX512VLBW-NEXT:    retq
719;
720; AVX512VBMI2-LABEL: splatvar_rotate_v16i16:
721; AVX512VBMI2:       # %bb.0:
722; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
723; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %ymm1
724; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
725; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
726; AVX512VBMI2-NEXT:    retq
727;
728; AVX512VLVBMI2-LABEL: splatvar_rotate_v16i16:
729; AVX512VLVBMI2:       # %bb.0:
730; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %ymm1
731; AVX512VLVBMI2-NEXT:    vpshldvw %ymm1, %ymm0, %ymm0
732; AVX512VLVBMI2-NEXT:    retq
733;
734; XOPAVX1-LABEL: splatvar_rotate_v16i16:
735; XOPAVX1:       # %bb.0:
736; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
737; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
738; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
739; XOPAVX1-NEXT:    vprotw %xmm1, %xmm2, %xmm2
740; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
741; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
742; XOPAVX1-NEXT:    retq
743;
744; XOPAVX2-LABEL: splatvar_rotate_v16i16:
745; XOPAVX2:       # %bb.0:
746; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
747; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
748; XOPAVX2-NEXT:    vprotw %xmm1, %xmm2, %xmm2
749; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
750; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
751; XOPAVX2-NEXT:    retq
752  %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer
753  %splat16 = sub <16 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
754  %shl = shl <16 x i16> %a, %splat
755  %lshr = lshr <16 x i16> %a, %splat16
756  %or = or <16 x i16> %shl, %lshr
757  ret <16 x i16> %or
758}
759
760define <32 x i8> @splatvar_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
761; AVX1-LABEL: splatvar_rotate_v32i8:
762; AVX1:       # %bb.0:
763; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
764; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
765; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
766; AVX1-NEXT:    vpsllw %xmm1, %xmm3, %xmm3
767; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
768; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
769; AVX1-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
770; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
771; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
772; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
773; AVX1-NEXT:    vpsllw %xmm1, %xmm3, %xmm3
774; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
775; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
776; AVX1-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
777; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
778; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
779; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
780; AVX1-NEXT:    retq
781;
782; AVX2-LABEL: splatvar_rotate_v32i8:
783; AVX2:       # %bb.0:
784; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
785; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
786; AVX2-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
787; AVX2-NEXT:    vpsrlw $8, %ymm2, %ymm2
788; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
789; AVX2-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
790; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
791; AVX2-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
792; AVX2-NEXT:    retq
793;
794; AVX512-LABEL: splatvar_rotate_v32i8:
795; AVX512:       # %bb.0:
796; AVX512-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
797; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
798; AVX512-NEXT:    vpsllw %xmm1, %ymm2, %ymm2
799; AVX512-NEXT:    vpsrlw $8, %ymm2, %ymm2
800; AVX512-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
801; AVX512-NEXT:    vpsllw %xmm1, %ymm0, %ymm0
802; AVX512-NEXT:    vpsrlw $8, %ymm0, %ymm0
803; AVX512-NEXT:    vpackuswb %ymm2, %ymm0, %ymm0
804; AVX512-NEXT:    retq
805;
806; XOPAVX1-LABEL: splatvar_rotate_v32i8:
807; XOPAVX1:       # %bb.0:
808; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
809; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
810; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
811; XOPAVX1-NEXT:    vprotb %xmm1, %xmm2, %xmm2
812; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
813; XOPAVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
814; XOPAVX1-NEXT:    retq
815;
816; XOPAVX2-LABEL: splatvar_rotate_v32i8:
817; XOPAVX2:       # %bb.0:
818; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
819; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
820; XOPAVX2-NEXT:    vprotb %xmm1, %xmm2, %xmm2
821; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
822; XOPAVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
823; XOPAVX2-NEXT:    retq
824  %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer
825  %splat8 = sub <32 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
826  %shl = shl <32 x i8> %a, %splat
827  %lshr = lshr <32 x i8> %a, %splat8
828  %or = or <32 x i8> %shl, %lshr
829  ret <32 x i8> %or
830}
831
832;
833; Constant Rotates
834;
835
836define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
837; AVX1-LABEL: constant_rotate_v4i64:
838; AVX1:       # %bb.0:
839; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
840; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm2
841; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm3
842; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
843; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm3
844; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm4
845; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7]
846; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
847; AVX1-NEXT:    vpsllq $60, %xmm1, %xmm3
848; AVX1-NEXT:    vpsllq $50, %xmm1, %xmm1
849; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
850; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm3
851; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
852; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm3[4,5,6,7]
853; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
854; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
855; AVX1-NEXT:    retq
856;
857; AVX2-LABEL: constant_rotate_v4i64:
858; AVX2:       # %bb.0:
859; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
860; AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
861; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
862; AVX2-NEXT:    retq
863;
864; AVX512NOVLX-LABEL: constant_rotate_v4i64:
865; AVX512NOVLX:       # %bb.0:
866; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
867; AVX512NOVLX-NEXT:    vpmovsxbq {{.*#+}} ymm1 = [4,14,50,60]
868; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
869; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
870; AVX512NOVLX-NEXT:    retq
871;
872; AVX512VLX-LABEL: constant_rotate_v4i64:
873; AVX512VLX:       # %bb.0:
874; AVX512VLX-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
875; AVX512VLX-NEXT:    retq
876;
877; XOPAVX1-LABEL: constant_rotate_v4i64:
878; XOPAVX1:       # %bb.0:
879; XOPAVX1-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
880; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
881; XOPAVX1-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
882; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
883; XOPAVX1-NEXT:    retq
884;
885; XOPAVX2-LABEL: constant_rotate_v4i64:
886; XOPAVX2:       # %bb.0:
887; XOPAVX2-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
888; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
889; XOPAVX2-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
890; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
891; XOPAVX2-NEXT:    retq
892  %shl = shl <4 x i64> %a, <i64 4, i64 14, i64 50, i64 60>
893  %lshr = lshr <4 x i64> %a, <i64 60, i64 50, i64 14, i64 4>
894  %or = or <4 x i64> %shl, %lshr
895  ret <4 x i64> %or
896}
897
898define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
899; AVX1-LABEL: constant_rotate_v8i32:
900; AVX1:       # %bb.0:
901; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
902; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[1,1,3,3]
903; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
904; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
905; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
906; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
907; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
908; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
909; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
910; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
911; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
912; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
913; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
914; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,3],xmm3[4,5],xmm2[6,7]
915; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
916; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
917; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
918; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
919; AVX1-NEXT:    retq
920;
921; AVX2-LABEL: constant_rotate_v8i32:
922; AVX2:       # %bb.0:
923; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
924; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
925; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
926; AVX2-NEXT:    retq
927;
928; AVX512NOVLX-LABEL: constant_rotate_v8i32:
929; AVX512NOVLX:       # %bb.0:
930; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
931; AVX512NOVLX-NEXT:    vpmovsxbd {{.*#+}} ymm1 = [4,5,6,7,8,9,10,11]
932; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
933; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
934; AVX512NOVLX-NEXT:    retq
935;
936; AVX512VLX-LABEL: constant_rotate_v8i32:
937; AVX512VLX:       # %bb.0:
938; AVX512VLX-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
939; AVX512VLX-NEXT:    retq
940;
941; XOPAVX1-LABEL: constant_rotate_v8i32:
942; XOPAVX1:       # %bb.0:
943; XOPAVX1-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
944; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
945; XOPAVX1-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
946; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
947; XOPAVX1-NEXT:    retq
948;
949; XOPAVX2-LABEL: constant_rotate_v8i32:
950; XOPAVX2:       # %bb.0:
951; XOPAVX2-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
952; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
953; XOPAVX2-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
954; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
955; XOPAVX2-NEXT:    retq
956  %shl = shl <8 x i32> %a, <i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
957  %lshr = lshr <8 x i32> %a, <i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21>
958  %or = or <8 x i32> %shl, %lshr
959  ret <8 x i32> %or
960}
961
962define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
963; AVX1-LABEL: constant_rotate_v16i16:
964; AVX1:       # %bb.0:
965; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
966; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [256,512,1024,2048,4096,8192,16384,32768]
967; AVX1-NEXT:    vpmulhuw %xmm2, %xmm1, %xmm3
968; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
969; AVX1-NEXT:    vpor %xmm3, %xmm1, %xmm1
970; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128]
971; AVX1-NEXT:    vpmulhuw %xmm2, %xmm0, %xmm3
972; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
973; AVX1-NEXT:    vpor %xmm3, %xmm0, %xmm0
974; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
975; AVX1-NEXT:    retq
976;
977; AVX2-LABEL: constant_rotate_v16i16:
978; AVX2:       # %bb.0:
979; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
980; AVX2-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
981; AVX2-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
982; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
983; AVX2-NEXT:    retq
984;
985; AVX512F-LABEL: constant_rotate_v16i16:
986; AVX512F:       # %bb.0:
987; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
988; AVX512F-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
989; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
990; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
991; AVX512F-NEXT:    retq
992;
993; AVX512VL-LABEL: constant_rotate_v16i16:
994; AVX512VL:       # %bb.0:
995; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768]
996; AVX512VL-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2
997; AVX512VL-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
998; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
999; AVX512VL-NEXT:    retq
1000;
1001; AVX512BW-LABEL: constant_rotate_v16i16:
1002; AVX512BW:       # %bb.0:
1003; AVX512BW-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1004; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1005; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
1006; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1007; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1008; AVX512BW-NEXT:    vpor %ymm2, %ymm0, %ymm0
1009; AVX512BW-NEXT:    retq
1010;
1011; AVX512VLBW-LABEL: constant_rotate_v16i16:
1012; AVX512VLBW:       # %bb.0:
1013; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
1014; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1015; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1016; AVX512VLBW-NEXT:    retq
1017;
1018; AVX512VBMI2-LABEL: constant_rotate_v16i16:
1019; AVX512VBMI2:       # %bb.0:
1020; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1021; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
1022; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
1023; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1024; AVX512VBMI2-NEXT:    retq
1025;
1026; AVX512VLVBMI2-LABEL: constant_rotate_v16i16:
1027; AVX512VLVBMI2:       # %bb.0:
1028; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1029; AVX512VLVBMI2-NEXT:    retq
1030;
1031; XOPAVX1-LABEL: constant_rotate_v16i16:
1032; XOPAVX1:       # %bb.0:
1033; XOPAVX1-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1034; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1035; XOPAVX1-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1036; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1037; XOPAVX1-NEXT:    retq
1038;
1039; XOPAVX2-LABEL: constant_rotate_v16i16:
1040; XOPAVX2:       # %bb.0:
1041; XOPAVX2-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1042; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1043; XOPAVX2-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1044; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1045; XOPAVX2-NEXT:    retq
1046  %shl = shl <16 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
1047  %lshr = lshr <16 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9, i16 8, i16 7, i16 6, i16 5, i16 4, i16 3, i16 2, i16 1>
1048  %or = or <16 x i16> %shl, %lshr
1049  ret <16 x i16> %or
1050}
1051
1052define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
1053; AVX1-LABEL: constant_rotate_v32i8:
1054; AVX1:       # %bb.0:
1055; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1056; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1057; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = [1,128,64,32,16,8,4,2]
1058; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1059; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1060; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1061; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128]
1062; AVX1-NEXT:    vpmullw %xmm4, %xmm1, %xmm1
1063; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
1064; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
1065; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1066; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
1067; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
1068; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1069; AVX1-NEXT:    vpmullw %xmm4, %xmm0, %xmm0
1070; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
1071; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
1072; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1073; AVX1-NEXT:    retq
1074;
1075; AVX2-LABEL: constant_rotate_v32i8:
1076; AVX2:       # %bb.0:
1077; AVX2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1078; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1079; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1080; AVX2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1081; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1082; AVX2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1083; AVX2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1084; AVX2-NEXT:    retq
1085;
1086; AVX512F-LABEL: constant_rotate_v32i8:
1087; AVX512F:       # %bb.0:
1088; AVX512F-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1089; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1090; AVX512F-NEXT:    vpsrlw $8, %ymm1, %ymm1
1091; AVX512F-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1092; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1093; AVX512F-NEXT:    vpsrlw $8, %ymm0, %ymm0
1094; AVX512F-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1095; AVX512F-NEXT:    retq
1096;
1097; AVX512VL-LABEL: constant_rotate_v32i8:
1098; AVX512VL:       # %bb.0:
1099; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1100; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [1,128,64,32,16,8,4,2,1,128,64,32,16,8,4,2]
1101; AVX512VL-NEXT:    vpsrlw $8, %ymm1, %ymm1
1102; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1103; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1,2,4,8,16,32,64,128,1,2,4,8,16,32,64,128]
1104; AVX512VL-NEXT:    vpsrlw $8, %ymm0, %ymm0
1105; AVX512VL-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1106; AVX512VL-NEXT:    retq
1107;
1108; AVX512BW-LABEL: constant_rotate_v32i8:
1109; AVX512BW:       # %bb.0:
1110; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1111; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
1112; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1113; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
1114; AVX512BW-NEXT:    vpsrlw $8, %ymm1, %ymm1
1115; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1116; AVX512BW-NEXT:    # ymm2 = mem[0,1,0,1]
1117; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1118; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1119; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1120; AVX512BW-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1121; AVX512BW-NEXT:    retq
1122;
1123; AVX512VLBW-LABEL: constant_rotate_v32i8:
1124; AVX512VLBW:       # %bb.0:
1125; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1126; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1127; AVX512VLBW-NEXT:    vpsrlw $8, %ymm1, %ymm1
1128; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1129; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1130; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
1131; AVX512VLBW-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1132; AVX512VLBW-NEXT:    retq
1133;
1134; AVX512VBMI2-LABEL: constant_rotate_v32i8:
1135; AVX512VBMI2:       # %bb.0:
1136; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0,3,0,2,0,1,0]
1137; AVX512VBMI2-NEXT:    # ymm1 = mem[0,1,0,1]
1138; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1139; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
1140; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1141; AVX512VBMI2-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0,0,0,1,0,2,0,3,0,4,0,5,0,6,0,7,0]
1142; AVX512VBMI2-NEXT:    # ymm2 = mem[0,1,0,1]
1143; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1144; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1145; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1146; AVX512VBMI2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1147; AVX512VBMI2-NEXT:    retq
1148;
1149; AVX512VLVBMI2-LABEL: constant_rotate_v32i8:
1150; AVX512VLVBMI2:       # %bb.0:
1151; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
1152; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1153; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm1, %ymm1
1154; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23]
1155; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1156; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
1157; AVX512VLVBMI2-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0
1158; AVX512VLVBMI2-NEXT:    retq
1159;
1160; XOPAVX1-LABEL: constant_rotate_v32i8:
1161; XOPAVX1:       # %bb.0:
1162; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1163; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1164; XOPAVX1-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1165; XOPAVX1-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1166; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1167; XOPAVX1-NEXT:    retq
1168;
1169; XOPAVX2-LABEL: constant_rotate_v32i8:
1170; XOPAVX2:       # %bb.0:
1171; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
1172; XOPAVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
1173; XOPAVX2-NEXT:    vprotb %xmm2, %xmm1, %xmm1
1174; XOPAVX2-NEXT:    vprotb %xmm2, %xmm0, %xmm0
1175; XOPAVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
1176; XOPAVX2-NEXT:    retq
1177  %shl = shl <32 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1178  %lshr = lshr <32 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1179  %or = or <32 x i8> %shl, %lshr
1180  ret <32 x i8> %or
1181}
1182
1183;
1184; Uniform Constant Rotates
1185;
1186
1187define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
1188; AVX1-LABEL: splatconstant_rotate_v4i64:
1189; AVX1:       # %bb.0:
1190; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm1
1191; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
1192; AVX1-NEXT:    vpsrlq $50, %xmm2, %xmm3
1193; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
1194; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm0
1195; AVX1-NEXT:    vpsllq $14, %xmm2, %xmm2
1196; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
1197; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
1198; AVX1-NEXT:    retq
1199;
1200; AVX2-LABEL: splatconstant_rotate_v4i64:
1201; AVX2:       # %bb.0:
1202; AVX2-NEXT:    vpsrlq $50, %ymm0, %ymm1
1203; AVX2-NEXT:    vpsllq $14, %ymm0, %ymm0
1204; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1205; AVX2-NEXT:    retq
1206;
1207; AVX512NOVLX-LABEL: splatconstant_rotate_v4i64:
1208; AVX512NOVLX:       # %bb.0:
1209; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1210; AVX512NOVLX-NEXT:    vprolq $14, %zmm0, %zmm0
1211; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1212; AVX512NOVLX-NEXT:    retq
1213;
1214; AVX512VLX-LABEL: splatconstant_rotate_v4i64:
1215; AVX512VLX:       # %bb.0:
1216; AVX512VLX-NEXT:    vprolq $14, %ymm0, %ymm0
1217; AVX512VLX-NEXT:    retq
1218;
1219; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
1220; XOPAVX1:       # %bb.0:
1221; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
1222; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1223; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm0
1224; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1225; XOPAVX1-NEXT:    retq
1226;
1227; XOPAVX2-LABEL: splatconstant_rotate_v4i64:
1228; XOPAVX2:       # %bb.0:
1229; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm1
1230; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1231; XOPAVX2-NEXT:    vprotq $14, %xmm0, %xmm0
1232; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1233; XOPAVX2-NEXT:    retq
1234  %shl = shl <4 x i64> %a, <i64 14, i64 14, i64 14, i64 14>
1235  %lshr = lshr <4 x i64> %a, <i64 50, i64 50, i64 50, i64 50>
1236  %or = or <4 x i64> %shl, %lshr
1237  ret <4 x i64> %or
1238}
1239
1240define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
1241; AVX1-LABEL: splatconstant_rotate_v8i32:
1242; AVX1:       # %bb.0:
1243; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1244; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1245; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1246; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1247; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1248; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1249; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1250; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1251; AVX1-NEXT:    retq
1252;
1253; AVX2-LABEL: splatconstant_rotate_v8i32:
1254; AVX2:       # %bb.0:
1255; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1256; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1257; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1258; AVX2-NEXT:    retq
1259;
1260; AVX512NOVLX-LABEL: splatconstant_rotate_v8i32:
1261; AVX512NOVLX:       # %bb.0:
1262; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1263; AVX512NOVLX-NEXT:    vprold $4, %zmm0, %zmm0
1264; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1265; AVX512NOVLX-NEXT:    retq
1266;
1267; AVX512VLX-LABEL: splatconstant_rotate_v8i32:
1268; AVX512VLX:       # %bb.0:
1269; AVX512VLX-NEXT:    vprold $4, %ymm0, %ymm0
1270; AVX512VLX-NEXT:    retq
1271;
1272; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
1273; XOPAVX1:       # %bb.0:
1274; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1275; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1276; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1277; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1278; XOPAVX1-NEXT:    retq
1279;
1280; XOPAVX2-LABEL: splatconstant_rotate_v8i32:
1281; XOPAVX2:       # %bb.0:
1282; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1283; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1284; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1285; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1286; XOPAVX2-NEXT:    retq
1287  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1288  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1289  %or = or <8 x i32> %shl, %lshr
1290  ret <8 x i32> %or
1291}
1292
1293define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
1294; AVX1-LABEL: splatconstant_rotate_v16i16:
1295; AVX1:       # %bb.0:
1296; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1297; AVX1-NEXT:    vpsrlw $9, %xmm1, %xmm2
1298; AVX1-NEXT:    vpsllw $7, %xmm1, %xmm1
1299; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1300; AVX1-NEXT:    vpsrlw $9, %xmm0, %xmm2
1301; AVX1-NEXT:    vpsllw $7, %xmm0, %xmm0
1302; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1303; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1304; AVX1-NEXT:    retq
1305;
1306; AVX2-LABEL: splatconstant_rotate_v16i16:
1307; AVX2:       # %bb.0:
1308; AVX2-NEXT:    vpsrlw $9, %ymm0, %ymm1
1309; AVX2-NEXT:    vpsllw $7, %ymm0, %ymm0
1310; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1311; AVX2-NEXT:    retq
1312;
1313; AVX512F-LABEL: splatconstant_rotate_v16i16:
1314; AVX512F:       # %bb.0:
1315; AVX512F-NEXT:    vpsrlw $9, %ymm0, %ymm1
1316; AVX512F-NEXT:    vpsllw $7, %ymm0, %ymm0
1317; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1318; AVX512F-NEXT:    retq
1319;
1320; AVX512VL-LABEL: splatconstant_rotate_v16i16:
1321; AVX512VL:       # %bb.0:
1322; AVX512VL-NEXT:    vpsrlw $9, %ymm0, %ymm1
1323; AVX512VL-NEXT:    vpsllw $7, %ymm0, %ymm0
1324; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
1325; AVX512VL-NEXT:    retq
1326;
1327; AVX512BW-LABEL: splatconstant_rotate_v16i16:
1328; AVX512BW:       # %bb.0:
1329; AVX512BW-NEXT:    vpsrlw $9, %ymm0, %ymm1
1330; AVX512BW-NEXT:    vpsllw $7, %ymm0, %ymm0
1331; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1332; AVX512BW-NEXT:    retq
1333;
1334; AVX512VLBW-LABEL: splatconstant_rotate_v16i16:
1335; AVX512VLBW:       # %bb.0:
1336; AVX512VLBW-NEXT:    vpsrlw $9, %ymm0, %ymm1
1337; AVX512VLBW-NEXT:    vpsllw $7, %ymm0, %ymm0
1338; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1339; AVX512VLBW-NEXT:    retq
1340;
1341; AVX512VBMI2-LABEL: splatconstant_rotate_v16i16:
1342; AVX512VBMI2:       # %bb.0:
1343; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1344; AVX512VBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
1345; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1346; AVX512VBMI2-NEXT:    retq
1347;
1348; AVX512VLVBMI2-LABEL: splatconstant_rotate_v16i16:
1349; AVX512VLVBMI2:       # %bb.0:
1350; AVX512VLVBMI2-NEXT:    vpshldw $7, %ymm0, %ymm0, %ymm0
1351; AVX512VLVBMI2-NEXT:    retq
1352;
1353; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
1354; XOPAVX1:       # %bb.0:
1355; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
1356; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1357; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm0
1358; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1359; XOPAVX1-NEXT:    retq
1360;
1361; XOPAVX2-LABEL: splatconstant_rotate_v16i16:
1362; XOPAVX2:       # %bb.0:
1363; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm1
1364; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1365; XOPAVX2-NEXT:    vprotw $7, %xmm0, %xmm0
1366; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1367; XOPAVX2-NEXT:    retq
1368  %shl = shl <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1369  %lshr = lshr <16 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1370  %or = or <16 x i16> %shl, %lshr
1371  ret <16 x i16> %or
1372}
1373
1374define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
1375; AVX1-LABEL: splatconstant_rotate_v32i8:
1376; AVX1:       # %bb.0:
1377; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1378; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1379; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1380; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1381; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1382; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1383; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1384; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1385; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1386; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1387; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1388; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1389; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1390; AVX1-NEXT:    retq
1391;
1392; AVX2-LABEL: splatconstant_rotate_v32i8:
1393; AVX2:       # %bb.0:
1394; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1395; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1396; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1397; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1398; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1399; AVX2-NEXT:    retq
1400;
1401; AVX512NOVLX-LABEL: splatconstant_rotate_v32i8:
1402; AVX512NOVLX:       # %bb.0:
1403; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
1404; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
1405; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
1406; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
1407; AVX512NOVLX-NEXT:    retq
1408;
1409; AVX512VLX-LABEL: splatconstant_rotate_v32i8:
1410; AVX512VLX:       # %bb.0:
1411; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
1412; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
1413; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1414; AVX512VLX-NEXT:    retq
1415;
1416; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
1417; XOPAVX1:       # %bb.0:
1418; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1419; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1420; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1421; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1422; XOPAVX1-NEXT:    retq
1423;
1424; XOPAVX2-LABEL: splatconstant_rotate_v32i8:
1425; XOPAVX2:       # %bb.0:
1426; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1427; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1428; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1429; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1430; XOPAVX2-NEXT:    retq
1431  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1432  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1433  %or = or <32 x i8> %shl, %lshr
1434  ret <32 x i8> %or
1435}
1436
1437;
1438; Masked Uniform Constant Rotates
1439;
1440
1441define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
1442; AVX1-LABEL: splatconstant_rotate_mask_v4i64:
1443; AVX1:       # %bb.0:
1444; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm1
1445; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1446; AVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
1447; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1448; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1449; AVX1-NEXT:    retq
1450;
1451; AVX2-LABEL: splatconstant_rotate_mask_v4i64:
1452; AVX2:       # %bb.0:
1453; AVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
1454; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1455; AVX2-NEXT:    retq
1456;
1457; AVX512-LABEL: splatconstant_rotate_mask_v4i64:
1458; AVX512:       # %bb.0:
1459; AVX512-NEXT:    vpsrlq $49, %ymm0, %ymm0
1460; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1461; AVX512-NEXT:    retq
1462;
1463; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
1464; XOPAVX1:       # %bb.0:
1465; XOPAVX1-NEXT:    vpsrlq $49, %xmm0, %xmm1
1466; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1467; XOPAVX1-NEXT:    vpsrlq $49, %xmm0, %xmm0
1468; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1469; XOPAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1470; XOPAVX1-NEXT:    retq
1471;
1472; XOPAVX2-LABEL: splatconstant_rotate_mask_v4i64:
1473; XOPAVX2:       # %bb.0:
1474; XOPAVX2-NEXT:    vpsrlq $49, %ymm0, %ymm0
1475; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1476; XOPAVX2-NEXT:    retq
1477  %shl = shl <4 x i64> %a, <i64 15, i64 15, i64 15, i64 15>
1478  %lshr = lshr <4 x i64> %a, <i64 49, i64 49, i64 49, i64 49>
1479  %rmask = and <4 x i64> %lshr, <i64 255, i64 127, i64 127, i64 255>
1480  %lmask = and <4 x i64> %shl, <i64 33, i64 65, i64 129, i64 257>
1481  %or = or <4 x i64> %lmask, %rmask
1482  ret <4 x i64> %or
1483}
1484
1485define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
1486; AVX1-LABEL: splatconstant_rotate_mask_v8i32:
1487; AVX1:       # %bb.0:
1488; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1489; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm2
1490; AVX1-NEXT:    vpslld $4, %xmm1, %xmm1
1491; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1492; AVX1-NEXT:    vpsrld $28, %xmm0, %xmm2
1493; AVX1-NEXT:    vpslld $4, %xmm0, %xmm0
1494; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1495; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1496; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1497; AVX1-NEXT:    retq
1498;
1499; AVX2-LABEL: splatconstant_rotate_mask_v8i32:
1500; AVX2:       # %bb.0:
1501; AVX2-NEXT:    vpsrld $28, %ymm0, %ymm1
1502; AVX2-NEXT:    vpslld $4, %ymm0, %ymm0
1503; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1504; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1505; AVX2-NEXT:    retq
1506;
1507; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v8i32:
1508; AVX512NOVLX:       # %bb.0:
1509; AVX512NOVLX-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1510; AVX512NOVLX-NEXT:    vprold $4, %zmm0, %zmm0
1511; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1512; AVX512NOVLX-NEXT:    retq
1513;
1514; AVX512VLX-LABEL: splatconstant_rotate_mask_v8i32:
1515; AVX512VLX:       # %bb.0:
1516; AVX512VLX-NEXT:    vprold $4, %ymm0, %ymm0
1517; AVX512VLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1518; AVX512VLX-NEXT:    retq
1519;
1520; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
1521; XOPAVX1:       # %bb.0:
1522; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
1523; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1524; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm0
1525; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1526; XOPAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1527; XOPAVX1-NEXT:    retq
1528;
1529; XOPAVX2-LABEL: splatconstant_rotate_mask_v8i32:
1530; XOPAVX2:       # %bb.0:
1531; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm1
1532; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1533; XOPAVX2-NEXT:    vprotd $4, %xmm0, %xmm0
1534; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1535; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1536; XOPAVX2-NEXT:    retq
1537  %shl = shl <8 x i32> %a, <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
1538  %lshr = lshr <8 x i32> %a, <i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28, i32 28>
1539  %rmask = and <8 x i32> %lshr, <i32 3, i32 7, i32 15, i32 31, i32 63, i32 127, i32 255, i32 511>
1540  %lmask = and <8 x i32> %shl, <i32 511, i32 255, i32 127, i32 63, i32 31, i32 15, i32 7, i32 3>
1541  %or = or <8 x i32> %lmask, %rmask
1542  ret <8 x i32> %or
1543}
1544
1545define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
1546; AVX1-LABEL: splatconstant_rotate_mask_v16i16:
1547; AVX1:       # %bb.0:
1548; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1549; AVX1-NEXT:    vpsrlw $11, %xmm1, %xmm2
1550; AVX1-NEXT:    vpsllw $5, %xmm1, %xmm1
1551; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1552; AVX1-NEXT:    vpsrlw $11, %xmm0, %xmm2
1553; AVX1-NEXT:    vpsllw $5, %xmm0, %xmm0
1554; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1555; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1556; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1557; AVX1-NEXT:    retq
1558;
1559; AVX2-LABEL: splatconstant_rotate_mask_v16i16:
1560; AVX2:       # %bb.0:
1561; AVX2-NEXT:    vpsrlw $11, %ymm0, %ymm1
1562; AVX2-NEXT:    vpsllw $5, %ymm0, %ymm0
1563; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1564; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1565; AVX2-NEXT:    retq
1566;
1567; AVX512F-LABEL: splatconstant_rotate_mask_v16i16:
1568; AVX512F:       # %bb.0:
1569; AVX512F-NEXT:    vpsrlw $11, %ymm0, %ymm1
1570; AVX512F-NEXT:    vpsllw $5, %ymm0, %ymm0
1571; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
1572; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1573; AVX512F-NEXT:    retq
1574;
1575; AVX512VL-LABEL: splatconstant_rotate_mask_v16i16:
1576; AVX512VL:       # %bb.0:
1577; AVX512VL-NEXT:    vpsllw $5, %ymm0, %ymm1
1578; AVX512VL-NEXT:    vpsrlw $11, %ymm0, %ymm0
1579; AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
1580; AVX512VL-NEXT:    retq
1581;
1582; AVX512BW-LABEL: splatconstant_rotate_mask_v16i16:
1583; AVX512BW:       # %bb.0:
1584; AVX512BW-NEXT:    vpsrlw $11, %ymm0, %ymm1
1585; AVX512BW-NEXT:    vpsllw $5, %ymm0, %ymm0
1586; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
1587; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1588; AVX512BW-NEXT:    retq
1589;
1590; AVX512VLBW-LABEL: splatconstant_rotate_mask_v16i16:
1591; AVX512VLBW:       # %bb.0:
1592; AVX512VLBW-NEXT:    vpsllw $5, %ymm0, %ymm1
1593; AVX512VLBW-NEXT:    vpsrlw $11, %ymm0, %ymm0
1594; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} ymm0 = mem & (ymm0 | ymm1)
1595; AVX512VLBW-NEXT:    retq
1596;
1597; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v16i16:
1598; AVX512VBMI2:       # %bb.0:
1599; AVX512VBMI2-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
1600; AVX512VBMI2-NEXT:    vpshldw $5, %zmm0, %zmm0, %zmm0
1601; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1602; AVX512VBMI2-NEXT:    retq
1603;
1604; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v16i16:
1605; AVX512VLVBMI2:       # %bb.0:
1606; AVX512VLVBMI2-NEXT:    vpshldw $5, %ymm0, %ymm0, %ymm0
1607; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1608; AVX512VLVBMI2-NEXT:    retq
1609;
1610; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
1611; XOPAVX1:       # %bb.0:
1612; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
1613; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1614; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm0
1615; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1616; XOPAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1617; XOPAVX1-NEXT:    retq
1618;
1619; XOPAVX2-LABEL: splatconstant_rotate_mask_v16i16:
1620; XOPAVX2:       # %bb.0:
1621; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm1
1622; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1623; XOPAVX2-NEXT:    vprotw $5, %xmm0, %xmm0
1624; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1625; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1626; XOPAVX2-NEXT:    retq
1627  %shl = shl <16 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1628  %lshr = lshr <16 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1629  %rmask = and <16 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1630  %lmask = and <16 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1631  %or = or <16 x i16> %lmask, %rmask
1632  ret <16 x i16> %or
1633}
1634
1635define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
1636; AVX1-LABEL: splatconstant_rotate_mask_v32i8:
1637; AVX1:       # %bb.0:
1638; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
1639; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm2
1640; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
1641; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1642; AVX1-NEXT:    vpsllw $4, %xmm1, %xmm1
1643; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
1644; AVX1-NEXT:    vpor %xmm2, %xmm1, %xmm1
1645; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm2
1646; AVX1-NEXT:    vpandn %xmm2, %xmm3, %xmm2
1647; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm0
1648; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
1649; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1650; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
1651; AVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1652; AVX1-NEXT:    retq
1653;
1654; AVX2-LABEL: splatconstant_rotate_mask_v32i8:
1655; AVX2:       # %bb.0:
1656; AVX2-NEXT:    vpsrlw $4, %ymm0, %ymm1
1657; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
1658; AVX2-NEXT:    vpsllw $4, %ymm0, %ymm0
1659; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1660; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
1661; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1662; AVX2-NEXT:    retq
1663;
1664; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v32i8:
1665; AVX512NOVLX:       # %bb.0:
1666; AVX512NOVLX-NEXT:    vpsllw $4, %ymm0, %ymm1
1667; AVX512NOVLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
1668; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
1669; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1670; AVX512NOVLX-NEXT:    retq
1671;
1672; AVX512VLX-LABEL: splatconstant_rotate_mask_v32i8:
1673; AVX512VLX:       # %bb.0:
1674; AVX512VLX-NEXT:    vpsllw $4, %ymm0, %ymm1
1675; AVX512VLX-NEXT:    vpsrlw $4, %ymm0, %ymm0
1676; AVX512VLX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
1677; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
1678; AVX512VLX-NEXT:    retq
1679;
1680; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
1681; XOPAVX1:       # %bb.0:
1682; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
1683; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
1684; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm0
1685; XOPAVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
1686; XOPAVX1-NEXT:    vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1687; XOPAVX1-NEXT:    retq
1688;
1689; XOPAVX2-LABEL: splatconstant_rotate_mask_v32i8:
1690; XOPAVX2:       # %bb.0:
1691; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm1
1692; XOPAVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
1693; XOPAVX2-NEXT:    vprotb $4, %xmm0, %xmm0
1694; XOPAVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
1695; XOPAVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
1696; XOPAVX2-NEXT:    retq
1697  %shl = shl <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1698  %lshr = lshr <32 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1699  %rmask = and <32 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1700  %lmask = and <32 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1701  %or = or <32 x i8> %lmask, %rmask
1702  ret <32 x i8> %or
1703}
1704