xref: /llvm-project/llvm/test/CodeGen/X86/vector-rotate-128.ll (revision a25f2cb3e6953691fade076c8e0ccebf1016d3d9)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLBW
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512,AVX512NOVLX,AVX512VBMI2
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VLX,AVX512VLVBMI2
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
14
15; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
16; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
17
18;
19; Variable Rotates
20;
21
22define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
23; SSE2-LABEL: var_rotate_v2i64:
24; SSE2:       # %bb.0:
25; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
26; SSE2-NEXT:    psubq %xmm1, %xmm2
27; SSE2-NEXT:    movdqa %xmm0, %xmm3
28; SSE2-NEXT:    psllq %xmm1, %xmm3
29; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
30; SSE2-NEXT:    movdqa %xmm0, %xmm4
31; SSE2-NEXT:    psllq %xmm1, %xmm4
32; SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
33; SSE2-NEXT:    movdqa %xmm0, %xmm1
34; SSE2-NEXT:    psrlq %xmm2, %xmm1
35; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
36; SSE2-NEXT:    psrlq %xmm2, %xmm0
37; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
38; SSE2-NEXT:    orpd %xmm4, %xmm0
39; SSE2-NEXT:    retq
40;
41; SSE41-LABEL: var_rotate_v2i64:
42; SSE41:       # %bb.0:
43; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [64,64]
44; SSE41-NEXT:    psubq %xmm1, %xmm2
45; SSE41-NEXT:    movdqa %xmm0, %xmm3
46; SSE41-NEXT:    psllq %xmm1, %xmm3
47; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
48; SSE41-NEXT:    movdqa %xmm0, %xmm4
49; SSE41-NEXT:    psllq %xmm1, %xmm4
50; SSE41-NEXT:    pblendw {{.*#+}} xmm4 = xmm3[0,1,2,3],xmm4[4,5,6,7]
51; SSE41-NEXT:    movdqa %xmm0, %xmm1
52; SSE41-NEXT:    psrlq %xmm2, %xmm1
53; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
54; SSE41-NEXT:    psrlq %xmm2, %xmm0
55; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
56; SSE41-NEXT:    por %xmm4, %xmm0
57; SSE41-NEXT:    retq
58;
59; AVX1-LABEL: var_rotate_v2i64:
60; AVX1:       # %bb.0:
61; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
62; AVX1-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
63; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm3
64; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
65; AVX1-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
66; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3],xmm1[4,5,6,7]
67; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm3
68; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
69; AVX1-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
70; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
71; AVX1-NEXT:    vpor %xmm0, %xmm1, %xmm0
72; AVX1-NEXT:    retq
73;
74; AVX2-LABEL: var_rotate_v2i64:
75; AVX2:       # %bb.0:
76; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
77; AVX2-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
78; AVX2-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
79; AVX2-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
80; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
81; AVX2-NEXT:    retq
82;
83; AVX512NOVLX-LABEL: var_rotate_v2i64:
84; AVX512NOVLX:       # %bb.0:
85; AVX512NOVLX-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
86; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
87; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
88; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
89; AVX512NOVLX-NEXT:    vzeroupper
90; AVX512NOVLX-NEXT:    retq
91;
92; AVX512VLX-LABEL: var_rotate_v2i64:
93; AVX512VLX:       # %bb.0:
94; AVX512VLX-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
95; AVX512VLX-NEXT:    retq
96;
97; XOP-LABEL: var_rotate_v2i64:
98; XOP:       # %bb.0:
99; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
100; XOP-NEXT:    retq
101;
102; X86-SSE2-LABEL: var_rotate_v2i64:
103; X86-SSE2:       # %bb.0:
104; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,64,0]
105; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
106; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
107; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
108; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
109; X86-SSE2-NEXT:    movdqa %xmm0, %xmm4
110; X86-SSE2-NEXT:    psllq %xmm1, %xmm4
111; X86-SSE2-NEXT:    movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1]
112; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
113; X86-SSE2-NEXT:    psrlq %xmm2, %xmm1
114; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
115; X86-SSE2-NEXT:    psrlq %xmm2, %xmm0
116; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
117; X86-SSE2-NEXT:    orpd %xmm4, %xmm0
118; X86-SSE2-NEXT:    retl
119  %b64 = sub <2 x i64> <i64 64, i64 64>, %b
120  %shl = shl <2 x i64> %a, %b
121  %lshr = lshr <2 x i64> %a, %b64
122  %or = or <2 x i64> %shl, %lshr
123  ret <2 x i64> %or
124}
125
126define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
127; SSE2-LABEL: var_rotate_v4i32:
128; SSE2:       # %bb.0:
129; SSE2-NEXT:    pslld $23, %xmm1
130; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
131; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
132; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
133; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
134; SSE2-NEXT:    pmuludq %xmm1, %xmm0
135; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
136; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
137; SSE2-NEXT:    pmuludq %xmm2, %xmm1
138; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
139; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
140; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
141; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
142; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
143; SSE2-NEXT:    por %xmm3, %xmm0
144; SSE2-NEXT:    retq
145;
146; SSE41-LABEL: var_rotate_v4i32:
147; SSE41:       # %bb.0:
148; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
149; SSE41-NEXT:    pslld $23, %xmm1
150; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
151; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
152; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
153; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
154; SSE41-NEXT:    pmuludq %xmm2, %xmm3
155; SSE41-NEXT:    pmuludq %xmm1, %xmm0
156; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
157; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
158; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
159; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
160; SSE41-NEXT:    por %xmm1, %xmm0
161; SSE41-NEXT:    retq
162;
163; AVX1-LABEL: var_rotate_v4i32:
164; AVX1:       # %bb.0:
165; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
166; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
167; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
168; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
169; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
170; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
171; AVX1-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
172; AVX1-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0
173; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
174; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
175; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
176; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
177; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
178; AVX1-NEXT:    retq
179;
180; AVX2-LABEL: var_rotate_v4i32:
181; AVX2:       # %bb.0:
182; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
183; AVX2-NEXT:    vpand %xmm2, %xmm1, %xmm1
184; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm2
185; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
186; AVX2-NEXT:    vpsubd %xmm1, %xmm3, %xmm1
187; AVX2-NEXT:    vpsrlvd %xmm1, %xmm0, %xmm0
188; AVX2-NEXT:    vpor %xmm0, %xmm2, %xmm0
189; AVX2-NEXT:    retq
190;
191; AVX512NOVLX-LABEL: var_rotate_v4i32:
192; AVX512NOVLX:       # %bb.0:
193; AVX512NOVLX-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
194; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
195; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
196; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
197; AVX512NOVLX-NEXT:    vzeroupper
198; AVX512NOVLX-NEXT:    retq
199;
200; AVX512VLX-LABEL: var_rotate_v4i32:
201; AVX512VLX:       # %bb.0:
202; AVX512VLX-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
203; AVX512VLX-NEXT:    retq
204;
205; XOP-LABEL: var_rotate_v4i32:
206; XOP:       # %bb.0:
207; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
208; XOP-NEXT:    retq
209;
210; X86-SSE2-LABEL: var_rotate_v4i32:
211; X86-SSE2:       # %bb.0:
212; X86-SSE2-NEXT:    pslld $23, %xmm1
213; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
214; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
215; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
216; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
217; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
218; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
219; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
220; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
221; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
222; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
223; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
224; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
225; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
226; X86-SSE2-NEXT:    por %xmm3, %xmm0
227; X86-SSE2-NEXT:    retl
228  %b32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %b
229  %shl = shl <4 x i32> %a, %b
230  %lshr = lshr <4 x i32> %a, %b32
231  %or = or <4 x i32> %shl, %lshr
232  ret <4 x i32> %or
233}
234
235define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
236; SSE2-LABEL: var_rotate_v8i16:
237; SSE2:       # %bb.0:
238; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
239; SSE2-NEXT:    movdqa %xmm1, %xmm2
240; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
241; SSE2-NEXT:    pslld $23, %xmm2
242; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
243; SSE2-NEXT:    paddd %xmm3, %xmm2
244; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
245; SSE2-NEXT:    pslld $16, %xmm2
246; SSE2-NEXT:    psrad $16, %xmm2
247; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
248; SSE2-NEXT:    pslld $23, %xmm1
249; SSE2-NEXT:    paddd %xmm3, %xmm1
250; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
251; SSE2-NEXT:    pslld $16, %xmm1
252; SSE2-NEXT:    psrad $16, %xmm1
253; SSE2-NEXT:    packssdw %xmm2, %xmm1
254; SSE2-NEXT:    movdqa %xmm0, %xmm2
255; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
256; SSE2-NEXT:    pmullw %xmm1, %xmm0
257; SSE2-NEXT:    por %xmm2, %xmm0
258; SSE2-NEXT:    retq
259;
260; SSE41-LABEL: var_rotate_v8i16:
261; SSE41:       # %bb.0:
262; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
263; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
264; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
265; SSE41-NEXT:    pslld $23, %xmm1
266; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
267; SSE41-NEXT:    paddd %xmm3, %xmm1
268; SSE41-NEXT:    cvttps2dq %xmm1, %xmm1
269; SSE41-NEXT:    pslld $23, %xmm2
270; SSE41-NEXT:    paddd %xmm3, %xmm2
271; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
272; SSE41-NEXT:    packusdw %xmm1, %xmm2
273; SSE41-NEXT:    movdqa %xmm0, %xmm1
274; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
275; SSE41-NEXT:    pmullw %xmm2, %xmm0
276; SSE41-NEXT:    por %xmm1, %xmm0
277; SSE41-NEXT:    retq
278;
279; AVX1-LABEL: var_rotate_v8i16:
280; AVX1:       # %bb.0:
281; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
282; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4,4,5,5,6,6,7,7]
283; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
284; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
285; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
286; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
287; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
288; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
289; AVX1-NEXT:    vpaddd %xmm3, %xmm1, %xmm1
290; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
291; AVX1-NEXT:    vpackusdw %xmm2, %xmm1, %xmm1
292; AVX1-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
293; AVX1-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
294; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
295; AVX1-NEXT:    retq
296;
297; AVX2-LABEL: var_rotate_v8i16:
298; AVX2:       # %bb.0:
299; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
300; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
301; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
302; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7]
303; AVX2-NEXT:    vpsllvd %xmm2, %xmm3, %xmm2
304; AVX2-NEXT:    vpsrld $16, %xmm2, %xmm2
305; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
306; AVX2-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
307; AVX2-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
308; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm0
309; AVX2-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
310; AVX2-NEXT:    retq
311;
312; AVX512F-LABEL: var_rotate_v8i16:
313; AVX512F:       # %bb.0:
314; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
315; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
316; AVX512F-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
317; AVX512F-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7]
318; AVX512F-NEXT:    vpsllvd %xmm2, %xmm3, %xmm2
319; AVX512F-NEXT:    vpsrld $16, %xmm2, %xmm2
320; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
321; AVX512F-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
322; AVX512F-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
323; AVX512F-NEXT:    vpsrld $16, %xmm0, %xmm0
324; AVX512F-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
325; AVX512F-NEXT:    retq
326;
327; AVX512VL-LABEL: var_rotate_v8i16:
328; AVX512VL:       # %bb.0:
329; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
330; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
331; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
332; AVX512VL-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm0[4,4,5,5,6,6,7,7]
333; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm3, %xmm2
334; AVX512VL-NEXT:    vpsrld $16, %xmm2, %xmm2
335; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
336; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
337; AVX512VL-NEXT:    vpsllvd %xmm1, %xmm0, %xmm0
338; AVX512VL-NEXT:    vpsrld $16, %xmm0, %xmm0
339; AVX512VL-NEXT:    vpackusdw %xmm2, %xmm0, %xmm0
340; AVX512VL-NEXT:    retq
341;
342; AVX512BW-LABEL: var_rotate_v8i16:
343; AVX512BW:       # %bb.0:
344; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
345; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
346; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm2
347; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
348; AVX512BW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
349; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
350; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
351; AVX512BW-NEXT:    vzeroupper
352; AVX512BW-NEXT:    retq
353;
354; AVX512VLBW-LABEL: var_rotate_v8i16:
355; AVX512VLBW:       # %bb.0:
356; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
357; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2
358; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16]
359; AVX512VLBW-NEXT:    vpsubw %xmm1, %xmm3, %xmm1
360; AVX512VLBW-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0
361; AVX512VLBW-NEXT:    vpor %xmm0, %xmm2, %xmm0
362; AVX512VLBW-NEXT:    retq
363;
364; AVX512VBMI2-LABEL: var_rotate_v8i16:
365; AVX512VBMI2:       # %bb.0:
366; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
367; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
368; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
369; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
370; AVX512VBMI2-NEXT:    vzeroupper
371; AVX512VBMI2-NEXT:    retq
372;
373; AVX512VLVBMI2-LABEL: var_rotate_v8i16:
374; AVX512VLVBMI2:       # %bb.0:
375; AVX512VLVBMI2-NEXT:    vpshldvw %xmm1, %xmm0, %xmm0
376; AVX512VLVBMI2-NEXT:    retq
377;
378; XOP-LABEL: var_rotate_v8i16:
379; XOP:       # %bb.0:
380; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
381; XOP-NEXT:    retq
382;
383; X86-SSE2-LABEL: var_rotate_v8i16:
384; X86-SSE2:       # %bb.0:
385; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
386; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
387; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
388; X86-SSE2-NEXT:    pslld $23, %xmm2
389; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
390; X86-SSE2-NEXT:    paddd %xmm3, %xmm2
391; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
392; X86-SSE2-NEXT:    pslld $16, %xmm2
393; X86-SSE2-NEXT:    psrad $16, %xmm2
394; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
395; X86-SSE2-NEXT:    pslld $23, %xmm1
396; X86-SSE2-NEXT:    paddd %xmm3, %xmm1
397; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
398; X86-SSE2-NEXT:    pslld $16, %xmm1
399; X86-SSE2-NEXT:    psrad $16, %xmm1
400; X86-SSE2-NEXT:    packssdw %xmm2, %xmm1
401; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
402; X86-SSE2-NEXT:    pmulhuw %xmm1, %xmm2
403; X86-SSE2-NEXT:    pmullw %xmm1, %xmm0
404; X86-SSE2-NEXT:    por %xmm2, %xmm0
405; X86-SSE2-NEXT:    retl
406  %b16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %b
407  %shl = shl <8 x i16> %a, %b
408  %lshr = lshr <8 x i16> %a, %b16
409  %or = or <8 x i16> %shl, %lshr
410  ret <8 x i16> %or
411}
412
413define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
414; SSE2-LABEL: var_rotate_v16i8:
415; SSE2:       # %bb.0:
416; SSE2-NEXT:    movdqa %xmm0, %xmm2
417; SSE2-NEXT:    psllw $5, %xmm1
418; SSE2-NEXT:    pxor %xmm0, %xmm0
419; SSE2-NEXT:    pxor %xmm3, %xmm3
420; SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
421; SSE2-NEXT:    movdqa %xmm2, %xmm4
422; SSE2-NEXT:    psrlw $4, %xmm4
423; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
424; SSE2-NEXT:    movdqa %xmm2, %xmm5
425; SSE2-NEXT:    psllw $4, %xmm5
426; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
427; SSE2-NEXT:    por %xmm4, %xmm5
428; SSE2-NEXT:    pand %xmm3, %xmm5
429; SSE2-NEXT:    pandn %xmm2, %xmm3
430; SSE2-NEXT:    por %xmm5, %xmm3
431; SSE2-NEXT:    movdqa %xmm3, %xmm2
432; SSE2-NEXT:    psrlw $6, %xmm2
433; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
434; SSE2-NEXT:    movdqa %xmm3, %xmm4
435; SSE2-NEXT:    psllw $2, %xmm4
436; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
437; SSE2-NEXT:    por %xmm2, %xmm4
438; SSE2-NEXT:    paddb %xmm1, %xmm1
439; SSE2-NEXT:    pxor %xmm2, %xmm2
440; SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
441; SSE2-NEXT:    pand %xmm2, %xmm4
442; SSE2-NEXT:    pandn %xmm3, %xmm2
443; SSE2-NEXT:    por %xmm4, %xmm2
444; SSE2-NEXT:    movdqa %xmm2, %xmm3
445; SSE2-NEXT:    psrlw $7, %xmm3
446; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
447; SSE2-NEXT:    movdqa %xmm2, %xmm4
448; SSE2-NEXT:    paddb %xmm2, %xmm4
449; SSE2-NEXT:    por %xmm3, %xmm4
450; SSE2-NEXT:    paddb %xmm1, %xmm1
451; SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
452; SSE2-NEXT:    pand %xmm0, %xmm4
453; SSE2-NEXT:    pandn %xmm2, %xmm0
454; SSE2-NEXT:    por %xmm4, %xmm0
455; SSE2-NEXT:    retq
456;
457; SSE41-LABEL: var_rotate_v16i8:
458; SSE41:       # %bb.0:
459; SSE41-NEXT:    movdqa %xmm1, %xmm2
460; SSE41-NEXT:    movdqa %xmm0, %xmm1
461; SSE41-NEXT:    psrlw $4, %xmm0
462; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
463; SSE41-NEXT:    movdqa %xmm1, %xmm3
464; SSE41-NEXT:    psllw $4, %xmm3
465; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
466; SSE41-NEXT:    por %xmm0, %xmm3
467; SSE41-NEXT:    psllw $5, %xmm2
468; SSE41-NEXT:    movdqa %xmm2, %xmm0
469; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
470; SSE41-NEXT:    movdqa %xmm1, %xmm0
471; SSE41-NEXT:    psrlw $6, %xmm0
472; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
473; SSE41-NEXT:    movdqa %xmm1, %xmm3
474; SSE41-NEXT:    psllw $2, %xmm3
475; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
476; SSE41-NEXT:    por %xmm0, %xmm3
477; SSE41-NEXT:    paddb %xmm2, %xmm2
478; SSE41-NEXT:    movdqa %xmm2, %xmm0
479; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
480; SSE41-NEXT:    movdqa %xmm1, %xmm0
481; SSE41-NEXT:    psrlw $7, %xmm0
482; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
483; SSE41-NEXT:    movdqa %xmm1, %xmm3
484; SSE41-NEXT:    paddb %xmm1, %xmm3
485; SSE41-NEXT:    por %xmm0, %xmm3
486; SSE41-NEXT:    paddb %xmm2, %xmm2
487; SSE41-NEXT:    movdqa %xmm2, %xmm0
488; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
489; SSE41-NEXT:    movdqa %xmm1, %xmm0
490; SSE41-NEXT:    retq
491;
492; AVX-LABEL: var_rotate_v16i8:
493; AVX:       # %bb.0:
494; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm2
495; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
496; AVX-NEXT:    vpsllw $4, %xmm0, %xmm3
497; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
498; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
499; AVX-NEXT:    vpsllw $5, %xmm1, %xmm1
500; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
501; AVX-NEXT:    vpsrlw $6, %xmm0, %xmm2
502; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
503; AVX-NEXT:    vpsllw $2, %xmm0, %xmm3
504; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
505; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
506; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
507; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
508; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm2
509; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
510; AVX-NEXT:    vpaddb %xmm0, %xmm0, %xmm3
511; AVX-NEXT:    vpor %xmm2, %xmm3, %xmm2
512; AVX-NEXT:    vpaddb %xmm1, %xmm1, %xmm1
513; AVX-NEXT:    vpblendvb %xmm1, %xmm2, %xmm0, %xmm0
514; AVX-NEXT:    retq
515;
516; AVX512F-LABEL: var_rotate_v16i8:
517; AVX512F:       # %bb.0:
518; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
519; AVX512F-NEXT:    vpslld $8, %zmm0, %zmm2
520; AVX512F-NEXT:    vpord %zmm2, %zmm0, %zmm0
521; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
522; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
523; AVX512F-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
524; AVX512F-NEXT:    vpsrld $8, %zmm0, %zmm0
525; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
526; AVX512F-NEXT:    vzeroupper
527; AVX512F-NEXT:    retq
528;
529; AVX512VL-LABEL: var_rotate_v16i8:
530; AVX512VL:       # %bb.0:
531; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
532; AVX512VL-NEXT:    vpslld $8, %zmm0, %zmm2
533; AVX512VL-NEXT:    vpord %zmm2, %zmm0, %zmm0
534; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
535; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
536; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm0
537; AVX512VL-NEXT:    vpsrld $8, %zmm0, %zmm0
538; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
539; AVX512VL-NEXT:    vzeroupper
540; AVX512VL-NEXT:    retq
541;
542; AVX512BW-LABEL: var_rotate_v16i8:
543; AVX512BW:       # %bb.0:
544; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
545; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
546; AVX512BW-NEXT:    vpxor %xmm3, %xmm3, %xmm3
547; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
548; AVX512BW-NEXT:    vpsllvw %zmm3, %zmm2, %zmm2
549; AVX512BW-NEXT:    vpsrlw $8, %xmm2, %xmm2
550; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
551; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
552; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
553; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm0
554; AVX512BW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
555; AVX512BW-NEXT:    vzeroupper
556; AVX512BW-NEXT:    retq
557;
558; AVX512VLBW-LABEL: var_rotate_v16i8:
559; AVX512VLBW:       # %bb.0:
560; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
561; AVX512VLBW-NEXT:    vpxor %xmm2, %xmm2, %xmm2
562; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
563; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
564; AVX512VLBW-NEXT:    vpsllvw %xmm2, %xmm3, %xmm2
565; AVX512VLBW-NEXT:    vpsrlw $8, %xmm2, %xmm2
566; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
567; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
568; AVX512VLBW-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
569; AVX512VLBW-NEXT:    vpsrlw $8, %xmm0, %xmm0
570; AVX512VLBW-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
571; AVX512VLBW-NEXT:    retq
572;
573; AVX512VBMI2-LABEL: var_rotate_v16i8:
574; AVX512VBMI2:       # %bb.0:
575; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
576; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
577; AVX512VBMI2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
578; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm3[8],xmm1[9],xmm3[9],xmm1[10],xmm3[10],xmm1[11],xmm3[11],xmm1[12],xmm3[12],xmm1[13],xmm3[13],xmm1[14],xmm3[14],xmm1[15],xmm3[15]
579; AVX512VBMI2-NEXT:    vpsllvw %zmm3, %zmm2, %zmm2
580; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm2, %xmm2
581; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
582; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
583; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
584; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
585; AVX512VBMI2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
586; AVX512VBMI2-NEXT:    vzeroupper
587; AVX512VBMI2-NEXT:    retq
588;
589; AVX512VLVBMI2-LABEL: var_rotate_v16i8:
590; AVX512VLVBMI2:       # %bb.0:
591; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
592; AVX512VLVBMI2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
593; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
594; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
595; AVX512VLVBMI2-NEXT:    vpsllvw %xmm2, %xmm3, %xmm2
596; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm2, %xmm2
597; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
598; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
599; AVX512VLVBMI2-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0
600; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
601; AVX512VLVBMI2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
602; AVX512VLVBMI2-NEXT:    retq
603;
604; XOP-LABEL: var_rotate_v16i8:
605; XOP:       # %bb.0:
606; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
607; XOP-NEXT:    retq
608;
609; X86-SSE2-LABEL: var_rotate_v16i8:
610; X86-SSE2:       # %bb.0:
611; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
612; X86-SSE2-NEXT:    psllw $5, %xmm1
613; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
614; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
615; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm3
616; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
617; X86-SSE2-NEXT:    psrlw $4, %xmm4
618; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
619; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
620; X86-SSE2-NEXT:    psllw $4, %xmm5
621; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm5
622; X86-SSE2-NEXT:    por %xmm4, %xmm5
623; X86-SSE2-NEXT:    pand %xmm3, %xmm5
624; X86-SSE2-NEXT:    pandn %xmm2, %xmm3
625; X86-SSE2-NEXT:    por %xmm5, %xmm3
626; X86-SSE2-NEXT:    movdqa %xmm3, %xmm2
627; X86-SSE2-NEXT:    psrlw $6, %xmm2
628; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
629; X86-SSE2-NEXT:    movdqa %xmm3, %xmm4
630; X86-SSE2-NEXT:    psllw $2, %xmm4
631; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm4
632; X86-SSE2-NEXT:    por %xmm2, %xmm4
633; X86-SSE2-NEXT:    paddb %xmm1, %xmm1
634; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
635; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm2
636; X86-SSE2-NEXT:    pand %xmm2, %xmm4
637; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
638; X86-SSE2-NEXT:    por %xmm4, %xmm2
639; X86-SSE2-NEXT:    movdqa %xmm2, %xmm3
640; X86-SSE2-NEXT:    psrlw $7, %xmm3
641; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
642; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
643; X86-SSE2-NEXT:    paddb %xmm2, %xmm4
644; X86-SSE2-NEXT:    por %xmm3, %xmm4
645; X86-SSE2-NEXT:    paddb %xmm1, %xmm1
646; X86-SSE2-NEXT:    pcmpgtb %xmm1, %xmm0
647; X86-SSE2-NEXT:    pand %xmm0, %xmm4
648; X86-SSE2-NEXT:    pandn %xmm2, %xmm0
649; X86-SSE2-NEXT:    por %xmm4, %xmm0
650; X86-SSE2-NEXT:    retl
651  %b8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %b
652  %shl = shl <16 x i8> %a, %b
653  %lshr = lshr <16 x i8> %a, %b8
654  %or = or <16 x i8> %shl, %lshr
655  ret <16 x i8> %or
656}
657
658;
659; Uniform Variable Rotates
660;
661
662define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
663; SSE2-LABEL: splatvar_rotate_v2i64:
664; SSE2:       # %bb.0:
665; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,64]
666; SSE2-NEXT:    psubq %xmm1, %xmm2
667; SSE2-NEXT:    movdqa %xmm0, %xmm3
668; SSE2-NEXT:    psllq %xmm1, %xmm3
669; SSE2-NEXT:    psrlq %xmm2, %xmm0
670; SSE2-NEXT:    por %xmm3, %xmm0
671; SSE2-NEXT:    retq
672;
673; SSE41-LABEL: splatvar_rotate_v2i64:
674; SSE41:       # %bb.0:
675; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [64,64]
676; SSE41-NEXT:    psubq %xmm1, %xmm2
677; SSE41-NEXT:    movdqa %xmm0, %xmm3
678; SSE41-NEXT:    psllq %xmm1, %xmm3
679; SSE41-NEXT:    psrlq %xmm2, %xmm0
680; SSE41-NEXT:    por %xmm3, %xmm0
681; SSE41-NEXT:    retq
682;
683; AVX-LABEL: splatvar_rotate_v2i64:
684; AVX:       # %bb.0:
685; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [64,64]
686; AVX-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
687; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm1
688; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
689; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
690; AVX-NEXT:    retq
691;
692; AVX512NOVLX-LABEL: splatvar_rotate_v2i64:
693; AVX512NOVLX:       # %bb.0:
694; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
695; AVX512NOVLX-NEXT:    vpbroadcastq %xmm1, %xmm1
696; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
697; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
698; AVX512NOVLX-NEXT:    vzeroupper
699; AVX512NOVLX-NEXT:    retq
700;
701; AVX512VLX-LABEL: splatvar_rotate_v2i64:
702; AVX512VLX:       # %bb.0:
703; AVX512VLX-NEXT:    vpbroadcastq %xmm1, %xmm1
704; AVX512VLX-NEXT:    vprolvq %xmm1, %xmm0, %xmm0
705; AVX512VLX-NEXT:    retq
706;
707; XOPAVX1-LABEL: splatvar_rotate_v2i64:
708; XOPAVX1:       # %bb.0:
709; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1]
710; XOPAVX1-NEXT:    vprotq %xmm1, %xmm0, %xmm0
711; XOPAVX1-NEXT:    retq
712;
713; XOPAVX2-LABEL: splatvar_rotate_v2i64:
714; XOPAVX2:       # %bb.0:
715; XOPAVX2-NEXT:    vpbroadcastq %xmm1, %xmm1
716; XOPAVX2-NEXT:    vprotq %xmm1, %xmm0, %xmm0
717; XOPAVX2-NEXT:    retq
718;
719; X86-SSE2-LABEL: splatvar_rotate_v2i64:
720; X86-SSE2:       # %bb.0:
721; X86-SSE2-NEXT:    movd {{.*#+}} xmm2 = [64,0,0,0]
722; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
723; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
724; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
725; X86-SSE2-NEXT:    psrlq %xmm2, %xmm0
726; X86-SSE2-NEXT:    por %xmm3, %xmm0
727; X86-SSE2-NEXT:    retl
728  %splat = shufflevector <2 x i64> %b, <2 x i64> undef, <2 x i32> zeroinitializer
729  %splat64 = sub <2 x i64> <i64 64, i64 64>, %splat
730  %shl = shl <2 x i64> %a, %splat
731  %lshr = lshr <2 x i64> %a, %splat64
732  %or = or <2 x i64> %shl, %lshr
733  ret <2 x i64> %or
734}
735
736define <4 x i32> @splatvar_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
737; SSE-LABEL: splatvar_rotate_v4i32:
738; SSE:       # %bb.0:
739; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
740; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
741; SSE-NEXT:    psllq %xmm1, %xmm2
742; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
743; SSE-NEXT:    psllq %xmm1, %xmm0
744; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
745; SSE-NEXT:    retq
746;
747; AVX-LABEL: splatvar_rotate_v4i32:
748; AVX:       # %bb.0:
749; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
750; AVX-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
751; AVX-NEXT:    vpsllq %xmm1, %xmm2, %xmm2
752; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
753; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0
754; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
755; AVX-NEXT:    retq
756;
757; AVX512NOVLX-LABEL: splatvar_rotate_v4i32:
758; AVX512NOVLX:       # %bb.0:
759; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
760; AVX512NOVLX-NEXT:    vpbroadcastd %xmm1, %xmm1
761; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
762; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
763; AVX512NOVLX-NEXT:    vzeroupper
764; AVX512NOVLX-NEXT:    retq
765;
766; AVX512VLX-LABEL: splatvar_rotate_v4i32:
767; AVX512VLX:       # %bb.0:
768; AVX512VLX-NEXT:    vpbroadcastd %xmm1, %xmm1
769; AVX512VLX-NEXT:    vprolvd %xmm1, %xmm0, %xmm0
770; AVX512VLX-NEXT:    retq
771;
772; XOPAVX1-LABEL: splatvar_rotate_v4i32:
773; XOPAVX1:       # %bb.0:
774; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
775; XOPAVX1-NEXT:    vprotd %xmm1, %xmm0, %xmm0
776; XOPAVX1-NEXT:    retq
777;
778; XOPAVX2-LABEL: splatvar_rotate_v4i32:
779; XOPAVX2:       # %bb.0:
780; XOPAVX2-NEXT:    vpbroadcastd %xmm1, %xmm1
781; XOPAVX2-NEXT:    vprotd %xmm1, %xmm0, %xmm0
782; XOPAVX2-NEXT:    retq
783;
784; X86-SSE2-LABEL: splatvar_rotate_v4i32:
785; X86-SSE2:       # %bb.0:
786; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
787; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
788; X86-SSE2-NEXT:    psllq %xmm1, %xmm2
789; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
790; X86-SSE2-NEXT:    psllq %xmm1, %xmm0
791; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
792; X86-SSE2-NEXT:    retl
793  %splat = shufflevector <4 x i32> %b, <4 x i32> undef, <4 x i32> zeroinitializer
794  %splat32 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %splat
795  %shl = shl <4 x i32> %a, %splat
796  %lshr = lshr <4 x i32> %a, %splat32
797  %or = or <4 x i32> %shl, %lshr
798  ret <4 x i32> %or
799}
800
801define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
802; SSE2-LABEL: splatvar_rotate_v8i16:
803; SSE2:       # %bb.0:
804; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
805; SSE2-NEXT:    movdqa %xmm0, %xmm2
806; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
807; SSE2-NEXT:    pslld %xmm1, %xmm2
808; SSE2-NEXT:    psrad $16, %xmm2
809; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
810; SSE2-NEXT:    pslld %xmm1, %xmm0
811; SSE2-NEXT:    psrad $16, %xmm0
812; SSE2-NEXT:    packssdw %xmm2, %xmm0
813; SSE2-NEXT:    retq
814;
815; SSE41-LABEL: splatvar_rotate_v8i16:
816; SSE41:       # %bb.0:
817; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm2 = [15,0]
818; SSE41-NEXT:    movdqa %xmm1, %xmm3
819; SSE41-NEXT:    pandn %xmm2, %xmm3
820; SSE41-NEXT:    movdqa %xmm0, %xmm4
821; SSE41-NEXT:    psrlw $1, %xmm4
822; SSE41-NEXT:    psrlw %xmm3, %xmm4
823; SSE41-NEXT:    pand %xmm2, %xmm1
824; SSE41-NEXT:    psllw %xmm1, %xmm0
825; SSE41-NEXT:    por %xmm4, %xmm0
826; SSE41-NEXT:    retq
827;
828; AVX-LABEL: splatvar_rotate_v8i16:
829; AVX:       # %bb.0:
830; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
831; AVX-NEXT:    vpandn %xmm2, %xmm1, %xmm3
832; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm4
833; AVX-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
834; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm1
835; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
836; AVX-NEXT:    vpor %xmm3, %xmm0, %xmm0
837; AVX-NEXT:    retq
838;
839; AVX512F-LABEL: splatvar_rotate_v8i16:
840; AVX512F:       # %bb.0:
841; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
842; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
843; AVX512F-NEXT:    vpsrlw $1, %xmm0, %xmm4
844; AVX512F-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
845; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm1
846; AVX512F-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
847; AVX512F-NEXT:    vpor %xmm3, %xmm0, %xmm0
848; AVX512F-NEXT:    retq
849;
850; AVX512VL-LABEL: splatvar_rotate_v8i16:
851; AVX512VL:       # %bb.0:
852; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
853; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
854; AVX512VL-NEXT:    vpsrlw $1, %xmm0, %xmm4
855; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
856; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm1
857; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
858; AVX512VL-NEXT:    vpor %xmm3, %xmm0, %xmm0
859; AVX512VL-NEXT:    retq
860;
861; AVX512BW-LABEL: splatvar_rotate_v8i16:
862; AVX512BW:       # %bb.0:
863; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
864; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
865; AVX512BW-NEXT:    vpsrlw $1, %xmm0, %xmm4
866; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
867; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm1
868; AVX512BW-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
869; AVX512BW-NEXT:    vpor %xmm3, %xmm0, %xmm0
870; AVX512BW-NEXT:    retq
871;
872; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
873; AVX512VLBW:       # %bb.0:
874; AVX512VLBW-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [15,0]
875; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
876; AVX512VLBW-NEXT:    vpsrlw $1, %xmm0, %xmm4
877; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
878; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm1
879; AVX512VLBW-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
880; AVX512VLBW-NEXT:    vpor %xmm3, %xmm0, %xmm0
881; AVX512VLBW-NEXT:    retq
882;
883; AVX512VBMI2-LABEL: splatvar_rotate_v8i16:
884; AVX512VBMI2:       # %bb.0:
885; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
886; AVX512VBMI2-NEXT:    vpbroadcastw %xmm1, %xmm1
887; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
888; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
889; AVX512VBMI2-NEXT:    vzeroupper
890; AVX512VBMI2-NEXT:    retq
891;
892; AVX512VLVBMI2-LABEL: splatvar_rotate_v8i16:
893; AVX512VLVBMI2:       # %bb.0:
894; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm1, %xmm1
895; AVX512VLVBMI2-NEXT:    vpshldvw %xmm1, %xmm0, %xmm0
896; AVX512VLVBMI2-NEXT:    retq
897;
898; XOPAVX1-LABEL: splatvar_rotate_v8i16:
899; XOPAVX1:       # %bb.0:
900; XOPAVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
901; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
902; XOPAVX1-NEXT:    vprotw %xmm1, %xmm0, %xmm0
903; XOPAVX1-NEXT:    retq
904;
905; XOPAVX2-LABEL: splatvar_rotate_v8i16:
906; XOPAVX2:       # %bb.0:
907; XOPAVX2-NEXT:    vpbroadcastw %xmm1, %xmm1
908; XOPAVX2-NEXT:    vprotw %xmm1, %xmm0, %xmm0
909; XOPAVX2-NEXT:    retq
910;
911; X86-SSE2-LABEL: splatvar_rotate_v8i16:
912; X86-SSE2:       # %bb.0:
913; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
914; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
915; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
916; X86-SSE2-NEXT:    pslld %xmm1, %xmm2
917; X86-SSE2-NEXT:    psrad $16, %xmm2
918; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
919; X86-SSE2-NEXT:    pslld %xmm1, %xmm0
920; X86-SSE2-NEXT:    psrad $16, %xmm0
921; X86-SSE2-NEXT:    packssdw %xmm2, %xmm0
922; X86-SSE2-NEXT:    retl
923  %splat = shufflevector <8 x i16> %b, <8 x i16> undef, <8 x i32> zeroinitializer
924  %splat16 = sub <8 x i16> <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>, %splat
925  %shl = shl <8 x i16> %a, %splat
926  %lshr = lshr <8 x i16> %a, %splat16
927  %or = or <8 x i16> %shl, %lshr
928  ret <8 x i16> %or
929}
930
931define <16 x i8> @splatvar_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
932; SSE-LABEL: splatvar_rotate_v16i8:
933; SSE:       # %bb.0:
934; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
935; SSE-NEXT:    movdqa %xmm0, %xmm2
936; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
937; SSE-NEXT:    psllw %xmm1, %xmm2
938; SSE-NEXT:    psrlw $8, %xmm2
939; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
940; SSE-NEXT:    psllw %xmm1, %xmm0
941; SSE-NEXT:    psrlw $8, %xmm0
942; SSE-NEXT:    packuswb %xmm2, %xmm0
943; SSE-NEXT:    retq
944;
945; AVX-LABEL: splatvar_rotate_v16i8:
946; AVX:       # %bb.0:
947; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
948; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
949; AVX-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
950; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
951; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
952; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
953; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
954; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
955; AVX-NEXT:    retq
956;
957; AVX512-LABEL: splatvar_rotate_v16i8:
958; AVX512:       # %bb.0:
959; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
960; AVX512-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
961; AVX512-NEXT:    vpsllw %xmm1, %xmm2, %xmm2
962; AVX512-NEXT:    vpsrlw $8, %xmm2, %xmm2
963; AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
964; AVX512-NEXT:    vpsllw %xmm1, %xmm0, %xmm0
965; AVX512-NEXT:    vpsrlw $8, %xmm0, %xmm0
966; AVX512-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
967; AVX512-NEXT:    retq
968;
969; XOPAVX1-LABEL: splatvar_rotate_v16i8:
970; XOPAVX1:       # %bb.0:
971; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
972; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
973; XOPAVX1-NEXT:    vprotb %xmm1, %xmm0, %xmm0
974; XOPAVX1-NEXT:    retq
975;
976; XOPAVX2-LABEL: splatvar_rotate_v16i8:
977; XOPAVX2:       # %bb.0:
978; XOPAVX2-NEXT:    vpbroadcastb %xmm1, %xmm1
979; XOPAVX2-NEXT:    vprotb %xmm1, %xmm0, %xmm0
980; XOPAVX2-NEXT:    retq
981;
982; X86-SSE2-LABEL: splatvar_rotate_v16i8:
983; X86-SSE2:       # %bb.0:
984; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
985; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
986; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
987; X86-SSE2-NEXT:    psllw %xmm1, %xmm2
988; X86-SSE2-NEXT:    psrlw $8, %xmm2
989; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
990; X86-SSE2-NEXT:    psllw %xmm1, %xmm0
991; X86-SSE2-NEXT:    psrlw $8, %xmm0
992; X86-SSE2-NEXT:    packuswb %xmm2, %xmm0
993; X86-SSE2-NEXT:    retl
994  %splat = shufflevector <16 x i8> %b, <16 x i8> undef, <16 x i32> zeroinitializer
995  %splat8 = sub <16 x i8> <i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8, i8 8>, %splat
996  %shl = shl <16 x i8> %a, %splat
997  %lshr = lshr <16 x i8> %a, %splat8
998  %or = or <16 x i8> %shl, %lshr
999  ret <16 x i8> %or
1000}
1001
1002;
1003; Constant Rotates
1004;
1005
1006define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
1007; SSE2-LABEL: constant_rotate_v2i64:
1008; SSE2:       # %bb.0:
1009; SSE2-NEXT:    movdqa %xmm0, %xmm1
1010; SSE2-NEXT:    psrlq $60, %xmm1
1011; SSE2-NEXT:    movdqa %xmm0, %xmm2
1012; SSE2-NEXT:    psrlq $50, %xmm2
1013; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1014; SSE2-NEXT:    movdqa %xmm0, %xmm1
1015; SSE2-NEXT:    psllq $4, %xmm1
1016; SSE2-NEXT:    psllq $14, %xmm0
1017; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1018; SSE2-NEXT:    orpd %xmm2, %xmm0
1019; SSE2-NEXT:    retq
1020;
1021; SSE41-LABEL: constant_rotate_v2i64:
1022; SSE41:       # %bb.0:
1023; SSE41-NEXT:    movdqa %xmm0, %xmm1
1024; SSE41-NEXT:    psrlq $50, %xmm1
1025; SSE41-NEXT:    movdqa %xmm0, %xmm2
1026; SSE41-NEXT:    psrlq $60, %xmm2
1027; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1028; SSE41-NEXT:    movdqa %xmm0, %xmm1
1029; SSE41-NEXT:    psllq $14, %xmm1
1030; SSE41-NEXT:    psllq $4, %xmm0
1031; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1032; SSE41-NEXT:    por %xmm2, %xmm0
1033; SSE41-NEXT:    retq
1034;
1035; AVX1-LABEL: constant_rotate_v2i64:
1036; AVX1:       # %bb.0:
1037; AVX1-NEXT:    vpsrlq $50, %xmm0, %xmm1
1038; AVX1-NEXT:    vpsrlq $60, %xmm0, %xmm2
1039; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7]
1040; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
1041; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
1042; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1043; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1044; AVX1-NEXT:    retq
1045;
1046; AVX2-LABEL: constant_rotate_v2i64:
1047; AVX2:       # %bb.0:
1048; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1049; AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1050; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1051; AVX2-NEXT:    retq
1052;
1053; AVX512NOVLX-LABEL: constant_rotate_v2i64:
1054; AVX512NOVLX:       # %bb.0:
1055; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1056; AVX512NOVLX-NEXT:    vpmovsxbq {{.*#+}} xmm1 = [4,14]
1057; AVX512NOVLX-NEXT:    vprolvq %zmm1, %zmm0, %zmm0
1058; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1059; AVX512NOVLX-NEXT:    vzeroupper
1060; AVX512NOVLX-NEXT:    retq
1061;
1062; AVX512VLX-LABEL: constant_rotate_v2i64:
1063; AVX512VLX:       # %bb.0:
1064; AVX512VLX-NEXT:    vprolvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1065; AVX512VLX-NEXT:    retq
1066;
1067; XOP-LABEL: constant_rotate_v2i64:
1068; XOP:       # %bb.0:
1069; XOP-NEXT:    vprotq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1070; XOP-NEXT:    retq
1071;
1072; X86-SSE2-LABEL: constant_rotate_v2i64:
1073; X86-SSE2:       # %bb.0:
1074; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1075; X86-SSE2-NEXT:    psrlq $60, %xmm1
1076; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1077; X86-SSE2-NEXT:    psrlq $50, %xmm2
1078; X86-SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
1079; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1080; X86-SSE2-NEXT:    psllq $4, %xmm1
1081; X86-SSE2-NEXT:    psllq $14, %xmm0
1082; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1083; X86-SSE2-NEXT:    orpd %xmm2, %xmm0
1084; X86-SSE2-NEXT:    retl
1085  %shl = shl <2 x i64> %a, <i64 4, i64 14>
1086  %lshr = lshr <2 x i64> %a, <i64 60, i64 50>
1087  %or = or <2 x i64> %shl, %lshr
1088  ret <2 x i64> %or
1089}
1090
1091define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
1092; SSE2-LABEL: constant_rotate_v4i32:
1093; SSE2:       # %bb.0:
1094; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1095; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1096; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
1097; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1098; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
1099; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1100; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1101; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1102; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1103; SSE2-NEXT:    por %xmm2, %xmm0
1104; SSE2-NEXT:    retq
1105;
1106; SSE41-LABEL: constant_rotate_v4i32:
1107; SSE41:       # %bb.0:
1108; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1109; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1110; SSE41-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1111; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1112; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1113; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
1114; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1115; SSE41-NEXT:    por %xmm2, %xmm0
1116; SSE41-NEXT:    retq
1117;
1118; AVX1-LABEL: constant_rotate_v4i32:
1119; AVX1:       # %bb.0:
1120; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1121; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1122; AVX1-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1123; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1124; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7]
1125; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,2,2]
1126; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
1127; AVX1-NEXT:    vpor %xmm2, %xmm0, %xmm0
1128; AVX1-NEXT:    retq
1129;
1130; AVX2-LABEL: constant_rotate_v4i32:
1131; AVX2:       # %bb.0:
1132; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1133; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1134; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1135; AVX2-NEXT:    retq
1136;
1137; AVX512NOVLX-LABEL: constant_rotate_v4i32:
1138; AVX512NOVLX:       # %bb.0:
1139; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1140; AVX512NOVLX-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
1141; AVX512NOVLX-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
1142; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1143; AVX512NOVLX-NEXT:    vzeroupper
1144; AVX512NOVLX-NEXT:    retq
1145;
1146; AVX512VLX-LABEL: constant_rotate_v4i32:
1147; AVX512VLX:       # %bb.0:
1148; AVX512VLX-NEXT:    vprolvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1149; AVX512VLX-NEXT:    retq
1150;
1151; XOP-LABEL: constant_rotate_v4i32:
1152; XOP:       # %bb.0:
1153; XOP-NEXT:    vprotd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1154; XOP-NEXT:    retq
1155;
1156; X86-SSE2-LABEL: constant_rotate_v4i32:
1157; X86-SSE2:       # %bb.0:
1158; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
1159; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1160; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,3,2,3]
1161; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1162; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,3,2,3]
1163; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
1164; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1165; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
1166; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
1167; X86-SSE2-NEXT:    por %xmm2, %xmm0
1168; X86-SSE2-NEXT:    retl
1169  %shl = shl <4 x i32> %a, <i32 4, i32 5, i32 6, i32 7>
1170  %lshr = lshr <4 x i32> %a, <i32 28, i32 27, i32 26, i32 25>
1171  %or = or <4 x i32> %shl, %lshr
1172  ret <4 x i32> %or
1173}
1174
1175define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
1176; SSE2-LABEL: constant_rotate_v8i16:
1177; SSE2:       # %bb.0:
1178; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1179; SSE2-NEXT:    movdqa %xmm0, %xmm2
1180; SSE2-NEXT:    pmulhuw %xmm1, %xmm2
1181; SSE2-NEXT:    pmullw %xmm1, %xmm0
1182; SSE2-NEXT:    por %xmm2, %xmm0
1183; SSE2-NEXT:    retq
1184;
1185; SSE41-LABEL: constant_rotate_v8i16:
1186; SSE41:       # %bb.0:
1187; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1188; SSE41-NEXT:    movdqa %xmm0, %xmm2
1189; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
1190; SSE41-NEXT:    pmullw %xmm1, %xmm0
1191; SSE41-NEXT:    por %xmm2, %xmm0
1192; SSE41-NEXT:    retq
1193;
1194; AVX-LABEL: constant_rotate_v8i16:
1195; AVX:       # %bb.0:
1196; AVX-NEXT:    vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1197; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1198; AVX-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1199; AVX-NEXT:    vpor %xmm2, %xmm0, %xmm0
1200; AVX-NEXT:    retq
1201;
1202; AVX512F-LABEL: constant_rotate_v8i16:
1203; AVX512F:       # %bb.0:
1204; AVX512F-NEXT:    vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1205; AVX512F-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1206; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1207; AVX512F-NEXT:    vpor %xmm2, %xmm0, %xmm0
1208; AVX512F-NEXT:    retq
1209;
1210; AVX512VL-LABEL: constant_rotate_v8i16:
1211; AVX512VL:       # %bb.0:
1212; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1213; AVX512VL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2
1214; AVX512VL-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
1215; AVX512VL-NEXT:    vpor %xmm2, %xmm0, %xmm0
1216; AVX512VL-NEXT:    retq
1217;
1218; AVX512BW-LABEL: constant_rotate_v8i16:
1219; AVX512BW:       # %bb.0:
1220; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1221; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1222; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
1223; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm2
1224; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
1225; AVX512BW-NEXT:    vpor %xmm2, %xmm0, %xmm0
1226; AVX512BW-NEXT:    vzeroupper
1227; AVX512BW-NEXT:    retq
1228;
1229; AVX512VLBW-LABEL: constant_rotate_v8i16:
1230; AVX512VLBW:       # %bb.0:
1231; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1232; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1233; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1234; AVX512VLBW-NEXT:    retq
1235;
1236; AVX512VBMI2-LABEL: constant_rotate_v8i16:
1237; AVX512VBMI2:       # %bb.0:
1238; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1239; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
1240; AVX512VBMI2-NEXT:    vpshldvw %zmm1, %zmm0, %zmm0
1241; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1242; AVX512VBMI2-NEXT:    vzeroupper
1243; AVX512VBMI2-NEXT:    retq
1244;
1245; AVX512VLVBMI2-LABEL: constant_rotate_v8i16:
1246; AVX512VLVBMI2:       # %bb.0:
1247; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1248; AVX512VLVBMI2-NEXT:    retq
1249;
1250; XOP-LABEL: constant_rotate_v8i16:
1251; XOP:       # %bb.0:
1252; XOP-NEXT:    vprotw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1253; XOP-NEXT:    retq
1254;
1255; X86-SSE2-LABEL: constant_rotate_v8i16:
1256; X86-SSE2:       # %bb.0:
1257; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128]
1258; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
1259; X86-SSE2-NEXT:    pmulhuw %xmm1, %xmm2
1260; X86-SSE2-NEXT:    pmullw %xmm1, %xmm0
1261; X86-SSE2-NEXT:    por %xmm2, %xmm0
1262; X86-SSE2-NEXT:    retl
1263  %shl = shl <8 x i16> %a, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>
1264  %lshr = lshr <8 x i16> %a, <i16 16, i16 15, i16 14, i16 13, i16 12, i16 11, i16 10, i16 9>
1265  %or = or <8 x i16> %shl, %lshr
1266  ret <8 x i16> %or
1267}
1268
1269define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
1270; SSE-LABEL: constant_rotate_v16i8:
1271; SSE:       # %bb.0:
1272; SSE-NEXT:    movdqa %xmm0, %xmm1
1273; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1274; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,128,64,32,16,8,4,2]
1275; SSE-NEXT:    psrlw $8, %xmm1
1276; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1277; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
1278; SSE-NEXT:    psrlw $8, %xmm0
1279; SSE-NEXT:    packuswb %xmm1, %xmm0
1280; SSE-NEXT:    retq
1281;
1282; AVX-LABEL: constant_rotate_v16i8:
1283; AVX:       # %bb.0:
1284; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1285; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2]
1286; AVX-NEXT:    vpsrlw $8, %xmm1, %xmm1
1287; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1288; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1289; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1290; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1291; AVX-NEXT:    retq
1292;
1293; AVX512F-LABEL: constant_rotate_v16i8:
1294; AVX512F:       # %bb.0:
1295; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1296; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2]
1297; AVX512F-NEXT:    vpsrlw $8, %xmm1, %xmm1
1298; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1299; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1300; AVX512F-NEXT:    vpsrlw $8, %xmm0, %xmm0
1301; AVX512F-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1302; AVX512F-NEXT:    retq
1303;
1304; AVX512VL-LABEL: constant_rotate_v16i8:
1305; AVX512VL:       # %bb.0:
1306; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1307; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [1,128,64,32,16,8,4,2]
1308; AVX512VL-NEXT:    vpsrlw $8, %xmm1, %xmm1
1309; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1310; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1311; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
1312; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1313; AVX512VL-NEXT:    retq
1314;
1315; AVX512BW-LABEL: constant_rotate_v16i8:
1316; AVX512BW:       # %bb.0:
1317; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1]
1318; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1319; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
1320; AVX512BW-NEXT:    vpsrlw $8, %xmm1, %xmm1
1321; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1322; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1323; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1324; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm0
1325; AVX512BW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1326; AVX512BW-NEXT:    vzeroupper
1327; AVX512BW-NEXT:    retq
1328;
1329; AVX512VLBW-LABEL: constant_rotate_v16i8:
1330; AVX512VLBW:       # %bb.0:
1331; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1332; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1333; AVX512VLBW-NEXT:    vpsrlw $8, %xmm1, %xmm1
1334; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1335; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1336; AVX512VLBW-NEXT:    vpsrlw $8, %xmm0, %xmm0
1337; AVX512VLBW-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1338; AVX512VLBW-NEXT:    retq
1339;
1340; AVX512VBMI2-LABEL: constant_rotate_v16i8:
1341; AVX512VBMI2:       # %bb.0:
1342; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} xmm1 = [0,7,6,5,4,3,2,1]
1343; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1344; AVX512VBMI2-NEXT:    vpsllvw %zmm1, %zmm2, %zmm1
1345; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm1, %xmm1
1346; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1347; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1348; AVX512VBMI2-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1349; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
1350; AVX512VBMI2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1351; AVX512VBMI2-NEXT:    vzeroupper
1352; AVX512VBMI2-NEXT:    retq
1353;
1354; AVX512VLVBMI2-LABEL: constant_rotate_v16i8:
1355; AVX512VLVBMI2:       # %bb.0:
1356; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1357; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1358; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm1, %xmm1
1359; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1360; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1361; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
1362; AVX512VLVBMI2-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
1363; AVX512VLVBMI2-NEXT:    retq
1364;
1365; XOP-LABEL: constant_rotate_v16i8:
1366; XOP:       # %bb.0:
1367; XOP-NEXT:    vprotb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1368; XOP-NEXT:    retq
1369;
1370; X86-SSE2-LABEL: constant_rotate_v16i8:
1371; X86-SSE2:       # %bb.0:
1372; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1373; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1374; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,128,64,32,16,8,4,2]
1375; X86-SSE2-NEXT:    psrlw $8, %xmm1
1376; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1377; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128]
1378; X86-SSE2-NEXT:    psrlw $8, %xmm0
1379; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
1380; X86-SSE2-NEXT:    retl
1381  %shl = shl <16 x i8> %a, <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>
1382  %lshr = lshr <16 x i8> %a, <i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>
1383  %or = or <16 x i8> %shl, %lshr
1384  ret <16 x i8> %or
1385}
1386
1387;
1388; Uniform Constant Rotates
1389;
1390
1391define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
1392; SSE-LABEL: splatconstant_rotate_v2i64:
1393; SSE:       # %bb.0:
1394; SSE-NEXT:    movdqa %xmm0, %xmm1
1395; SSE-NEXT:    psrlq $50, %xmm1
1396; SSE-NEXT:    psllq $14, %xmm0
1397; SSE-NEXT:    por %xmm1, %xmm0
1398; SSE-NEXT:    retq
1399;
1400; AVX-LABEL: splatconstant_rotate_v2i64:
1401; AVX:       # %bb.0:
1402; AVX-NEXT:    vpsrlq $50, %xmm0, %xmm1
1403; AVX-NEXT:    vpsllq $14, %xmm0, %xmm0
1404; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1405; AVX-NEXT:    retq
1406;
1407; AVX512NOVLX-LABEL: splatconstant_rotate_v2i64:
1408; AVX512NOVLX:       # %bb.0:
1409; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1410; AVX512NOVLX-NEXT:    vprolq $14, %zmm0, %zmm0
1411; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1412; AVX512NOVLX-NEXT:    vzeroupper
1413; AVX512NOVLX-NEXT:    retq
1414;
1415; AVX512VLX-LABEL: splatconstant_rotate_v2i64:
1416; AVX512VLX:       # %bb.0:
1417; AVX512VLX-NEXT:    vprolq $14, %xmm0, %xmm0
1418; AVX512VLX-NEXT:    retq
1419;
1420; XOP-LABEL: splatconstant_rotate_v2i64:
1421; XOP:       # %bb.0:
1422; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
1423; XOP-NEXT:    retq
1424;
1425; X86-SSE2-LABEL: splatconstant_rotate_v2i64:
1426; X86-SSE2:       # %bb.0:
1427; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1428; X86-SSE2-NEXT:    psrlq $50, %xmm1
1429; X86-SSE2-NEXT:    psllq $14, %xmm0
1430; X86-SSE2-NEXT:    por %xmm1, %xmm0
1431; X86-SSE2-NEXT:    retl
1432  %shl = shl <2 x i64> %a, <i64 14, i64 14>
1433  %lshr = lshr <2 x i64> %a, <i64 50, i64 50>
1434  %or = or <2 x i64> %shl, %lshr
1435  ret <2 x i64> %or
1436}
1437
1438define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
1439; SSE-LABEL: splatconstant_rotate_v4i32:
1440; SSE:       # %bb.0:
1441; SSE-NEXT:    movdqa %xmm0, %xmm1
1442; SSE-NEXT:    psrld $28, %xmm1
1443; SSE-NEXT:    pslld $4, %xmm0
1444; SSE-NEXT:    por %xmm1, %xmm0
1445; SSE-NEXT:    retq
1446;
1447; AVX-LABEL: splatconstant_rotate_v4i32:
1448; AVX:       # %bb.0:
1449; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
1450; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
1451; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1452; AVX-NEXT:    retq
1453;
1454; AVX512NOVLX-LABEL: splatconstant_rotate_v4i32:
1455; AVX512NOVLX:       # %bb.0:
1456; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1457; AVX512NOVLX-NEXT:    vprold $4, %zmm0, %zmm0
1458; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1459; AVX512NOVLX-NEXT:    vzeroupper
1460; AVX512NOVLX-NEXT:    retq
1461;
1462; AVX512VLX-LABEL: splatconstant_rotate_v4i32:
1463; AVX512VLX:       # %bb.0:
1464; AVX512VLX-NEXT:    vprold $4, %xmm0, %xmm0
1465; AVX512VLX-NEXT:    retq
1466;
1467; XOP-LABEL: splatconstant_rotate_v4i32:
1468; XOP:       # %bb.0:
1469; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1470; XOP-NEXT:    retq
1471;
1472; X86-SSE2-LABEL: splatconstant_rotate_v4i32:
1473; X86-SSE2:       # %bb.0:
1474; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1475; X86-SSE2-NEXT:    psrld $28, %xmm1
1476; X86-SSE2-NEXT:    pslld $4, %xmm0
1477; X86-SSE2-NEXT:    por %xmm1, %xmm0
1478; X86-SSE2-NEXT:    retl
1479  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1480  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1481  %or = or <4 x i32> %shl, %lshr
1482  ret <4 x i32> %or
1483}
1484
1485define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
1486; SSE-LABEL: splatconstant_rotate_v8i16:
1487; SSE:       # %bb.0:
1488; SSE-NEXT:    movdqa %xmm0, %xmm1
1489; SSE-NEXT:    psrlw $9, %xmm1
1490; SSE-NEXT:    psllw $7, %xmm0
1491; SSE-NEXT:    por %xmm1, %xmm0
1492; SSE-NEXT:    retq
1493;
1494; AVX-LABEL: splatconstant_rotate_v8i16:
1495; AVX:       # %bb.0:
1496; AVX-NEXT:    vpsrlw $9, %xmm0, %xmm1
1497; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
1498; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1499; AVX-NEXT:    retq
1500;
1501; AVX512F-LABEL: splatconstant_rotate_v8i16:
1502; AVX512F:       # %bb.0:
1503; AVX512F-NEXT:    vpsrlw $9, %xmm0, %xmm1
1504; AVX512F-NEXT:    vpsllw $7, %xmm0, %xmm0
1505; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1506; AVX512F-NEXT:    retq
1507;
1508; AVX512VL-LABEL: splatconstant_rotate_v8i16:
1509; AVX512VL:       # %bb.0:
1510; AVX512VL-NEXT:    vpsrlw $9, %xmm0, %xmm1
1511; AVX512VL-NEXT:    vpsllw $7, %xmm0, %xmm0
1512; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1513; AVX512VL-NEXT:    retq
1514;
1515; AVX512BW-LABEL: splatconstant_rotate_v8i16:
1516; AVX512BW:       # %bb.0:
1517; AVX512BW-NEXT:    vpsrlw $9, %xmm0, %xmm1
1518; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
1519; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1520; AVX512BW-NEXT:    retq
1521;
1522; AVX512VLBW-LABEL: splatconstant_rotate_v8i16:
1523; AVX512VLBW:       # %bb.0:
1524; AVX512VLBW-NEXT:    vpsrlw $9, %xmm0, %xmm1
1525; AVX512VLBW-NEXT:    vpsllw $7, %xmm0, %xmm0
1526; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1527; AVX512VLBW-NEXT:    retq
1528;
1529; AVX512VBMI2-LABEL: splatconstant_rotate_v8i16:
1530; AVX512VBMI2:       # %bb.0:
1531; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1532; AVX512VBMI2-NEXT:    vpshldw $7, %zmm0, %zmm0, %zmm0
1533; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1534; AVX512VBMI2-NEXT:    vzeroupper
1535; AVX512VBMI2-NEXT:    retq
1536;
1537; AVX512VLVBMI2-LABEL: splatconstant_rotate_v8i16:
1538; AVX512VLVBMI2:       # %bb.0:
1539; AVX512VLVBMI2-NEXT:    vpshldw $7, %xmm0, %xmm0, %xmm0
1540; AVX512VLVBMI2-NEXT:    retq
1541;
1542; XOP-LABEL: splatconstant_rotate_v8i16:
1543; XOP:       # %bb.0:
1544; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
1545; XOP-NEXT:    retq
1546;
1547; X86-SSE2-LABEL: splatconstant_rotate_v8i16:
1548; X86-SSE2:       # %bb.0:
1549; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1550; X86-SSE2-NEXT:    psrlw $9, %xmm1
1551; X86-SSE2-NEXT:    psllw $7, %xmm0
1552; X86-SSE2-NEXT:    por %xmm1, %xmm0
1553; X86-SSE2-NEXT:    retl
1554  %shl = shl <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
1555  %lshr = lshr <8 x i16> %a, <i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9, i16 9>
1556  %or = or <8 x i16> %shl, %lshr
1557  ret <8 x i16> %or
1558}
1559
1560define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
1561; SSE-LABEL: splatconstant_rotate_v16i8:
1562; SSE:       # %bb.0:
1563; SSE-NEXT:    movdqa %xmm0, %xmm1
1564; SSE-NEXT:    psrlw $4, %xmm1
1565; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1566; SSE-NEXT:    psllw $4, %xmm0
1567; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1568; SSE-NEXT:    por %xmm1, %xmm0
1569; SSE-NEXT:    retq
1570;
1571; AVX-LABEL: splatconstant_rotate_v16i8:
1572; AVX:       # %bb.0:
1573; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
1574; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1575; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
1576; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1577; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1578; AVX-NEXT:    retq
1579;
1580; AVX512NOVLX-LABEL: splatconstant_rotate_v16i8:
1581; AVX512NOVLX:       # %bb.0:
1582; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
1583; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1584; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
1585; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1586; AVX512NOVLX-NEXT:    vzeroupper
1587; AVX512NOVLX-NEXT:    retq
1588;
1589; AVX512VLX-LABEL: splatconstant_rotate_v16i8:
1590; AVX512VLX:       # %bb.0:
1591; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
1592; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1593; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
1594; AVX512VLX-NEXT:    retq
1595;
1596; XOP-LABEL: splatconstant_rotate_v16i8:
1597; XOP:       # %bb.0:
1598; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1599; XOP-NEXT:    retq
1600;
1601; X86-SSE2-LABEL: splatconstant_rotate_v16i8:
1602; X86-SSE2:       # %bb.0:
1603; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1604; X86-SSE2-NEXT:    psrlw $4, %xmm1
1605; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1606; X86-SSE2-NEXT:    psllw $4, %xmm0
1607; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1608; X86-SSE2-NEXT:    por %xmm1, %xmm0
1609; X86-SSE2-NEXT:    retl
1610  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1611  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1612  %or = or <16 x i8> %shl, %lshr
1613  ret <16 x i8> %or
1614}
1615
1616;
1617; Masked Uniform Constant Rotates
1618;
1619
1620define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
1621; SSE-LABEL: splatconstant_rotate_mask_v2i64:
1622; SSE:       # %bb.0:
1623; SSE-NEXT:    psrlq $49, %xmm0
1624; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1625; SSE-NEXT:    retq
1626;
1627; AVX-LABEL: splatconstant_rotate_mask_v2i64:
1628; AVX:       # %bb.0:
1629; AVX-NEXT:    vpsrlq $49, %xmm0, %xmm0
1630; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1631; AVX-NEXT:    retq
1632;
1633; AVX512-LABEL: splatconstant_rotate_mask_v2i64:
1634; AVX512:       # %bb.0:
1635; AVX512-NEXT:    vpsrlq $49, %xmm0, %xmm0
1636; AVX512-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1637; AVX512-NEXT:    retq
1638;
1639; XOP-LABEL: splatconstant_rotate_mask_v2i64:
1640; XOP:       # %bb.0:
1641; XOP-NEXT:    vpsrlq $49, %xmm0, %xmm0
1642; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1643; XOP-NEXT:    retq
1644;
1645; X86-SSE2-LABEL: splatconstant_rotate_mask_v2i64:
1646; X86-SSE2:       # %bb.0:
1647; X86-SSE2-NEXT:    psrlq $49, %xmm0
1648; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1649; X86-SSE2-NEXT:    retl
1650  %shl = shl <2 x i64> %a, <i64 15, i64 15>
1651  %lshr = lshr <2 x i64> %a, <i64 49, i64 49>
1652  %rmask = and <2 x i64> %lshr, <i64 255, i64 127>
1653  %lmask = and <2 x i64> %shl, <i64 65, i64 33>
1654  %or = or <2 x i64> %lmask, %rmask
1655  ret <2 x i64> %or
1656}
1657
1658define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
1659; SSE-LABEL: splatconstant_rotate_mask_v4i32:
1660; SSE:       # %bb.0:
1661; SSE-NEXT:    movdqa %xmm0, %xmm1
1662; SSE-NEXT:    psrld $28, %xmm1
1663; SSE-NEXT:    pslld $4, %xmm0
1664; SSE-NEXT:    por %xmm1, %xmm0
1665; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1666; SSE-NEXT:    retq
1667;
1668; AVX-LABEL: splatconstant_rotate_mask_v4i32:
1669; AVX:       # %bb.0:
1670; AVX-NEXT:    vpsrld $28, %xmm0, %xmm1
1671; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
1672; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1673; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1674; AVX-NEXT:    retq
1675;
1676; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v4i32:
1677; AVX512NOVLX:       # %bb.0:
1678; AVX512NOVLX-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1679; AVX512NOVLX-NEXT:    vprold $4, %zmm0, %zmm0
1680; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1681; AVX512NOVLX-NEXT:    vzeroupper
1682; AVX512NOVLX-NEXT:    retq
1683;
1684; AVX512VLX-LABEL: splatconstant_rotate_mask_v4i32:
1685; AVX512VLX:       # %bb.0:
1686; AVX512VLX-NEXT:    vprold $4, %xmm0, %xmm0
1687; AVX512VLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1688; AVX512VLX-NEXT:    retq
1689;
1690; XOP-LABEL: splatconstant_rotate_mask_v4i32:
1691; XOP:       # %bb.0:
1692; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
1693; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1694; XOP-NEXT:    retq
1695;
1696; X86-SSE2-LABEL: splatconstant_rotate_mask_v4i32:
1697; X86-SSE2:       # %bb.0:
1698; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1699; X86-SSE2-NEXT:    psrld $28, %xmm1
1700; X86-SSE2-NEXT:    pslld $4, %xmm0
1701; X86-SSE2-NEXT:    por %xmm1, %xmm0
1702; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1703; X86-SSE2-NEXT:    retl
1704  %shl = shl <4 x i32> %a, <i32 4, i32 4, i32 4, i32 4>
1705  %lshr = lshr <4 x i32> %a, <i32 28, i32 28, i32 28, i32 28>
1706  %rmask = and <4 x i32> %lshr, <i32 127, i32 255, i32 511, i32 1023>
1707  %lmask = and <4 x i32> %shl, <i32 1023, i32 511, i32 255, i32 127>
1708  %or = or <4 x i32> %lmask, %rmask
1709  ret <4 x i32> %or
1710}
1711
1712define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
1713; SSE-LABEL: splatconstant_rotate_mask_v8i16:
1714; SSE:       # %bb.0:
1715; SSE-NEXT:    movdqa %xmm0, %xmm1
1716; SSE-NEXT:    psrlw $11, %xmm1
1717; SSE-NEXT:    psllw $5, %xmm0
1718; SSE-NEXT:    por %xmm1, %xmm0
1719; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1720; SSE-NEXT:    retq
1721;
1722; AVX-LABEL: splatconstant_rotate_mask_v8i16:
1723; AVX:       # %bb.0:
1724; AVX-NEXT:    vpsrlw $11, %xmm0, %xmm1
1725; AVX-NEXT:    vpsllw $5, %xmm0, %xmm0
1726; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1727; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1728; AVX-NEXT:    retq
1729;
1730; AVX512F-LABEL: splatconstant_rotate_mask_v8i16:
1731; AVX512F:       # %bb.0:
1732; AVX512F-NEXT:    vpsrlw $11, %xmm0, %xmm1
1733; AVX512F-NEXT:    vpsllw $5, %xmm0, %xmm0
1734; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1735; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1736; AVX512F-NEXT:    retq
1737;
1738; AVX512VL-LABEL: splatconstant_rotate_mask_v8i16:
1739; AVX512VL:       # %bb.0:
1740; AVX512VL-NEXT:    vpsllw $5, %xmm0, %xmm1
1741; AVX512VL-NEXT:    vpsrlw $11, %xmm0, %xmm0
1742; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
1743; AVX512VL-NEXT:    retq
1744;
1745; AVX512BW-LABEL: splatconstant_rotate_mask_v8i16:
1746; AVX512BW:       # %bb.0:
1747; AVX512BW-NEXT:    vpsrlw $11, %xmm0, %xmm1
1748; AVX512BW-NEXT:    vpsllw $5, %xmm0, %xmm0
1749; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1750; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1751; AVX512BW-NEXT:    retq
1752;
1753; AVX512VLBW-LABEL: splatconstant_rotate_mask_v8i16:
1754; AVX512VLBW:       # %bb.0:
1755; AVX512VLBW-NEXT:    vpsllw $5, %xmm0, %xmm1
1756; AVX512VLBW-NEXT:    vpsrlw $11, %xmm0, %xmm0
1757; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = mem & (xmm0 | xmm1)
1758; AVX512VLBW-NEXT:    retq
1759;
1760; AVX512VBMI2-LABEL: splatconstant_rotate_mask_v8i16:
1761; AVX512VBMI2:       # %bb.0:
1762; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1763; AVX512VBMI2-NEXT:    vpshldw $5, %zmm0, %zmm0, %zmm0
1764; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1765; AVX512VBMI2-NEXT:    vzeroupper
1766; AVX512VBMI2-NEXT:    retq
1767;
1768; AVX512VLVBMI2-LABEL: splatconstant_rotate_mask_v8i16:
1769; AVX512VLVBMI2:       # %bb.0:
1770; AVX512VLVBMI2-NEXT:    vpshldw $5, %xmm0, %xmm0, %xmm0
1771; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1772; AVX512VLVBMI2-NEXT:    retq
1773;
1774; XOP-LABEL: splatconstant_rotate_mask_v8i16:
1775; XOP:       # %bb.0:
1776; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
1777; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1778; XOP-NEXT:    retq
1779;
1780; X86-SSE2-LABEL: splatconstant_rotate_mask_v8i16:
1781; X86-SSE2:       # %bb.0:
1782; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1783; X86-SSE2-NEXT:    psrlw $11, %xmm1
1784; X86-SSE2-NEXT:    psllw $5, %xmm0
1785; X86-SSE2-NEXT:    por %xmm1, %xmm0
1786; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1787; X86-SSE2-NEXT:    retl
1788  %shl = shl <8 x i16> %a, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
1789  %lshr = lshr <8 x i16> %a, <i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11, i16 11>
1790  %rmask = and <8 x i16> %lshr, <i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55, i16 55>
1791  %lmask = and <8 x i16> %shl, <i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33, i16 33>
1792  %or = or <8 x i16> %lmask, %rmask
1793  ret <8 x i16> %or
1794}
1795
1796define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
1797; SSE-LABEL: splatconstant_rotate_mask_v16i8:
1798; SSE:       # %bb.0:
1799; SSE-NEXT:    movdqa %xmm0, %xmm1
1800; SSE-NEXT:    psrlw $4, %xmm1
1801; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1802; SSE-NEXT:    psllw $4, %xmm0
1803; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1804; SSE-NEXT:    por %xmm1, %xmm0
1805; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1806; SSE-NEXT:    retq
1807;
1808; AVX-LABEL: splatconstant_rotate_mask_v16i8:
1809; AVX:       # %bb.0:
1810; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm1
1811; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1812; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
1813; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1814; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1815; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1816; AVX-NEXT:    retq
1817;
1818; AVX512NOVLX-LABEL: splatconstant_rotate_mask_v16i8:
1819; AVX512NOVLX:       # %bb.0:
1820; AVX512NOVLX-NEXT:    vpsllw $4, %xmm0, %xmm1
1821; AVX512NOVLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1822; AVX512NOVLX-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm1))
1823; AVX512NOVLX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1824; AVX512NOVLX-NEXT:    vzeroupper
1825; AVX512NOVLX-NEXT:    retq
1826;
1827; AVX512VLX-LABEL: splatconstant_rotate_mask_v16i8:
1828; AVX512VLX:       # %bb.0:
1829; AVX512VLX-NEXT:    vpsllw $4, %xmm0, %xmm1
1830; AVX512VLX-NEXT:    vpsrlw $4, %xmm0, %xmm0
1831; AVX512VLX-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm1))
1832; AVX512VLX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
1833; AVX512VLX-NEXT:    retq
1834;
1835; XOP-LABEL: splatconstant_rotate_mask_v16i8:
1836; XOP:       # %bb.0:
1837; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
1838; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1839; XOP-NEXT:    retq
1840;
1841; X86-SSE2-LABEL: splatconstant_rotate_mask_v16i8:
1842; X86-SSE2:       # %bb.0:
1843; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1844; X86-SSE2-NEXT:    psrlw $4, %xmm1
1845; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1846; X86-SSE2-NEXT:    psllw $4, %xmm0
1847; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1848; X86-SSE2-NEXT:    por %xmm1, %xmm0
1849; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1850; X86-SSE2-NEXT:    retl
1851  %shl = shl <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1852  %lshr = lshr <16 x i8> %a, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
1853  %rmask = and <16 x i8> %lshr, <i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55, i8 55>
1854  %lmask = and <16 x i8> %shl, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
1855  %or = or <16 x i8> %lmask, %rmask
1856  ret <16 x i8> %or
1857}
1858
1859define <4 x i32> @rot16_demandedbits(<4 x i32> %x, <4 x i32> %y) nounwind {
1860; X86-LABEL: rot16_demandedbits:
1861; X86:       # %bb.0:
1862; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
1863; X86-NEXT:    movl %eax, %ecx
1864; X86-NEXT:    shrl $11, %ecx
1865; X86-NEXT:    shll $5, %eax
1866; X86-NEXT:    orl %ecx, %eax
1867; X86-NEXT:    andl $65536, %eax # imm = 0x10000
1868; X86-NEXT:    retl
1869;
1870; X64-LABEL: rot16_demandedbits:
1871; X64:       # %bb.0:
1872; X64-NEXT:    movl %edi, %eax
1873; X64-NEXT:    movl %edi, %ecx
1874; X64-NEXT:    shrl $11, %ecx
1875; X64-NEXT:    shll $5, %eax
1876; X64-NEXT:    orl %ecx, %eax
1877; X64-NEXT:    andl $65536, %eax # imm = 0x10000
1878; X64-NEXT:    retq
1879; SSE2-LABEL: rot16_demandedbits:
1880; SSE2:       # %bb.0:
1881; SSE2-NEXT:    movdqa %xmm0, %xmm1
1882; SSE2-NEXT:    psrld $11, %xmm1
1883; SSE2-NEXT:    pslld $11, %xmm0
1884; SSE2-NEXT:    por %xmm1, %xmm0
1885; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1886; SSE2-NEXT:    retq
1887;
1888; SSE41-LABEL: rot16_demandedbits:
1889; SSE41:       # %bb.0:
1890; SSE41-NEXT:    movdqa %xmm0, %xmm1
1891; SSE41-NEXT:    psrld $11, %xmm1
1892; SSE41-NEXT:    pslld $11, %xmm0
1893; SSE41-NEXT:    por %xmm1, %xmm0
1894; SSE41-NEXT:    pxor %xmm1, %xmm1
1895; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1896; SSE41-NEXT:    retq
1897;
1898; AVX-LABEL: rot16_demandedbits:
1899; AVX:       # %bb.0:
1900; AVX-NEXT:    vpsrld $11, %xmm0, %xmm1
1901; AVX-NEXT:    vpslld $11, %xmm0, %xmm0
1902; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1903; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1904; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1905; AVX-NEXT:    retq
1906;
1907; AVX512-LABEL: rot16_demandedbits:
1908; AVX512:       # %bb.0:
1909; AVX512-NEXT:    vpsrld $11, %xmm0, %xmm1
1910; AVX512-NEXT:    vpslld $11, %xmm0, %xmm0
1911; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
1912; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1913; AVX512-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1914; AVX512-NEXT:    retq
1915;
1916; XOP-LABEL: rot16_demandedbits:
1917; XOP:       # %bb.0:
1918; XOP-NEXT:    vpsrld $11, %xmm0, %xmm1
1919; XOP-NEXT:    vpslld $11, %xmm0, %xmm0
1920; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1921; XOP-NEXT:    vpxor %xmm1, %xmm1, %xmm1
1922; XOP-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
1923; XOP-NEXT:    retq
1924;
1925; X86-SSE2-LABEL: rot16_demandedbits:
1926; X86-SSE2:       # %bb.0:
1927; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1928; X86-SSE2-NEXT:    psrld $11, %xmm1
1929; X86-SSE2-NEXT:    pslld $11, %xmm0
1930; X86-SSE2-NEXT:    por %xmm1, %xmm0
1931; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1932; X86-SSE2-NEXT:    retl
1933  %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
1934  %t1 = shl <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
1935  %t2 = or <4 x i32> %t0, %t1
1936  %t3 = and <4 x i32> %t2, <i32 65535, i32 65535, i32 65535, i32 65535>
1937  ret <4 x i32> %t3
1938}
1939
1940define <4 x i16> @rot16_trunc(<4 x i32> %x, <4 x i32> %y) nounwind {
1941; SSE2-LABEL: rot16_trunc:
1942; SSE2:       # %bb.0:
1943; SSE2-NEXT:    movdqa %xmm0, %xmm1
1944; SSE2-NEXT:    psrld $11, %xmm1
1945; SSE2-NEXT:    pslld $5, %xmm0
1946; SSE2-NEXT:    por %xmm1, %xmm0
1947; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1948; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
1949; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1950; SSE2-NEXT:    retq
1951;
1952; SSE41-LABEL: rot16_trunc:
1953; SSE41:       # %bb.0:
1954; SSE41-NEXT:    movdqa %xmm0, %xmm1
1955; SSE41-NEXT:    psrld $11, %xmm1
1956; SSE41-NEXT:    pslld $5, %xmm0
1957; SSE41-NEXT:    por %xmm1, %xmm0
1958; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1959; SSE41-NEXT:    retq
1960;
1961; AVX-LABEL: rot16_trunc:
1962; AVX:       # %bb.0:
1963; AVX-NEXT:    vpsrld $11, %xmm0, %xmm1
1964; AVX-NEXT:    vpslld $5, %xmm0, %xmm0
1965; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1966; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1967; AVX-NEXT:    retq
1968;
1969; AVX512NOVLX-LABEL: rot16_trunc:
1970; AVX512NOVLX:       # %bb.0:
1971; AVX512NOVLX-NEXT:    vpsrld $11, %xmm0, %xmm1
1972; AVX512NOVLX-NEXT:    vpslld $5, %xmm0, %xmm0
1973; AVX512NOVLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1974; AVX512NOVLX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1975; AVX512NOVLX-NEXT:    retq
1976;
1977; AVX512VLX-LABEL: rot16_trunc:
1978; AVX512VLX:       # %bb.0:
1979; AVX512VLX-NEXT:    vpsrld $11, %xmm0, %xmm1
1980; AVX512VLX-NEXT:    vpslld $5, %xmm0, %xmm0
1981; AVX512VLX-NEXT:    vpor %xmm0, %xmm1, %xmm0
1982; AVX512VLX-NEXT:    vpmovdw %xmm0, %xmm0
1983; AVX512VLX-NEXT:    retq
1984;
1985; XOP-LABEL: rot16_trunc:
1986; XOP:       # %bb.0:
1987; XOP-NEXT:    vpsrld $11, %xmm0, %xmm1
1988; XOP-NEXT:    vpslld $5, %xmm0, %xmm0
1989; XOP-NEXT:    vpor %xmm0, %xmm1, %xmm0
1990; XOP-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
1991; XOP-NEXT:    retq
1992;
1993; X86-SSE2-LABEL: rot16_trunc:
1994; X86-SSE2:       # %bb.0:
1995; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1996; X86-SSE2-NEXT:    psrld $11, %xmm1
1997; X86-SSE2-NEXT:    pslld $5, %xmm0
1998; X86-SSE2-NEXT:    por %xmm1, %xmm0
1999; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
2000; X86-SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
2001; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
2002; X86-SSE2-NEXT:    retl
2003  %t0 = lshr <4 x i32> %x, <i32 11, i32 11, i32 11, i32 11>
2004  %t1 = shl <4 x i32> %x, <i32 5, i32 5, i32 5, i32 5>
2005  %t2 = or <4 x i32> %t0, %t1
2006  %t3 = trunc <4 x i32> %t2 to <4 x i16>
2007  ret <4 x i16> %t3
2008}
2009