xref: /llvm-project/llvm/test/CodeGen/X86/vector-fshr-128.ll (revision 61d5addd942a5ef8128e48d3617419e6320d8280)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
16
17; Just one 32-bit run to make sure we do reasonable things for i64 cases.
18; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
19
20declare <2 x i64> @llvm.fshr.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
21declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
22declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
23declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
24
25;
26; Variable Shifts
27;
28
29define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
30; SSE2-LABEL: var_funnnel_v2i64:
31; SSE2:       # %bb.0:
32; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,63]
33; SSE2-NEXT:    movdqa %xmm2, %xmm4
34; SSE2-NEXT:    pand %xmm3, %xmm4
35; SSE2-NEXT:    movdqa %xmm1, %xmm5
36; SSE2-NEXT:    psrlq %xmm4, %xmm5
37; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
38; SSE2-NEXT:    psrlq %xmm4, %xmm1
39; SSE2-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
40; SSE2-NEXT:    pandn %xmm3, %xmm2
41; SSE2-NEXT:    paddq %xmm0, %xmm0
42; SSE2-NEXT:    movdqa %xmm0, %xmm1
43; SSE2-NEXT:    psllq %xmm2, %xmm1
44; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
45; SSE2-NEXT:    psllq %xmm2, %xmm0
46; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
47; SSE2-NEXT:    orpd %xmm5, %xmm0
48; SSE2-NEXT:    retq
49;
50; SSE41-LABEL: var_funnnel_v2i64:
51; SSE41:       # %bb.0:
52; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [63,63]
53; SSE41-NEXT:    movdqa %xmm2, %xmm4
54; SSE41-NEXT:    pand %xmm3, %xmm4
55; SSE41-NEXT:    movdqa %xmm1, %xmm5
56; SSE41-NEXT:    psrlq %xmm4, %xmm5
57; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
58; SSE41-NEXT:    psrlq %xmm4, %xmm1
59; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
60; SSE41-NEXT:    pandn %xmm3, %xmm2
61; SSE41-NEXT:    paddq %xmm0, %xmm0
62; SSE41-NEXT:    movdqa %xmm0, %xmm1
63; SSE41-NEXT:    psllq %xmm2, %xmm1
64; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
65; SSE41-NEXT:    psllq %xmm2, %xmm0
66; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
67; SSE41-NEXT:    por %xmm5, %xmm0
68; SSE41-NEXT:    retq
69;
70; AVX1-LABEL: var_funnnel_v2i64:
71; AVX1:       # %bb.0:
72; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
73; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
74; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
75; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
76; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
77; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
78; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
79; AVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
80; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
81; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
82; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
83; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
84; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
85; AVX1-NEXT:    retq
86;
87; AVX2-LABEL: var_funnnel_v2i64:
88; AVX2:       # %bb.0:
89; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
90; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
91; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
92; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
93; AVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
94; AVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
95; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
96; AVX2-NEXT:    retq
97;
98; AVX512F-LABEL: var_funnnel_v2i64:
99; AVX512F:       # %bb.0:
100; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
101; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
102; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
103; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
104; AVX512F-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
105; AVX512F-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
106; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
107; AVX512F-NEXT:    retq
108;
109; AVX512VL-LABEL: var_funnnel_v2i64:
110; AVX512VL:       # %bb.0:
111; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
112; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
113; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
114; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
115; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
116; AVX512VL-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
117; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
118; AVX512VL-NEXT:    retq
119;
120; AVX512BW-LABEL: var_funnnel_v2i64:
121; AVX512BW:       # %bb.0:
122; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
123; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
124; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
125; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
126; AVX512BW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
127; AVX512BW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
128; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
129; AVX512BW-NEXT:    retq
130;
131; AVX512VBMI2-LABEL: var_funnnel_v2i64:
132; AVX512VBMI2:       # %bb.0:
133; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
134; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
135; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
136; AVX512VBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
137; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
138; AVX512VBMI2-NEXT:    vzeroupper
139; AVX512VBMI2-NEXT:    retq
140;
141; AVX512VLBW-LABEL: var_funnnel_v2i64:
142; AVX512VLBW:       # %bb.0:
143; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
144; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
145; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
146; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
147; AVX512VLBW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
148; AVX512VLBW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
149; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
150; AVX512VLBW-NEXT:    retq
151;
152; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
153; AVX512VLVBMI2:       # %bb.0:
154; AVX512VLVBMI2-NEXT:    vpshrdvq %xmm2, %xmm0, %xmm1
155; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
156; AVX512VLVBMI2-NEXT:    retq
157;
158; XOPAVX1-LABEL: var_funnnel_v2i64:
159; XOPAVX1:       # %bb.0:
160; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
161; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
162; XOPAVX1-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
163; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
164; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
165; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
166; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
167; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
168; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
169; XOPAVX1-NEXT:    retq
170;
171; XOPAVX2-LABEL: var_funnnel_v2i64:
172; XOPAVX2:       # %bb.0:
173; XOPAVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
174; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
175; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
176; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
177; XOPAVX2-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
178; XOPAVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
179; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
180; XOPAVX2-NEXT:    retq
181;
182; X86-SSE2-LABEL: var_funnnel_v2i64:
183; X86-SSE2:       # %bb.0:
184; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [63,0,63,0]
185; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
186; X86-SSE2-NEXT:    pand %xmm4, %xmm5
187; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
188; X86-SSE2-NEXT:    psrlq %xmm5, %xmm3
189; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
190; X86-SSE2-NEXT:    psrlq %xmm5, %xmm1
191; X86-SSE2-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
192; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
193; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
194; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
195; X86-SSE2-NEXT:    psllq %xmm2, %xmm1
196; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
197; X86-SSE2-NEXT:    psllq %xmm2, %xmm0
198; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
199; X86-SSE2-NEXT:    orpd %xmm3, %xmm0
200; X86-SSE2-NEXT:    retl
201  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
202  ret <2 x i64> %res
203}
204
205define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
206; SSE2-LABEL: var_funnnel_v4i32:
207; SSE2:       # %bb.0:
208; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
209; SSE2-NEXT:    movdqa %xmm2, %xmm5
210; SSE2-NEXT:    pand %xmm4, %xmm5
211; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
212; SSE2-NEXT:    movdqa %xmm1, %xmm6
213; SSE2-NEXT:    psrld %xmm3, %xmm6
214; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
215; SSE2-NEXT:    movdqa %xmm1, %xmm3
216; SSE2-NEXT:    psrld %xmm7, %xmm3
217; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
218; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
219; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
220; SSE2-NEXT:    movdqa %xmm1, %xmm7
221; SSE2-NEXT:    psrld %xmm6, %xmm7
222; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
223; SSE2-NEXT:    psrld %xmm5, %xmm1
224; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
225; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
226; SSE2-NEXT:    pandn %xmm4, %xmm2
227; SSE2-NEXT:    pslld $23, %xmm2
228; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
229; SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
230; SSE2-NEXT:    paddd %xmm0, %xmm0
231; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
232; SSE2-NEXT:    pmuludq %xmm1, %xmm0
233; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
234; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
235; SSE2-NEXT:    pmuludq %xmm2, %xmm1
236; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
237; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
238; SSE2-NEXT:    por %xmm3, %xmm0
239; SSE2-NEXT:    retq
240;
241; SSE41-LABEL: var_funnnel_v4i32:
242; SSE41:       # %bb.0:
243; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
244; SSE41-NEXT:    movdqa %xmm2, %xmm4
245; SSE41-NEXT:    pand %xmm3, %xmm4
246; SSE41-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
247; SSE41-NEXT:    movdqa %xmm1, %xmm6
248; SSE41-NEXT:    psrld %xmm5, %xmm6
249; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
250; SSE41-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
251; SSE41-NEXT:    movdqa %xmm1, %xmm8
252; SSE41-NEXT:    psrld %xmm7, %xmm8
253; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
254; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
255; SSE41-NEXT:    movdqa %xmm1, %xmm6
256; SSE41-NEXT:    psrld %xmm4, %xmm6
257; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
258; SSE41-NEXT:    psrld %xmm4, %xmm1
259; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
260; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
261; SSE41-NEXT:    pandn %xmm3, %xmm2
262; SSE41-NEXT:    pslld $23, %xmm2
263; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
264; SSE41-NEXT:    cvttps2dq %xmm2, %xmm1
265; SSE41-NEXT:    paddd %xmm0, %xmm0
266; SSE41-NEXT:    pmulld %xmm1, %xmm0
267; SSE41-NEXT:    por %xmm6, %xmm0
268; SSE41-NEXT:    retq
269;
270; AVX1-LABEL: var_funnnel_v4i32:
271; AVX1:       # %bb.0:
272; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
273; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
274; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
275; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
276; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
277; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
278; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
279; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
280; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
281; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
282; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
283; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
284; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
285; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
286; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
287; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
288; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
289; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
290; AVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
291; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
292; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
293; AVX1-NEXT:    retq
294;
295; AVX2-LABEL: var_funnnel_v4i32:
296; AVX2:       # %bb.0:
297; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
298; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
299; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
300; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
301; AVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
302; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
303; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
304; AVX2-NEXT:    retq
305;
306; AVX512F-LABEL: var_funnnel_v4i32:
307; AVX512F:       # %bb.0:
308; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
309; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
310; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
311; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
312; AVX512F-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
313; AVX512F-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
314; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
315; AVX512F-NEXT:    retq
316;
317; AVX512VL-LABEL: var_funnnel_v4i32:
318; AVX512VL:       # %bb.0:
319; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
320; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
321; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
322; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
323; AVX512VL-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
324; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
325; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
326; AVX512VL-NEXT:    retq
327;
328; AVX512BW-LABEL: var_funnnel_v4i32:
329; AVX512BW:       # %bb.0:
330; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
331; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
332; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
333; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
334; AVX512BW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
335; AVX512BW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
336; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
337; AVX512BW-NEXT:    retq
338;
339; AVX512VBMI2-LABEL: var_funnnel_v4i32:
340; AVX512VBMI2:       # %bb.0:
341; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
342; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
343; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
344; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
345; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
346; AVX512VBMI2-NEXT:    vzeroupper
347; AVX512VBMI2-NEXT:    retq
348;
349; AVX512VLBW-LABEL: var_funnnel_v4i32:
350; AVX512VLBW:       # %bb.0:
351; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
352; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
353; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
354; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
355; AVX512VLBW-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
356; AVX512VLBW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
357; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
358; AVX512VLBW-NEXT:    retq
359;
360; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
361; AVX512VLVBMI2:       # %bb.0:
362; AVX512VLVBMI2-NEXT:    vpshrdvd %xmm2, %xmm0, %xmm1
363; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
364; AVX512VLVBMI2-NEXT:    retq
365;
366; XOPAVX1-LABEL: var_funnnel_v4i32:
367; XOPAVX1:       # %bb.0:
368; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
369; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
370; XOPAVX1-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
371; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
372; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
373; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
374; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
375; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
376; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
377; XOPAVX1-NEXT:    retq
378;
379; XOPAVX2-LABEL: var_funnnel_v4i32:
380; XOPAVX2:       # %bb.0:
381; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
382; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
383; XOPAVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
384; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
385; XOPAVX2-NEXT:    vpaddd %xmm0, %xmm0, %xmm0
386; XOPAVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
387; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
388; XOPAVX2-NEXT:    retq
389;
390; X86-SSE2-LABEL: var_funnnel_v4i32:
391; X86-SSE2:       # %bb.0:
392; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
393; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
394; X86-SSE2-NEXT:    pand %xmm4, %xmm5
395; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
396; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
397; X86-SSE2-NEXT:    psrld %xmm3, %xmm6
398; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
399; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
400; X86-SSE2-NEXT:    psrld %xmm7, %xmm3
401; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
402; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
403; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
404; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
405; X86-SSE2-NEXT:    psrld %xmm6, %xmm7
406; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
407; X86-SSE2-NEXT:    psrld %xmm5, %xmm1
408; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
409; X86-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
410; X86-SSE2-NEXT:    pandn %xmm4, %xmm2
411; X86-SSE2-NEXT:    pslld $23, %xmm2
412; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
413; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
414; X86-SSE2-NEXT:    paddd %xmm0, %xmm0
415; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
416; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
417; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
418; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
419; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
420; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
421; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
422; X86-SSE2-NEXT:    por %xmm3, %xmm0
423; X86-SSE2-NEXT:    retl
424  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
425  ret <4 x i32> %res
426}
427
428define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
429; SSE2-LABEL: var_funnnel_v8i16:
430; SSE2:       # %bb.0:
431; SSE2-NEXT:    movdqa %xmm2, %xmm4
432; SSE2-NEXT:    psllw $12, %xmm4
433; SSE2-NEXT:    movdqa %xmm4, %xmm3
434; SSE2-NEXT:    psraw $15, %xmm3
435; SSE2-NEXT:    movdqa %xmm3, %xmm5
436; SSE2-NEXT:    pandn %xmm1, %xmm5
437; SSE2-NEXT:    psrlw $8, %xmm1
438; SSE2-NEXT:    pand %xmm1, %xmm3
439; SSE2-NEXT:    por %xmm5, %xmm3
440; SSE2-NEXT:    paddw %xmm4, %xmm4
441; SSE2-NEXT:    movdqa %xmm4, %xmm1
442; SSE2-NEXT:    psraw $15, %xmm1
443; SSE2-NEXT:    movdqa %xmm1, %xmm5
444; SSE2-NEXT:    pandn %xmm3, %xmm5
445; SSE2-NEXT:    psrlw $4, %xmm3
446; SSE2-NEXT:    pand %xmm1, %xmm3
447; SSE2-NEXT:    por %xmm5, %xmm3
448; SSE2-NEXT:    paddw %xmm4, %xmm4
449; SSE2-NEXT:    movdqa %xmm4, %xmm1
450; SSE2-NEXT:    psraw $15, %xmm1
451; SSE2-NEXT:    movdqa %xmm1, %xmm5
452; SSE2-NEXT:    pandn %xmm3, %xmm5
453; SSE2-NEXT:    psrlw $2, %xmm3
454; SSE2-NEXT:    pand %xmm1, %xmm3
455; SSE2-NEXT:    por %xmm5, %xmm3
456; SSE2-NEXT:    paddw %xmm4, %xmm4
457; SSE2-NEXT:    psraw $15, %xmm4
458; SSE2-NEXT:    movdqa %xmm4, %xmm1
459; SSE2-NEXT:    pandn %xmm3, %xmm1
460; SSE2-NEXT:    psrlw $1, %xmm3
461; SSE2-NEXT:    pand %xmm4, %xmm3
462; SSE2-NEXT:    por %xmm1, %xmm3
463; SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
464; SSE2-NEXT:    movdqa %xmm2, %xmm1
465; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
466; SSE2-NEXT:    pslld $23, %xmm1
467; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
468; SSE2-NEXT:    paddd %xmm4, %xmm1
469; SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
470; SSE2-NEXT:    pslld $16, %xmm1
471; SSE2-NEXT:    psrad $16, %xmm1
472; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
473; SSE2-NEXT:    pslld $23, %xmm2
474; SSE2-NEXT:    paddd %xmm4, %xmm2
475; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
476; SSE2-NEXT:    pslld $16, %xmm2
477; SSE2-NEXT:    psrad $16, %xmm2
478; SSE2-NEXT:    packssdw %xmm1, %xmm2
479; SSE2-NEXT:    paddw %xmm0, %xmm0
480; SSE2-NEXT:    pmullw %xmm2, %xmm0
481; SSE2-NEXT:    por %xmm3, %xmm0
482; SSE2-NEXT:    retq
483;
484; SSE41-LABEL: var_funnnel_v8i16:
485; SSE41:       # %bb.0:
486; SSE41-NEXT:    movdqa %xmm0, %xmm3
487; SSE41-NEXT:    pmovsxbw {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15]
488; SSE41-NEXT:    movdqa %xmm2, %xmm4
489; SSE41-NEXT:    pand %xmm5, %xmm4
490; SSE41-NEXT:    psllw $4, %xmm4
491; SSE41-NEXT:    movdqa %xmm2, %xmm0
492; SSE41-NEXT:    psllw $12, %xmm0
493; SSE41-NEXT:    por %xmm4, %xmm0
494; SSE41-NEXT:    movdqa %xmm0, %xmm4
495; SSE41-NEXT:    paddw %xmm0, %xmm4
496; SSE41-NEXT:    movdqa %xmm1, %xmm6
497; SSE41-NEXT:    psrlw $8, %xmm6
498; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
499; SSE41-NEXT:    movdqa %xmm1, %xmm6
500; SSE41-NEXT:    psrlw $4, %xmm6
501; SSE41-NEXT:    movdqa %xmm4, %xmm0
502; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
503; SSE41-NEXT:    movdqa %xmm1, %xmm6
504; SSE41-NEXT:    psrlw $2, %xmm6
505; SSE41-NEXT:    paddw %xmm4, %xmm4
506; SSE41-NEXT:    movdqa %xmm4, %xmm0
507; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
508; SSE41-NEXT:    movdqa %xmm1, %xmm6
509; SSE41-NEXT:    psrlw $1, %xmm6
510; SSE41-NEXT:    paddw %xmm4, %xmm4
511; SSE41-NEXT:    movdqa %xmm4, %xmm0
512; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
513; SSE41-NEXT:    pandn %xmm5, %xmm2
514; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
515; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
516; SSE41-NEXT:    pslld $23, %xmm2
517; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
518; SSE41-NEXT:    paddd %xmm4, %xmm2
519; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
520; SSE41-NEXT:    pslld $23, %xmm0
521; SSE41-NEXT:    paddd %xmm4, %xmm0
522; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
523; SSE41-NEXT:    packusdw %xmm2, %xmm0
524; SSE41-NEXT:    paddw %xmm3, %xmm3
525; SSE41-NEXT:    pmullw %xmm0, %xmm3
526; SSE41-NEXT:    por %xmm1, %xmm3
527; SSE41-NEXT:    movdqa %xmm3, %xmm0
528; SSE41-NEXT:    retq
529;
530; AVX1-LABEL: var_funnnel_v8i16:
531; AVX1:       # %bb.0:
532; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
533; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
534; AVX1-NEXT:    vpsllw $4, %xmm4, %xmm4
535; AVX1-NEXT:    vpsllw $12, %xmm2, %xmm5
536; AVX1-NEXT:    vpor %xmm5, %xmm4, %xmm4
537; AVX1-NEXT:    vpaddw %xmm4, %xmm4, %xmm5
538; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm6
539; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
540; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
541; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
542; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
543; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
544; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
545; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
546; AVX1-NEXT:    vpaddw %xmm5, %xmm5, %xmm5
547; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
548; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
549; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4,4,5,5,6,6,7,7]
550; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
551; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
552; AVX1-NEXT:    vpaddd %xmm4, %xmm3, %xmm3
553; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
554; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
555; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
556; AVX1-NEXT:    vpaddd %xmm4, %xmm2, %xmm2
557; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
558; AVX1-NEXT:    vpackusdw %xmm3, %xmm2, %xmm2
559; AVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
560; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
561; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
562; AVX1-NEXT:    retq
563;
564; AVX2-LABEL: var_funnnel_v8i16:
565; AVX2:       # %bb.0:
566; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
567; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
568; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
569; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
570; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
571; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
572; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
573; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
574; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
575; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
576; AVX2-NEXT:    vzeroupper
577; AVX2-NEXT:    retq
578;
579; AVX512F-LABEL: var_funnnel_v8i16:
580; AVX512F:       # %bb.0:
581; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
582; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
583; AVX512F-NEXT:    vpslld $16, %ymm0, %ymm0
584; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
585; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
586; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
587; AVX512F-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
588; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
589; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
590; AVX512F-NEXT:    vzeroupper
591; AVX512F-NEXT:    retq
592;
593; AVX512VL-LABEL: var_funnnel_v8i16:
594; AVX512VL:       # %bb.0:
595; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
596; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
597; AVX512VL-NEXT:    vpslld $16, %ymm0, %ymm0
598; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
599; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
600; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
601; AVX512VL-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
602; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
603; AVX512VL-NEXT:    vzeroupper
604; AVX512VL-NEXT:    retq
605;
606; AVX512BW-LABEL: var_funnnel_v8i16:
607; AVX512BW:       # %bb.0:
608; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
609; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
610; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
611; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
612; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
613; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
614; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
615; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
616; AVX512BW-NEXT:    vzeroupper
617; AVX512BW-NEXT:    retq
618;
619; AVX512VBMI2-LABEL: var_funnnel_v8i16:
620; AVX512VBMI2:       # %bb.0:
621; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
622; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
623; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
624; AVX512VBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
625; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
626; AVX512VBMI2-NEXT:    vzeroupper
627; AVX512VBMI2-NEXT:    retq
628;
629; AVX512VLBW-LABEL: var_funnnel_v8i16:
630; AVX512VLBW:       # %bb.0:
631; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
632; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
633; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
634; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
635; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
636; AVX512VLBW-NEXT:    vpsllvw %xmm2, %xmm0, %xmm0
637; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
638; AVX512VLBW-NEXT:    retq
639;
640; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
641; AVX512VLVBMI2:       # %bb.0:
642; AVX512VLVBMI2-NEXT:    vpshrdvw %xmm2, %xmm0, %xmm1
643; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
644; AVX512VLVBMI2-NEXT:    retq
645;
646; XOPAVX1-LABEL: var_funnnel_v8i16:
647; XOPAVX1:       # %bb.0:
648; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
649; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
650; XOPAVX1-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
651; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
652; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
653; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
654; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
655; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
656; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
657; XOPAVX1-NEXT:    retq
658;
659; XOPAVX2-LABEL: var_funnnel_v8i16:
660; XOPAVX2:       # %bb.0:
661; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
662; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
663; XOPAVX2-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
664; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
665; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
666; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
667; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
668; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
669; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
670; XOPAVX2-NEXT:    retq
671;
672; X86-SSE2-LABEL: var_funnnel_v8i16:
673; X86-SSE2:       # %bb.0:
674; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
675; X86-SSE2-NEXT:    psllw $12, %xmm4
676; X86-SSE2-NEXT:    movdqa %xmm4, %xmm3
677; X86-SSE2-NEXT:    psraw $15, %xmm3
678; X86-SSE2-NEXT:    movdqa %xmm3, %xmm5
679; X86-SSE2-NEXT:    pandn %xmm1, %xmm5
680; X86-SSE2-NEXT:    psrlw $8, %xmm1
681; X86-SSE2-NEXT:    pand %xmm1, %xmm3
682; X86-SSE2-NEXT:    por %xmm5, %xmm3
683; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
684; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
685; X86-SSE2-NEXT:    psraw $15, %xmm1
686; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
687; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
688; X86-SSE2-NEXT:    psrlw $4, %xmm3
689; X86-SSE2-NEXT:    pand %xmm1, %xmm3
690; X86-SSE2-NEXT:    por %xmm5, %xmm3
691; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
692; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
693; X86-SSE2-NEXT:    psraw $15, %xmm1
694; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
695; X86-SSE2-NEXT:    pandn %xmm3, %xmm5
696; X86-SSE2-NEXT:    psrlw $2, %xmm3
697; X86-SSE2-NEXT:    pand %xmm1, %xmm3
698; X86-SSE2-NEXT:    por %xmm5, %xmm3
699; X86-SSE2-NEXT:    paddw %xmm4, %xmm4
700; X86-SSE2-NEXT:    psraw $15, %xmm4
701; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
702; X86-SSE2-NEXT:    pandn %xmm3, %xmm1
703; X86-SSE2-NEXT:    psrlw $1, %xmm3
704; X86-SSE2-NEXT:    pand %xmm4, %xmm3
705; X86-SSE2-NEXT:    por %xmm1, %xmm3
706; X86-SSE2-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
707; X86-SSE2-NEXT:    movdqa %xmm2, %xmm1
708; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
709; X86-SSE2-NEXT:    pslld $23, %xmm1
710; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
711; X86-SSE2-NEXT:    paddd %xmm4, %xmm1
712; X86-SSE2-NEXT:    cvttps2dq %xmm1, %xmm1
713; X86-SSE2-NEXT:    pslld $16, %xmm1
714; X86-SSE2-NEXT:    psrad $16, %xmm1
715; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
716; X86-SSE2-NEXT:    pslld $23, %xmm2
717; X86-SSE2-NEXT:    paddd %xmm4, %xmm2
718; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
719; X86-SSE2-NEXT:    pslld $16, %xmm2
720; X86-SSE2-NEXT:    psrad $16, %xmm2
721; X86-SSE2-NEXT:    packssdw %xmm1, %xmm2
722; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
723; X86-SSE2-NEXT:    pmullw %xmm2, %xmm0
724; X86-SSE2-NEXT:    por %xmm3, %xmm0
725; X86-SSE2-NEXT:    retl
726  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
727  ret <8 x i16> %res
728}
729
730define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
731; SSE2-LABEL: var_funnnel_v16i8:
732; SSE2:       # %bb.0:
733; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
734; SSE2-NEXT:    movdqa %xmm2, %xmm6
735; SSE2-NEXT:    pand %xmm5, %xmm6
736; SSE2-NEXT:    psllw $5, %xmm6
737; SSE2-NEXT:    pxor %xmm4, %xmm4
738; SSE2-NEXT:    pxor %xmm3, %xmm3
739; SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
740; SSE2-NEXT:    movdqa %xmm3, %xmm7
741; SSE2-NEXT:    pandn %xmm1, %xmm7
742; SSE2-NEXT:    psrlw $4, %xmm1
743; SSE2-NEXT:    pand %xmm1, %xmm3
744; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
745; SSE2-NEXT:    por %xmm7, %xmm3
746; SSE2-NEXT:    paddb %xmm6, %xmm6
747; SSE2-NEXT:    pxor %xmm1, %xmm1
748; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
749; SSE2-NEXT:    movdqa %xmm1, %xmm7
750; SSE2-NEXT:    pandn %xmm3, %xmm7
751; SSE2-NEXT:    psrlw $2, %xmm3
752; SSE2-NEXT:    pand %xmm1, %xmm3
753; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
754; SSE2-NEXT:    por %xmm7, %xmm3
755; SSE2-NEXT:    paddb %xmm6, %xmm6
756; SSE2-NEXT:    pxor %xmm1, %xmm1
757; SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
758; SSE2-NEXT:    movdqa %xmm1, %xmm6
759; SSE2-NEXT:    pandn %xmm3, %xmm6
760; SSE2-NEXT:    psrlw $1, %xmm3
761; SSE2-NEXT:    pand %xmm1, %xmm3
762; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
763; SSE2-NEXT:    por %xmm6, %xmm3
764; SSE2-NEXT:    pandn %xmm5, %xmm2
765; SSE2-NEXT:    psllw $5, %xmm2
766; SSE2-NEXT:    pxor %xmm1, %xmm1
767; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
768; SSE2-NEXT:    paddb %xmm0, %xmm0
769; SSE2-NEXT:    movdqa %xmm1, %xmm5
770; SSE2-NEXT:    pandn %xmm0, %xmm5
771; SSE2-NEXT:    psllw $4, %xmm0
772; SSE2-NEXT:    pand %xmm1, %xmm0
773; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
774; SSE2-NEXT:    por %xmm5, %xmm0
775; SSE2-NEXT:    paddb %xmm2, %xmm2
776; SSE2-NEXT:    pxor %xmm1, %xmm1
777; SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
778; SSE2-NEXT:    movdqa %xmm1, %xmm5
779; SSE2-NEXT:    pandn %xmm0, %xmm5
780; SSE2-NEXT:    psllw $2, %xmm0
781; SSE2-NEXT:    pand %xmm1, %xmm0
782; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
783; SSE2-NEXT:    por %xmm5, %xmm0
784; SSE2-NEXT:    paddb %xmm2, %xmm2
785; SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
786; SSE2-NEXT:    movdqa %xmm4, %xmm1
787; SSE2-NEXT:    pandn %xmm0, %xmm1
788; SSE2-NEXT:    paddb %xmm0, %xmm0
789; SSE2-NEXT:    pand %xmm4, %xmm0
790; SSE2-NEXT:    por %xmm1, %xmm0
791; SSE2-NEXT:    por %xmm3, %xmm0
792; SSE2-NEXT:    retq
793;
794; SSE41-LABEL: var_funnnel_v16i8:
795; SSE41:       # %bb.0:
796; SSE41-NEXT:    movdqa %xmm2, %xmm3
797; SSE41-NEXT:    movdqa %xmm0, %xmm2
798; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
799; SSE41-NEXT:    movdqa %xmm3, %xmm0
800; SSE41-NEXT:    pand %xmm5, %xmm0
801; SSE41-NEXT:    psllw $5, %xmm0
802; SSE41-NEXT:    movdqa %xmm0, %xmm4
803; SSE41-NEXT:    paddb %xmm0, %xmm4
804; SSE41-NEXT:    movdqa %xmm1, %xmm6
805; SSE41-NEXT:    psrlw $4, %xmm6
806; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
807; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
808; SSE41-NEXT:    movdqa %xmm1, %xmm6
809; SSE41-NEXT:    psrlw $2, %xmm6
810; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
811; SSE41-NEXT:    movdqa %xmm4, %xmm0
812; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
813; SSE41-NEXT:    movdqa %xmm1, %xmm6
814; SSE41-NEXT:    psrlw $1, %xmm6
815; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
816; SSE41-NEXT:    paddb %xmm4, %xmm4
817; SSE41-NEXT:    movdqa %xmm4, %xmm0
818; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm1
819; SSE41-NEXT:    pandn %xmm5, %xmm3
820; SSE41-NEXT:    psllw $5, %xmm3
821; SSE41-NEXT:    movdqa %xmm3, %xmm4
822; SSE41-NEXT:    paddb %xmm3, %xmm4
823; SSE41-NEXT:    paddb %xmm2, %xmm2
824; SSE41-NEXT:    movdqa %xmm2, %xmm5
825; SSE41-NEXT:    psllw $4, %xmm5
826; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
827; SSE41-NEXT:    movdqa %xmm3, %xmm0
828; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm2
829; SSE41-NEXT:    movdqa %xmm2, %xmm3
830; SSE41-NEXT:    psllw $2, %xmm3
831; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
832; SSE41-NEXT:    movdqa %xmm4, %xmm0
833; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
834; SSE41-NEXT:    movdqa %xmm2, %xmm3
835; SSE41-NEXT:    paddb %xmm2, %xmm3
836; SSE41-NEXT:    paddb %xmm4, %xmm4
837; SSE41-NEXT:    movdqa %xmm4, %xmm0
838; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
839; SSE41-NEXT:    por %xmm1, %xmm2
840; SSE41-NEXT:    movdqa %xmm2, %xmm0
841; SSE41-NEXT:    retq
842;
843; AVX1-LABEL: var_funnnel_v16i8:
844; AVX1:       # %bb.0:
845; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
846; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
847; AVX1-NEXT:    vpsllw $5, %xmm4, %xmm4
848; AVX1-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
849; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm6
850; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
851; AVX1-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
852; AVX1-NEXT:    vpsrlw $2, %xmm1, %xmm4
853; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
854; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
855; AVX1-NEXT:    vpsrlw $1, %xmm1, %xmm4
856; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
857; AVX1-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
858; AVX1-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
859; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
860; AVX1-NEXT:    vpsllw $5, %xmm2, %xmm2
861; AVX1-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
862; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
863; AVX1-NEXT:    vpsllw $4, %xmm0, %xmm4
864; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
865; AVX1-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
866; AVX1-NEXT:    vpsllw $2, %xmm0, %xmm2
867; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
868; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
869; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
870; AVX1-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
871; AVX1-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
872; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
873; AVX1-NEXT:    retq
874;
875; AVX2-LABEL: var_funnnel_v16i8:
876; AVX2:       # %bb.0:
877; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
878; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
879; AVX2-NEXT:    vpsllw $5, %xmm4, %xmm4
880; AVX2-NEXT:    vpaddb %xmm4, %xmm4, %xmm5
881; AVX2-NEXT:    vpsrlw $4, %xmm1, %xmm6
882; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6, %xmm6
883; AVX2-NEXT:    vpblendvb %xmm4, %xmm6, %xmm1, %xmm1
884; AVX2-NEXT:    vpsrlw $2, %xmm1, %xmm4
885; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
886; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
887; AVX2-NEXT:    vpsrlw $1, %xmm1, %xmm4
888; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
889; AVX2-NEXT:    vpaddb %xmm5, %xmm5, %xmm5
890; AVX2-NEXT:    vpblendvb %xmm5, %xmm4, %xmm1, %xmm1
891; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
892; AVX2-NEXT:    vpsllw $5, %xmm2, %xmm2
893; AVX2-NEXT:    vpaddb %xmm2, %xmm2, %xmm3
894; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
895; AVX2-NEXT:    vpsllw $4, %xmm0, %xmm4
896; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4, %xmm4
897; AVX2-NEXT:    vpblendvb %xmm2, %xmm4, %xmm0, %xmm0
898; AVX2-NEXT:    vpsllw $2, %xmm0, %xmm2
899; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
900; AVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
901; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm2
902; AVX2-NEXT:    vpaddb %xmm3, %xmm3, %xmm3
903; AVX2-NEXT:    vpblendvb %xmm3, %xmm2, %xmm0, %xmm0
904; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
905; AVX2-NEXT:    retq
906;
907; AVX512F-LABEL: var_funnnel_v16i8:
908; AVX512F:       # %bb.0:
909; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
910; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
911; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
912; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
913; AVX512F-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
914; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
915; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
916; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
917; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
918; AVX512F-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
919; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
920; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
921; AVX512F-NEXT:    vzeroupper
922; AVX512F-NEXT:    retq
923;
924; AVX512VL-LABEL: var_funnnel_v16i8:
925; AVX512VL:       # %bb.0:
926; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
927; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
928; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
929; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
930; AVX512VL-NEXT:    vpsrlvd %zmm4, %zmm1, %zmm1
931; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
932; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
933; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
934; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
935; AVX512VL-NEXT:    vpsllvd %zmm2, %zmm0, %zmm0
936; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
937; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
938; AVX512VL-NEXT:    vzeroupper
939; AVX512VL-NEXT:    retq
940;
941; AVX512BW-LABEL: var_funnnel_v16i8:
942; AVX512BW:       # %bb.0:
943; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
944; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
945; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
946; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
947; AVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
948; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
949; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
950; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
951; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
952; AVX512BW-NEXT:    vzeroupper
953; AVX512BW-NEXT:    retq
954;
955; AVX512VBMI2-LABEL: var_funnnel_v16i8:
956; AVX512VBMI2:       # %bb.0:
957; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
958; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
959; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
960; AVX512VBMI2-NEXT:    vpermt2b %zmm0, %zmm3, %zmm1
961; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
962; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
963; AVX512VBMI2-NEXT:    vpsrlvw %zmm0, %zmm1, %zmm0
964; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
965; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
966; AVX512VBMI2-NEXT:    vzeroupper
967; AVX512VBMI2-NEXT:    retq
968;
969; AVX512VLBW-LABEL: var_funnnel_v16i8:
970; AVX512VLBW:       # %bb.0:
971; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
972; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
973; AVX512VLBW-NEXT:    vpsllw $8, %ymm0, %ymm0
974; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
975; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
976; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
977; AVX512VLBW-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0
978; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
979; AVX512VLBW-NEXT:    vzeroupper
980; AVX512VLBW-NEXT:    retq
981;
982; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
983; AVX512VLVBMI2:       # %bb.0:
984; AVX512VLVBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
985; AVX512VLVBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
986; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
987; AVX512VLVBMI2-NEXT:    vpermi2b %ymm0, %ymm1, %ymm3
988; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
989; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
990; AVX512VLVBMI2-NEXT:    vpsrlvw %ymm0, %ymm3, %ymm0
991; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
992; AVX512VLVBMI2-NEXT:    vzeroupper
993; AVX512VLVBMI2-NEXT:    retq
994;
995; XOPAVX1-LABEL: var_funnnel_v16i8:
996; XOPAVX1:       # %bb.0:
997; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
998; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
999; XOPAVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
1000; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
1001; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
1002; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1003; XOPAVX1-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
1004; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1005; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1006; XOPAVX1-NEXT:    retq
1007;
1008; XOPAVX2-LABEL: var_funnnel_v16i8:
1009; XOPAVX2:       # %bb.0:
1010; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1011; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1012; XOPAVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
1013; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm0, %xmm0
1014; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
1015; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
1016; XOPAVX2-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
1017; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
1018; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1019; XOPAVX2-NEXT:    retq
1020;
1021; X86-SSE2-LABEL: var_funnnel_v16i8:
1022; X86-SSE2:       # %bb.0:
1023; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
1024; X86-SSE2-NEXT:    movdqa %xmm2, %xmm6
1025; X86-SSE2-NEXT:    pand %xmm5, %xmm6
1026; X86-SSE2-NEXT:    psllw $5, %xmm6
1027; X86-SSE2-NEXT:    pxor %xmm4, %xmm4
1028; X86-SSE2-NEXT:    pxor %xmm3, %xmm3
1029; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm3
1030; X86-SSE2-NEXT:    movdqa %xmm3, %xmm7
1031; X86-SSE2-NEXT:    pandn %xmm1, %xmm7
1032; X86-SSE2-NEXT:    psrlw $4, %xmm1
1033; X86-SSE2-NEXT:    pand %xmm1, %xmm3
1034; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1035; X86-SSE2-NEXT:    por %xmm7, %xmm3
1036; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
1037; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1038; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
1039; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
1040; X86-SSE2-NEXT:    pandn %xmm3, %xmm7
1041; X86-SSE2-NEXT:    psrlw $2, %xmm3
1042; X86-SSE2-NEXT:    pand %xmm1, %xmm3
1043; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1044; X86-SSE2-NEXT:    por %xmm7, %xmm3
1045; X86-SSE2-NEXT:    paddb %xmm6, %xmm6
1046; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1047; X86-SSE2-NEXT:    pcmpgtb %xmm6, %xmm1
1048; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
1049; X86-SSE2-NEXT:    pandn %xmm3, %xmm6
1050; X86-SSE2-NEXT:    psrlw $1, %xmm3
1051; X86-SSE2-NEXT:    pand %xmm1, %xmm3
1052; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
1053; X86-SSE2-NEXT:    por %xmm6, %xmm3
1054; X86-SSE2-NEXT:    pandn %xmm5, %xmm2
1055; X86-SSE2-NEXT:    psllw $5, %xmm2
1056; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1057; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
1058; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
1059; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
1060; X86-SSE2-NEXT:    pandn %xmm0, %xmm5
1061; X86-SSE2-NEXT:    psllw $4, %xmm0
1062; X86-SSE2-NEXT:    pand %xmm1, %xmm0
1063; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1064; X86-SSE2-NEXT:    por %xmm5, %xmm0
1065; X86-SSE2-NEXT:    paddb %xmm2, %xmm2
1066; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
1067; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm1
1068; X86-SSE2-NEXT:    movdqa %xmm1, %xmm5
1069; X86-SSE2-NEXT:    pandn %xmm0, %xmm5
1070; X86-SSE2-NEXT:    psllw $2, %xmm0
1071; X86-SSE2-NEXT:    pand %xmm1, %xmm0
1072; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1073; X86-SSE2-NEXT:    por %xmm5, %xmm0
1074; X86-SSE2-NEXT:    paddb %xmm2, %xmm2
1075; X86-SSE2-NEXT:    pcmpgtb %xmm2, %xmm4
1076; X86-SSE2-NEXT:    movdqa %xmm4, %xmm1
1077; X86-SSE2-NEXT:    pandn %xmm0, %xmm1
1078; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
1079; X86-SSE2-NEXT:    pand %xmm4, %xmm0
1080; X86-SSE2-NEXT:    por %xmm1, %xmm0
1081; X86-SSE2-NEXT:    por %xmm3, %xmm0
1082; X86-SSE2-NEXT:    retl
1083  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
1084  ret <16 x i8> %res
1085}
1086
1087;
1088; Uniform Variable Shifts
1089;
1090
1091define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
1092; SSE2-LABEL: splatvar_funnnel_v2i64:
1093; SSE2:       # %bb.0:
1094; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,63]
1095; SSE2-NEXT:    movdqa %xmm2, %xmm4
1096; SSE2-NEXT:    pand %xmm3, %xmm4
1097; SSE2-NEXT:    psrlq %xmm4, %xmm1
1098; SSE2-NEXT:    pandn %xmm3, %xmm2
1099; SSE2-NEXT:    paddq %xmm0, %xmm0
1100; SSE2-NEXT:    psllq %xmm2, %xmm0
1101; SSE2-NEXT:    por %xmm1, %xmm0
1102; SSE2-NEXT:    retq
1103;
1104; SSE41-LABEL: splatvar_funnnel_v2i64:
1105; SSE41:       # %bb.0:
1106; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [63,63]
1107; SSE41-NEXT:    movdqa %xmm2, %xmm4
1108; SSE41-NEXT:    pand %xmm3, %xmm4
1109; SSE41-NEXT:    psrlq %xmm4, %xmm1
1110; SSE41-NEXT:    pandn %xmm3, %xmm2
1111; SSE41-NEXT:    paddq %xmm0, %xmm0
1112; SSE41-NEXT:    psllq %xmm2, %xmm0
1113; SSE41-NEXT:    por %xmm1, %xmm0
1114; SSE41-NEXT:    retq
1115;
1116; AVX-LABEL: splatvar_funnnel_v2i64:
1117; AVX:       # %bb.0:
1118; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1119; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
1120; AVX-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1121; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1122; AVX-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1123; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1124; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1125; AVX-NEXT:    retq
1126;
1127; AVX512F-LABEL: splatvar_funnnel_v2i64:
1128; AVX512F:       # %bb.0:
1129; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1130; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
1131; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1132; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1133; AVX512F-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1134; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1135; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1136; AVX512F-NEXT:    retq
1137;
1138; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1139; AVX512VL:       # %bb.0:
1140; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
1141; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
1142; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1143; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1144; AVX512VL-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1145; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1146; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1147; AVX512VL-NEXT:    retq
1148;
1149; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1150; AVX512BW:       # %bb.0:
1151; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1152; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
1153; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1154; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1155; AVX512BW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1156; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1157; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1158; AVX512BW-NEXT:    retq
1159;
1160; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1161; AVX512VBMI2:       # %bb.0:
1162; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1163; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1164; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
1165; AVX512VBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
1166; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1167; AVX512VBMI2-NEXT:    vzeroupper
1168; AVX512VBMI2-NEXT:    retq
1169;
1170; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1171; AVX512VLBW:       # %bb.0:
1172; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
1173; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
1174; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1175; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1176; AVX512VLBW-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1177; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1178; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1179; AVX512VLBW-NEXT:    retq
1180;
1181; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1182; AVX512VLVBMI2:       # %bb.0:
1183; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
1184; AVX512VLVBMI2-NEXT:    vpshrdvq %xmm2, %xmm0, %xmm1
1185; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1186; AVX512VLVBMI2-NEXT:    retq
1187;
1188; XOP-LABEL: splatvar_funnnel_v2i64:
1189; XOP:       # %bb.0:
1190; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1191; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
1192; XOP-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1193; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1194; XOP-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
1195; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1196; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1197; XOP-NEXT:    retq
1198;
1199; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1200; X86-SSE2:       # %bb.0:
1201; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,0,63,0]
1202; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
1203; X86-SSE2-NEXT:    pand %xmm3, %xmm4
1204; X86-SSE2-NEXT:    psrlq %xmm4, %xmm1
1205; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
1206; X86-SSE2-NEXT:    paddq %xmm0, %xmm0
1207; X86-SSE2-NEXT:    psllq %xmm2, %xmm0
1208; X86-SSE2-NEXT:    por %xmm1, %xmm0
1209; X86-SSE2-NEXT:    retl
1210  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1211  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1212  ret <2 x i64> %res
1213}
1214
1215define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1216; SSE-LABEL: splatvar_funnnel_v4i32:
1217; SSE:       # %bb.0:
1218; SSE-NEXT:    movdqa %xmm1, %xmm3
1219; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1220; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1221; SSE-NEXT:    psrlq %xmm2, %xmm3
1222; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1223; SSE-NEXT:    psrlq %xmm2, %xmm1
1224; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1225; SSE-NEXT:    movaps %xmm1, %xmm0
1226; SSE-NEXT:    retq
1227;
1228; AVX-LABEL: splatvar_funnnel_v4i32:
1229; AVX:       # %bb.0:
1230; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1231; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1232; AVX-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
1233; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1234; AVX-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
1235; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1236; AVX-NEXT:    retq
1237;
1238; AVX512F-LABEL: splatvar_funnnel_v4i32:
1239; AVX512F:       # %bb.0:
1240; AVX512F-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1241; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1242; AVX512F-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
1243; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1244; AVX512F-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
1245; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1246; AVX512F-NEXT:    retq
1247;
1248; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1249; AVX512VL:       # %bb.0:
1250; AVX512VL-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1251; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1252; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
1253; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1254; AVX512VL-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
1255; AVX512VL-NEXT:    vpmovqd %ymm0, %xmm0
1256; AVX512VL-NEXT:    vzeroupper
1257; AVX512VL-NEXT:    retq
1258;
1259; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1260; AVX512BW:       # %bb.0:
1261; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1262; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1263; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
1264; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1265; AVX512BW-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
1266; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1267; AVX512BW-NEXT:    retq
1268;
1269; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1270; AVX512VBMI2:       # %bb.0:
1271; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1272; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1273; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
1274; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
1275; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1276; AVX512VBMI2-NEXT:    vzeroupper
1277; AVX512VBMI2-NEXT:    retq
1278;
1279; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1280; AVX512VLBW:       # %bb.0:
1281; AVX512VLBW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1282; AVX512VLBW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1283; AVX512VLBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
1284; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1285; AVX512VLBW-NEXT:    vpsrlq %xmm1, %ymm0, %ymm0
1286; AVX512VLBW-NEXT:    vpmovqd %ymm0, %xmm0
1287; AVX512VLBW-NEXT:    vzeroupper
1288; AVX512VLBW-NEXT:    retq
1289;
1290; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1291; AVX512VLVBMI2:       # %bb.0:
1292; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
1293; AVX512VLVBMI2-NEXT:    vpshrdvd %xmm2, %xmm0, %xmm1
1294; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1295; AVX512VLVBMI2-NEXT:    retq
1296;
1297; XOP-LABEL: splatvar_funnnel_v4i32:
1298; XOP:       # %bb.0:
1299; XOP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1300; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1301; XOP-NEXT:    vpsrlq %xmm2, %xmm3, %xmm3
1302; XOP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1303; XOP-NEXT:    vpsrlq %xmm2, %xmm0, %xmm0
1304; XOP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm3[0,2]
1305; XOP-NEXT:    retq
1306;
1307; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1308; X86-SSE2:       # %bb.0:
1309; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1310; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1311; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1312; X86-SSE2-NEXT:    psrlq %xmm2, %xmm3
1313; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1314; X86-SSE2-NEXT:    psrlq %xmm2, %xmm1
1315; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
1316; X86-SSE2-NEXT:    movaps %xmm1, %xmm0
1317; X86-SSE2-NEXT:    retl
1318  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1319  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1320  ret <4 x i32> %res
1321}
1322
1323define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1324; SSE2-LABEL: splatvar_funnnel_v8i16:
1325; SSE2:       # %bb.0:
1326; SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
1327; SSE2-NEXT:    movdqa %xmm2, %xmm4
1328; SSE2-NEXT:    pand %xmm3, %xmm4
1329; SSE2-NEXT:    psrlw %xmm4, %xmm1
1330; SSE2-NEXT:    pandn %xmm3, %xmm2
1331; SSE2-NEXT:    paddw %xmm0, %xmm0
1332; SSE2-NEXT:    psllw %xmm2, %xmm0
1333; SSE2-NEXT:    por %xmm1, %xmm0
1334; SSE2-NEXT:    retq
1335;
1336; SSE41-LABEL: splatvar_funnnel_v8i16:
1337; SSE41:       # %bb.0:
1338; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [15,0]
1339; SSE41-NEXT:    movdqa %xmm2, %xmm4
1340; SSE41-NEXT:    pand %xmm3, %xmm4
1341; SSE41-NEXT:    psrlw %xmm4, %xmm1
1342; SSE41-NEXT:    pandn %xmm3, %xmm2
1343; SSE41-NEXT:    paddw %xmm0, %xmm0
1344; SSE41-NEXT:    psllw %xmm2, %xmm0
1345; SSE41-NEXT:    por %xmm1, %xmm0
1346; SSE41-NEXT:    retq
1347;
1348; AVX-LABEL: splatvar_funnnel_v8i16:
1349; AVX:       # %bb.0:
1350; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1351; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
1352; AVX-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1353; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1354; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1355; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1356; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1357; AVX-NEXT:    retq
1358;
1359; AVX512F-LABEL: splatvar_funnnel_v8i16:
1360; AVX512F:       # %bb.0:
1361; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1362; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
1363; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1364; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1365; AVX512F-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1366; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1367; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1368; AVX512F-NEXT:    retq
1369;
1370; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1371; AVX512VL:       # %bb.0:
1372; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1373; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
1374; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1375; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1376; AVX512VL-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1377; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1378; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1379; AVX512VL-NEXT:    retq
1380;
1381; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1382; AVX512BW:       # %bb.0:
1383; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1384; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
1385; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1386; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1387; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1388; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1389; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1390; AVX512BW-NEXT:    retq
1391;
1392; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1393; AVX512VBMI2:       # %bb.0:
1394; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1395; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1396; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
1397; AVX512VBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
1398; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1399; AVX512VBMI2-NEXT:    vzeroupper
1400; AVX512VBMI2-NEXT:    retq
1401;
1402; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1403; AVX512VLBW:       # %bb.0:
1404; AVX512VLBW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1405; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
1406; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1407; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1408; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1409; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1410; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1411; AVX512VLBW-NEXT:    retq
1412;
1413; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1414; AVX512VLVBMI2:       # %bb.0:
1415; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
1416; AVX512VLVBMI2-NEXT:    vpshrdvw %xmm2, %xmm0, %xmm1
1417; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1418; AVX512VLVBMI2-NEXT:    retq
1419;
1420; XOP-LABEL: splatvar_funnnel_v8i16:
1421; XOP:       # %bb.0:
1422; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1423; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
1424; XOP-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1425; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
1426; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1427; XOP-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1428; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1429; XOP-NEXT:    retq
1430;
1431; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1432; X86-SSE2:       # %bb.0:
1433; X86-SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
1434; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
1435; X86-SSE2-NEXT:    pand %xmm3, %xmm4
1436; X86-SSE2-NEXT:    psrlw %xmm4, %xmm1
1437; X86-SSE2-NEXT:    pandn %xmm3, %xmm2
1438; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
1439; X86-SSE2-NEXT:    psllw %xmm2, %xmm0
1440; X86-SSE2-NEXT:    por %xmm1, %xmm0
1441; X86-SSE2-NEXT:    retl
1442  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1443  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1444  ret <8 x i16> %res
1445}
1446
1447define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1448; SSE2-LABEL: splatvar_funnnel_v16i8:
1449; SSE2:       # %bb.0:
1450; SSE2-NEXT:    movdqa %xmm1, %xmm4
1451; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1452; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1453; SSE2-NEXT:    psrlw %xmm2, %xmm4
1454; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1455; SSE2-NEXT:    pand %xmm3, %xmm4
1456; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1457; SSE2-NEXT:    psrlw %xmm2, %xmm1
1458; SSE2-NEXT:    pand %xmm1, %xmm3
1459; SSE2-NEXT:    packuswb %xmm4, %xmm3
1460; SSE2-NEXT:    movdqa %xmm3, %xmm0
1461; SSE2-NEXT:    retq
1462;
1463; SSE41-LABEL: splatvar_funnnel_v16i8:
1464; SSE41:       # %bb.0:
1465; SSE41-NEXT:    movdqa %xmm1, %xmm4
1466; SSE41-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1467; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1468; SSE41-NEXT:    psrlw %xmm2, %xmm4
1469; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1470; SSE41-NEXT:    pand %xmm3, %xmm4
1471; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1472; SSE41-NEXT:    psrlw %xmm2, %xmm1
1473; SSE41-NEXT:    pand %xmm1, %xmm3
1474; SSE41-NEXT:    packuswb %xmm4, %xmm3
1475; SSE41-NEXT:    movdqa %xmm3, %xmm0
1476; SSE41-NEXT:    retq
1477;
1478; AVX1-LABEL: splatvar_funnnel_v16i8:
1479; AVX1:       # %bb.0:
1480; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1481; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1482; AVX1-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1483; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1484; AVX1-NEXT:    vpand %xmm4, %xmm3, %xmm3
1485; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1486; AVX1-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1487; AVX1-NEXT:    vpand %xmm4, %xmm0, %xmm0
1488; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1489; AVX1-NEXT:    retq
1490;
1491; AVX2-LABEL: splatvar_funnnel_v16i8:
1492; AVX2:       # %bb.0:
1493; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1494; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1495; AVX2-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1496; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1497; AVX2-NEXT:    vpand %xmm4, %xmm3, %xmm3
1498; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1499; AVX2-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1500; AVX2-NEXT:    vpand %xmm4, %xmm0, %xmm0
1501; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1502; AVX2-NEXT:    retq
1503;
1504; AVX512F-LABEL: splatvar_funnnel_v16i8:
1505; AVX512F:       # %bb.0:
1506; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1507; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1508; AVX512F-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1509; AVX512F-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1510; AVX512F-NEXT:    vpand %xmm4, %xmm3, %xmm3
1511; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1512; AVX512F-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1513; AVX512F-NEXT:    vpand %xmm4, %xmm0, %xmm0
1514; AVX512F-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1515; AVX512F-NEXT:    retq
1516;
1517; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1518; AVX512VL:       # %bb.0:
1519; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1520; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1521; AVX512VL-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1522; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1523; AVX512VL-NEXT:    vpand %xmm4, %xmm3, %xmm3
1524; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1525; AVX512VL-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1526; AVX512VL-NEXT:    vpand %xmm4, %xmm0, %xmm0
1527; AVX512VL-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1528; AVX512VL-NEXT:    retq
1529;
1530; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1531; AVX512BW:       # %bb.0:
1532; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1533; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1534; AVX512BW-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1535; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
1536; AVX512BW-NEXT:    vpand %xmm4, %xmm3, %xmm3
1537; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1538; AVX512BW-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1539; AVX512BW-NEXT:    vpand %xmm4, %xmm0, %xmm0
1540; AVX512BW-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1541; AVX512BW-NEXT:    retq
1542;
1543; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1544; AVX512VBMI2:       # %bb.0:
1545; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,2,4,6,8,10,12,14,64,66,68,70,72,74,76,78]
1546; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1547; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1548; AVX512VBMI2-NEXT:    vpsrlw %xmm2, %xmm4, %xmm4
1549; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1550; AVX512VBMI2-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1551; AVX512VBMI2-NEXT:    vpermt2b %zmm4, %zmm3, %zmm0
1552; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1553; AVX512VBMI2-NEXT:    vzeroupper
1554; AVX512VBMI2-NEXT:    retq
1555;
1556; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1557; AVX512VLBW:       # %bb.0:
1558; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1559; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1560; AVX512VLBW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
1561; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1562; AVX512VLBW-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1563; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
1564; AVX512VLBW-NEXT:    vzeroupper
1565; AVX512VLBW-NEXT:    retq
1566;
1567; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1568; AVX512VLVBMI2:       # %bb.0:
1569; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1570; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1571; AVX512VLVBMI2-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm0
1572; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
1573; AVX512VLVBMI2-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0
1574; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
1575; AVX512VLVBMI2-NEXT:    vzeroupper
1576; AVX512VLVBMI2-NEXT:    retq
1577;
1578; XOP-LABEL: splatvar_funnnel_v16i8:
1579; XOP:       # %bb.0:
1580; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1581; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1582; XOP-NEXT:    vpsrlw %xmm2, %xmm3, %xmm3
1583; XOP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1584; XOP-NEXT:    vpsrlw %xmm2, %xmm0, %xmm0
1585; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14],xmm3[0,2,4,6,8,10,12,14]
1586; XOP-NEXT:    retq
1587;
1588; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1589; X86-SSE2:       # %bb.0:
1590; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
1591; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
1592; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1593; X86-SSE2-NEXT:    psrlw %xmm2, %xmm4
1594; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1595; X86-SSE2-NEXT:    pand %xmm3, %xmm4
1596; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1597; X86-SSE2-NEXT:    psrlw %xmm2, %xmm1
1598; X86-SSE2-NEXT:    pand %xmm1, %xmm3
1599; X86-SSE2-NEXT:    packuswb %xmm4, %xmm3
1600; X86-SSE2-NEXT:    movdqa %xmm3, %xmm0
1601; X86-SSE2-NEXT:    retl
1602  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1603  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1604  ret <16 x i8> %res
1605}
1606
1607;
1608; Constant Shifts
1609;
1610
1611define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1612; SSE2-LABEL: constant_funnnel_v2i64:
1613; SSE2:       # %bb.0:
1614; SSE2-NEXT:    movdqa %xmm1, %xmm2
1615; SSE2-NEXT:    psrlq $4, %xmm2
1616; SSE2-NEXT:    psrlq $14, %xmm1
1617; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1618; SSE2-NEXT:    movdqa %xmm0, %xmm1
1619; SSE2-NEXT:    psllq $60, %xmm1
1620; SSE2-NEXT:    psllq $50, %xmm0
1621; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1622; SSE2-NEXT:    orpd %xmm2, %xmm0
1623; SSE2-NEXT:    retq
1624;
1625; SSE41-LABEL: constant_funnnel_v2i64:
1626; SSE41:       # %bb.0:
1627; SSE41-NEXT:    movdqa %xmm1, %xmm2
1628; SSE41-NEXT:    psrlq $14, %xmm2
1629; SSE41-NEXT:    psrlq $4, %xmm1
1630; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1631; SSE41-NEXT:    movdqa %xmm0, %xmm1
1632; SSE41-NEXT:    psllq $50, %xmm1
1633; SSE41-NEXT:    psllq $60, %xmm0
1634; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1635; SSE41-NEXT:    por %xmm2, %xmm0
1636; SSE41-NEXT:    retq
1637;
1638; AVX1-LABEL: constant_funnnel_v2i64:
1639; AVX1:       # %bb.0:
1640; AVX1-NEXT:    vpsrlq $14, %xmm1, %xmm2
1641; AVX1-NEXT:    vpsrlq $4, %xmm1, %xmm1
1642; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1643; AVX1-NEXT:    vpsllq $50, %xmm0, %xmm2
1644; AVX1-NEXT:    vpsllq $60, %xmm0, %xmm0
1645; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1646; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1647; AVX1-NEXT:    retq
1648;
1649; AVX2-LABEL: constant_funnnel_v2i64:
1650; AVX2:       # %bb.0:
1651; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1652; AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1653; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1654; AVX2-NEXT:    retq
1655;
1656; AVX512F-LABEL: constant_funnnel_v2i64:
1657; AVX512F:       # %bb.0:
1658; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1659; AVX512F-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1660; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1661; AVX512F-NEXT:    retq
1662;
1663; AVX512VL-LABEL: constant_funnnel_v2i64:
1664; AVX512VL:       # %bb.0:
1665; AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1666; AVX512VL-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1667; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1668; AVX512VL-NEXT:    retq
1669;
1670; AVX512BW-LABEL: constant_funnnel_v2i64:
1671; AVX512BW:       # %bb.0:
1672; AVX512BW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1673; AVX512BW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1674; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1675; AVX512BW-NEXT:    retq
1676;
1677; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1678; AVX512VBMI2:       # %bb.0:
1679; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1680; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1681; AVX512VBMI2-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,14]
1682; AVX512VBMI2-NEXT:    vpshrdvq %zmm2, %zmm0, %zmm1
1683; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1684; AVX512VBMI2-NEXT:    vzeroupper
1685; AVX512VBMI2-NEXT:    retq
1686;
1687; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1688; AVX512VLBW:       # %bb.0:
1689; AVX512VLBW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1690; AVX512VLBW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1691; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1692; AVX512VLBW-NEXT:    retq
1693;
1694; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1695; AVX512VLVBMI2:       # %bb.0:
1696; AVX512VLVBMI2-NEXT:    vpshrdvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1697; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1698; AVX512VLVBMI2-NEXT:    retq
1699;
1700; XOPAVX1-LABEL: constant_funnnel_v2i64:
1701; XOPAVX1:       # %bb.0:
1702; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1703; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1704; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1705; XOPAVX1-NEXT:    retq
1706;
1707; XOPAVX2-LABEL: constant_funnnel_v2i64:
1708; XOPAVX2:       # %bb.0:
1709; XOPAVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1710; XOPAVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1711; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1712; XOPAVX2-NEXT:    retq
1713;
1714; X86-SSE2-LABEL: constant_funnnel_v2i64:
1715; X86-SSE2:       # %bb.0:
1716; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1717; X86-SSE2-NEXT:    psrlq $4, %xmm2
1718; X86-SSE2-NEXT:    psrlq $14, %xmm1
1719; X86-SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1720; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1721; X86-SSE2-NEXT:    psllq $60, %xmm1
1722; X86-SSE2-NEXT:    psllq $50, %xmm0
1723; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1724; X86-SSE2-NEXT:    orpd %xmm2, %xmm0
1725; X86-SSE2-NEXT:    retl
1726  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1727  ret <2 x i64> %res
1728}
1729
1730define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1731; SSE2-LABEL: constant_funnnel_v4i32:
1732; SSE2:       # %bb.0:
1733; SSE2-NEXT:    movdqa %xmm1, %xmm2
1734; SSE2-NEXT:    psrld $7, %xmm2
1735; SSE2-NEXT:    movdqa %xmm1, %xmm3
1736; SSE2-NEXT:    psrld $6, %xmm3
1737; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1738; SSE2-NEXT:    movdqa %xmm1, %xmm2
1739; SSE2-NEXT:    psrld $5, %xmm2
1740; SSE2-NEXT:    psrld $4, %xmm1
1741; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1742; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1743; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1744; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1745; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1746; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1747; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1748; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1749; SSE2-NEXT:    por %xmm1, %xmm0
1750; SSE2-NEXT:    retq
1751;
1752; SSE41-LABEL: constant_funnnel_v4i32:
1753; SSE41:       # %bb.0:
1754; SSE41-NEXT:    movdqa %xmm1, %xmm2
1755; SSE41-NEXT:    psrld $7, %xmm2
1756; SSE41-NEXT:    movdqa %xmm1, %xmm3
1757; SSE41-NEXT:    psrld $5, %xmm3
1758; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1759; SSE41-NEXT:    movdqa %xmm1, %xmm2
1760; SSE41-NEXT:    psrld $6, %xmm2
1761; SSE41-NEXT:    psrld $4, %xmm1
1762; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1763; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1764; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1765; SSE41-NEXT:    por %xmm2, %xmm0
1766; SSE41-NEXT:    retq
1767;
1768; AVX1-LABEL: constant_funnnel_v4i32:
1769; AVX1:       # %bb.0:
1770; AVX1-NEXT:    vpsrld $7, %xmm1, %xmm2
1771; AVX1-NEXT:    vpsrld $5, %xmm1, %xmm3
1772; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1773; AVX1-NEXT:    vpsrld $6, %xmm1, %xmm3
1774; AVX1-NEXT:    vpsrld $4, %xmm1, %xmm1
1775; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1776; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1777; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1778; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1779; AVX1-NEXT:    retq
1780;
1781; AVX2-LABEL: constant_funnnel_v4i32:
1782; AVX2:       # %bb.0:
1783; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1784; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1786; AVX2-NEXT:    retq
1787;
1788; AVX512F-LABEL: constant_funnnel_v4i32:
1789; AVX512F:       # %bb.0:
1790; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1791; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1792; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1793; AVX512F-NEXT:    retq
1794;
1795; AVX512VL-LABEL: constant_funnnel_v4i32:
1796; AVX512VL:       # %bb.0:
1797; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1798; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1799; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1800; AVX512VL-NEXT:    retq
1801;
1802; AVX512BW-LABEL: constant_funnnel_v4i32:
1803; AVX512BW:       # %bb.0:
1804; AVX512BW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1805; AVX512BW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1806; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1807; AVX512BW-NEXT:    retq
1808;
1809; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1810; AVX512VBMI2:       # %bb.0:
1811; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1812; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1813; AVX512VBMI2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7]
1814; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
1815; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1816; AVX512VBMI2-NEXT:    vzeroupper
1817; AVX512VBMI2-NEXT:    retq
1818;
1819; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1820; AVX512VLBW:       # %bb.0:
1821; AVX512VLBW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1822; AVX512VLBW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1823; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1824; AVX512VLBW-NEXT:    retq
1825;
1826; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1827; AVX512VLVBMI2:       # %bb.0:
1828; AVX512VLVBMI2-NEXT:    vpshrdvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1829; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1830; AVX512VLVBMI2-NEXT:    retq
1831;
1832; XOPAVX1-LABEL: constant_funnnel_v4i32:
1833; XOPAVX1:       # %bb.0:
1834; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1835; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1836; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1837; XOPAVX1-NEXT:    retq
1838;
1839; XOPAVX2-LABEL: constant_funnnel_v4i32:
1840; XOPAVX2:       # %bb.0:
1841; XOPAVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1842; XOPAVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1843; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1844; XOPAVX2-NEXT:    retq
1845;
1846; X86-SSE2-LABEL: constant_funnnel_v4i32:
1847; X86-SSE2:       # %bb.0:
1848; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1849; X86-SSE2-NEXT:    psrld $7, %xmm2
1850; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1851; X86-SSE2-NEXT:    psrld $6, %xmm3
1852; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1853; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1854; X86-SSE2-NEXT:    psrld $5, %xmm2
1855; X86-SSE2-NEXT:    psrld $4, %xmm1
1856; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1857; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1858; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1859; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1860; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1861; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1862; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1863; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1864; X86-SSE2-NEXT:    por %xmm1, %xmm0
1865; X86-SSE2-NEXT:    retl
1866  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1867  ret <4 x i32> %res
1868}
1869
1870define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1871; SSE2-LABEL: constant_funnnel_v8i16:
1872; SSE2:       # %bb.0:
1873; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1874; SSE2-NEXT:    pandn %xmm1, %xmm2
1875; SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
1876; SSE2-NEXT:    por %xmm1, %xmm2
1877; SSE2-NEXT:    paddw %xmm0, %xmm0
1878; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1879; SSE2-NEXT:    por %xmm2, %xmm0
1880; SSE2-NEXT:    retq
1881;
1882; SSE41-LABEL: constant_funnnel_v8i16:
1883; SSE41:       # %bb.0:
1884; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [u,32768,16384,8192,4096,2048,1024,512]
1885; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
1886; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1887; SSE41-NEXT:    paddw %xmm0, %xmm0
1888; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1889; SSE41-NEXT:    por %xmm2, %xmm0
1890; SSE41-NEXT:    retq
1891;
1892; AVX-LABEL: constant_funnnel_v8i16:
1893; AVX:       # %bb.0:
1894; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1895; AVX-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1896; AVX-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1897; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1898; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1899; AVX-NEXT:    retq
1900;
1901; AVX512F-LABEL: constant_funnnel_v8i16:
1902; AVX512F:       # %bb.0:
1903; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1904; AVX512F-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1905; AVX512F-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1906; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1907; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1908; AVX512F-NEXT:    retq
1909;
1910; AVX512VL-LABEL: constant_funnnel_v8i16:
1911; AVX512VL:       # %bb.0:
1912; AVX512VL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2 # [u,32768,16384,8192,4096,2048,1024,512]
1913; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3,4,5,6,7]
1914; AVX512VL-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1915; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1916; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1917; AVX512VL-NEXT:    retq
1918;
1919; AVX512BW-LABEL: constant_funnnel_v8i16:
1920; AVX512BW:       # %bb.0:
1921; AVX512BW-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1922; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1923; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
1924; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1925; AVX512BW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1926; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1927; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1928; AVX512BW-NEXT:    vzeroupper
1929; AVX512BW-NEXT:    retq
1930;
1931; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1932; AVX512VBMI2:       # %bb.0:
1933; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1934; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1935; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1936; AVX512VBMI2-NEXT:    vpshrdvw %zmm2, %zmm0, %zmm1
1937; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1938; AVX512VBMI2-NEXT:    vzeroupper
1939; AVX512VBMI2-NEXT:    retq
1940;
1941; AVX512VLBW-LABEL: constant_funnnel_v8i16:
1942; AVX512VLBW:       # %bb.0:
1943; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1944; AVX512VLBW-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1945; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1946; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1947; AVX512VLBW-NEXT:    retq
1948;
1949; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
1950; AVX512VLVBMI2:       # %bb.0:
1951; AVX512VLVBMI2-NEXT:    vpshrdvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
1952; AVX512VLVBMI2-NEXT:    vmovdqa %xmm1, %xmm0
1953; AVX512VLVBMI2-NEXT:    retq
1954;
1955; XOP-LABEL: constant_funnnel_v8i16:
1956; XOP:       # %bb.0:
1957; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1958; XOP-NEXT:    vpaddw %xmm0, %xmm0, %xmm0
1959; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1960; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1961; XOP-NEXT:    retq
1962;
1963; X86-SSE2-LABEL: constant_funnnel_v8i16:
1964; X86-SSE2:       # %bb.0:
1965; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535]
1966; X86-SSE2-NEXT:    pandn %xmm1, %xmm2
1967; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
1968; X86-SSE2-NEXT:    por %xmm1, %xmm2
1969; X86-SSE2-NEXT:    paddw %xmm0, %xmm0
1970; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [32768,16384,8192,4096,2048,1024,512,256]
1971; X86-SSE2-NEXT:    por %xmm2, %xmm0
1972; X86-SSE2-NEXT:    retl
1973  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
1974  ret <8 x i16> %res
1975}
1976
1977define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
1978; SSE2-LABEL: constant_funnnel_v16i8:
1979; SSE2:       # %bb.0:
1980; SSE2-NEXT:    pxor %xmm2, %xmm2
1981; SSE2-NEXT:    movdqa %xmm1, %xmm3
1982; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
1983; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,2,4,8,16,32,64,128]
1984; SSE2-NEXT:    psrlw $8, %xmm3
1985; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
1986; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,128,64,32,16,8,4,2]
1987; SSE2-NEXT:    psrlw $8, %xmm1
1988; SSE2-NEXT:    packuswb %xmm3, %xmm1
1989; SSE2-NEXT:    paddb %xmm0, %xmm0
1990; SSE2-NEXT:    movdqa %xmm0, %xmm2
1991; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
1992; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [128,1,2,4,8,16,32,64]
1993; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
1994; SSE2-NEXT:    pand %xmm3, %xmm2
1995; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
1996; SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,64,32,16,8,4,2,1]
1997; SSE2-NEXT:    pand %xmm3, %xmm0
1998; SSE2-NEXT:    packuswb %xmm2, %xmm0
1999; SSE2-NEXT:    por %xmm1, %xmm0
2000; SSE2-NEXT:    retq
2001;
2002; SSE41-LABEL: constant_funnnel_v16i8:
2003; SSE41:       # %bb.0:
2004; SSE41-NEXT:    pxor %xmm2, %xmm2
2005; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2006; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2007; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [256,2,4,8,16,32,64,128]
2008; SSE41-NEXT:    psrlw $8, %xmm1
2009; SSE41-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 # [256,128,64,32,16,8,4,2]
2010; SSE41-NEXT:    psrlw $8, %xmm3
2011; SSE41-NEXT:    packuswb %xmm1, %xmm3
2012; SSE41-NEXT:    paddb %xmm0, %xmm0
2013; SSE41-NEXT:    movdqa %xmm0, %xmm1
2014; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
2015; SSE41-NEXT:    psllw $8, %xmm1
2016; SSE41-NEXT:    por %xmm3, %xmm1
2017; SSE41-NEXT:    pmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
2018; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2019; SSE41-NEXT:    por %xmm1, %xmm0
2020; SSE41-NEXT:    retq
2021;
2022; AVX1-LABEL: constant_funnnel_v16i8:
2023; AVX1:       # %bb.0:
2024; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
2025; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
2026; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [256,2,4,8,16,32,64,128]
2027; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
2028; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
2029; AVX1-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [256,128,64,32,16,8,4,2]
2030; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
2031; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2032; AVX1-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
2033; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2 # [0,64,0,16,0,4,0,1,0,1,0,4,0,16,0,64]
2034; AVX1-NEXT:    vpsllw $8, %xmm2, %xmm2
2035; AVX1-NEXT:    vpor %xmm1, %xmm2, %xmm1
2036; AVX1-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [128,0,32,0,8,0,2,0,128,0,2,0,8,0,32,0]
2037; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2038; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
2039; AVX1-NEXT:    retq
2040;
2041; AVX2-LABEL: constant_funnnel_v16i8:
2042; AVX2:       # %bb.0:
2043; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2044; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 # [256,128,64,32,16,8,4,2,256,2,4,8,16,32,64,128]
2045; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
2046; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
2047; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
2048; AVX2-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
2049; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2050; AVX2-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [128,64,32,16,8,4,2,1,128,1,2,4,8,16,32,64]
2051; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2052; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
2053; AVX2-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2054; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
2055; AVX2-NEXT:    vzeroupper
2056; AVX2-NEXT:    retq
2057;
2058; AVX512F-LABEL: constant_funnnel_v16i8:
2059; AVX512F:       # %bb.0:
2060; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2061; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2062; AVX512F-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
2063; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2064; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2065; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
2066; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
2067; AVX512F-NEXT:    vzeroupper
2068; AVX512F-NEXT:    retq
2069;
2070; AVX512VL-LABEL: constant_funnnel_v16i8:
2071; AVX512VL:       # %bb.0:
2072; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
2073; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm1, %zmm1
2074; AVX512VL-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
2075; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
2076; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0
2077; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
2078; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
2079; AVX512VL-NEXT:    vzeroupper
2080; AVX512VL-NEXT:    retq
2081;
2082; AVX512BW-LABEL: constant_funnnel_v16i8:
2083; AVX512BW:       # %bb.0:
2084; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2085; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2086; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2087; AVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
2088; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2089; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
2090; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2091; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2092; AVX512BW-NEXT:    vzeroupper
2093; AVX512BW-NEXT:    retq
2094;
2095; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2096; AVX512VBMI2:       # %bb.0:
2097; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2098; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2099; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2100; AVX512VBMI2-NEXT:    vpermt2b %zmm0, %zmm2, %zmm1
2101; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2102; AVX512VBMI2-NEXT:    vpsrlvw %zmm0, %zmm1, %zmm0
2103; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
2104; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2105; AVX512VBMI2-NEXT:    vzeroupper
2106; AVX512VBMI2-NEXT:    retq
2107;
2108; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2109; AVX512VLBW:       # %bb.0:
2110; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2111; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2112; AVX512VLBW-NEXT:    vpsllw $8, %ymm0, %ymm0
2113; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2114; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2115; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
2116; AVX512VLBW-NEXT:    vzeroupper
2117; AVX512VLBW-NEXT:    retq
2118;
2119; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2120; AVX512VLVBMI2:       # %bb.0:
2121; AVX512VLVBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2122; AVX512VLVBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2123; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2124; AVX512VLVBMI2-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
2125; AVX512VLVBMI2-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2126; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
2127; AVX512VLVBMI2-NEXT:    vzeroupper
2128; AVX512VLVBMI2-NEXT:    retq
2129;
2130; XOP-LABEL: constant_funnnel_v16i8:
2131; XOP:       # %bb.0:
2132; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2133; XOP-NEXT:    vpaddb %xmm0, %xmm0, %xmm0
2134; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2135; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2136; XOP-NEXT:    retq
2137;
2138; X86-SSE2-LABEL: constant_funnnel_v16i8:
2139; X86-SSE2:       # %bb.0:
2140; X86-SSE2-NEXT:    pxor %xmm2, %xmm2
2141; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
2142; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
2143; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 # [256,2,4,8,16,32,64,128]
2144; X86-SSE2-NEXT:    psrlw $8, %xmm3
2145; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
2146; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [256,128,64,32,16,8,4,2]
2147; X86-SSE2-NEXT:    psrlw $8, %xmm1
2148; X86-SSE2-NEXT:    packuswb %xmm3, %xmm1
2149; X86-SSE2-NEXT:    paddb %xmm0, %xmm0
2150; X86-SSE2-NEXT:    movdqa %xmm0, %xmm2
2151; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
2152; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [128,1,2,4,8,16,32,64]
2153; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
2154; X86-SSE2-NEXT:    pand %xmm3, %xmm2
2155; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
2156; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [128,64,32,16,8,4,2,1]
2157; X86-SSE2-NEXT:    pand %xmm3, %xmm0
2158; X86-SSE2-NEXT:    packuswb %xmm2, %xmm0
2159; X86-SSE2-NEXT:    por %xmm1, %xmm0
2160; X86-SSE2-NEXT:    retl
2161  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2162  ret <16 x i8> %res
2163}
2164
2165;
2166; Uniform Constant Shifts
2167;
2168
2169define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2170; SSE-LABEL: splatconstant_funnnel_v2i64:
2171; SSE:       # %bb.0:
2172; SSE-NEXT:    psrlq $14, %xmm1
2173; SSE-NEXT:    psllq $50, %xmm0
2174; SSE-NEXT:    por %xmm1, %xmm0
2175; SSE-NEXT:    retq
2176;
2177; AVX-LABEL: splatconstant_funnnel_v2i64:
2178; AVX:       # %bb.0:
2179; AVX-NEXT:    vpsrlq $14, %xmm1, %xmm1
2180; AVX-NEXT:    vpsllq $50, %xmm0, %xmm0
2181; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2182; AVX-NEXT:    retq
2183;
2184; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2185; AVX512F:       # %bb.0:
2186; AVX512F-NEXT:    vpsrlq $14, %xmm1, %xmm1
2187; AVX512F-NEXT:    vpsllq $50, %xmm0, %xmm0
2188; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2189; AVX512F-NEXT:    retq
2190;
2191; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2192; AVX512VL:       # %bb.0:
2193; AVX512VL-NEXT:    vpsrlq $14, %xmm1, %xmm1
2194; AVX512VL-NEXT:    vpsllq $50, %xmm0, %xmm0
2195; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2196; AVX512VL-NEXT:    retq
2197;
2198; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2199; AVX512BW:       # %bb.0:
2200; AVX512BW-NEXT:    vpsrlq $14, %xmm1, %xmm1
2201; AVX512BW-NEXT:    vpsllq $50, %xmm0, %xmm0
2202; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2203; AVX512BW-NEXT:    retq
2204;
2205; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2206; AVX512VBMI2:       # %bb.0:
2207; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2208; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2209; AVX512VBMI2-NEXT:    vpshrdq $14, %zmm0, %zmm1, %zmm0
2210; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2211; AVX512VBMI2-NEXT:    vzeroupper
2212; AVX512VBMI2-NEXT:    retq
2213;
2214; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2215; AVX512VLBW:       # %bb.0:
2216; AVX512VLBW-NEXT:    vpsrlq $14, %xmm1, %xmm1
2217; AVX512VLBW-NEXT:    vpsllq $50, %xmm0, %xmm0
2218; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2219; AVX512VLBW-NEXT:    retq
2220;
2221; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2222; AVX512VLVBMI2:       # %bb.0:
2223; AVX512VLVBMI2-NEXT:    vpshrdq $14, %xmm0, %xmm1, %xmm0
2224; AVX512VLVBMI2-NEXT:    retq
2225;
2226; XOP-LABEL: splatconstant_funnnel_v2i64:
2227; XOP:       # %bb.0:
2228; XOP-NEXT:    vpsrlq $14, %xmm1, %xmm1
2229; XOP-NEXT:    vpsllq $50, %xmm0, %xmm0
2230; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2231; XOP-NEXT:    retq
2232;
2233; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2234; X86-SSE2:       # %bb.0:
2235; X86-SSE2-NEXT:    psrlq $14, %xmm1
2236; X86-SSE2-NEXT:    psllq $50, %xmm0
2237; X86-SSE2-NEXT:    por %xmm1, %xmm0
2238; X86-SSE2-NEXT:    retl
2239  %res = call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2240  ret <2 x i64> %res
2241}
2242
2243define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2244; SSE-LABEL: splatconstant_funnnel_v4i32:
2245; SSE:       # %bb.0:
2246; SSE-NEXT:    psrld $4, %xmm1
2247; SSE-NEXT:    pslld $28, %xmm0
2248; SSE-NEXT:    por %xmm1, %xmm0
2249; SSE-NEXT:    retq
2250;
2251; AVX-LABEL: splatconstant_funnnel_v4i32:
2252; AVX:       # %bb.0:
2253; AVX-NEXT:    vpsrld $4, %xmm1, %xmm1
2254; AVX-NEXT:    vpslld $28, %xmm0, %xmm0
2255; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2256; AVX-NEXT:    retq
2257;
2258; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2259; AVX512F:       # %bb.0:
2260; AVX512F-NEXT:    vpsrld $4, %xmm1, %xmm1
2261; AVX512F-NEXT:    vpslld $28, %xmm0, %xmm0
2262; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2263; AVX512F-NEXT:    retq
2264;
2265; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2266; AVX512VL:       # %bb.0:
2267; AVX512VL-NEXT:    vpsrld $4, %xmm1, %xmm1
2268; AVX512VL-NEXT:    vpslld $28, %xmm0, %xmm0
2269; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2270; AVX512VL-NEXT:    retq
2271;
2272; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2273; AVX512BW:       # %bb.0:
2274; AVX512BW-NEXT:    vpsrld $4, %xmm1, %xmm1
2275; AVX512BW-NEXT:    vpslld $28, %xmm0, %xmm0
2276; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2277; AVX512BW-NEXT:    retq
2278;
2279; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2280; AVX512VBMI2:       # %bb.0:
2281; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2282; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2283; AVX512VBMI2-NEXT:    vpshrdd $4, %zmm0, %zmm1, %zmm0
2284; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2285; AVX512VBMI2-NEXT:    vzeroupper
2286; AVX512VBMI2-NEXT:    retq
2287;
2288; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2289; AVX512VLBW:       # %bb.0:
2290; AVX512VLBW-NEXT:    vpsrld $4, %xmm1, %xmm1
2291; AVX512VLBW-NEXT:    vpslld $28, %xmm0, %xmm0
2292; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2293; AVX512VLBW-NEXT:    retq
2294;
2295; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2296; AVX512VLVBMI2:       # %bb.0:
2297; AVX512VLVBMI2-NEXT:    vpshrdd $4, %xmm0, %xmm1, %xmm0
2298; AVX512VLVBMI2-NEXT:    retq
2299;
2300; XOP-LABEL: splatconstant_funnnel_v4i32:
2301; XOP:       # %bb.0:
2302; XOP-NEXT:    vpsrld $4, %xmm1, %xmm1
2303; XOP-NEXT:    vpslld $28, %xmm0, %xmm0
2304; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2305; XOP-NEXT:    retq
2306;
2307; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2308; X86-SSE2:       # %bb.0:
2309; X86-SSE2-NEXT:    psrld $4, %xmm1
2310; X86-SSE2-NEXT:    pslld $28, %xmm0
2311; X86-SSE2-NEXT:    por %xmm1, %xmm0
2312; X86-SSE2-NEXT:    retl
2313  %res = call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2314  ret <4 x i32> %res
2315}
2316
2317define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2318; SSE-LABEL: splatconstant_funnnel_v8i16:
2319; SSE:       # %bb.0:
2320; SSE-NEXT:    psrlw $7, %xmm1
2321; SSE-NEXT:    psllw $9, %xmm0
2322; SSE-NEXT:    por %xmm1, %xmm0
2323; SSE-NEXT:    retq
2324;
2325; AVX-LABEL: splatconstant_funnnel_v8i16:
2326; AVX:       # %bb.0:
2327; AVX-NEXT:    vpsrlw $7, %xmm1, %xmm1
2328; AVX-NEXT:    vpsllw $9, %xmm0, %xmm0
2329; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2330; AVX-NEXT:    retq
2331;
2332; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2333; AVX512F:       # %bb.0:
2334; AVX512F-NEXT:    vpsrlw $7, %xmm1, %xmm1
2335; AVX512F-NEXT:    vpsllw $9, %xmm0, %xmm0
2336; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2337; AVX512F-NEXT:    retq
2338;
2339; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2340; AVX512VL:       # %bb.0:
2341; AVX512VL-NEXT:    vpsrlw $7, %xmm1, %xmm1
2342; AVX512VL-NEXT:    vpsllw $9, %xmm0, %xmm0
2343; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2344; AVX512VL-NEXT:    retq
2345;
2346; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2347; AVX512BW:       # %bb.0:
2348; AVX512BW-NEXT:    vpsrlw $7, %xmm1, %xmm1
2349; AVX512BW-NEXT:    vpsllw $9, %xmm0, %xmm0
2350; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2351; AVX512BW-NEXT:    retq
2352;
2353; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2354; AVX512VBMI2:       # %bb.0:
2355; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2356; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2357; AVX512VBMI2-NEXT:    vpshrdw $7, %zmm0, %zmm1, %zmm0
2358; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2359; AVX512VBMI2-NEXT:    vzeroupper
2360; AVX512VBMI2-NEXT:    retq
2361;
2362; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2363; AVX512VLBW:       # %bb.0:
2364; AVX512VLBW-NEXT:    vpsrlw $7, %xmm1, %xmm1
2365; AVX512VLBW-NEXT:    vpsllw $9, %xmm0, %xmm0
2366; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2367; AVX512VLBW-NEXT:    retq
2368;
2369; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2370; AVX512VLVBMI2:       # %bb.0:
2371; AVX512VLVBMI2-NEXT:    vpshrdw $7, %xmm0, %xmm1, %xmm0
2372; AVX512VLVBMI2-NEXT:    retq
2373;
2374; XOP-LABEL: splatconstant_funnnel_v8i16:
2375; XOP:       # %bb.0:
2376; XOP-NEXT:    vpsrlw $7, %xmm1, %xmm1
2377; XOP-NEXT:    vpsllw $9, %xmm0, %xmm0
2378; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2379; XOP-NEXT:    retq
2380;
2381; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2382; X86-SSE2:       # %bb.0:
2383; X86-SSE2-NEXT:    psrlw $7, %xmm1
2384; X86-SSE2-NEXT:    psllw $9, %xmm0
2385; X86-SSE2-NEXT:    por %xmm1, %xmm0
2386; X86-SSE2-NEXT:    retl
2387  %res = call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2388  ret <8 x i16> %res
2389}
2390
2391define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2392; SSE-LABEL: splatconstant_funnnel_v16i8:
2393; SSE:       # %bb.0:
2394; SSE-NEXT:    psrlw $4, %xmm1
2395; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2396; SSE-NEXT:    psllw $4, %xmm0
2397; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2398; SSE-NEXT:    por %xmm1, %xmm0
2399; SSE-NEXT:    retq
2400;
2401; AVX-LABEL: splatconstant_funnnel_v16i8:
2402; AVX:       # %bb.0:
2403; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm1
2404; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2405; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
2406; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2407; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2408; AVX-NEXT:    retq
2409;
2410; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2411; AVX512F:       # %bb.0:
2412; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
2413; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
2414; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2415; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2416; AVX512F-NEXT:    vzeroupper
2417; AVX512F-NEXT:    retq
2418;
2419; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2420; AVX512VL:       # %bb.0:
2421; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
2422; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
2423; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2424; AVX512VL-NEXT:    retq
2425;
2426; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2427; AVX512BW:       # %bb.0:
2428; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
2429; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
2430; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2431; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2432; AVX512BW-NEXT:    vzeroupper
2433; AVX512BW-NEXT:    retq
2434;
2435; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2436; AVX512VBMI2:       # %bb.0:
2437; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
2438; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
2439; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2440; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2441; AVX512VBMI2-NEXT:    vzeroupper
2442; AVX512VBMI2-NEXT:    retq
2443;
2444; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2445; AVX512VLBW:       # %bb.0:
2446; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
2447; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
2448; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2449; AVX512VLBW-NEXT:    retq
2450;
2451; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2452; AVX512VLVBMI2:       # %bb.0:
2453; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
2454; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
2455; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2456; AVX512VLVBMI2-NEXT:    retq
2457;
2458; XOP-LABEL: splatconstant_funnnel_v16i8:
2459; XOP:       # %bb.0:
2460; XOP-NEXT:    vpsrlw $4, %xmm1, %xmm1
2461; XOP-NEXT:    vpsllw $4, %xmm0, %xmm0
2462; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0
2463; XOP-NEXT:    retq
2464;
2465; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2466; X86-SSE2:       # %bb.0:
2467; X86-SSE2-NEXT:    psrlw $4, %xmm1
2468; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2469; X86-SSE2-NEXT:    psllw $4, %xmm0
2470; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2471; X86-SSE2-NEXT:    por %xmm1, %xmm0
2472; X86-SSE2-NEXT:    retl
2473  %res = call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
2474  ret <16 x i8> %res
2475}
2476