xref: /llvm-project/llvm/test/CodeGen/X86/vector-fshl-128.ll (revision 560517fb71e3928ab63cfa78ead7ff766e733f9d)
1; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
3; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
4; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
5; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
6; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512F
7; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512VL
8; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
9; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2 | FileCheck %s --check-prefixes=AVX512VBMI2
10; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX512VLBW
11; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vbmi,+avx512vbmi2,+avx512vl | FileCheck %s --check-prefixes=AVX512VLVBMI2
12; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-256 | FileCheck %s --check-prefixes=AVX512VLVBMI2
13; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx10.1-512 | FileCheck %s --check-prefixes=AVX512VLVBMI2
14; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP,XOPAVX1
15; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=XOP,XOPAVX2
16
17; Just one 32-bit run to make sure we do reasonable things for i64 cases.
18; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86-SSE2
19
20declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>)
21declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
22declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
23declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
24
25;
26; Variable Shifts
27;
28
29define <2 x i64> @var_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
30; SSE2-LABEL: var_funnnel_v2i64:
31; SSE2:       # %bb.0:
32; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,63]
33; SSE2-NEXT:    movdqa %xmm2, %xmm4
34; SSE2-NEXT:    pandn %xmm3, %xmm4
35; SSE2-NEXT:    psrlq $1, %xmm1
36; SSE2-NEXT:    movdqa %xmm1, %xmm5
37; SSE2-NEXT:    psrlq %xmm4, %xmm5
38; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
39; SSE2-NEXT:    psrlq %xmm4, %xmm1
40; SSE2-NEXT:    shufpd {{.*#+}} xmm5 = xmm5[0],xmm1[1]
41; SSE2-NEXT:    pand %xmm3, %xmm2
42; SSE2-NEXT:    movdqa %xmm0, %xmm1
43; SSE2-NEXT:    psllq %xmm2, %xmm1
44; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
45; SSE2-NEXT:    psllq %xmm2, %xmm0
46; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
47; SSE2-NEXT:    orpd %xmm5, %xmm0
48; SSE2-NEXT:    retq
49;
50; SSE41-LABEL: var_funnnel_v2i64:
51; SSE41:       # %bb.0:
52; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [63,63]
53; SSE41-NEXT:    movdqa %xmm2, %xmm4
54; SSE41-NEXT:    pandn %xmm3, %xmm4
55; SSE41-NEXT:    psrlq $1, %xmm1
56; SSE41-NEXT:    movdqa %xmm1, %xmm5
57; SSE41-NEXT:    psrlq %xmm4, %xmm5
58; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
59; SSE41-NEXT:    psrlq %xmm4, %xmm1
60; SSE41-NEXT:    pblendw {{.*#+}} xmm5 = xmm5[0,1,2,3],xmm1[4,5,6,7]
61; SSE41-NEXT:    pand %xmm3, %xmm2
62; SSE41-NEXT:    movdqa %xmm0, %xmm1
63; SSE41-NEXT:    psllq %xmm2, %xmm1
64; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
65; SSE41-NEXT:    psllq %xmm2, %xmm0
66; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7]
67; SSE41-NEXT:    por %xmm5, %xmm0
68; SSE41-NEXT:    retq
69;
70; AVX1-LABEL: var_funnnel_v2i64:
71; AVX1:       # %bb.0:
72; AVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
73; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
74; AVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
75; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm5
76; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,3,2,3]
77; AVX1-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
78; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2,3],xmm1[4,5,6,7]
79; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
80; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm3
81; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
82; AVX1-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
83; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7]
84; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
85; AVX1-NEXT:    retq
86;
87; AVX2-LABEL: var_funnnel_v2i64:
88; AVX2:       # %bb.0:
89; AVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
90; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
91; AVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
92; AVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
93; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
94; AVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
95; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
96; AVX2-NEXT:    retq
97;
98; AVX512F-LABEL: var_funnnel_v2i64:
99; AVX512F:       # %bb.0:
100; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
101; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
102; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
103; AVX512F-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
104; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
105; AVX512F-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
106; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
107; AVX512F-NEXT:    retq
108;
109; AVX512VL-LABEL: var_funnnel_v2i64:
110; AVX512VL:       # %bb.0:
111; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
112; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
113; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
114; AVX512VL-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
115; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
116; AVX512VL-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
117; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
118; AVX512VL-NEXT:    retq
119;
120; AVX512BW-LABEL: var_funnnel_v2i64:
121; AVX512BW:       # %bb.0:
122; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
123; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
124; AVX512BW-NEXT:    vpsrlq $1, %xmm1, %xmm1
125; AVX512BW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
126; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
127; AVX512BW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
128; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
129; AVX512BW-NEXT:    retq
130;
131; AVX512VBMI2-LABEL: var_funnnel_v2i64:
132; AVX512VBMI2:       # %bb.0:
133; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
134; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
135; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
136; AVX512VBMI2-NEXT:    vpshldvq %zmm2, %zmm1, %zmm0
137; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
138; AVX512VBMI2-NEXT:    vzeroupper
139; AVX512VBMI2-NEXT:    retq
140;
141; AVX512VLBW-LABEL: var_funnnel_v2i64:
142; AVX512VLBW:       # %bb.0:
143; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
144; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
145; AVX512VLBW-NEXT:    vpsrlq $1, %xmm1, %xmm1
146; AVX512VLBW-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
147; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
148; AVX512VLBW-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
149; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
150; AVX512VLBW-NEXT:    retq
151;
152; AVX512VLVBMI2-LABEL: var_funnnel_v2i64:
153; AVX512VLVBMI2:       # %bb.0:
154; AVX512VLVBMI2-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0
155; AVX512VLVBMI2-NEXT:    retq
156;
157; XOPAVX1-LABEL: var_funnnel_v2i64:
158; XOPAVX1:       # %bb.0:
159; XOPAVX1-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
160; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
161; XOPAVX1-NEXT:    vpshlq %xmm4, %xmm0, %xmm0
162; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
163; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
164; XOPAVX1-NEXT:    vpsubq %xmm2, %xmm3, %xmm2
165; XOPAVX1-NEXT:    vpsrlq $1, %xmm1, %xmm1
166; XOPAVX1-NEXT:    vpshlq %xmm2, %xmm1, %xmm1
167; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
168; XOPAVX1-NEXT:    retq
169;
170; XOPAVX2-LABEL: var_funnnel_v2i64:
171; XOPAVX2:       # %bb.0:
172; XOPAVX2-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
173; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
174; XOPAVX2-NEXT:    vpsrlq $1, %xmm1, %xmm1
175; XOPAVX2-NEXT:    vpsrlvq %xmm4, %xmm1, %xmm1
176; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
177; XOPAVX2-NEXT:    vpsllvq %xmm2, %xmm0, %xmm0
178; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
179; XOPAVX2-NEXT:    retq
180;
181; X86-SSE2-LABEL: var_funnnel_v2i64:
182; X86-SSE2:       # %bb.0:
183; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [63,0,63,0]
184; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
185; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
186; X86-SSE2-NEXT:    psrlq $1, %xmm1
187; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
188; X86-SSE2-NEXT:    psrlq %xmm5, %xmm3
189; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
190; X86-SSE2-NEXT:    psrlq %xmm5, %xmm1
191; X86-SSE2-NEXT:    shufpd {{.*#+}} xmm3 = xmm3[0],xmm1[1]
192; X86-SSE2-NEXT:    pand %xmm4, %xmm2
193; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
194; X86-SSE2-NEXT:    psllq %xmm2, %xmm1
195; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,2,3]
196; X86-SSE2-NEXT:    psllq %xmm2, %xmm0
197; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
198; X86-SSE2-NEXT:    orpd %xmm3, %xmm0
199; X86-SSE2-NEXT:    retl
200  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt)
201  ret <2 x i64> %res
202}
203
204define <4 x i32> @var_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
205; SSE2-LABEL: var_funnnel_v4i32:
206; SSE2:       # %bb.0:
207; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
208; SSE2-NEXT:    movdqa %xmm2, %xmm5
209; SSE2-NEXT:    pandn %xmm4, %xmm5
210; SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
211; SSE2-NEXT:    psrld $1, %xmm1
212; SSE2-NEXT:    movdqa %xmm1, %xmm6
213; SSE2-NEXT:    psrld %xmm3, %xmm6
214; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
215; SSE2-NEXT:    movdqa %xmm1, %xmm3
216; SSE2-NEXT:    psrld %xmm7, %xmm3
217; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
218; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
219; SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
220; SSE2-NEXT:    movdqa %xmm1, %xmm7
221; SSE2-NEXT:    psrld %xmm6, %xmm7
222; SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
223; SSE2-NEXT:    psrld %xmm5, %xmm1
224; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
225; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
226; SSE2-NEXT:    pand %xmm4, %xmm2
227; SSE2-NEXT:    pslld $23, %xmm2
228; SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
229; SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
230; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
231; SSE2-NEXT:    pmuludq %xmm1, %xmm0
232; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
233; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
234; SSE2-NEXT:    pmuludq %xmm2, %xmm1
235; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
236; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
237; SSE2-NEXT:    por %xmm3, %xmm0
238; SSE2-NEXT:    retq
239;
240; SSE41-LABEL: var_funnnel_v4i32:
241; SSE41:       # %bb.0:
242; SSE41-NEXT:    pmovsxbd {{.*#+}} xmm3 = [31,31,31,31]
243; SSE41-NEXT:    movdqa %xmm2, %xmm4
244; SSE41-NEXT:    pandn %xmm3, %xmm4
245; SSE41-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[2,3,3,3,4,5,6,7]
246; SSE41-NEXT:    psrld $1, %xmm1
247; SSE41-NEXT:    movdqa %xmm1, %xmm6
248; SSE41-NEXT:    psrld %xmm5, %xmm6
249; SSE41-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
250; SSE41-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[2,3,3,3,4,5,6,7]
251; SSE41-NEXT:    movdqa %xmm1, %xmm8
252; SSE41-NEXT:    psrld %xmm7, %xmm8
253; SSE41-NEXT:    pblendw {{.*#+}} xmm8 = xmm6[0,1,2,3],xmm8[4,5,6,7]
254; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,1,4,5,6,7]
255; SSE41-NEXT:    movdqa %xmm1, %xmm6
256; SSE41-NEXT:    psrld %xmm4, %xmm6
257; SSE41-NEXT:    pshuflw {{.*#+}} xmm4 = xmm5[0,1,1,1,4,5,6,7]
258; SSE41-NEXT:    psrld %xmm4, %xmm1
259; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1,2,3],xmm1[4,5,6,7]
260; SSE41-NEXT:    pblendw {{.*#+}} xmm6 = xmm6[0,1],xmm8[2,3],xmm6[4,5],xmm8[6,7]
261; SSE41-NEXT:    pand %xmm3, %xmm2
262; SSE41-NEXT:    pslld $23, %xmm2
263; SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
264; SSE41-NEXT:    cvttps2dq %xmm2, %xmm1
265; SSE41-NEXT:    pmulld %xmm1, %xmm0
266; SSE41-NEXT:    por %xmm6, %xmm0
267; SSE41-NEXT:    retq
268;
269; AVX1-LABEL: var_funnnel_v4i32:
270; AVX1:       # %bb.0:
271; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
272; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
273; AVX1-NEXT:    vpsrldq {{.*#+}} xmm5 = xmm4[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
274; AVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
275; AVX1-NEXT:    vpsrld %xmm5, %xmm1, %xmm5
276; AVX1-NEXT:    vpsrlq $32, %xmm4, %xmm6
277; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
278; AVX1-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2,3],xmm5[4,5,6,7]
279; AVX1-NEXT:    vpxor %xmm6, %xmm6, %xmm6
280; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
281; AVX1-NEXT:    vpsrld %xmm6, %xmm1, %xmm6
282; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
283; AVX1-NEXT:    vpsrld %xmm4, %xmm1, %xmm1
284; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm6[4,5,6,7]
285; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm5[2,3],xmm1[4,5],xmm5[6,7]
286; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
287; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
288; AVX1-NEXT:    vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
289; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
290; AVX1-NEXT:    vpmulld %xmm2, %xmm0, %xmm0
291; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
292; AVX1-NEXT:    retq
293;
294; AVX2-LABEL: var_funnnel_v4i32:
295; AVX2:       # %bb.0:
296; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
297; AVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
298; AVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
299; AVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
300; AVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
301; AVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
302; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
303; AVX2-NEXT:    retq
304;
305; AVX512F-LABEL: var_funnnel_v4i32:
306; AVX512F:       # %bb.0:
307; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
308; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
309; AVX512F-NEXT:    vpsrld $1, %xmm1, %xmm1
310; AVX512F-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
311; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
312; AVX512F-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
313; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
314; AVX512F-NEXT:    retq
315;
316; AVX512VL-LABEL: var_funnnel_v4i32:
317; AVX512VL:       # %bb.0:
318; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
319; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
320; AVX512VL-NEXT:    vpsrld $1, %xmm1, %xmm1
321; AVX512VL-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
322; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
323; AVX512VL-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
324; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
325; AVX512VL-NEXT:    retq
326;
327; AVX512BW-LABEL: var_funnnel_v4i32:
328; AVX512BW:       # %bb.0:
329; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
330; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
331; AVX512BW-NEXT:    vpsrld $1, %xmm1, %xmm1
332; AVX512BW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
333; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
334; AVX512BW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
335; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
336; AVX512BW-NEXT:    retq
337;
338; AVX512VBMI2-LABEL: var_funnnel_v4i32:
339; AVX512VBMI2:       # %bb.0:
340; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
341; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
342; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
343; AVX512VBMI2-NEXT:    vpshldvd %zmm2, %zmm1, %zmm0
344; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
345; AVX512VBMI2-NEXT:    vzeroupper
346; AVX512VBMI2-NEXT:    retq
347;
348; AVX512VLBW-LABEL: var_funnnel_v4i32:
349; AVX512VLBW:       # %bb.0:
350; AVX512VLBW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
351; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
352; AVX512VLBW-NEXT:    vpsrld $1, %xmm1, %xmm1
353; AVX512VLBW-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
354; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
355; AVX512VLBW-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
356; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
357; AVX512VLBW-NEXT:    retq
358;
359; AVX512VLVBMI2-LABEL: var_funnnel_v4i32:
360; AVX512VLVBMI2:       # %bb.0:
361; AVX512VLVBMI2-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0
362; AVX512VLVBMI2-NEXT:    retq
363;
364; XOPAVX1-LABEL: var_funnnel_v4i32:
365; XOPAVX1:       # %bb.0:
366; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [31,31,31,31]
367; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
368; XOPAVX1-NEXT:    vpshld %xmm4, %xmm0, %xmm0
369; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
370; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
371; XOPAVX1-NEXT:    vpsubd %xmm2, %xmm3, %xmm2
372; XOPAVX1-NEXT:    vpsrld $1, %xmm1, %xmm1
373; XOPAVX1-NEXT:    vpshld %xmm2, %xmm1, %xmm1
374; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
375; XOPAVX1-NEXT:    retq
376;
377; XOPAVX2-LABEL: var_funnnel_v4i32:
378; XOPAVX2:       # %bb.0:
379; XOPAVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [31,31,31,31]
380; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
381; XOPAVX2-NEXT:    vpsrld $1, %xmm1, %xmm1
382; XOPAVX2-NEXT:    vpsrlvd %xmm4, %xmm1, %xmm1
383; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
384; XOPAVX2-NEXT:    vpsllvd %xmm2, %xmm0, %xmm0
385; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
386; XOPAVX2-NEXT:    retq
387;
388; X86-SSE2-LABEL: var_funnnel_v4i32:
389; X86-SSE2:       # %bb.0:
390; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [31,31,31,31]
391; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
392; X86-SSE2-NEXT:    pandn %xmm4, %xmm5
393; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm5[2,3,3,3,4,5,6,7]
394; X86-SSE2-NEXT:    psrld $1, %xmm1
395; X86-SSE2-NEXT:    movdqa %xmm1, %xmm6
396; X86-SSE2-NEXT:    psrld %xmm3, %xmm6
397; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm5[0,1,1,1,4,5,6,7]
398; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
399; X86-SSE2-NEXT:    psrld %xmm7, %xmm3
400; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0]
401; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3]
402; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[2,3,3,3,4,5,6,7]
403; X86-SSE2-NEXT:    movdqa %xmm1, %xmm7
404; X86-SSE2-NEXT:    psrld %xmm6, %xmm7
405; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,1,4,5,6,7]
406; X86-SSE2-NEXT:    psrld %xmm5, %xmm1
407; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm7[1]
408; X86-SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm1[0,3]
409; X86-SSE2-NEXT:    pand %xmm4, %xmm2
410; X86-SSE2-NEXT:    pslld $23, %xmm2
411; X86-SSE2-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
412; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm1
413; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
414; X86-SSE2-NEXT:    pmuludq %xmm1, %xmm0
415; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
416; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
417; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
418; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
419; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
420; X86-SSE2-NEXT:    por %xmm3, %xmm0
421; X86-SSE2-NEXT:    retl
422  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt)
423  ret <4 x i32> %res
424}
425
426define <8 x i16> @var_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
427; SSE2-LABEL: var_funnnel_v8i16:
428; SSE2:       # %bb.0:
429; SSE2-NEXT:    movdqa %xmm1, %xmm3
430; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
431; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
432; SSE2-NEXT:    movdqa %xmm2, %xmm4
433; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4,4,5,5,6,6,7,7]
434; SSE2-NEXT:    pslld $23, %xmm4
435; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
436; SSE2-NEXT:    paddd %xmm5, %xmm4
437; SSE2-NEXT:    cvttps2dq %xmm4, %xmm4
438; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
439; SSE2-NEXT:    pmuludq %xmm4, %xmm3
440; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
441; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[1,1,3,3]
442; SSE2-NEXT:    pmuludq %xmm6, %xmm4
443; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
444; SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
445; SSE2-NEXT:    psrad $16, %xmm3
446; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
447; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
448; SSE2-NEXT:    pslld $23, %xmm2
449; SSE2-NEXT:    paddd %xmm5, %xmm2
450; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
451; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
452; SSE2-NEXT:    pmuludq %xmm2, %xmm1
453; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
454; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
455; SSE2-NEXT:    pmuludq %xmm4, %xmm1
456; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
457; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
458; SSE2-NEXT:    psrad $16, %xmm0
459; SSE2-NEXT:    packssdw %xmm3, %xmm0
460; SSE2-NEXT:    retq
461;
462; SSE41-LABEL: var_funnnel_v8i16:
463; SSE41:       # %bb.0:
464; SSE41-NEXT:    movdqa %xmm1, %xmm3
465; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
466; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
467; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
468; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
469; SSE41-NEXT:    pslld $23, %xmm2
470; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
471; SSE41-NEXT:    paddd %xmm5, %xmm2
472; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
473; SSE41-NEXT:    pmulld %xmm3, %xmm2
474; SSE41-NEXT:    psrld $16, %xmm2
475; SSE41-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
476; SSE41-NEXT:    pslld $23, %xmm4
477; SSE41-NEXT:    paddd %xmm5, %xmm4
478; SSE41-NEXT:    cvttps2dq %xmm4, %xmm0
479; SSE41-NEXT:    pmulld %xmm1, %xmm0
480; SSE41-NEXT:    psrld $16, %xmm0
481; SSE41-NEXT:    packusdw %xmm2, %xmm0
482; SSE41-NEXT:    retq
483;
484; AVX1-LABEL: var_funnnel_v8i16:
485; AVX1:       # %bb.0:
486; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
487; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
488; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm2[4,4,5,5,6,6,7,7]
489; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
490; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
491; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
492; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
493; AVX1-NEXT:    vpmulld %xmm4, %xmm3, %xmm3
494; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
495; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
496; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
497; AVX1-NEXT:    vpslld $23, %xmm1, %xmm1
498; AVX1-NEXT:    vpaddd %xmm5, %xmm1, %xmm1
499; AVX1-NEXT:    vcvttps2dq %xmm1, %xmm1
500; AVX1-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
501; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm0
502; AVX1-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
503; AVX1-NEXT:    retq
504;
505; AVX2-LABEL: var_funnnel_v8i16:
506; AVX2:       # %bb.0:
507; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
508; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
509; AVX2-NEXT:    vpslld $16, %ymm0, %ymm0
510; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
511; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
512; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
513; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
514; AVX2-NEXT:    vpsrld $16, %ymm0, %ymm0
515; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
516; AVX2-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0
517; AVX2-NEXT:    vzeroupper
518; AVX2-NEXT:    retq
519;
520; AVX512F-LABEL: var_funnnel_v8i16:
521; AVX512F:       # %bb.0:
522; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
523; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
524; AVX512F-NEXT:    vpslld $16, %ymm0, %ymm0
525; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
526; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
527; AVX512F-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
528; AVX512F-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
529; AVX512F-NEXT:    vpsrld $16, %ymm0, %ymm0
530; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
531; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
532; AVX512F-NEXT:    vzeroupper
533; AVX512F-NEXT:    retq
534;
535; AVX512VL-LABEL: var_funnnel_v8i16:
536; AVX512VL:       # %bb.0:
537; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
538; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
539; AVX512VL-NEXT:    vpslld $16, %ymm0, %ymm0
540; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7],ymm1[8],ymm0[9],ymm1[10],ymm0[11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
541; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
542; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
543; AVX512VL-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
544; AVX512VL-NEXT:    vpsrld $16, %ymm0, %ymm0
545; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
546; AVX512VL-NEXT:    vzeroupper
547; AVX512VL-NEXT:    retq
548;
549; AVX512BW-LABEL: var_funnnel_v8i16:
550; AVX512BW:       # %bb.0:
551; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
552; AVX512BW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
553; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
554; AVX512BW-NEXT:    vpsrlw $1, %xmm1, %xmm1
555; AVX512BW-NEXT:    vpsrlvw %zmm4, %zmm1, %zmm1
556; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
557; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
558; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
559; AVX512BW-NEXT:    vzeroupper
560; AVX512BW-NEXT:    retq
561;
562; AVX512VBMI2-LABEL: var_funnnel_v8i16:
563; AVX512VBMI2:       # %bb.0:
564; AVX512VBMI2-NEXT:    # kill: def $xmm2 killed $xmm2 def $zmm2
565; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
566; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
567; AVX512VBMI2-NEXT:    vpshldvw %zmm2, %zmm1, %zmm0
568; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
569; AVX512VBMI2-NEXT:    vzeroupper
570; AVX512VBMI2-NEXT:    retq
571;
572; AVX512VLBW-LABEL: var_funnnel_v8i16:
573; AVX512VLBW:       # %bb.0:
574; AVX512VLBW-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
575; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
576; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
577; AVX512VLBW-NEXT:    vpsrlvw %xmm4, %xmm1, %xmm1
578; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
579; AVX512VLBW-NEXT:    vpsllvw %xmm2, %xmm0, %xmm0
580; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
581; AVX512VLBW-NEXT:    retq
582;
583; AVX512VLVBMI2-LABEL: var_funnnel_v8i16:
584; AVX512VLVBMI2:       # %bb.0:
585; AVX512VLVBMI2-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0
586; AVX512VLVBMI2-NEXT:    retq
587;
588; XOPAVX1-LABEL: var_funnnel_v8i16:
589; XOPAVX1:       # %bb.0:
590; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
591; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
592; XOPAVX1-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
593; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm2
594; XOPAVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
595; XOPAVX1-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
596; XOPAVX1-NEXT:    vpsrlw $1, %xmm1, %xmm1
597; XOPAVX1-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
598; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
599; XOPAVX1-NEXT:    retq
600;
601; XOPAVX2-LABEL: var_funnnel_v8i16:
602; XOPAVX2:       # %bb.0:
603; XOPAVX2-NEXT:    vpbroadcastw {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15]
604; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm4
605; XOPAVX2-NEXT:    vpshlw %xmm4, %xmm0, %xmm0
606; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm2
607; XOPAVX2-NEXT:    vpxor %xmm3, %xmm3, %xmm3
608; XOPAVX2-NEXT:    vpsubw %xmm2, %xmm3, %xmm2
609; XOPAVX2-NEXT:    vpsrlw $1, %xmm1, %xmm1
610; XOPAVX2-NEXT:    vpshlw %xmm2, %xmm1, %xmm1
611; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
612; XOPAVX2-NEXT:    retq
613;
614; X86-SSE2-LABEL: var_funnnel_v8i16:
615; X86-SSE2:       # %bb.0:
616; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
617; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
618; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
619; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
620; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
621; X86-SSE2-NEXT:    pslld $23, %xmm5
622; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1065353216,1065353216,1065353216,1065353216]
623; X86-SSE2-NEXT:    paddd %xmm4, %xmm5
624; X86-SSE2-NEXT:    cvttps2dq %xmm5, %xmm5
625; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[1,1,3,3]
626; X86-SSE2-NEXT:    pmuludq %xmm5, %xmm3
627; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
628; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
629; X86-SSE2-NEXT:    pmuludq %xmm6, %xmm5
630; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,2,3]
631; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1]
632; X86-SSE2-NEXT:    psrad $16, %xmm3
633; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
634; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
635; X86-SSE2-NEXT:    pslld $23, %xmm2
636; X86-SSE2-NEXT:    paddd %xmm4, %xmm2
637; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
638; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[1,1,3,3]
639; X86-SSE2-NEXT:    pmuludq %xmm2, %xmm1
640; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
641; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,1,3,3]
642; X86-SSE2-NEXT:    pmuludq %xmm4, %xmm1
643; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
644; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
645; X86-SSE2-NEXT:    psrad $16, %xmm0
646; X86-SSE2-NEXT:    packssdw %xmm3, %xmm0
647; X86-SSE2-NEXT:    retl
648  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt)
649  ret <8 x i16> %res
650}
651
652define <16 x i8> @var_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
653; SSE2-LABEL: var_funnnel_v16i8:
654; SSE2:       # %bb.0:
655; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
656; SSE2-NEXT:    pxor %xmm5, %xmm5
657; SSE2-NEXT:    movdqa %xmm2, %xmm4
658; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
659; SSE2-NEXT:    movdqa %xmm4, %xmm6
660; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
661; SSE2-NEXT:    pslld $23, %xmm6
662; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
663; SSE2-NEXT:    paddd %xmm3, %xmm6
664; SSE2-NEXT:    cvttps2dq %xmm6, %xmm6
665; SSE2-NEXT:    pslld $16, %xmm6
666; SSE2-NEXT:    psrad $16, %xmm6
667; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
668; SSE2-NEXT:    pslld $23, %xmm4
669; SSE2-NEXT:    paddd %xmm3, %xmm4
670; SSE2-NEXT:    cvttps2dq %xmm4, %xmm7
671; SSE2-NEXT:    pslld $16, %xmm7
672; SSE2-NEXT:    psrad $16, %xmm7
673; SSE2-NEXT:    packssdw %xmm6, %xmm7
674; SSE2-NEXT:    movdqa %xmm1, %xmm4
675; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
676; SSE2-NEXT:    pmullw %xmm7, %xmm4
677; SSE2-NEXT:    psrlw $8, %xmm4
678; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
679; SSE2-NEXT:    movdqa %xmm2, %xmm5
680; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
681; SSE2-NEXT:    pslld $23, %xmm5
682; SSE2-NEXT:    paddd %xmm3, %xmm5
683; SSE2-NEXT:    cvttps2dq %xmm5, %xmm5
684; SSE2-NEXT:    pslld $16, %xmm5
685; SSE2-NEXT:    psrad $16, %xmm5
686; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
687; SSE2-NEXT:    pslld $23, %xmm2
688; SSE2-NEXT:    paddd %xmm3, %xmm2
689; SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
690; SSE2-NEXT:    pslld $16, %xmm2
691; SSE2-NEXT:    psrad $16, %xmm2
692; SSE2-NEXT:    packssdw %xmm5, %xmm2
693; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
694; SSE2-NEXT:    pmullw %xmm1, %xmm2
695; SSE2-NEXT:    psrlw $8, %xmm2
696; SSE2-NEXT:    packuswb %xmm4, %xmm2
697; SSE2-NEXT:    movdqa %xmm2, %xmm0
698; SSE2-NEXT:    retq
699;
700; SSE41-LABEL: var_funnnel_v16i8:
701; SSE41:       # %bb.0:
702; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
703; SSE41-NEXT:    pxor %xmm3, %xmm3
704; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
705; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
706; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
707; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
708; SSE41-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
709; SSE41-NEXT:    pslld $23, %xmm2
710; SSE41-NEXT:    movdqa {{.*#+}} xmm6 = [1065353216,1065353216,1065353216,1065353216]
711; SSE41-NEXT:    paddd %xmm6, %xmm2
712; SSE41-NEXT:    cvttps2dq %xmm2, %xmm2
713; SSE41-NEXT:    pslld $23, %xmm3
714; SSE41-NEXT:    paddd %xmm6, %xmm3
715; SSE41-NEXT:    cvttps2dq %xmm3, %xmm3
716; SSE41-NEXT:    packusdw %xmm2, %xmm3
717; SSE41-NEXT:    movdqa %xmm1, %xmm7
718; SSE41-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
719; SSE41-NEXT:    pmullw %xmm3, %xmm7
720; SSE41-NEXT:    psrlw $8, %xmm7
721; SSE41-NEXT:    pslld $23, %xmm4
722; SSE41-NEXT:    paddd %xmm6, %xmm4
723; SSE41-NEXT:    cvttps2dq %xmm4, %xmm2
724; SSE41-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
725; SSE41-NEXT:    pslld $23, %xmm5
726; SSE41-NEXT:    paddd %xmm6, %xmm5
727; SSE41-NEXT:    cvttps2dq %xmm5, %xmm3
728; SSE41-NEXT:    packusdw %xmm3, %xmm2
729; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
730; SSE41-NEXT:    pmullw %xmm1, %xmm2
731; SSE41-NEXT:    psrlw $8, %xmm2
732; SSE41-NEXT:    packuswb %xmm7, %xmm2
733; SSE41-NEXT:    movdqa %xmm2, %xmm0
734; SSE41-NEXT:    retq
735;
736; AVX1-LABEL: var_funnnel_v16i8:
737; AVX1:       # %bb.0:
738; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
739; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
740; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
741; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm3[4,4,5,5,6,6,7,7]
742; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
743; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1065353216,1065353216,1065353216,1065353216]
744; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
745; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
746; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
747; AVX1-NEXT:    vpslld $23, %xmm3, %xmm3
748; AVX1-NEXT:    vpaddd %xmm5, %xmm3, %xmm3
749; AVX1-NEXT:    vcvttps2dq %xmm3, %xmm3
750; AVX1-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
751; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
752; AVX1-NEXT:    vpmullw %xmm3, %xmm4, %xmm3
753; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
754; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm4 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
755; AVX1-NEXT:    vpslld $23, %xmm4, %xmm4
756; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
757; AVX1-NEXT:    vcvttps2dq %xmm4, %xmm4
758; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
759; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4,4,5,5,6,6,7,7]
760; AVX1-NEXT:    vpslld $23, %xmm2, %xmm2
761; AVX1-NEXT:    vpaddd %xmm5, %xmm2, %xmm2
762; AVX1-NEXT:    vcvttps2dq %xmm2, %xmm2
763; AVX1-NEXT:    vpackusdw %xmm2, %xmm4, %xmm2
764; AVX1-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
765; AVX1-NEXT:    vpmullw %xmm2, %xmm0, %xmm0
766; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm0
767; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
768; AVX1-NEXT:    retq
769;
770; AVX2-LABEL: var_funnnel_v16i8:
771; AVX2:       # %bb.0:
772; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
773; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm3 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
774; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
775; AVX2-NEXT:    vpxor %xmm4, %xmm4, %xmm4
776; AVX2-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
777; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
778; AVX2-NEXT:    vpsllvd %ymm4, %ymm3, %ymm3
779; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
780; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
781; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
782; AVX2-NEXT:    vpsrlw $8, %xmm3, %xmm3
783; AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
784; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
785; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero
786; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
787; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
788; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
789; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm0
790; AVX2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
791; AVX2-NEXT:    vzeroupper
792; AVX2-NEXT:    retq
793;
794; AVX512F-LABEL: var_funnnel_v16i8:
795; AVX512F:       # %bb.0:
796; AVX512F-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
797; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
798; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
799; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
800; AVX512F-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
801; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
802; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
803; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
804; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
805; AVX512F-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
806; AVX512F-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
807; AVX512F-NEXT:    vpord %zmm1, %zmm0, %zmm0
808; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
809; AVX512F-NEXT:    vzeroupper
810; AVX512F-NEXT:    retq
811;
812; AVX512VL-LABEL: var_funnnel_v16i8:
813; AVX512VL:       # %bb.0:
814; AVX512VL-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
815; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
816; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero,xmm4[4],zero,zero,zero,xmm4[5],zero,zero,zero,xmm4[6],zero,zero,zero,xmm4[7],zero,zero,zero,xmm4[8],zero,zero,zero,xmm4[9],zero,zero,zero,xmm4[10],zero,zero,zero,xmm4[11],zero,zero,zero,xmm4[12],zero,zero,zero,xmm4[13],zero,zero,zero,xmm4[14],zero,zero,zero,xmm4[15],zero,zero,zero
817; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
818; AVX512VL-NEXT:    vpsllvd %zmm4, %zmm0, %zmm0
819; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
820; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
821; AVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
822; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm1
823; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
824; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm1, %zmm1
825; AVX512VL-NEXT:    vpord %zmm1, %zmm0, %zmm0
826; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
827; AVX512VL-NEXT:    vzeroupper
828; AVX512VL-NEXT:    retq
829;
830; AVX512BW-LABEL: var_funnnel_v16i8:
831; AVX512BW:       # %bb.0:
832; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
833; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
834; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
835; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
836; AVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
837; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
838; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
839; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
840; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
841; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
842; AVX512BW-NEXT:    vzeroupper
843; AVX512BW-NEXT:    retq
844;
845; AVX512VBMI2-LABEL: var_funnnel_v16i8:
846; AVX512VBMI2:       # %bb.0:
847; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
848; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
849; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
850; AVX512VBMI2-NEXT:    vpermt2b %zmm0, %zmm3, %zmm1
851; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm0
852; AVX512VBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
853; AVX512VBMI2-NEXT:    vpsllvw %zmm0, %zmm1, %zmm0
854; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
855; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
856; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
857; AVX512VBMI2-NEXT:    vzeroupper
858; AVX512VBMI2-NEXT:    retq
859;
860; AVX512VLBW-LABEL: var_funnnel_v16i8:
861; AVX512VLBW:       # %bb.0:
862; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
863; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
864; AVX512VLBW-NEXT:    vpsllw $8, %ymm0, %ymm0
865; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
866; AVX512VLBW-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
867; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
868; AVX512VLBW-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0
869; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
870; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
871; AVX512VLBW-NEXT:    vzeroupper
872; AVX512VLBW-NEXT:    retq
873;
874; AVX512VLVBMI2-LABEL: var_funnnel_v16i8:
875; AVX512VLVBMI2:       # %bb.0:
876; AVX512VLVBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
877; AVX512VLVBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
878; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
879; AVX512VLVBMI2-NEXT:    vpermi2b %ymm0, %ymm1, %ymm3
880; AVX512VLVBMI2-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm0
881; AVX512VLVBMI2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
882; AVX512VLVBMI2-NEXT:    vpsllvw %ymm0, %ymm3, %ymm0
883; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
884; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
885; AVX512VLVBMI2-NEXT:    vzeroupper
886; AVX512VLVBMI2-NEXT:    retq
887;
888; XOPAVX1-LABEL: var_funnnel_v16i8:
889; XOPAVX1:       # %bb.0:
890; XOPAVX1-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
891; XOPAVX1-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
892; XOPAVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
893; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
894; XOPAVX1-NEXT:    vpxor %xmm5, %xmm5, %xmm5
895; XOPAVX1-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
896; XOPAVX1-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
897; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
898; XOPAVX1-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
899; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
900; XOPAVX1-NEXT:    retq
901;
902; XOPAVX2-LABEL: var_funnnel_v16i8:
903; XOPAVX2:       # %bb.0:
904; XOPAVX2-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
905; XOPAVX2-NEXT:    vpshlb %xmm3, %xmm1, %xmm1
906; XOPAVX2-NEXT:    vpbroadcastb {{.*#+}} xmm3 = [7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7]
907; XOPAVX2-NEXT:    vpandn %xmm3, %xmm2, %xmm4
908; XOPAVX2-NEXT:    vpxor %xmm5, %xmm5, %xmm5
909; XOPAVX2-NEXT:    vpsubb %xmm4, %xmm5, %xmm4
910; XOPAVX2-NEXT:    vpshlb %xmm4, %xmm1, %xmm1
911; XOPAVX2-NEXT:    vpand %xmm3, %xmm2, %xmm2
912; XOPAVX2-NEXT:    vpshlb %xmm2, %xmm0, %xmm0
913; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
914; XOPAVX2-NEXT:    retq
915;
916; X86-SSE2-LABEL: var_funnnel_v16i8:
917; X86-SSE2:       # %bb.0:
918; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
919; X86-SSE2-NEXT:    pxor %xmm5, %xmm5
920; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
921; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
922; X86-SSE2-NEXT:    movdqa %xmm4, %xmm6
923; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4,4,5,5,6,6,7,7]
924; X86-SSE2-NEXT:    pslld $23, %xmm6
925; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216]
926; X86-SSE2-NEXT:    paddd %xmm3, %xmm6
927; X86-SSE2-NEXT:    cvttps2dq %xmm6, %xmm6
928; X86-SSE2-NEXT:    pslld $16, %xmm6
929; X86-SSE2-NEXT:    psrad $16, %xmm6
930; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3]
931; X86-SSE2-NEXT:    pslld $23, %xmm4
932; X86-SSE2-NEXT:    paddd %xmm3, %xmm4
933; X86-SSE2-NEXT:    cvttps2dq %xmm4, %xmm7
934; X86-SSE2-NEXT:    pslld $16, %xmm7
935; X86-SSE2-NEXT:    psrad $16, %xmm7
936; X86-SSE2-NEXT:    packssdw %xmm6, %xmm7
937; X86-SSE2-NEXT:    movdqa %xmm1, %xmm4
938; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
939; X86-SSE2-NEXT:    pmullw %xmm7, %xmm4
940; X86-SSE2-NEXT:    psrlw $8, %xmm4
941; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
942; X86-SSE2-NEXT:    movdqa %xmm2, %xmm5
943; X86-SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7]
944; X86-SSE2-NEXT:    pslld $23, %xmm5
945; X86-SSE2-NEXT:    paddd %xmm3, %xmm5
946; X86-SSE2-NEXT:    cvttps2dq %xmm5, %xmm5
947; X86-SSE2-NEXT:    pslld $16, %xmm5
948; X86-SSE2-NEXT:    psrad $16, %xmm5
949; X86-SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3]
950; X86-SSE2-NEXT:    pslld $23, %xmm2
951; X86-SSE2-NEXT:    paddd %xmm3, %xmm2
952; X86-SSE2-NEXT:    cvttps2dq %xmm2, %xmm2
953; X86-SSE2-NEXT:    pslld $16, %xmm2
954; X86-SSE2-NEXT:    psrad $16, %xmm2
955; X86-SSE2-NEXT:    packssdw %xmm5, %xmm2
956; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
957; X86-SSE2-NEXT:    pmullw %xmm1, %xmm2
958; X86-SSE2-NEXT:    psrlw $8, %xmm2
959; X86-SSE2-NEXT:    packuswb %xmm4, %xmm2
960; X86-SSE2-NEXT:    movdqa %xmm2, %xmm0
961; X86-SSE2-NEXT:    retl
962  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt)
963  ret <16 x i8> %res
964}
965
966;
967; Uniform Variable Shifts
968;
969
970define <2 x i64> @splatvar_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %amt) nounwind {
971; SSE2-LABEL: splatvar_funnnel_v2i64:
972; SSE2:       # %bb.0:
973; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,63]
974; SSE2-NEXT:    movdqa %xmm2, %xmm4
975; SSE2-NEXT:    pandn %xmm3, %xmm4
976; SSE2-NEXT:    psrlq $1, %xmm1
977; SSE2-NEXT:    psrlq %xmm4, %xmm1
978; SSE2-NEXT:    pand %xmm3, %xmm2
979; SSE2-NEXT:    psllq %xmm2, %xmm0
980; SSE2-NEXT:    por %xmm1, %xmm0
981; SSE2-NEXT:    retq
982;
983; SSE41-LABEL: splatvar_funnnel_v2i64:
984; SSE41:       # %bb.0:
985; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [63,63]
986; SSE41-NEXT:    movdqa %xmm2, %xmm4
987; SSE41-NEXT:    pandn %xmm3, %xmm4
988; SSE41-NEXT:    psrlq $1, %xmm1
989; SSE41-NEXT:    psrlq %xmm4, %xmm1
990; SSE41-NEXT:    pand %xmm3, %xmm2
991; SSE41-NEXT:    psllq %xmm2, %xmm0
992; SSE41-NEXT:    por %xmm1, %xmm0
993; SSE41-NEXT:    retq
994;
995; AVX-LABEL: splatvar_funnnel_v2i64:
996; AVX:       # %bb.0:
997; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
998; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm4
999; AVX-NEXT:    vpsrlq $1, %xmm1, %xmm1
1000; AVX-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1001; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
1002; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1003; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1004; AVX-NEXT:    retq
1005;
1006; AVX512F-LABEL: splatvar_funnnel_v2i64:
1007; AVX512F:       # %bb.0:
1008; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1009; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1010; AVX512F-NEXT:    vpsrlq $1, %xmm1, %xmm1
1011; AVX512F-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1012; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
1013; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1014; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1015; AVX512F-NEXT:    retq
1016;
1017; AVX512VL-LABEL: splatvar_funnnel_v2i64:
1018; AVX512VL:       # %bb.0:
1019; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
1020; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1021; AVX512VL-NEXT:    vpsrlq $1, %xmm1, %xmm1
1022; AVX512VL-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1023; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
1024; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1025; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1026; AVX512VL-NEXT:    retq
1027;
1028; AVX512BW-LABEL: splatvar_funnnel_v2i64:
1029; AVX512BW:       # %bb.0:
1030; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1031; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1032; AVX512BW-NEXT:    vpsrlq $1, %xmm1, %xmm1
1033; AVX512BW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1034; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
1035; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1036; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1037; AVX512BW-NEXT:    retq
1038;
1039; AVX512VBMI2-LABEL: splatvar_funnnel_v2i64:
1040; AVX512VBMI2:       # %bb.0:
1041; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1042; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1043; AVX512VBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
1044; AVX512VBMI2-NEXT:    vpshldvq %zmm2, %zmm1, %zmm0
1045; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1046; AVX512VBMI2-NEXT:    vzeroupper
1047; AVX512VBMI2-NEXT:    retq
1048;
1049; AVX512VLBW-LABEL: splatvar_funnnel_v2i64:
1050; AVX512VLBW:       # %bb.0:
1051; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
1052; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1053; AVX512VLBW-NEXT:    vpsrlq $1, %xmm1, %xmm1
1054; AVX512VLBW-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1055; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
1056; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1057; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1058; AVX512VLBW-NEXT:    retq
1059;
1060; AVX512VLVBMI2-LABEL: splatvar_funnnel_v2i64:
1061; AVX512VLVBMI2:       # %bb.0:
1062; AVX512VLVBMI2-NEXT:    vpbroadcastq %xmm2, %xmm2
1063; AVX512VLVBMI2-NEXT:    vpshldvq %xmm2, %xmm1, %xmm0
1064; AVX512VLVBMI2-NEXT:    retq
1065;
1066; XOP-LABEL: splatvar_funnnel_v2i64:
1067; XOP:       # %bb.0:
1068; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [63,63]
1069; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1070; XOP-NEXT:    vpsrlq $1, %xmm1, %xmm1
1071; XOP-NEXT:    vpsrlq %xmm4, %xmm1, %xmm1
1072; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm2
1073; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1074; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1075; XOP-NEXT:    retq
1076;
1077; X86-SSE2-LABEL: splatvar_funnnel_v2i64:
1078; X86-SSE2:       # %bb.0:
1079; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [63,0,63,0]
1080; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
1081; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
1082; X86-SSE2-NEXT:    psrlq $1, %xmm1
1083; X86-SSE2-NEXT:    psrlq %xmm4, %xmm1
1084; X86-SSE2-NEXT:    pand %xmm3, %xmm2
1085; X86-SSE2-NEXT:    psllq %xmm2, %xmm0
1086; X86-SSE2-NEXT:    por %xmm1, %xmm0
1087; X86-SSE2-NEXT:    retl
1088  %splat = shufflevector <2 x i64> %amt, <2 x i64> undef, <2 x i32> zeroinitializer
1089  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %splat)
1090  ret <2 x i64> %res
1091}
1092
1093define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %amt) nounwind {
1094; SSE-LABEL: splatvar_funnnel_v4i32:
1095; SSE:       # %bb.0:
1096; SSE-NEXT:    movdqa %xmm1, %xmm3
1097; SSE-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1098; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1099; SSE-NEXT:    psllq %xmm2, %xmm3
1100; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1101; SSE-NEXT:    psllq %xmm2, %xmm1
1102; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
1103; SSE-NEXT:    movaps %xmm1, %xmm0
1104; SSE-NEXT:    retq
1105;
1106; AVX-LABEL: splatvar_funnnel_v4i32:
1107; AVX:       # %bb.0:
1108; AVX-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1109; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1110; AVX-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1111; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1112; AVX-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1113; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1114; AVX-NEXT:    retq
1115;
1116; AVX512F-LABEL: splatvar_funnnel_v4i32:
1117; AVX512F:       # %bb.0:
1118; AVX512F-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1119; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1120; AVX512F-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1121; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1122; AVX512F-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1123; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1124; AVX512F-NEXT:    retq
1125;
1126; AVX512VL-LABEL: splatvar_funnnel_v4i32:
1127; AVX512VL:       # %bb.0:
1128; AVX512VL-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1129; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1130; AVX512VL-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1131; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1132; AVX512VL-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1133; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1134; AVX512VL-NEXT:    retq
1135;
1136; AVX512BW-LABEL: splatvar_funnnel_v4i32:
1137; AVX512BW:       # %bb.0:
1138; AVX512BW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1139; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1140; AVX512BW-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1141; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1142; AVX512BW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1143; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1144; AVX512BW-NEXT:    retq
1145;
1146; AVX512VBMI2-LABEL: splatvar_funnnel_v4i32:
1147; AVX512VBMI2:       # %bb.0:
1148; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1149; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1150; AVX512VBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
1151; AVX512VBMI2-NEXT:    vpshldvd %zmm2, %zmm1, %zmm0
1152; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1153; AVX512VBMI2-NEXT:    vzeroupper
1154; AVX512VBMI2-NEXT:    retq
1155;
1156; AVX512VLBW-LABEL: splatvar_funnnel_v4i32:
1157; AVX512VLBW:       # %bb.0:
1158; AVX512VLBW-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1159; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1160; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1161; AVX512VLBW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1162; AVX512VLBW-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1163; AVX512VLBW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1164; AVX512VLBW-NEXT:    retq
1165;
1166; AVX512VLVBMI2-LABEL: splatvar_funnnel_v4i32:
1167; AVX512VLVBMI2:       # %bb.0:
1168; AVX512VLVBMI2-NEXT:    vpbroadcastd %xmm2, %xmm2
1169; AVX512VLVBMI2-NEXT:    vpshldvd %xmm2, %xmm1, %xmm0
1170; AVX512VLVBMI2-NEXT:    retq
1171;
1172; XOP-LABEL: splatvar_funnnel_v4i32:
1173; XOP:       # %bb.0:
1174; XOP-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1175; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1176; XOP-NEXT:    vpsllq %xmm2, %xmm3, %xmm3
1177; XOP-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1178; XOP-NEXT:    vpsllq %xmm2, %xmm0, %xmm0
1179; XOP-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm3[1,3]
1180; XOP-NEXT:    retq
1181;
1182; X86-SSE2-LABEL: splatvar_funnnel_v4i32:
1183; X86-SSE2:       # %bb.0:
1184; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1185; X86-SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm0[2],xmm3[3],xmm0[3]
1186; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1187; X86-SSE2-NEXT:    psllq %xmm2, %xmm3
1188; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
1189; X86-SSE2-NEXT:    psllq %xmm2, %xmm1
1190; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm3[1,3]
1191; X86-SSE2-NEXT:    movaps %xmm1, %xmm0
1192; X86-SSE2-NEXT:    retl
1193  %splat = shufflevector <4 x i32> %amt, <4 x i32> undef, <4 x i32> zeroinitializer
1194  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %splat)
1195  ret <4 x i32> %res
1196}
1197
1198define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
1199; SSE2-LABEL: splatvar_funnnel_v8i16:
1200; SSE2:       # %bb.0:
1201; SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
1202; SSE2-NEXT:    movdqa %xmm2, %xmm4
1203; SSE2-NEXT:    pandn %xmm3, %xmm4
1204; SSE2-NEXT:    psrlw $1, %xmm1
1205; SSE2-NEXT:    psrlw %xmm4, %xmm1
1206; SSE2-NEXT:    pand %xmm3, %xmm2
1207; SSE2-NEXT:    psllw %xmm2, %xmm0
1208; SSE2-NEXT:    por %xmm1, %xmm0
1209; SSE2-NEXT:    retq
1210;
1211; SSE41-LABEL: splatvar_funnnel_v8i16:
1212; SSE41:       # %bb.0:
1213; SSE41-NEXT:    pmovsxbq {{.*#+}} xmm3 = [15,0]
1214; SSE41-NEXT:    movdqa %xmm2, %xmm4
1215; SSE41-NEXT:    pandn %xmm3, %xmm4
1216; SSE41-NEXT:    psrlw $1, %xmm1
1217; SSE41-NEXT:    psrlw %xmm4, %xmm1
1218; SSE41-NEXT:    pand %xmm3, %xmm2
1219; SSE41-NEXT:    psllw %xmm2, %xmm0
1220; SSE41-NEXT:    por %xmm1, %xmm0
1221; SSE41-NEXT:    retq
1222;
1223; AVX-LABEL: splatvar_funnnel_v8i16:
1224; AVX:       # %bb.0:
1225; AVX-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1226; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1227; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
1228; AVX-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1229; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm2
1230; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1231; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1232; AVX-NEXT:    retq
1233;
1234; AVX512F-LABEL: splatvar_funnnel_v8i16:
1235; AVX512F:       # %bb.0:
1236; AVX512F-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1237; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1238; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
1239; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1240; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm2
1241; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1242; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1243; AVX512F-NEXT:    retq
1244;
1245; AVX512VL-LABEL: splatvar_funnnel_v8i16:
1246; AVX512VL:       # %bb.0:
1247; AVX512VL-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1248; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1249; AVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
1250; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1251; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm2
1252; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1253; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1254; AVX512VL-NEXT:    retq
1255;
1256; AVX512BW-LABEL: splatvar_funnnel_v8i16:
1257; AVX512BW:       # %bb.0:
1258; AVX512BW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1259; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1260; AVX512BW-NEXT:    vpsrlw $1, %xmm1, %xmm1
1261; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1262; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm2
1263; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1264; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1265; AVX512BW-NEXT:    retq
1266;
1267; AVX512VBMI2-LABEL: splatvar_funnnel_v8i16:
1268; AVX512VBMI2:       # %bb.0:
1269; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1270; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1271; AVX512VBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
1272; AVX512VBMI2-NEXT:    vpshldvw %zmm2, %zmm1, %zmm0
1273; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1274; AVX512VBMI2-NEXT:    vzeroupper
1275; AVX512VBMI2-NEXT:    retq
1276;
1277; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
1278; AVX512VLBW:       # %bb.0:
1279; AVX512VLBW-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1280; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1281; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
1282; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1283; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm2
1284; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1285; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1286; AVX512VLBW-NEXT:    retq
1287;
1288; AVX512VLVBMI2-LABEL: splatvar_funnnel_v8i16:
1289; AVX512VLVBMI2:       # %bb.0:
1290; AVX512VLVBMI2-NEXT:    vpbroadcastw %xmm2, %xmm2
1291; AVX512VLVBMI2-NEXT:    vpshldvw %xmm2, %xmm1, %xmm0
1292; AVX512VLVBMI2-NEXT:    retq
1293;
1294; XOP-LABEL: splatvar_funnnel_v8i16:
1295; XOP:       # %bb.0:
1296; XOP-NEXT:    vpmovsxbq {{.*#+}} xmm3 = [15,0]
1297; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
1298; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
1299; XOP-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
1300; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm2
1301; XOP-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1302; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
1303; XOP-NEXT:    retq
1304;
1305; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
1306; X86-SSE2:       # %bb.0:
1307; X86-SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
1308; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
1309; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
1310; X86-SSE2-NEXT:    psrlw $1, %xmm1
1311; X86-SSE2-NEXT:    psrlw %xmm4, %xmm1
1312; X86-SSE2-NEXT:    pand %xmm3, %xmm2
1313; X86-SSE2-NEXT:    psllw %xmm2, %xmm0
1314; X86-SSE2-NEXT:    por %xmm1, %xmm0
1315; X86-SSE2-NEXT:    retl
1316  %splat = shufflevector <8 x i16> %amt, <8 x i16> undef, <8 x i32> zeroinitializer
1317  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %splat)
1318  ret <8 x i16> %res
1319}
1320
1321define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %amt) nounwind {
1322; SSE-LABEL: splatvar_funnnel_v16i8:
1323; SSE:       # %bb.0:
1324; SSE-NEXT:    movdqa %xmm1, %xmm3
1325; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1326; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1327; SSE-NEXT:    psllw %xmm2, %xmm3
1328; SSE-NEXT:    psrlw $8, %xmm3
1329; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1330; SSE-NEXT:    psllw %xmm2, %xmm1
1331; SSE-NEXT:    psrlw $8, %xmm1
1332; SSE-NEXT:    packuswb %xmm3, %xmm1
1333; SSE-NEXT:    movdqa %xmm1, %xmm0
1334; SSE-NEXT:    retq
1335;
1336; AVX-LABEL: splatvar_funnnel_v16i8:
1337; AVX:       # %bb.0:
1338; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1339; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1340; AVX-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1341; AVX-NEXT:    vpsrlw $8, %xmm3, %xmm3
1342; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1343; AVX-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1344; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
1345; AVX-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1346; AVX-NEXT:    retq
1347;
1348; AVX512F-LABEL: splatvar_funnnel_v16i8:
1349; AVX512F:       # %bb.0:
1350; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1351; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1352; AVX512F-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1353; AVX512F-NEXT:    vpsrlw $8, %xmm3, %xmm3
1354; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1355; AVX512F-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1356; AVX512F-NEXT:    vpsrlw $8, %xmm0, %xmm0
1357; AVX512F-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1358; AVX512F-NEXT:    retq
1359;
1360; AVX512VL-LABEL: splatvar_funnnel_v16i8:
1361; AVX512VL:       # %bb.0:
1362; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1363; AVX512VL-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1364; AVX512VL-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1365; AVX512VL-NEXT:    vpsrlw $8, %xmm3, %xmm3
1366; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1367; AVX512VL-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1368; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
1369; AVX512VL-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1370; AVX512VL-NEXT:    retq
1371;
1372; AVX512BW-LABEL: splatvar_funnnel_v16i8:
1373; AVX512BW:       # %bb.0:
1374; AVX512BW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1375; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1376; AVX512BW-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1377; AVX512BW-NEXT:    vpsrlw $8, %xmm3, %xmm3
1378; AVX512BW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1379; AVX512BW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1380; AVX512BW-NEXT:    vpsrlw $8, %xmm0, %xmm0
1381; AVX512BW-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1382; AVX512BW-NEXT:    retq
1383;
1384; AVX512VBMI2-LABEL: splatvar_funnnel_v16i8:
1385; AVX512VBMI2:       # %bb.0:
1386; AVX512VBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1387; AVX512VBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1388; AVX512VBMI2-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1389; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm3, %xmm3
1390; AVX512VBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1391; AVX512VBMI2-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1392; AVX512VBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
1393; AVX512VBMI2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1394; AVX512VBMI2-NEXT:    retq
1395;
1396; AVX512VLBW-LABEL: splatvar_funnnel_v16i8:
1397; AVX512VLBW:       # %bb.0:
1398; AVX512VLBW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1399; AVX512VLBW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1400; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1401; AVX512VLBW-NEXT:    vpsrlw $8, %xmm3, %xmm3
1402; AVX512VLBW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1403; AVX512VLBW-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1404; AVX512VLBW-NEXT:    vpsrlw $8, %xmm0, %xmm0
1405; AVX512VLBW-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1406; AVX512VLBW-NEXT:    retq
1407;
1408; AVX512VLVBMI2-LABEL: splatvar_funnnel_v16i8:
1409; AVX512VLVBMI2:       # %bb.0:
1410; AVX512VLVBMI2-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1411; AVX512VLVBMI2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1412; AVX512VLVBMI2-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1413; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm3, %xmm3
1414; AVX512VLVBMI2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1415; AVX512VLVBMI2-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1416; AVX512VLVBMI2-NEXT:    vpsrlw $8, %xmm0, %xmm0
1417; AVX512VLVBMI2-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
1418; AVX512VLVBMI2-NEXT:    retq
1419;
1420; XOP-LABEL: splatvar_funnnel_v16i8:
1421; XOP:       # %bb.0:
1422; XOP-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
1423; XOP-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
1424; XOP-NEXT:    vpsllw %xmm2, %xmm3, %xmm3
1425; XOP-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1426; XOP-NEXT:    vpsllw %xmm2, %xmm0, %xmm0
1427; XOP-NEXT:    vpperm {{.*#+}} xmm0 = xmm0[1,3,5,7,9,11,13,15],xmm3[1,3,5,7,9,11,13,15]
1428; XOP-NEXT:    retq
1429;
1430; X86-SSE2-LABEL: splatvar_funnnel_v16i8:
1431; X86-SSE2:       # %bb.0:
1432; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1433; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
1434; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1435; X86-SSE2-NEXT:    psllw %xmm2, %xmm3
1436; X86-SSE2-NEXT:    psrlw $8, %xmm3
1437; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
1438; X86-SSE2-NEXT:    psllw %xmm2, %xmm1
1439; X86-SSE2-NEXT:    psrlw $8, %xmm1
1440; X86-SSE2-NEXT:    packuswb %xmm3, %xmm1
1441; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
1442; X86-SSE2-NEXT:    retl
1443  %splat = shufflevector <16 x i8> %amt, <16 x i8> undef, <16 x i32> zeroinitializer
1444  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %splat)
1445  ret <16 x i8> %res
1446}
1447
1448; CGP should allow a cross-block splat shift amount to be seen in SDAG.
1449; PR37426 - https://bugs.llvm.org/show_bug.cgi?id=37426
1450
1451define void @sink_splatvar(ptr %p, i32 %shift_amt) {
1452; SSE-LABEL: sink_splatvar:
1453; SSE:       # %bb.0: # %entry
1454; SSE-NEXT:    movd %esi, %xmm0
1455; SSE-NEXT:    movq $-1024, %rax # imm = 0xFC00
1456; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1457; SSE-NEXT:    .p2align 4
1458; SSE-NEXT:  .LBB8_1: # %loop
1459; SSE-NEXT:    # =>This Inner Loop Header: Depth=1
1460; SSE-NEXT:    movdqu 1024(%rdi,%rax), %xmm1
1461; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1462; SSE-NEXT:    psllq %xmm0, %xmm2
1463; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1464; SSE-NEXT:    psllq %xmm0, %xmm1
1465; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1466; SSE-NEXT:    movups %xmm1, 1024(%rdi,%rax)
1467; SSE-NEXT:    addq $16, %rax
1468; SSE-NEXT:    jne .LBB8_1
1469; SSE-NEXT:  # %bb.2: # %end
1470; SSE-NEXT:    retq
1471;
1472; AVX1-LABEL: sink_splatvar:
1473; AVX1:       # %bb.0: # %entry
1474; AVX1-NEXT:    vmovd %esi, %xmm0
1475; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
1476; AVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1477; AVX1-NEXT:    .p2align 4
1478; AVX1-NEXT:  .LBB8_1: # %loop
1479; AVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1480; AVX1-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1481; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1482; AVX1-NEXT:    vpsllq %xmm0, %xmm2, %xmm2
1483; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1484; AVX1-NEXT:    vpsllq %xmm0, %xmm1, %xmm1
1485; AVX1-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1486; AVX1-NEXT:    vmovups %xmm1, 1024(%rdi,%rax)
1487; AVX1-NEXT:    addq $16, %rax
1488; AVX1-NEXT:    jne .LBB8_1
1489; AVX1-NEXT:  # %bb.2: # %end
1490; AVX1-NEXT:    retq
1491;
1492; AVX2-LABEL: sink_splatvar:
1493; AVX2:       # %bb.0: # %entry
1494; AVX2-NEXT:    vmovd %esi, %xmm0
1495; AVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
1496; AVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
1497; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [31,31,31,31]
1498; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
1499; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [32,32,32,32]
1500; AVX2-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
1501; AVX2-NEXT:    .p2align 4
1502; AVX2-NEXT:  .LBB8_1: # %loop
1503; AVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1504; AVX2-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm2
1505; AVX2-NEXT:    vpsllvd %xmm0, %xmm2, %xmm3
1506; AVX2-NEXT:    vpsrlvd %xmm1, %xmm2, %xmm2
1507; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
1508; AVX2-NEXT:    vmovdqu %xmm2, 1024(%rdi,%rax)
1509; AVX2-NEXT:    addq $16, %rax
1510; AVX2-NEXT:    jne .LBB8_1
1511; AVX2-NEXT:  # %bb.2: # %end
1512; AVX2-NEXT:    retq
1513;
1514; AVX512F-LABEL: sink_splatvar:
1515; AVX512F:       # %bb.0: # %entry
1516; AVX512F-NEXT:    vmovd %esi, %xmm0
1517; AVX512F-NEXT:    vpbroadcastd %xmm0, %xmm0
1518; AVX512F-NEXT:    movq $-1024, %rax # imm = 0xFC00
1519; AVX512F-NEXT:    .p2align 4
1520; AVX512F-NEXT:  .LBB8_1: # %loop
1521; AVX512F-NEXT:    # =>This Inner Loop Header: Depth=1
1522; AVX512F-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1523; AVX512F-NEXT:    vprolvd %zmm0, %zmm1, %zmm1
1524; AVX512F-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1525; AVX512F-NEXT:    addq $16, %rax
1526; AVX512F-NEXT:    jne .LBB8_1
1527; AVX512F-NEXT:  # %bb.2: # %end
1528; AVX512F-NEXT:    vzeroupper
1529; AVX512F-NEXT:    retq
1530;
1531; AVX512VL-LABEL: sink_splatvar:
1532; AVX512VL:       # %bb.0: # %entry
1533; AVX512VL-NEXT:    vpbroadcastd %esi, %xmm0
1534; AVX512VL-NEXT:    movq $-1024, %rax # imm = 0xFC00
1535; AVX512VL-NEXT:    .p2align 4
1536; AVX512VL-NEXT:  .LBB8_1: # %loop
1537; AVX512VL-NEXT:    # =>This Inner Loop Header: Depth=1
1538; AVX512VL-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1539; AVX512VL-NEXT:    vprolvd %xmm0, %xmm1, %xmm1
1540; AVX512VL-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1541; AVX512VL-NEXT:    addq $16, %rax
1542; AVX512VL-NEXT:    jne .LBB8_1
1543; AVX512VL-NEXT:  # %bb.2: # %end
1544; AVX512VL-NEXT:    retq
1545;
1546; AVX512BW-LABEL: sink_splatvar:
1547; AVX512BW:       # %bb.0: # %entry
1548; AVX512BW-NEXT:    vmovd %esi, %xmm0
1549; AVX512BW-NEXT:    vpbroadcastd %xmm0, %xmm0
1550; AVX512BW-NEXT:    movq $-1024, %rax # imm = 0xFC00
1551; AVX512BW-NEXT:    .p2align 4
1552; AVX512BW-NEXT:  .LBB8_1: # %loop
1553; AVX512BW-NEXT:    # =>This Inner Loop Header: Depth=1
1554; AVX512BW-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1555; AVX512BW-NEXT:    vprolvd %zmm0, %zmm1, %zmm1
1556; AVX512BW-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1557; AVX512BW-NEXT:    addq $16, %rax
1558; AVX512BW-NEXT:    jne .LBB8_1
1559; AVX512BW-NEXT:  # %bb.2: # %end
1560; AVX512BW-NEXT:    vzeroupper
1561; AVX512BW-NEXT:    retq
1562;
1563; AVX512VBMI2-LABEL: sink_splatvar:
1564; AVX512VBMI2:       # %bb.0: # %entry
1565; AVX512VBMI2-NEXT:    vmovd %esi, %xmm0
1566; AVX512VBMI2-NEXT:    vpbroadcastd %xmm0, %xmm0
1567; AVX512VBMI2-NEXT:    movq $-1024, %rax # imm = 0xFC00
1568; AVX512VBMI2-NEXT:    .p2align 4
1569; AVX512VBMI2-NEXT:  .LBB8_1: # %loop
1570; AVX512VBMI2-NEXT:    # =>This Inner Loop Header: Depth=1
1571; AVX512VBMI2-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1572; AVX512VBMI2-NEXT:    vprolvd %zmm0, %zmm1, %zmm1
1573; AVX512VBMI2-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1574; AVX512VBMI2-NEXT:    addq $16, %rax
1575; AVX512VBMI2-NEXT:    jne .LBB8_1
1576; AVX512VBMI2-NEXT:  # %bb.2: # %end
1577; AVX512VBMI2-NEXT:    vzeroupper
1578; AVX512VBMI2-NEXT:    retq
1579;
1580; AVX512VLBW-LABEL: sink_splatvar:
1581; AVX512VLBW:       # %bb.0: # %entry
1582; AVX512VLBW-NEXT:    vpbroadcastd %esi, %xmm0
1583; AVX512VLBW-NEXT:    movq $-1024, %rax # imm = 0xFC00
1584; AVX512VLBW-NEXT:    .p2align 4
1585; AVX512VLBW-NEXT:  .LBB8_1: # %loop
1586; AVX512VLBW-NEXT:    # =>This Inner Loop Header: Depth=1
1587; AVX512VLBW-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1588; AVX512VLBW-NEXT:    vprolvd %xmm0, %xmm1, %xmm1
1589; AVX512VLBW-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1590; AVX512VLBW-NEXT:    addq $16, %rax
1591; AVX512VLBW-NEXT:    jne .LBB8_1
1592; AVX512VLBW-NEXT:  # %bb.2: # %end
1593; AVX512VLBW-NEXT:    retq
1594;
1595; AVX512VLVBMI2-LABEL: sink_splatvar:
1596; AVX512VLVBMI2:       # %bb.0: # %entry
1597; AVX512VLVBMI2-NEXT:    vpbroadcastd %esi, %xmm0
1598; AVX512VLVBMI2-NEXT:    movq $-1024, %rax # imm = 0xFC00
1599; AVX512VLVBMI2-NEXT:    .p2align 4
1600; AVX512VLVBMI2-NEXT:  .LBB8_1: # %loop
1601; AVX512VLVBMI2-NEXT:    # =>This Inner Loop Header: Depth=1
1602; AVX512VLVBMI2-NEXT:    vmovdqu 1024(%rdi,%rax), %xmm1
1603; AVX512VLVBMI2-NEXT:    vprolvd %xmm0, %xmm1, %xmm1
1604; AVX512VLVBMI2-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1605; AVX512VLVBMI2-NEXT:    addq $16, %rax
1606; AVX512VLVBMI2-NEXT:    jne .LBB8_1
1607; AVX512VLVBMI2-NEXT:  # %bb.2: # %end
1608; AVX512VLVBMI2-NEXT:    retq
1609;
1610; XOPAVX1-LABEL: sink_splatvar:
1611; XOPAVX1:       # %bb.0: # %entry
1612; XOPAVX1-NEXT:    vmovd %esi, %xmm0
1613; XOPAVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
1614; XOPAVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
1615; XOPAVX1-NEXT:    .p2align 4
1616; XOPAVX1-NEXT:  .LBB8_1: # %loop
1617; XOPAVX1-NEXT:    # =>This Inner Loop Header: Depth=1
1618; XOPAVX1-NEXT:    vprotd %xmm0, 1024(%rdi,%rax), %xmm1
1619; XOPAVX1-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1620; XOPAVX1-NEXT:    addq $16, %rax
1621; XOPAVX1-NEXT:    jne .LBB8_1
1622; XOPAVX1-NEXT:  # %bb.2: # %end
1623; XOPAVX1-NEXT:    retq
1624;
1625; XOPAVX2-LABEL: sink_splatvar:
1626; XOPAVX2:       # %bb.0: # %entry
1627; XOPAVX2-NEXT:    vmovd %esi, %xmm0
1628; XOPAVX2-NEXT:    vpbroadcastd %xmm0, %xmm0
1629; XOPAVX2-NEXT:    movq $-1024, %rax # imm = 0xFC00
1630; XOPAVX2-NEXT:    .p2align 4
1631; XOPAVX2-NEXT:  .LBB8_1: # %loop
1632; XOPAVX2-NEXT:    # =>This Inner Loop Header: Depth=1
1633; XOPAVX2-NEXT:    vprotd %xmm0, 1024(%rdi,%rax), %xmm1
1634; XOPAVX2-NEXT:    vmovdqu %xmm1, 1024(%rdi,%rax)
1635; XOPAVX2-NEXT:    addq $16, %rax
1636; XOPAVX2-NEXT:    jne .LBB8_1
1637; XOPAVX2-NEXT:  # %bb.2: # %end
1638; XOPAVX2-NEXT:    retq
1639;
1640; X86-SSE2-LABEL: sink_splatvar:
1641; X86-SSE2:       # %bb.0: # %entry
1642; X86-SSE2-NEXT:    pushl %esi
1643; X86-SSE2-NEXT:    .cfi_def_cfa_offset 8
1644; X86-SSE2-NEXT:    .cfi_offset %esi, -8
1645; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
1646; X86-SSE2-NEXT:    xorl %ecx, %ecx
1647; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
1648; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1649; X86-SSE2-NEXT:    xorl %edx, %edx
1650; X86-SSE2-NEXT:    .p2align 4
1651; X86-SSE2-NEXT:  .LBB8_1: # %loop
1652; X86-SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
1653; X86-SSE2-NEXT:    movdqu (%eax,%ecx,4), %xmm1
1654; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,2,3,3]
1655; X86-SSE2-NEXT:    psllq %xmm0, %xmm2
1656; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
1657; X86-SSE2-NEXT:    psllq %xmm0, %xmm1
1658; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
1659; X86-SSE2-NEXT:    movups %xmm1, (%eax,%ecx,4)
1660; X86-SSE2-NEXT:    addl $4, %ecx
1661; X86-SSE2-NEXT:    adcl $0, %edx
1662; X86-SSE2-NEXT:    movl %ecx, %esi
1663; X86-SSE2-NEXT:    xorl $256, %esi # imm = 0x100
1664; X86-SSE2-NEXT:    orl %edx, %esi
1665; X86-SSE2-NEXT:    jne .LBB8_1
1666; X86-SSE2-NEXT:  # %bb.2: # %end
1667; X86-SSE2-NEXT:    popl %esi
1668; X86-SSE2-NEXT:    .cfi_def_cfa_offset 4
1669; X86-SSE2-NEXT:    retl
1670entry:
1671  %ins = insertelement <4 x i32> undef, i32 %shift_amt, i32 0
1672  %splat = shufflevector <4 x i32> %ins, <4 x i32> undef, <4 x i32> zeroinitializer
1673  br label %loop
1674
1675loop:
1676  %index = phi i64 [ 0, %entry ], [ %inc, %loop ]
1677  %addr = getelementptr inbounds i32, ptr %p, i64 %index
1678  %x = load <4 x i32>, ptr %addr, align 4
1679  %fsh = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %x, <4 x i32> %splat)
1680  store <4 x i32> %fsh, ptr %addr, align 4
1681  %inc = add i64 %index, 4
1682  %iv = icmp eq i64 %inc, 256
1683  br i1 %iv, label %end, label %loop
1684
1685end:
1686  ret void
1687}
1688
1689;
1690; Constant Shifts
1691;
1692
1693define <2 x i64> @constant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
1694; SSE2-LABEL: constant_funnnel_v2i64:
1695; SSE2:       # %bb.0:
1696; SSE2-NEXT:    movdqa %xmm1, %xmm2
1697; SSE2-NEXT:    psrlq $60, %xmm2
1698; SSE2-NEXT:    psrlq $50, %xmm1
1699; SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1700; SSE2-NEXT:    movdqa %xmm0, %xmm1
1701; SSE2-NEXT:    psllq $4, %xmm1
1702; SSE2-NEXT:    psllq $14, %xmm0
1703; SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1704; SSE2-NEXT:    orpd %xmm2, %xmm0
1705; SSE2-NEXT:    retq
1706;
1707; SSE41-LABEL: constant_funnnel_v2i64:
1708; SSE41:       # %bb.0:
1709; SSE41-NEXT:    movdqa %xmm1, %xmm2
1710; SSE41-NEXT:    psrlq $50, %xmm2
1711; SSE41-NEXT:    psrlq $60, %xmm1
1712; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1713; SSE41-NEXT:    movdqa %xmm0, %xmm1
1714; SSE41-NEXT:    psllq $14, %xmm1
1715; SSE41-NEXT:    psllq $4, %xmm0
1716; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
1717; SSE41-NEXT:    por %xmm2, %xmm0
1718; SSE41-NEXT:    retq
1719;
1720; AVX1-LABEL: constant_funnnel_v2i64:
1721; AVX1:       # %bb.0:
1722; AVX1-NEXT:    vpsrlq $50, %xmm1, %xmm2
1723; AVX1-NEXT:    vpsrlq $60, %xmm1, %xmm1
1724; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1725; AVX1-NEXT:    vpsllq $14, %xmm0, %xmm2
1726; AVX1-NEXT:    vpsllq $4, %xmm0, %xmm0
1727; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7]
1728; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1729; AVX1-NEXT:    retq
1730;
1731; AVX2-LABEL: constant_funnnel_v2i64:
1732; AVX2:       # %bb.0:
1733; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1734; AVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1735; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1736; AVX2-NEXT:    retq
1737;
1738; AVX512F-LABEL: constant_funnnel_v2i64:
1739; AVX512F:       # %bb.0:
1740; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1741; AVX512F-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1742; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1743; AVX512F-NEXT:    retq
1744;
1745; AVX512VL-LABEL: constant_funnnel_v2i64:
1746; AVX512VL:       # %bb.0:
1747; AVX512VL-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1748; AVX512VL-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1749; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1750; AVX512VL-NEXT:    retq
1751;
1752; AVX512BW-LABEL: constant_funnnel_v2i64:
1753; AVX512BW:       # %bb.0:
1754; AVX512BW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1755; AVX512BW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1756; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1757; AVX512BW-NEXT:    retq
1758;
1759; AVX512VBMI2-LABEL: constant_funnnel_v2i64:
1760; AVX512VBMI2:       # %bb.0:
1761; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1762; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1763; AVX512VBMI2-NEXT:    vpmovsxbq {{.*#+}} xmm2 = [4,14]
1764; AVX512VBMI2-NEXT:    vpshldvq %zmm2, %zmm1, %zmm0
1765; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1766; AVX512VBMI2-NEXT:    vzeroupper
1767; AVX512VBMI2-NEXT:    retq
1768;
1769; AVX512VLBW-LABEL: constant_funnnel_v2i64:
1770; AVX512VLBW:       # %bb.0:
1771; AVX512VLBW-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1772; AVX512VLBW-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1773; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1774; AVX512VLBW-NEXT:    retq
1775;
1776; AVX512VLVBMI2-LABEL: constant_funnnel_v2i64:
1777; AVX512VLVBMI2:       # %bb.0:
1778; AVX512VLVBMI2-NEXT:    vpshldvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1779; AVX512VLVBMI2-NEXT:    retq
1780;
1781; XOPAVX1-LABEL: constant_funnnel_v2i64:
1782; XOPAVX1:       # %bb.0:
1783; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1784; XOPAVX1-NEXT:    vpshlq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1785; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1786; XOPAVX1-NEXT:    retq
1787;
1788; XOPAVX2-LABEL: constant_funnnel_v2i64:
1789; XOPAVX2:       # %bb.0:
1790; XOPAVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1791; XOPAVX2-NEXT:    vpsllvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1792; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1793; XOPAVX2-NEXT:    retq
1794;
1795; X86-SSE2-LABEL: constant_funnnel_v2i64:
1796; X86-SSE2:       # %bb.0:
1797; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1798; X86-SSE2-NEXT:    psrlq $60, %xmm2
1799; X86-SSE2-NEXT:    psrlq $50, %xmm1
1800; X86-SSE2-NEXT:    shufpd {{.*#+}} xmm2 = xmm2[0],xmm1[1]
1801; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
1802; X86-SSE2-NEXT:    psllq $4, %xmm1
1803; X86-SSE2-NEXT:    psllq $14, %xmm0
1804; X86-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
1805; X86-SSE2-NEXT:    orpd %xmm2, %xmm0
1806; X86-SSE2-NEXT:    retl
1807  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 4, i64 14>)
1808  ret <2 x i64> %res
1809}
1810
1811define <4 x i32> @constant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
1812; SSE2-LABEL: constant_funnnel_v4i32:
1813; SSE2:       # %bb.0:
1814; SSE2-NEXT:    movdqa %xmm1, %xmm2
1815; SSE2-NEXT:    psrld $25, %xmm2
1816; SSE2-NEXT:    movdqa %xmm1, %xmm3
1817; SSE2-NEXT:    psrld $26, %xmm3
1818; SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1819; SSE2-NEXT:    movdqa %xmm1, %xmm2
1820; SSE2-NEXT:    psrld $27, %xmm2
1821; SSE2-NEXT:    psrld $28, %xmm1
1822; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1823; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1824; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1825; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1826; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1827; SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
1828; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1829; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1830; SSE2-NEXT:    por %xmm1, %xmm0
1831; SSE2-NEXT:    retq
1832;
1833; SSE41-LABEL: constant_funnnel_v4i32:
1834; SSE41:       # %bb.0:
1835; SSE41-NEXT:    movdqa %xmm1, %xmm2
1836; SSE41-NEXT:    psrld $25, %xmm2
1837; SSE41-NEXT:    movdqa %xmm1, %xmm3
1838; SSE41-NEXT:    psrld $27, %xmm3
1839; SSE41-NEXT:    pblendw {{.*#+}} xmm3 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1840; SSE41-NEXT:    movdqa %xmm1, %xmm2
1841; SSE41-NEXT:    psrld $26, %xmm2
1842; SSE41-NEXT:    psrld $28, %xmm1
1843; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0,1,2,3],xmm2[4,5,6,7]
1844; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2,3],xmm2[4,5],xmm3[6,7]
1845; SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
1846; SSE41-NEXT:    por %xmm2, %xmm0
1847; SSE41-NEXT:    retq
1848;
1849; AVX1-LABEL: constant_funnnel_v4i32:
1850; AVX1:       # %bb.0:
1851; AVX1-NEXT:    vpsrld $25, %xmm1, %xmm2
1852; AVX1-NEXT:    vpsrld $27, %xmm1, %xmm3
1853; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7]
1854; AVX1-NEXT:    vpsrld $26, %xmm1, %xmm3
1855; AVX1-NEXT:    vpsrld $28, %xmm1, %xmm1
1856; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7]
1857; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
1858; AVX1-NEXT:    vpmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1859; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1860; AVX1-NEXT:    retq
1861;
1862; AVX2-LABEL: constant_funnnel_v4i32:
1863; AVX2:       # %bb.0:
1864; AVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1865; AVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1866; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1867; AVX2-NEXT:    retq
1868;
1869; AVX512F-LABEL: constant_funnnel_v4i32:
1870; AVX512F:       # %bb.0:
1871; AVX512F-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1872; AVX512F-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1873; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1874; AVX512F-NEXT:    retq
1875;
1876; AVX512VL-LABEL: constant_funnnel_v4i32:
1877; AVX512VL:       # %bb.0:
1878; AVX512VL-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1879; AVX512VL-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1880; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1881; AVX512VL-NEXT:    retq
1882;
1883; AVX512BW-LABEL: constant_funnnel_v4i32:
1884; AVX512BW:       # %bb.0:
1885; AVX512BW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1886; AVX512BW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1887; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1888; AVX512BW-NEXT:    retq
1889;
1890; AVX512VBMI2-LABEL: constant_funnnel_v4i32:
1891; AVX512VBMI2:       # %bb.0:
1892; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1893; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1894; AVX512VBMI2-NEXT:    vpmovsxbd {{.*#+}} xmm2 = [4,5,6,7]
1895; AVX512VBMI2-NEXT:    vpshldvd %zmm2, %zmm1, %zmm0
1896; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
1897; AVX512VBMI2-NEXT:    vzeroupper
1898; AVX512VBMI2-NEXT:    retq
1899;
1900; AVX512VLBW-LABEL: constant_funnnel_v4i32:
1901; AVX512VLBW:       # %bb.0:
1902; AVX512VLBW-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1903; AVX512VLBW-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1904; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1905; AVX512VLBW-NEXT:    retq
1906;
1907; AVX512VLVBMI2-LABEL: constant_funnnel_v4i32:
1908; AVX512VLVBMI2:       # %bb.0:
1909; AVX512VLVBMI2-NEXT:    vpshldvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
1910; AVX512VLVBMI2-NEXT:    retq
1911;
1912; XOPAVX1-LABEL: constant_funnnel_v4i32:
1913; XOPAVX1:       # %bb.0:
1914; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1915; XOPAVX1-NEXT:    vpshld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1916; XOPAVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
1917; XOPAVX1-NEXT:    retq
1918;
1919; XOPAVX2-LABEL: constant_funnnel_v4i32:
1920; XOPAVX2:       # %bb.0:
1921; XOPAVX2-NEXT:    vpsrlvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
1922; XOPAVX2-NEXT:    vpsllvd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
1923; XOPAVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
1924; XOPAVX2-NEXT:    retq
1925;
1926; X86-SSE2-LABEL: constant_funnnel_v4i32:
1927; X86-SSE2:       # %bb.0:
1928; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1929; X86-SSE2-NEXT:    psrld $25, %xmm2
1930; X86-SSE2-NEXT:    movdqa %xmm1, %xmm3
1931; X86-SSE2-NEXT:    psrld $26, %xmm3
1932; X86-SSE2-NEXT:    punpckhqdq {{.*#+}} xmm3 = xmm3[1],xmm2[1]
1933; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
1934; X86-SSE2-NEXT:    psrld $27, %xmm2
1935; X86-SSE2-NEXT:    psrld $28, %xmm1
1936; X86-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
1937; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
1938; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
1939; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
1940; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
1941; X86-SSE2-NEXT:    pmuludq {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
1942; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
1943; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
1944; X86-SSE2-NEXT:    por %xmm1, %xmm0
1945; X86-SSE2-NEXT:    retl
1946  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 5, i32 6, i32 7>)
1947  ret <4 x i32> %res
1948}
1949
1950define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
1951; SSE-LABEL: constant_funnnel_v8i16:
1952; SSE:       # %bb.0:
1953; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [1,2,4,8,16,32,64,128]
1954; SSE-NEXT:    psrlw $1, %xmm1
1955; SSE-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,4,8,16,32,64,128,256]
1956; SSE-NEXT:    por %xmm1, %xmm0
1957; SSE-NEXT:    retq
1958;
1959; AVX-LABEL: constant_funnnel_v8i16:
1960; AVX:       # %bb.0:
1961; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1962; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
1963; AVX-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1964; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
1965; AVX-NEXT:    retq
1966;
1967; AVX512F-LABEL: constant_funnnel_v8i16:
1968; AVX512F:       # %bb.0:
1969; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1970; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
1971; AVX512F-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1972; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
1973; AVX512F-NEXT:    retq
1974;
1975; AVX512VL-LABEL: constant_funnnel_v8i16:
1976; AVX512VL:       # %bb.0:
1977; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
1978; AVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
1979; AVX512VL-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 # [2,4,8,16,32,64,128,256]
1980; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
1981; AVX512VL-NEXT:    retq
1982;
1983; AVX512BW-LABEL: constant_funnnel_v8i16:
1984; AVX512BW:       # %bb.0:
1985; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1986; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
1987; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
1988; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8]
1989; AVX512BW-NEXT:    vpsrlw $1, %xmm1, %xmm1
1990; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm1, %zmm1
1991; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
1992; AVX512BW-NEXT:    vzeroupper
1993; AVX512BW-NEXT:    retq
1994;
1995; AVX512VBMI2-LABEL: constant_funnnel_v8i16:
1996; AVX512VBMI2:       # %bb.0:
1997; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
1998; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
1999; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} xmm2 = [0,1,2,3,4,5,6,7]
2000; AVX512VBMI2-NEXT:    vpshldvw %zmm2, %zmm1, %zmm0
2001; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2002; AVX512VBMI2-NEXT:    vzeroupper
2003; AVX512VBMI2-NEXT:    retq
2004;
2005; AVX512VLBW-LABEL: constant_funnnel_v8i16:
2006; AVX512VLBW:       # %bb.0:
2007; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2008; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
2009; AVX512VLBW-NEXT:    vpsrlvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2010; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2011; AVX512VLBW-NEXT:    retq
2012;
2013; AVX512VLVBMI2-LABEL: constant_funnnel_v8i16:
2014; AVX512VLVBMI2:       # %bb.0:
2015; AVX512VLVBMI2-NEXT:    vpshldvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
2016; AVX512VLVBMI2-NEXT:    retq
2017;
2018; XOP-LABEL: constant_funnnel_v8i16:
2019; XOP:       # %bb.0:
2020; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2021; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
2022; XOP-NEXT:    vpshlw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2023; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2024; XOP-NEXT:    retq
2025;
2026; X86-SSE2-LABEL: constant_funnnel_v8i16:
2027; X86-SSE2:       # %bb.0:
2028; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 # [1,2,4,8,16,32,64,128]
2029; X86-SSE2-NEXT:    psrlw $1, %xmm1
2030; X86-SSE2-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [2,4,8,16,32,64,128,256]
2031; X86-SSE2-NEXT:    por %xmm1, %xmm0
2032; X86-SSE2-NEXT:    retl
2033  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>)
2034  ret <8 x i16> %res
2035}
2036
2037define <16 x i8> @constant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2038; SSE-LABEL: constant_funnnel_v16i8:
2039; SSE:       # %bb.0:
2040; SSE-NEXT:    movdqa %xmm1, %xmm2
2041; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2042; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 # [1,128,64,32,16,8,4,2]
2043; SSE-NEXT:    psrlw $8, %xmm2
2044; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2045; SSE-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [1,2,4,8,16,32,64,128]
2046; SSE-NEXT:    psrlw $8, %xmm1
2047; SSE-NEXT:    packuswb %xmm2, %xmm1
2048; SSE-NEXT:    movdqa %xmm1, %xmm0
2049; SSE-NEXT:    retq
2050;
2051; AVX-LABEL: constant_funnnel_v16i8:
2052; AVX:       # %bb.0:
2053; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2054; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
2055; AVX-NEXT:    vpsrlw $8, %xmm2, %xmm2
2056; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2057; AVX-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
2058; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm0
2059; AVX-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2060; AVX-NEXT:    retq
2061;
2062; AVX512F-LABEL: constant_funnnel_v16i8:
2063; AVX512F:       # %bb.0:
2064; AVX512F-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2065; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
2066; AVX512F-NEXT:    vpsrlw $8, %xmm2, %xmm2
2067; AVX512F-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2068; AVX512F-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
2069; AVX512F-NEXT:    vpsrlw $8, %xmm0, %xmm0
2070; AVX512F-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2071; AVX512F-NEXT:    retq
2072;
2073; AVX512VL-LABEL: constant_funnnel_v16i8:
2074; AVX512VL:       # %bb.0:
2075; AVX512VL-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
2076; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2 # [1,128,64,32,16,8,4,2]
2077; AVX512VL-NEXT:    vpsrlw $8, %xmm2, %xmm2
2078; AVX512VL-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2079; AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 # [1,2,4,8,16,32,64,128]
2080; AVX512VL-NEXT:    vpsrlw $8, %xmm0, %xmm0
2081; AVX512VL-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
2082; AVX512VL-NEXT:    retq
2083;
2084; AVX512BW-LABEL: constant_funnnel_v16i8:
2085; AVX512BW:       # %bb.0:
2086; AVX512BW-NEXT:    vpmovsxbw {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2087; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2088; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2089; AVX512BW-NEXT:    vpsllw $8, %ymm0, %ymm0
2090; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2091; AVX512BW-NEXT:    vpsllvw %zmm2, %zmm0, %zmm0
2092; AVX512BW-NEXT:    vpsrlw $8, %ymm0, %ymm0
2093; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
2094; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2095; AVX512BW-NEXT:    vzeroupper
2096; AVX512BW-NEXT:    retq
2097;
2098; AVX512VBMI2-LABEL: constant_funnnel_v16i8:
2099; AVX512VBMI2:       # %bb.0:
2100; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2101; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2102; AVX512VBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,64,1,65,2,66,3,67,4,68,5,69,6,70,7,71,8,72,9,73,10,74,11,75,12,76,13,77,14,78,15,79]
2103; AVX512VBMI2-NEXT:    vpermt2b %zmm0, %zmm2, %zmm1
2104; AVX512VBMI2-NEXT:    vpmovsxbw {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,0,7,6,5,4,3,2,1]
2105; AVX512VBMI2-NEXT:    vpsllvw %zmm0, %zmm1, %zmm0
2106; AVX512VBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2107; AVX512VBMI2-NEXT:    vpmovwb %zmm0, %ymm0
2108; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
2109; AVX512VBMI2-NEXT:    vzeroupper
2110; AVX512VBMI2-NEXT:    retq
2111;
2112; AVX512VLBW-LABEL: constant_funnnel_v16i8:
2113; AVX512VLBW:       # %bb.0:
2114; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero,xmm1[8],zero,xmm1[9],zero,xmm1[10],zero,xmm1[11],zero,xmm1[12],zero,xmm1[13],zero,xmm1[14],zero,xmm1[15],zero
2115; AVX512VLBW-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
2116; AVX512VLBW-NEXT:    vpsllw $8, %ymm0, %ymm0
2117; AVX512VLBW-NEXT:    vpor %ymm1, %ymm0, %ymm0
2118; AVX512VLBW-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
2119; AVX512VLBW-NEXT:    vpsrlw $8, %ymm0, %ymm0
2120; AVX512VLBW-NEXT:    vpmovwb %ymm0, %xmm0
2121; AVX512VLBW-NEXT:    vzeroupper
2122; AVX512VLBW-NEXT:    retq
2123;
2124; AVX512VLVBMI2-LABEL: constant_funnnel_v16i8:
2125; AVX512VLVBMI2:       # %bb.0:
2126; AVX512VLVBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
2127; AVX512VLVBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
2128; AVX512VLVBMI2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,32,1,33,2,34,3,35,4,36,5,37,6,38,7,39,8,40,9,41,10,42,11,43,12,44,13,45,14,46,15,47]
2129; AVX512VLVBMI2-NEXT:    vpermi2b %ymm0, %ymm1, %ymm2
2130; AVX512VLVBMI2-NEXT:    vpsllvw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm0
2131; AVX512VLVBMI2-NEXT:    vpsrlw $8, %ymm0, %ymm0
2132; AVX512VLVBMI2-NEXT:    vpmovwb %ymm0, %xmm0
2133; AVX512VLVBMI2-NEXT:    vzeroupper
2134; AVX512VLVBMI2-NEXT:    retq
2135;
2136; XOP-LABEL: constant_funnnel_v16i8:
2137; XOP:       # %bb.0:
2138; XOP-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
2139; XOP-NEXT:    vpshlb %xmm2, %xmm1, %xmm1
2140; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2141; XOP-NEXT:    vpshlb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2142; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2143; XOP-NEXT:    retq
2144;
2145; X86-SSE2-LABEL: constant_funnnel_v16i8:
2146; X86-SSE2:       # %bb.0:
2147; X86-SSE2-NEXT:    movdqa %xmm1, %xmm2
2148; X86-SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
2149; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2 # [1,128,64,32,16,8,4,2]
2150; X86-SSE2-NEXT:    psrlw $8, %xmm2
2151; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
2152; X86-SSE2-NEXT:    pmullw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1 # [1,2,4,8,16,32,64,128]
2153; X86-SSE2-NEXT:    psrlw $8, %xmm1
2154; X86-SSE2-NEXT:    packuswb %xmm2, %xmm1
2155; X86-SSE2-NEXT:    movdqa %xmm1, %xmm0
2156; X86-SSE2-NEXT:    retl
2157  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1>)
2158  ret <16 x i8> %res
2159}
2160
2161;
2162; Uniform Constant Shifts
2163;
2164
2165define <2 x i64> @splatconstant_funnnel_v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
2166; SSE-LABEL: splatconstant_funnnel_v2i64:
2167; SSE:       # %bb.0:
2168; SSE-NEXT:    psrlq $50, %xmm1
2169; SSE-NEXT:    psllq $14, %xmm0
2170; SSE-NEXT:    por %xmm1, %xmm0
2171; SSE-NEXT:    retq
2172;
2173; AVX-LABEL: splatconstant_funnnel_v2i64:
2174; AVX:       # %bb.0:
2175; AVX-NEXT:    vpsrlq $50, %xmm1, %xmm1
2176; AVX-NEXT:    vpsllq $14, %xmm0, %xmm0
2177; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2178; AVX-NEXT:    retq
2179;
2180; AVX512F-LABEL: splatconstant_funnnel_v2i64:
2181; AVX512F:       # %bb.0:
2182; AVX512F-NEXT:    vpsrlq $50, %xmm1, %xmm1
2183; AVX512F-NEXT:    vpsllq $14, %xmm0, %xmm0
2184; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2185; AVX512F-NEXT:    retq
2186;
2187; AVX512VL-LABEL: splatconstant_funnnel_v2i64:
2188; AVX512VL:       # %bb.0:
2189; AVX512VL-NEXT:    vpsrlq $50, %xmm1, %xmm1
2190; AVX512VL-NEXT:    vpsllq $14, %xmm0, %xmm0
2191; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2192; AVX512VL-NEXT:    retq
2193;
2194; AVX512BW-LABEL: splatconstant_funnnel_v2i64:
2195; AVX512BW:       # %bb.0:
2196; AVX512BW-NEXT:    vpsrlq $50, %xmm1, %xmm1
2197; AVX512BW-NEXT:    vpsllq $14, %xmm0, %xmm0
2198; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2199; AVX512BW-NEXT:    retq
2200;
2201; AVX512VBMI2-LABEL: splatconstant_funnnel_v2i64:
2202; AVX512VBMI2:       # %bb.0:
2203; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2204; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2205; AVX512VBMI2-NEXT:    vpshldq $14, %zmm1, %zmm0, %zmm0
2206; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2207; AVX512VBMI2-NEXT:    vzeroupper
2208; AVX512VBMI2-NEXT:    retq
2209;
2210; AVX512VLBW-LABEL: splatconstant_funnnel_v2i64:
2211; AVX512VLBW:       # %bb.0:
2212; AVX512VLBW-NEXT:    vpsrlq $50, %xmm1, %xmm1
2213; AVX512VLBW-NEXT:    vpsllq $14, %xmm0, %xmm0
2214; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2215; AVX512VLBW-NEXT:    retq
2216;
2217; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v2i64:
2218; AVX512VLVBMI2:       # %bb.0:
2219; AVX512VLVBMI2-NEXT:    vpshldq $14, %xmm1, %xmm0, %xmm0
2220; AVX512VLVBMI2-NEXT:    retq
2221;
2222; XOP-LABEL: splatconstant_funnnel_v2i64:
2223; XOP:       # %bb.0:
2224; XOP-NEXT:    vpsrlq $50, %xmm1, %xmm1
2225; XOP-NEXT:    vpsllq $14, %xmm0, %xmm0
2226; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2227; XOP-NEXT:    retq
2228;
2229; X86-SSE2-LABEL: splatconstant_funnnel_v2i64:
2230; X86-SSE2:       # %bb.0:
2231; X86-SSE2-NEXT:    psrlq $50, %xmm1
2232; X86-SSE2-NEXT:    psllq $14, %xmm0
2233; X86-SSE2-NEXT:    por %xmm1, %xmm0
2234; X86-SSE2-NEXT:    retl
2235  %res = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> <i64 14, i64 14>)
2236  ret <2 x i64> %res
2237}
2238
2239define <4 x i32> @splatconstant_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
2240; SSE-LABEL: splatconstant_funnnel_v4i32:
2241; SSE:       # %bb.0:
2242; SSE-NEXT:    psrld $28, %xmm1
2243; SSE-NEXT:    pslld $4, %xmm0
2244; SSE-NEXT:    por %xmm1, %xmm0
2245; SSE-NEXT:    retq
2246;
2247; AVX-LABEL: splatconstant_funnnel_v4i32:
2248; AVX:       # %bb.0:
2249; AVX-NEXT:    vpsrld $28, %xmm1, %xmm1
2250; AVX-NEXT:    vpslld $4, %xmm0, %xmm0
2251; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2252; AVX-NEXT:    retq
2253;
2254; AVX512F-LABEL: splatconstant_funnnel_v4i32:
2255; AVX512F:       # %bb.0:
2256; AVX512F-NEXT:    vpsrld $28, %xmm1, %xmm1
2257; AVX512F-NEXT:    vpslld $4, %xmm0, %xmm0
2258; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2259; AVX512F-NEXT:    retq
2260;
2261; AVX512VL-LABEL: splatconstant_funnnel_v4i32:
2262; AVX512VL:       # %bb.0:
2263; AVX512VL-NEXT:    vpsrld $28, %xmm1, %xmm1
2264; AVX512VL-NEXT:    vpslld $4, %xmm0, %xmm0
2265; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2266; AVX512VL-NEXT:    retq
2267;
2268; AVX512BW-LABEL: splatconstant_funnnel_v4i32:
2269; AVX512BW:       # %bb.0:
2270; AVX512BW-NEXT:    vpsrld $28, %xmm1, %xmm1
2271; AVX512BW-NEXT:    vpslld $4, %xmm0, %xmm0
2272; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2273; AVX512BW-NEXT:    retq
2274;
2275; AVX512VBMI2-LABEL: splatconstant_funnnel_v4i32:
2276; AVX512VBMI2:       # %bb.0:
2277; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2278; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2279; AVX512VBMI2-NEXT:    vpshldd $4, %zmm1, %zmm0, %zmm0
2280; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2281; AVX512VBMI2-NEXT:    vzeroupper
2282; AVX512VBMI2-NEXT:    retq
2283;
2284; AVX512VLBW-LABEL: splatconstant_funnnel_v4i32:
2285; AVX512VLBW:       # %bb.0:
2286; AVX512VLBW-NEXT:    vpsrld $28, %xmm1, %xmm1
2287; AVX512VLBW-NEXT:    vpslld $4, %xmm0, %xmm0
2288; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2289; AVX512VLBW-NEXT:    retq
2290;
2291; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v4i32:
2292; AVX512VLVBMI2:       # %bb.0:
2293; AVX512VLVBMI2-NEXT:    vpshldd $4, %xmm1, %xmm0, %xmm0
2294; AVX512VLVBMI2-NEXT:    retq
2295;
2296; XOP-LABEL: splatconstant_funnnel_v4i32:
2297; XOP:       # %bb.0:
2298; XOP-NEXT:    vpsrld $28, %xmm1, %xmm1
2299; XOP-NEXT:    vpslld $4, %xmm0, %xmm0
2300; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2301; XOP-NEXT:    retq
2302;
2303; X86-SSE2-LABEL: splatconstant_funnnel_v4i32:
2304; X86-SSE2:       # %bb.0:
2305; X86-SSE2-NEXT:    psrld $28, %xmm1
2306; X86-SSE2-NEXT:    pslld $4, %xmm0
2307; X86-SSE2-NEXT:    por %xmm1, %xmm0
2308; X86-SSE2-NEXT:    retl
2309  %res = call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 4, i32 4, i32 4, i32 4>)
2310  ret <4 x i32> %res
2311}
2312
2313define <8 x i16> @splatconstant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
2314; SSE-LABEL: splatconstant_funnnel_v8i16:
2315; SSE:       # %bb.0:
2316; SSE-NEXT:    psrlw $9, %xmm1
2317; SSE-NEXT:    psllw $7, %xmm0
2318; SSE-NEXT:    por %xmm1, %xmm0
2319; SSE-NEXT:    retq
2320;
2321; AVX-LABEL: splatconstant_funnnel_v8i16:
2322; AVX:       # %bb.0:
2323; AVX-NEXT:    vpsrlw $9, %xmm1, %xmm1
2324; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0
2325; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2326; AVX-NEXT:    retq
2327;
2328; AVX512F-LABEL: splatconstant_funnnel_v8i16:
2329; AVX512F:       # %bb.0:
2330; AVX512F-NEXT:    vpsrlw $9, %xmm1, %xmm1
2331; AVX512F-NEXT:    vpsllw $7, %xmm0, %xmm0
2332; AVX512F-NEXT:    vpor %xmm1, %xmm0, %xmm0
2333; AVX512F-NEXT:    retq
2334;
2335; AVX512VL-LABEL: splatconstant_funnnel_v8i16:
2336; AVX512VL:       # %bb.0:
2337; AVX512VL-NEXT:    vpsrlw $9, %xmm1, %xmm1
2338; AVX512VL-NEXT:    vpsllw $7, %xmm0, %xmm0
2339; AVX512VL-NEXT:    vpor %xmm1, %xmm0, %xmm0
2340; AVX512VL-NEXT:    retq
2341;
2342; AVX512BW-LABEL: splatconstant_funnnel_v8i16:
2343; AVX512BW:       # %bb.0:
2344; AVX512BW-NEXT:    vpsrlw $9, %xmm1, %xmm1
2345; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
2346; AVX512BW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2347; AVX512BW-NEXT:    retq
2348;
2349; AVX512VBMI2-LABEL: splatconstant_funnnel_v8i16:
2350; AVX512VBMI2:       # %bb.0:
2351; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
2352; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
2353; AVX512VBMI2-NEXT:    vpshldw $7, %zmm1, %zmm0, %zmm0
2354; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2355; AVX512VBMI2-NEXT:    vzeroupper
2356; AVX512VBMI2-NEXT:    retq
2357;
2358; AVX512VLBW-LABEL: splatconstant_funnnel_v8i16:
2359; AVX512VLBW:       # %bb.0:
2360; AVX512VLBW-NEXT:    vpsrlw $9, %xmm1, %xmm1
2361; AVX512VLBW-NEXT:    vpsllw $7, %xmm0, %xmm0
2362; AVX512VLBW-NEXT:    vpor %xmm1, %xmm0, %xmm0
2363; AVX512VLBW-NEXT:    retq
2364;
2365; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v8i16:
2366; AVX512VLVBMI2:       # %bb.0:
2367; AVX512VLVBMI2-NEXT:    vpshldw $7, %xmm1, %xmm0, %xmm0
2368; AVX512VLVBMI2-NEXT:    retq
2369;
2370; XOP-LABEL: splatconstant_funnnel_v8i16:
2371; XOP:       # %bb.0:
2372; XOP-NEXT:    vpsrlw $9, %xmm1, %xmm1
2373; XOP-NEXT:    vpsllw $7, %xmm0, %xmm0
2374; XOP-NEXT:    vpor %xmm1, %xmm0, %xmm0
2375; XOP-NEXT:    retq
2376;
2377; X86-SSE2-LABEL: splatconstant_funnnel_v8i16:
2378; X86-SSE2:       # %bb.0:
2379; X86-SSE2-NEXT:    psrlw $9, %xmm1
2380; X86-SSE2-NEXT:    psllw $7, %xmm0
2381; X86-SSE2-NEXT:    por %xmm1, %xmm0
2382; X86-SSE2-NEXT:    retl
2383  %res = call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>)
2384  ret <8 x i16> %res
2385}
2386
2387define <16 x i8> @splatconstant_funnnel_v16i8(<16 x i8> %x, <16 x i8> %y) nounwind {
2388; SSE-LABEL: splatconstant_funnnel_v16i8:
2389; SSE:       # %bb.0:
2390; SSE-NEXT:    psrlw $4, %xmm1
2391; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
2392; SSE-NEXT:    psllw $4, %xmm0
2393; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
2394; SSE-NEXT:    por %xmm1, %xmm0
2395; SSE-NEXT:    retq
2396;
2397; AVX-LABEL: splatconstant_funnnel_v16i8:
2398; AVX:       # %bb.0:
2399; AVX-NEXT:    vpsrlw $4, %xmm1, %xmm1
2400; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
2401; AVX-NEXT:    vpsllw $4, %xmm0, %xmm0
2402; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
2403; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
2404; AVX-NEXT:    retq
2405;
2406; AVX512F-LABEL: splatconstant_funnnel_v16i8:
2407; AVX512F:       # %bb.0:
2408; AVX512F-NEXT:    vpsllw $4, %xmm0, %xmm2
2409; AVX512F-NEXT:    vpsrlw $4, %xmm1, %xmm0
2410; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2411; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2412; AVX512F-NEXT:    vzeroupper
2413; AVX512F-NEXT:    retq
2414;
2415; AVX512VL-LABEL: splatconstant_funnnel_v16i8:
2416; AVX512VL:       # %bb.0:
2417; AVX512VL-NEXT:    vpsllw $4, %xmm0, %xmm2
2418; AVX512VL-NEXT:    vpsrlw $4, %xmm1, %xmm0
2419; AVX512VL-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2420; AVX512VL-NEXT:    retq
2421;
2422; AVX512BW-LABEL: splatconstant_funnnel_v16i8:
2423; AVX512BW:       # %bb.0:
2424; AVX512BW-NEXT:    vpsllw $4, %xmm0, %xmm2
2425; AVX512BW-NEXT:    vpsrlw $4, %xmm1, %xmm0
2426; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2427; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2428; AVX512BW-NEXT:    vzeroupper
2429; AVX512BW-NEXT:    retq
2430;
2431; AVX512VBMI2-LABEL: splatconstant_funnnel_v16i8:
2432; AVX512VBMI2:       # %bb.0:
2433; AVX512VBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
2434; AVX512VBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
2435; AVX512VBMI2-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 ^ (mem & (zmm0 ^ zmm2))
2436; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
2437; AVX512VBMI2-NEXT:    vzeroupper
2438; AVX512VBMI2-NEXT:    retq
2439;
2440; AVX512VLBW-LABEL: splatconstant_funnnel_v16i8:
2441; AVX512VLBW:       # %bb.0:
2442; AVX512VLBW-NEXT:    vpsllw $4, %xmm0, %xmm2
2443; AVX512VLBW-NEXT:    vpsrlw $4, %xmm1, %xmm0
2444; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2445; AVX512VLBW-NEXT:    retq
2446;
2447; AVX512VLVBMI2-LABEL: splatconstant_funnnel_v16i8:
2448; AVX512VLVBMI2:       # %bb.0:
2449; AVX512VLVBMI2-NEXT:    vpsllw $4, %xmm0, %xmm2
2450; AVX512VLVBMI2-NEXT:    vpsrlw $4, %xmm1, %xmm0
2451; AVX512VLVBMI2-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm0 ^ (mem & (xmm0 ^ xmm2))
2452; AVX512VLVBMI2-NEXT:    retq
2453;
2454; XOP-LABEL: splatconstant_funnnel_v16i8:
2455; XOP:       # %bb.0:
2456; XOP-NEXT:    vpsrlw $4, %xmm1, %xmm1
2457; XOP-NEXT:    vpsllw $4, %xmm0, %xmm0
2458; XOP-NEXT:    vpcmov {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0, %xmm0
2459; XOP-NEXT:    retq
2460;
2461; X86-SSE2-LABEL: splatconstant_funnnel_v16i8:
2462; X86-SSE2:       # %bb.0:
2463; X86-SSE2-NEXT:    psrlw $4, %xmm1
2464; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
2465; X86-SSE2-NEXT:    psllw $4, %xmm0
2466; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
2467; X86-SSE2-NEXT:    por %xmm1, %xmm0
2468; X86-SSE2-NEXT:    retl
2469  %res = call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>)
2470  ret <16 x i8> %res
2471}
2472